Diese Seite gehört zum Wikipedia-Archiv.

# !/usr/bin/perl

use Digest::MD5 qw(md5_base64);

$bs = "Linux"; # oder "Windows"

sub xmlunesc {
	my $text = shift;
	$text =~ s/&lt;/</sg;
	$text =~ s/&gt;/>/sg;
	$text =~ s/&apos;/'/sg;
	$text =~ s/&quot;/"/sg; #"
	$text =~ s/&amp;/&/sg;
	$text;
}

sub xmlsiteinfo {
	while($xml =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) {
		$namespace{$2} = $1 if defined $2;
	}
	$xml = "";
}

sub xmlrevision {
	$xml =~ s/<revision>(.*?)<\/revision>//s;
	my $revcontent = $1;
	$rev = {};
	xmlpage() unless defined $page;
	$xml = "";
	while($revcontent =~ /<(id|timestamp|contributor|minor|comment|text)(?: xml:space="preserve"| type="(.*?)")*(?:\s*\/>|>(.*?)<\/\1>)/sg) {
		my ($tag, $type, $content) = ($1, $2, $3);
		$content = "" unless defined $content;
		if($tag =~ /^text/) {
			$rev->{"text"} = xmlunesc($content);
		} elsif($tag eq "contributor") {
			if($content =~ /<username>(.*?)<\/username>\s*<id>(.*?)<\/id>/) {
				$rev->{"user_text"} = xmlunesc($1);
				$rev->{"user"} = $2;
			} elsif($content =~ /<ip>(.*?)<\/ip>/) {
				$rev->{"user_text"} = xmlunesc($1);
				$rev->{"user"} = 0;
			} else {
				$rev->{"user_text"} = "_";
				$rev->{"user"} = 0;
			}
		} elsif($tag eq "comment") {
			$rev->{"comment"} = xmlunesc($content);
		} elsif($tag eq "timestamp") {
			$content =~ /^(....)-(..)-(..)T(..):(..):(..)Z$/;
			$rev->{"timestamp"} = "$1$2$3$4$5$6";
		} else {
			$rev->{$tag} = $content;
		}
	}
	revision();
}

sub xmlpage {
	while($xml =~ s/<(title|id|restrictions)(?:\s*\/>|>(.*?)<\/\1>)//s) {
		my ($tag, $content) = ($1, $2);
		if($tag eq "title") {
			$content = xmlunesc($content);
			if($content =~ /(.+?):(.+)/ && defined $namespace{$1}) {
				$page->{"namespace"} = $namespace{$1};
				$content = $2;
			} else {
				$page->{"namespace"} = 0;
			}
		}
		$page->{$tag} = defined $content ? $content : "";
	}
}

sub revision {
	my $is_redirect = $rev->{"text"} =~ /^# ?redirect/i;
	my $len = length($rev->{"text"});
	return unless defined $page->{"title"};
	my $text_md5 = md5_base64($rev->{"text"});
	my $loeschlink = $rev->{"text"} =~ /\[\[Wikipedia:(Löschkandidaten|Seiten, die gelöscht werden sollten)/s ||
		$rev->{"text"} =~ /\{\{(msg:)?(vfd|Lösch|URV)/is;
	$page->{"title"} = "_" unless defined $page->{"title"};
	$page->{"title"} =~ s/\s/_/sg;
	$rev->{"user_text"} =~ s/\s/_/sg;
	$rev->{"comment"} = "_" unless defined $rev->{"comment"};
	$rev->{"comment"} =~ s/\s/_/sg;
	printf ZSF1 "%s %7i %i %i %s %20s %i %s\n", $rev->{"timestamp"}, $len, $is_redirect,
		$loeschlink, $text_md5, $rev->{"user_text"}, $page->{"namespace"}, $page->{"title"};
}

sub kategorien {
	my $links = $page->{"title"};
	while($rev->{"text"} =~ /\[\[Kategorie:([^\|\]]*)/sg) {
		my $kat = $1;
		$kat =~ s/\s/_/sg;
		$links .=  " " . $kat;
	}
	print KATLINKS "$links\n" if $page->{"namespace"}==0;
	print KATTREE  "$links\n" if $page->{"namespace"}==14;
}

sub sortiere {
	my $tmp = $ENV{"LC_ALL"};
	$ENV{"LC_ALL"} = "C";
	system "sort $_[0] /O $_[1]" if $bs eq "Windows";
	system "sort -T ./sort-tmp -S 400M -o $_[1] $_[0]" if $bs eq "Linux";
	$ENV{"LC_ALL"} = $tmp;
	unlink $_[0];
}

mkdir "sort-tmp" unless -d "sort-tmp";
open ZSF1, ">zsf1";
open KATLINKS, ">katlinks";
open KATTREE, ">kattree";
while(<>) {
	$xml .= $_;
	if(/^\s*<\/siteinfo>/) {
		xmlsiteinfo();
	}
	next unless defined %namespace;
	if(/^\s*<\/revision>/) {
		xmlrevision();
	} elsif(/^\s*<\/page>/) {
		kategorien();
		$xml = "";
		$page = undef;
	}
}
close ZSF1; close KATLINKS; close KATTREE;
sortiere("zsf1", "zsf");
rmdir "sort-tmp";