import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.beans.XMLDecoder;
import java.beans.XMLEncoder;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
public class BKLCheckBot {
private static final File STATE_FILE = new File("state.xml");
private static final Logger logger = Logger.getLogger("bklCheckBot");
public static void main(String[] args) throws IOException,
XMLStreamException {
final BKLCheckBot bot;
if (STATE_FILE.exists()) {
bot = readState();
} else if (0 == args.length) {
throw new IllegalStateException(
"Beim ersten Start bitte User-Agent als Parameter angeben, z. B. E-Mail-Adresse. Siehe https://www.mediawiki.org/wiki/API#Identifying_your_client");
} else {
String userAgent = args[0];
bot = new BKLCheckBot();
bot.userAgent = userAgent;
}
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
try {
bot.writeState();
bot.writeWikitext();
} catch (IOException e) {
logger.log(Level.WARNING, "writeState", e);
}
}
});
bot.run();
}
private void writeWikitext() throws IOException {
try (Writer writer = new OutputStreamWriter(new FileOutputStream("out.wiki"))) {
for (String s : vollroteBKLs) {
writer.write("[[" + s + "]]\n");
}
}
}
private String cmContinue;
private transient String nextCMContinue;
private Set<String> vollroteBKLs = new LinkedHashSet<>();
private String userAgent;
private XMLInputFactory factory = XMLInputFactory.newInstance();
private void run() throws IOException, XMLStreamException {
List<String> bkls;
while ((bkls = getBKLs()) != null) {
for (String bkl : bkls) {
if (istVollrot(bkl)) {
logger.info("Vollrot: " + bkl);
vollroteBKLs.add(bkl);
}
}
cmContinue = nextCMContinue;
}
}
private boolean istVollrot(String bkl) throws IOException,
XMLStreamException {
String spec = "http://de.wikipedia.org/w/api.php?format=xml&action=query&generator=links&gplnamespace=0&gpllimit=max&titles="
+ URLEncoder.encode(bkl, "UTF-8");
try (InputStream stream = getStream(spec);) {
XMLStreamReader reader = factory.createXMLStreamReader(stream);
reader.nextTag();
assert "api".equals(reader.getLocalName());
do {
reader.nextTag();
} while (!"page".equals(reader.getLocalName()));
do {
assert START_ELEMENT == reader.getEventType();
String value = reader.getAttributeValue(null, "missing");
if (null == value) {
return false;
}
reader.nextTag();
assert END_ELEMENT == reader.getEventType();
reader.nextTag();
} while ("page".equals(reader.getLocalName()));
return true;
}
//
// <?xml version="1.0"?>
// <api>
// <limits links="500" />
// <query>
// <pages>
// <page ns="0" title="Francis de Quervain" missing="" />
// <page ns="0" title="Marcel Roland de Quervain" missing="" />
// <page pageid="3046608" ns="0"
// title="Alfred de Quervain (Geophysiker)" />
// <page pageid="941244" ns="0" title="Alfred de Quervain (Theologe)" />
// <page pageid="1563710" ns="0" title="Fritz de Quervain" />
// </pages>
// </query>
// </api>
}
private List<String> getBKLs() throws IOException, XMLStreamException {
String spec = "http://de.wikipedia.org/w/api.php?format=xml&action=query&list=categorymembers&cmnamespace=0&cmprop=title&cmtitle=Kategorie:Begriffskl%C3%A4rung";
if (null != cmContinue) {
spec = spec + "&cmcontinue=" + cmContinue;
}
List<String> result = new ArrayList<>();
try (InputStream stream = getStream(spec);) {
XMLStreamReader reader = factory.createXMLStreamReader(stream);
reader.nextTag();
assert "api".equals(reader.getLocalName());
reader.nextTag();
if ("query-continue".equals(reader.getLocalName())) {
reader.nextTag();
assert "categorymembers"
.equals(reader.getLocalName());
nextCMContinue = reader.getAttributeValue(null, "cmcontinue");
reader.nextTag();
assert "categorymembers"
.equals(reader.getLocalName());
reader.nextTag();
assert "query-continue".equals(reader.getLocalName());
reader.nextTag();
}
assert "query".equals(reader.getLocalName());
reader.nextTag();
assert "categorymembers".equals(reader.getLocalName());
reader.nextTag();
while ("cm".equals(reader.getLocalName())) {
String title = reader.getAttributeValue(null, "title");
result.add(title);
reader.nextTag();
assert "cm".equals(reader.getLocalName());
reader.nextTag();
}
}
// <api>
// <query-continue>
// <categorymembers
// cmcontinue="page|233a31204c4947410a312e204c494741|1536594"/>
// </query-continue>
// <query>
// <categorymembers>
// <cm ns="0" title="1. Dalai Lama"/>
// <cm ns="0" title="1 Decembrie"/>
// <cm ns="0" title="1. Deild"/>
// <cm ns="0" title="1. Division"/>
// <cm ns="0" title="1. FC Union"/>
// <cm ns="0" title="1. FCK"/>
// <cm ns="0" title="1. FFC"/>
// <cm ns="0" title="1. Kavallerie-Division"/>
// <cm ns="0" title="1. Klavierkonzert"/>
// <cm ns="0" title="I. Korps"/>
// </categorymembers>
// </query>
// </api>
return result;
}
private GZIPInputStream getStream(String spec)
throws MalformedURLException, IOException {
URL url = new URL(spec);
URLConnection connection = url.openConnection();
connection.setRequestProperty("Accept-encoding", "gzip");
connection.setRequestProperty("User-Agent", userAgent);
connection.connect();
logger.info(spec);
return new GZIPInputStream(connection.getInputStream());
}
private static BKLCheckBot readState() throws FileNotFoundException {
try (XMLDecoder decoder = new XMLDecoder(
new FileInputStream(STATE_FILE))) {
return (BKLCheckBot) decoder.readObject();
}
}
private void writeState() throws IOException {
try (XMLEncoder encoder = new XMLEncoder(new FileOutputStream(
STATE_FILE))) {
encoder.writeObject(this);
}
}
public String getCmContinue() {
return cmContinue;
}
public void setCmContinue(String s) {
cmContinue = s;
}
public String getUserAgent() {
return userAgent;
}
public void setUserAgent(String s) {
userAgent = s;
}
public Set<String> getVollroteBKLs() {
return new LinkedHashSet<>(vollroteBKLs);
}
public void setVollroteBKLs(Set<String> set) {
vollroteBKLs = new LinkedHashSet<>(set);
}
}