--- In
billiontriples@yahoogroups.com, Peter Mika <pmika@...> wrote:
>
> Dear All,
>
> After some long and careful consideration, we have made the decision
not
> to invent our own format for exchanging data but to rely on an
existing
> format known as WARC [1], in particular WARC version 0.9. WARC
archives
> store provenance (URL) and timestamp in the header. The only
additional
> agreement we need to make is that we are going to encode files in
> N-Triples format. (If that is a problem, let us know.)
>
> What convinced us ultimately about WARC is the excellent tool support
in
> the form of a Java API from the Laboratory for Web Algorithmics [2] of
> the Universitą degli studi di Milano <
http://www.unimi.it/>. The
API can
> be downloaded from [3] and there is a separate tarball with all the
> dependencies. (The license in LGPL). One of the nice features of this
> API is the ability to work with streams of compressed WARC records,
> where metadata about each record is stored in the gzip header. This
> means that the metadata can be read without uncompressing the content
of
> the record itself. Further, there are skip pointers in the file, which
> means that a record can be easily skipped over.
>
> To make it really easy, I've also created sample code that
demonstrates
> how to create WARC archives from a set of files or a directory
structure
> on disk, and how to read back the resulting WARC archive. The code is
> simply attached to this email, if all is well. (First time I send
> attachments to a Y! Group.) Many thanks to Sebastiano Vigna, one of
the
> authors of the LAW API, for his help and advice.
>
> To support the Challenge, we at Yahoo! Research Barcelona are also
hard
> at work to get permission to release a microformat crawl of 100
million
> triples. We hope this will be a significant contribution to the
> state-of-the-art and will complement the existing data sets to be
> provided by Semantic Web search engines.
>
> As always, your comments and questions are more than appreciated. In
> particular those of you planning to provide some data, please let us
> know if you need any further help.
>
> Thanks,
> Peter
>
> [1]
http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html
> [2]
http://law.dsi.unimi.it/
> [3]
>
http://law.dsi.unimi.it/index.php?option=com_content&task=section&id=5&I\
temid=42
>
> package com.yahoo.corp.barcelona.billiontriples;
>
> import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
> import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
> import it.unimi.dsi.law.warc.io.GZWarcRecord;
> import it.unimi.dsi.law.warc.io.WarcRecord;
> import it.unimi.dsi.law.warc.util.BURL;
> import it.unimi.dsi.law.warc.util.BasicHttpResponse;
>
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.OutputStream;
> import java.io.UnsupportedEncodingException;
> import java.net.URLDecoder;
> import java.util.Date;
>
> import javax.xml.transform.TransformerConfigurationException;
>
> /** Sample code for creating Warc packages. This class is executable.
> *
> * @author pmika@...
> *
> */
> public class WarcPackager {
>
> public final static int MAX_RECORDS = -1;
>
> private int count = 0;
>
> //MODIFY THIS if your filenames are not URLs
> protected BURL getURL(File file) {
> BURL result = null;
> try {
> result = BURL.parse(URLDecoder.decode(file.getName(), "UTF-8"));
> } catch (UnsupportedEncodingException e) {
>
> e.printStackTrace();
> }
> return result;
> }
>
> //MODIFY this if the last modification date of the file != crawl date
> protected Date getDate(File file) {
> return new Date(file.lastModified());
> }
>
> private WarcRecord createRecord(File file) throws
UnsupportedEncodingException, IOException {
> GZWarcRecord result = new GZWarcRecord();
>
> InputStream fis = new FileInputStream(file);
>
> BasicHttpResponse response = new BasicHttpResponse();
>
> BURL url = getURL(file);
>
> if (url == null) {
> throw new IllegalArgumentException("Warning: getURL() returned null
for " + file);
> }
>
> response.url(getURL(file));
>
> response.statusLine("HTTP/1.1 200 OK");
> response.status(200);
> response.contentAsStream(new FastBufferedInputStream(fis));
>
> response.toWarcRecord(result);
>
> Date date = getDate(file);
> if (date == null) {
> throw new IllegalArgumentException("Warning: getDate() returned
null for " + file);
> }
>
> result.header.creationDate = getDate(file);
>
> return result;
> }
>
>
> //recursive
> public void processFileOrDir(OutputStream out, File file) throws
IOException {
>
> //if MAX_RECORDS is specified, and we've reached the limit, return
> if (MAX_RECORDS != -1 && count > MAX_RECORDS) {
> return;
> }
>
> if (count++ % 99999 == 0) System.err.println("Processed " + count +
" files.");
>
> if (file.isDirectory()) {
> for (String name : file.list()) {
> processFileOrDir(out, new File(file.getAbsolutePath() +
System.getProperty("file.separator") + name));
> }
> } else {
> //Catch exceptions: failure to write a single file should not make
us abort
> try {
> WarcRecord record = createRecord(file);
> record.write(out);
> } catch (Exception e) {
> System.err.println(e);
> }
> }
>
> }
>
>
> /**
> * Package the files or directories passed in as arguments.
> * Directories are processed recursively.
> *
> * The result is printed to standard out, errors/diagnostic messages
to std err.
> *
> * @param args
> * @throws TransformerConfigurationException
> * @throws IOException
> * @throws UnsupportedEncodingException
> */
> public static void main(String[] args) throws
TransformerConfigurationException, UnsupportedEncodingException,
IOException {
>
> if (args.length < 1) {
> System.err.println("Usage: WarcPackage <fileOrDir> ...");
> }
>
> FastBufferedOutputStream out = new
FastBufferedOutputStream(System.out);
> WarcPackager packager = new WarcPackager();
>
> for (String arg: args) {
> packager.processFileOrDir(out, new File(arg));
> }
>
> out.close();
>
>
> }
> }
>
> package com.yahoo.corp.barcelona.billiontriples;
>
> import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
> import it.unimi.dsi.fastutil.io.MeasurableInputStream;
> import it.unimi.dsi.law.warc.filters.Filter;
> import it.unimi.dsi.law.warc.filters.Filters;
> import it.unimi.dsi.law.warc.io.GZWarcRecord;
> import it.unimi.dsi.law.warc.io.WarcFilteredIterator;
> import it.unimi.dsi.law.warc.io.WarcRecord;
> import it.unimi.dsi.law.warc.util.BURL;
> import it.unimi.dsi.law.warc.util.WarcHttpResponse;
>
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileNotFoundException;
> import java.io.IOException;
>
> import org.openrdf.model.Statement;
> import org.openrdf.rio.RDFHandlerException;
> import org.openrdf.rio.RDFParseException;
> import org.openrdf.rio.helpers.RDFHandlerBase;
> import org.openrdf.rio.ntriples.NTriplesParser;
>
> /** Sample code for reading Warc packages.
> *
> * This class is executable.
> *
> * @author pmika@...
> *
> */
> public class WarcReader {
>
> private NTriplesParser parser = new NTriplesParser();
>
> private CountHandler countHandler = new CountHandler();
>
> private int tripleCount = 0;
> private int lineCount = 0;
>
> public class CountHandler extends RDFHandlerBase {
>
> private int count = 0;
>
> public void endRDF() throws RDFHandlerException {
> super.endRDF();
> //System.out.println("Counted " + count + " statements.");
> }
>
> public void handleStatement(Statement st) {
> count++;
> }
>
> public void startRDF() throws RDFHandlerException {
> super.startRDF();
> count = 0;
> }
>
> }
>
> public static class TrueFilter extends Filter<BURL> {
>
> @Override
> public boolean accept( BURL x ) {
> return true;
> }
>
> @Override
> public String toExternalForm() {
>
> return "true";
> }
>
> }
>
> public void countTriples(MeasurableInputStream block, String base) {
> parser.setRDFHandler(countHandler);
>
> try {
> parser.parse(block, base);
> tripleCount += countHandler.count;
> } catch (RDFParseException e) {
> e.printStackTrace();
> } catch (RDFHandlerException e) {
> e.printStackTrace();
> } catch (IOException e) {
> e.printStackTrace();
> }
> }
>
> public void countLines(MeasurableInputStream block) throws
IOException {
> int c = 0;
> while ((c = block.read()) != -1) {
> if (c == '\n') {
> lineCount++;
> }
> }
> }
>
> public void dumpContent(MeasurableInputStream block) throws
IOException {
> int c = 0;
> while ((c = block.read()) != -1) {
> System.out.write(c);
> }
> }
>
>
> /**
> * @param args
> * @throws FileNotFoundException
> */
> public static void main(String[] args) throws FileNotFoundException {
> if (args.length < 1) {
> System.err.println("Usage: WarcReader <file>");
> }
>
> final FastBufferedInputStream in = new FastBufferedInputStream(new
FileInputStream(new File(args[0])));
> GZWarcRecord record = new GZWarcRecord();
> Filter<WarcRecord> filter = Filters.adaptFilterBURL2WarcRecord(new
TrueFilter());
> WarcFilteredIterator it = new WarcFilteredIterator(in, record,
filter);
> int urlCount = 0;
>
> WarcReader reader = new WarcReader();
> WarcHttpResponse response = new WarcHttpResponse();
> try {
> while (it.hasNext()) {
>
> if (urlCount++ % 99999 == 0) System.err.println("Processed " +
urlCount + " files.");
>
> WarcRecord nextRecord = it.next();
> //Get the HttpResponse
> try {
> response.fromWarcRecord(nextRecord);
> System.out.println("Processing: " +
nextRecord.header.subjectUri);
>
> //This will dump the content of the record
> //reader.dumpContent(response.contentAsStream());
>
> //This will count the number of triples by parsing the RDF
> //reader.countTriples(response.contentAsStream(),
nextRecord.header.subjectUri.toString());
>
> //This will count the number of lines, which is equivalent to
> //the number of triples in N-Triples format
> reader.countLines(response.contentAsStream());
> } catch (IOException e) {
> e.printStackTrace();
> continue;
> }
> }
> } catch (RuntimeException re) {}
>
> System.out.println("Counted " + reader.lineCount + " triples from "
+ urlCount + " urls.");
>
>
> }
>
> }
>