80Legs file extractor

#!/usr/bin/python
 
# Original Code
# <a href="http://github.com/turian/py80legsformat" title="http://github.com/turian/py80legsformat">http://github.com/turian/py80legsformat</a> by Joseph Turian
 
import csv
import hashlib
import os
import struct
import sys
import zipfile
from cStringIO import StringIO
from optparse import OptionParser
 
class EightyLegs:
    def __init__(self, filename):
        self.filename = filename
        self.directory = filename.split('.')[0]
 
    def read(self, file):
        assert(struct.calcsize("i")) == 4
 
        l = file.read(2*4)
        (classID, versionID) = struct.unpack("ii", l)
        assert (classID, versionID) == (218217067, 1)
 
        l = "not EOF"
        data = []
        l = file.read(1*4)
        while l != "":
            (URLSIZE,) = struct.unpack("i", l)
            url = file.read(URLSIZE).decode("utf-8")
            l = file.read(1*4)
            (DATASIZE,) = struct.unpack("i", l)
            data = str(file.read(DATASIZE))
            yield (url, data)
            l = file.read(1*4)
 
    def read_zip(self, file):
        zip = zipfile.ZipFile(file, 'r')
        for info in zip.infolist():
            fname = info.filename            
            if fname.endswith('.80'):
                data = zip.read(fname)
                for r in self.read(StringIO(data)):
                    yield r
 
    def parse(self):
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
 
        tsv = csv.writer(open("%s.tsv" % self.directory, 'w'), delimiter='\t', lineterminator='\n')
 
        if self.filename.endswith('.zip'):
            e = self.read_zip(open(self.filename))
        else:
            e = self.read(open(self.filename))
 
        for url, data in e:
            print url
            f = open("%s/%s.html" % (self.directory, hashlib.md5(url).hexdigest()), 'w')
            f.write(data)
            tsv.writerow([hashlib.md5(url).hexdigest(), url])
 
def main():
    usage = 'Usage: %prog -f 19970_20966_a_1.zip or %prog -f 19970_20966_a_1.80'
    parser = OptionParser(usage=usage)
    parser.add_option('-f', '--file', dest='filename', help='input file')
 
    (options, args) = parser.parse_args()
 
    if options.filename is None:
        parser.print_help()
        sys.exit(0)
 
    legs = EightyLegs(options.filename)
    legs.parse()
 
if __name__ == '__main__':
    main()

I had issues with the original version where it would break with some scrapes. Hoping to pass the code on to the original author so it can be merged back.

0 comments

Post new comment

  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>
  • Lines and paragraphs break automatically.
  • You can enable syntax highlighting of source code with the following tags: <code>, <blockcode>, <pre>, <apache>, <c>, <cpp>, <drupal5>, <drupal6>, <java>, <javascript>, <php>, <python>, <ruby>. The supported tag styles are: <foo>, [foo].

More information about formatting options

Drupal theme by Kiwi Themes.