80Legs file extractor
#!/usr/bin/python # Original Code # <a href="http://github.com/turian/py80legsformat" title="http://github.com/turian/py80legsformat">http://github.com/turian/py80legsformat</a> by Joseph Turian import csv import hashlib import os import struct import sys import zipfile from cStringIO import StringIO from optparse import OptionParser class EightyLegs: def __init__(self, filename): self.filename = filename self.directory = filename.split('.')[0] def read(self, file): assert(struct.calcsize("i")) == 4 l = file.read(2*4) (classID, versionID) = struct.unpack("ii", l) assert (classID, versionID) == (218217067, 1) l = "not EOF" data = [] l = file.read(1*4) while l != "": (URLSIZE,) = struct.unpack("i", l) url = file.read(URLSIZE).decode("utf-8") l = file.read(1*4) (DATASIZE,) = struct.unpack("i", l) data = str(file.read(DATASIZE)) yield (url, data) l = file.read(1*4) def read_zip(self, file): zip = zipfile.ZipFile(file, 'r') for info in zip.infolist(): fname = info.filename if fname.endswith('.80'): data = zip.read(fname) for r in self.read(StringIO(data)): yield r def parse(self): if not os.path.exists(self.directory): os.makedirs(self.directory) tsv = csv.writer(open("%s.tsv" % self.directory, 'w'), delimiter='\t', lineterminator='\n') if self.filename.endswith('.zip'): e = self.read_zip(open(self.filename)) else: e = self.read(open(self.filename)) for url, data in e: print url f = open("%s/%s.html" % (self.directory, hashlib.md5(url).hexdigest()), 'w') f.write(data) tsv.writerow([hashlib.md5(url).hexdigest(), url]) def main(): usage = 'Usage: %prog -f 19970_20966_a_1.zip or %prog -f 19970_20966_a_1.80' parser = OptionParser(usage=usage) parser.add_option('-f', '--file', dest='filename', help='input file') (options, args) = parser.parse_args() if options.filename is None: parser.print_help() sys.exit(0) legs = EightyLegs(options.filename) legs.parse() if __name__ == '__main__': main()
I had issues with the original version where it would break with some scrapes. Hoping to pass the code on to the original author so it can be merged back.
Drupal theme by Kiwi Themes.









0 comments
Post new comment