from htmllib import HTMLParser from formatter import NullFormatter import string class MetaParser(HTMLParser): def __init__(self): HTMLParser.__init__(self, NullFormatter()) self.meta_dict = {} def do_meta(self, attrs): # this method is called for META tags name = content = None # attrs is a list of 2-tuples for k, v in attrs: if k == "name": name = string.lower(v) elif k == "content": content = v if name and content: self.meta_dict[name] = content def getmeta(file): # extract META tags from an HTML document p = MetaParser() f = open(file) while 1: s = fp.read(10000) if not s: break p.feed(s) p.close() # the title tag is extracted by the base class if p.title: p.meta_dict["title"] = p.title return p.meta_dict