from htmllib import HTMLParser
from formatter import NullFormatter
import string
class MetaParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self, NullFormatter())
self.meta_dict = {}
def do_meta(self, attrs):
# this method is called for META tags
name = content = None
# attrs is a list of 2-tuples
for k, v in attrs:
if k == "name":
name = string.lower(v)
elif k == "content":
content = v
if name and content:
self.meta_dict[name] = content
def getmeta(file):
# extract META tags from an HTML document
p = MetaParser()
f = open(file)
while 1:
s = fp.read(10000)
if not s:
break
p.feed(s)
p.close()
# the title tag is extracted by the base class
if p.title:
p.meta_dict["title"] = p.title
return p.meta_dict