from html.parser import HTMLParser
from html.entities import name2codepoint
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.anchors = []
self.record = False
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
for attr in attrs:
print(" attr:", attr)
if tag == "span":
for k, v in attrs:
if k == 'style' and v == 'color: #263645; font-weight: normal;' :
self.record = True
#self.anchors.append(v)
break
def handle_endtag(self, tag):
print("End tag :", tag)
if tag =="span":
self.record = False
def handle_data(self, data):
print("Data :", data)
if self.record == True:
self.anchors.append(data)
def handle_comment(self, data):
print("Comment :", data)
def handle_entityref(self, name):
c = chr(name2codepoint[name])
print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
print("Num ent :", c)
def handle_decl(self, data):
print("Decl :", data)
f = open("1.html")
p = MyHTMLParser()
p.feed(f.read())
p.close()
print (p.anchors)
Result
['Date', 'Scan Information']
Refernece:
0 意見:
張貼留言