算是一个练习
之前都是用的BeautifulSoup来提取需要的信息,相对来说操作简单,但是需要这个额外的库有时候感觉还是比较慢。
之前也知道python内置了HTMLParser,可以用来解析html,但是一直没用过,这次用这个来做。
关键目标
- 尽量保持原有对齐
- mediainfo分离
- 图片
实现代码
直接上代码了。
from html.parser import HTMLParser
class THTMLParser(HTMLParser):
def __init__(self):
super(THTMLParser, self).__init__()
self.brflag = 0
self.qflag = None
self.imgtag = None
self.xtag = False
self.recording = False
self.record_step = 0 # 抽取进入的深度
self.record_data = []
def handle_starttag(self, tag, attrs):
if tag == "legend":
# 跳过 legend
self.recording = False
if tag == "x":
self.xtag = True
if tag == "div":
if self.recording is True:
self.record_step += 1
for key, value in attrs:
if key == "id" and value == "kdescr":
self.recording = True # 开始记录
self.record_step += 1 # 步进步数
break
if tag == "fieldset":
if self.recording is True:
self.record_step += 1
if self.qflag is None:
self.qflag = "quote_start"
def handle_endtag(self, tag):
if tag == "legend":
self.recording = True
return
if self.recording is True:
self.record_step -= 1
# 回退到起始tag 则说明要抽取的部分遍历结束了
if self.record_step == 0:
self.recording = False
# if self.xtag is True:
# self.xtag = False
if tag == "fieldset" and self.qflag == "find_next":
self.record_data[-1] += "[/quote]"
self.qflag = None
def handle_startendtag(self, tag, attrs):
if self.recording is False:
return
# 处理 <tagname /> 这种形式的tag
if tag == "br":
if self.brflag != 0:
if len(self.record_data) > 0:
if len(self.record_data[-1]) > 0 and self.record_data[-1][-1] != "\n":
self.record_data[-1] += "\n"
self.brflag = 0
self.brflag += 1
if tag == "hr":
if self.qflag is None:
self.qflag = "quote_start"
if self.qflag == "find_next":
self.record_data[-1] += "[/quote]"
self.qflag = None
if tag == "img":
for key, value in attrs:
if key == "src":
text = f"[img]{value}[/img]"
self.record_data.append(text)
def handle_data(self, data: str):
if self.recording is True:
# 这里没有处理 \u3000 即全角空白 因为全角空白可以对齐
text = data.strip("\n\t").replace("\xa0", " ")
if text != "":
if self.brflag == 1:
self.brflag = 0
if self.xtag is True:
self.record_data[-1] += text
self.xtag = False
else:
if self.qflag == "quote_start":
text = "[quote]" + text
self.qflag = "find_next"
self.record_data.append(text)
if self.record_data[-1] in ["Video", "Audio", "Other"]:
self.record_data[-1] = "\n" + self.record_data[-1]
def handle_comment(self, data):
pass
# print('<!--', data, '-->')
def handle_entityref(self, name):
pass
# print('&%s;' % name)
def handle_charref(self, name):
pass
# print('&#%s;' % name)
if __name__ == "__main__":
with open(r"torrent.html", "r", encoding="utf-8") as f:
content = f.read()
parser = THTMLParser()
parser.feed(content)
with open(r"torrent.md", "w", encoding="utf-8") as f:
content = f.write("\n".join(parser.record_data))