2020年7月

算是一个练习

之前都是用的BeautifulSoup来提取需要的信息,相对来说操作简单,但是需要这个额外的库有时候感觉还是比较慢。

之前也知道python内置了HTMLParser,可以用来解析html,但是一直没用过,这次用这个来做。

关键目标

  • 尽量保持原有对齐
  • mediainfo分离
  • 图片

实现代码

直接上代码了。

from html.parser import HTMLParser

class THTMLParser(HTMLParser):

    def __init__(self):
        super(THTMLParser, self).__init__()
        self.brflag = 0
        self.qflag = None
        self.imgtag = None
        self.xtag = False
        self.recording = False
        self.record_step = 0 # 抽取进入的深度
        self.record_data = []

    def handle_starttag(self, tag, attrs):
        if tag == "legend":
            # 跳过 legend
            self.recording = False
        if tag == "x":
            self.xtag = True
        if tag == "div":
            if self.recording is True:
                self.record_step += 1
            for key, value in attrs:
                if key == "id" and value == "kdescr":
                    self.recording = True # 开始记录
                    self.record_step += 1 # 步进步数
                    break
        if tag == "fieldset":
            if self.recording is True:
                self.record_step += 1
            if self.qflag is None:
                self.qflag = "quote_start"

    def handle_endtag(self, tag):
        if tag == "legend":
            self.recording = True
            return
        if self.recording is True:
            self.record_step -= 1
            # 回退到起始tag 则说明要抽取的部分遍历结束了
            if self.record_step == 0:
                self.recording = False
        # if self.xtag is True:
        #     self.xtag = False
        if tag == "fieldset" and self.qflag == "find_next":
            self.record_data[-1] += "[/quote]"
            self.qflag = None

    def handle_startendtag(self, tag, attrs):
        if self.recording is False:
            return
        # 处理 <tagname /> 这种形式的tag
        if tag == "br":
            if self.brflag != 0:
                if len(self.record_data) > 0:
                    if len(self.record_data[-1]) > 0 and self.record_data[-1][-1] != "\n":
                        self.record_data[-1] += "\n"
                self.brflag = 0
            self.brflag += 1
        if tag == "hr":
            if self.qflag is None:
                self.qflag = "quote_start"
            if self.qflag == "find_next":
                self.record_data[-1] += "[/quote]"
                self.qflag = None
        if tag == "img":
            for key, value in attrs:
                if key == "src":
                    text = f"[img]{value}[/img]"
                    self.record_data.append(text)

    def handle_data(self, data: str):
        if self.recording is True:
            # 这里没有处理 \u3000 即全角空白 因为全角空白可以对齐
            text = data.strip("\n\t").replace("\xa0", " ")
            if text != "":
                if self.brflag == 1:
                    self.brflag = 0
                if self.xtag is True:
                    self.record_data[-1] += text
                    self.xtag = False
                else:
                    if self.qflag == "quote_start":
                        text = "[quote]" + text
                        self.qflag = "find_next"
                    self.record_data.append(text)
                if self.record_data[-1] in ["Video", "Audio", "Other"]:
                    self.record_data[-1] = "\n" + self.record_data[-1]

    def handle_comment(self, data):
        pass
        # print('<!--', data, '-->')

    def handle_entityref(self, name):
        pass
        # print('&%s;' % name)

    def handle_charref(self, name):
        pass
        # print('&#%s;' % name)

if __name__ == "__main__":
    with open(r"torrent.html", "r", encoding="utf-8") as f:
        content = f.read()
    parser = THTMLParser()
    parser.feed(content)
    with open(r"torrent.md", "w", encoding="utf-8") as f:
        content = f.write("\n".join(parser.record_data))

提取效果预览

2020-07-05T10:24:53.png