两种markdown文件解析方法

我常用markdown写东西。写比较大的项目时,比如写包含很多章节和栏目一本书,可能要用Python做一些自动处理工作,如给所有儿歌中的难字加标记,统一处理文件里涉及的所有图片,统计分散在各处的某个栏目的情况,等等。这时需要先对markdown文件进行解析,获得元素树(token/element tree),以便处理。下面是两种实现方法,第一种更可靠;而第二种方法,做过爬虫的会比较熟悉。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import mistune
from lxml import etree
from IPython.display import display

markdown_data = """
# 标题1-1

TEXT after title1-1
TEXT after title1-2

* title1-1 li1
* title1-1 li2

1. title1-1 li1
1. title1-1 li2

TEXT berfor image ![image](img_url.png) TEXT after image

[GitHub](https://github.com/gera2ld/markmap)

**inline** ~~text~~ *styles*

`inline code`

Katex - $x = {-b \pm \sqrt{b^2-4ac} \over 2a}$

```js
console.log('code block');
## title2-1 ## title2-2 # title1-2 """
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
</div>

</div>
<div class="cell border-box-sizing text_cell rendered" markdown="1">
<div class="inner_cell" markdown="1">
<div class="text_cell_render border-box-sizing rendered_html" markdown="1">
 # 通过扩展mistune.Renderer获取tokenTree
</div>
</div>
</div>
<div class="cell border-box-sizing code_cell rendered" markdown="1">
<div class="input">

```python

class TokenTreeRenderer(mistune.Renderer):
    # options is required
    options = {}

    def placeholder(self):
        return []

    def __getattribute__(self, name):
        """Saves the arguments to each Markdown handling method."""
        found = TokenTreeRenderer.__dict__.get(name)
        if found is not None:
            return object.__getattribute__(self, name)

        def fake_method(*args, **kwargs):
            return [(name, args, kwargs)]
        return fake_method

markdown = mistune.Markdown(renderer=TokenTreeRenderer())
tokenTree = markdown(markdown_data) # tokenTree = markdown.render(markdown_data)
display(tokenTree)
[('header', ([('text', ('标题1-1',), {})], 1, '标题1-1'), {}),
 ('paragraph',
  ([('text', ('TEXT after title1-1\nTEXT after title1-2',), {})],),
  {}),
 ('list',
  ([('list_item', ([('text', ('title1-1 li1',), {})],), {}),
    ('list_item', ([('text', ('title1-1 li2',), {})],), {})],
   False),
  {}),
 ('list',
  ([('list_item', ([('text', ('title1-1 li1',), {})],), {}),
    ('list_item', ([('text', ('title1-1 li2',), {})],), {})],
   True),
  {}),
 ('paragraph',
  ([('text', ('TEXT berfor image ',), {}),
    ('image', ('img_url.png', None, 'image'), {}),
    ('text', (' TEXT after image',), {})],),
  {}),
 ('paragraph',
  ([('link',
     ('https://github.com/gera2ld/markmap', None, [('text', ('GitHub',), {})]),
     {})],),
  {}),
 ('paragraph',
  ([('double_emphasis', ([('text', ('inline',), {})],), {}),
    ('text', (' ',), {}),
    ('strikethrough', ([('text', ('text',), {})],), {}),
    ('text', (' ',), {}),
    ('emphasis', ([('text', ('styles',), {})],), {})],),
  {}),
 ('paragraph', ([('codespan', ('inline code',), {})],), {}),
 ('paragraph',
  ([('text', ('Katex - $x = {-b ',), {}),
    ('text', ('\\pm ',), {}),
    ('text', ('\\sqrt{b^2-4ac} ',), {}),
    ('text', ('\\over 2a}$',), {})],),
  {}),
 ('block_code', ("console.log('code block');", 'js'), {}),
 ('header', ([('text', ('title2-1',), {})], 2, 'title2-1'), {}),
 ('header', ([('text', ('title2-2',), {})], 2, 'title2-2'), {}),
 ('header', ([('text', ('title1-2',), {})], 1, 'title1-2'), {})]

# 或通过lxml获取元素树

1
2
3
4
# html = mistune.markdown(markdown_data)
markdown = mistune.Markdown()
html = markdown(markdown_data)
print(html)
<h1>标题1-1</h1>
<p>TEXT after title1-1
TEXT after title1-2</p>
<ul>
<li>title1-1 li1</li>
<li>title1-1 li2</li>
</ul>
<ol>
<li>title1-1 li1</li>
<li>title1-1 li2</li>
</ol>
<p>TEXT berfor image <img src="img_url.png" alt="image"> TEXT after image</p>
<p><a href="https://github.com/gera2ld/markmap">GitHub</a></p>
<p><strong>inline</strong> <del>text</del> <em>styles</em></p>
<p><code>inline code</code></p>
<p>Katex - $x = {-b \pm \sqrt{b^2-4ac} \over 2a}$</p>
<pre><code class="lang-js">console.log(&#39;code block&#39;);
</code></pre>
<h2>title2-1</h2>
<h2>title2-2</h2>
<h1>title1-2</h1>


1
2
3
# 元素树
root = etree.HTML(html)
print(root)
<Element html at 0x1c6dacc29c0>

1
2
3
# 元素.标签
l = [x.tag for x in root[0]]
print(l)
['h1', 'p', 'ul', 'ol', 'p', 'p', 'p', 'p', 'p', 'pre', 'h2', 'h2', 'h1']

1
2
# 元素.文本
print(root[0][1].text)
TEXT after title1-1
TEXT after title1-2

1
2
3
# 转字符串
print(etree.tostring(root, encoding='utf-8'))
print(etree.tostring(root, method="text", encoding='utf-8', pretty_print=True))
b'<html><body><h1>\xe6\xa0\x87\xe9\xa2\x981-1</h1>\n<p>TEXT after title1-1\nTEXT after title1-2</p>\n<ul>\n<li>title1-1 li1</li>\n<li>title1-1 li2</li>\n</ul>\n<ol>\n<li>title1-1 li1</li>\n<li>title1-1 li2</li>\n</ol>\n<p>TEXT berfor image <img src="img_url.png" alt="image"/> TEXT after image</p>\n<p><a href="https://github.com/gera2ld/markmap">GitHub</a></p>\n<p><strong>inline</strong> <del>text</del> <em>styles</em></p>\n<p><code>inline code</code></p>\n<p>Katex - $x = {-b \\pm \\sqrt{b^2-4ac} \\over 2a}$</p>\n<pre><code class="lang-js">console.log(\'code block\');\n</code></pre>\n<h2>title2-1</h2>\n<h2>title2-2</h2>\n<h1>title1-2</h1>\n</body></html>'
b"\xe6\xa0\x87\xe9\xa2\x981-1\nTEXT after title1-1\nTEXT after title1-2\n\ntitle1-1 li1\ntitle1-1 li2\n\n\ntitle1-1 li1\ntitle1-1 li2\n\nTEXT berfor image  TEXT after image\nGitHub\ninline text styles\ninline code\nKatex - $x = {-b \\pm \\sqrt{b^2-4ac} \\over 2a}$\nconsole.log('code block');\n\ntitle2-1\ntitle2-2\ntitle1-2\n"

1
2
3
4
5
6
7
8
# 用xpath查找元素
display(root.xpath("string()")) # 文本 # lxml.etree only!
display(root.xpath("//text()")) # 文本列表 # lxml.etree only!

# 同上
build_text_list = etree.XPath("//text()")
path = build_text_list(root)
print(path)
"标题1-1\nTEXT after title1-1\nTEXT after title1-2\n\ntitle1-1 li1\ntitle1-1 li2\n\n\ntitle1-1 li1\ntitle1-1 li2\n\nTEXT berfor image  TEXT after image\nGitHub\ninline text styles\ninline code\nKatex - $x = {-b \\pm \\sqrt{b^2-4ac} \\over 2a}$\nconsole.log('code block');\n\ntitle2-1\ntitle2-2\ntitle1-2\n"
['标题1-1',
 '\n',
 'TEXT after title1-1\nTEXT after title1-2',
 '\n',
 '\n',
 'title1-1 li1',
 '\n',
 'title1-1 li2',
 '\n',
 '\n',
 '\n',
 'title1-1 li1',
 '\n',
 'title1-1 li2',
 '\n',
 '\n',
 'TEXT berfor image ',
 ' TEXT after image',
 '\n',
 'GitHub',
 '\n',
 'inline',
 ' ',
 'text',
 ' ',
 'styles',
 '\n',
 'inline code',
 '\n',
 'Katex - $x = {-b \\pm \\sqrt{b^2-4ac} \\over 2a}$',
 '\n',
 "console.log('code block');\n",
 '\n',
 'title2-1',
 '\n',
 'title2-2',
 '\n',
 'title1-2',
 '\n']
['标题1-1', '\n', 'TEXT after title1-1\nTEXT after title1-2', '\n', '\n', 'title1-1 li1', '\n', 'title1-1 li2', '\n', '\n', '\n', 'title1-1 li1', '\n', 'title1-1 li2', '\n', '\n', 'TEXT berfor image ', ' TEXT after image', '\n', 'GitHub', '\n', 'inline', ' ', 'text', ' ', 'styles', '\n', 'inline code', '\n', 'Katex - $x = {-b \\pm \\sqrt{b^2-4ac} \\over 2a}$', '\n', "console.log('code block');\n", '\n', 'title2-1', '\n', 'title2-2', '\n', 'title1-2', '\n']

1
2
3
4
5
6
7
# 取父
print(path[0])
print(path[0].getparent().tag)

print(path[0].is_text) # 是否文本
print(path[1].is_text)
print(path[1].is_tail) # 是否尾巴??
标题1-1
h1
True
False
True

1
2
3
# 树的迭代
for e in root.iter():
    print(f"{e.tag} - {e.text}")
html - None
body - None
h1 - 标题1-1
p - TEXT after title1-1
TEXT after title1-2
ul - 

li - title1-1 li1
li - title1-1 li2
ol - 

li - title1-1 li1
li - title1-1 li2
p - TEXT berfor image 
img - None
p - None
a - GitHub
p - None
strong - inline
del - text
em - styles
p - None
code - inline code
p - Katex - $x = {-b \pm \sqrt{b^2-4ac} \over 2a}$
pre - None
code - console.log('code block');

h2 - title2-1
h2 - title2-2
h1 - title1-2

1
2
3
# 树的过滤
for e in root.iter("h1"):
    print(f"{e.tag} - {e.text}")
h1 - 标题1-1
h1 - title1-2

1
2
3
# 树的过滤,或关系
for e in root.iter("h1", "p"):
    print(f"{e.tag} - {e.text}")
h1 - 标题1-1
p - TEXT after title1-1
TEXT after title1-2
p - TEXT berfor image 
p - None
p - None
p - None
p - Katex - $x = {-b \pm \sqrt{b^2-4ac} \over 2a}$
h1 - title1-2

1
2
3
4
5
6
7
8
# 寻找子元素

print(root.find("h2")) # 寻找一级元素,find()找不到时返回None,其他方法会报错
print(root.find(".//h2")) # 在任意一级寻找元素
print(root.find(".//h2")) # 在任意一级寻找元素
print([ b for b in root.iterfind(".//h2") ]) # 迭代查找
print(root.findall(".//h2"))  # 查找全部
print(root.findall(".//h1[@x]")) # 带属性查找
None
<Element h2 at 0x1c6dae19380>
<Element h2 at 0x1c6dae19380>
[<Element h2 at 0x1c6dae19380>, <Element h2 at 0x1c6dae193c0>]
[<Element h2 at 0x1c6dae19380>, <Element h2 at 0x1c6dae193c0>]
[]