# Regular Expression

In [1]:
import re

html = """
<!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="UTF-8" />
        <title>Hello, world</title>
    </head>
    <body>
        <h1>从 HTML 提取数据的方式</h1>
        <ul>
            <li>正则表达式</li>
            <li>CSS 选择器</li>
            <li>XPath</li>
        </ul>
    </body>
</html>
"""


pattern = r"<title>(.*)</title>"
head_title = re.findall(pattern, html)
print(head_title)

['Hello, world']


In [2]:
# greedy mode

li_pattern = r"<li>(.*)</li>"
print(re.findall(li_pattern, html, re.DOTALL))

['正则表达式</li>\n            <li>CSS 选择器</li>\n            <li>XPath']


In [3]:
# non-greedy mode

li_pattern = r"<li>(.*?)</li>"
print(re.findall(li_pattern, html, re.DOTALL))

['正则表达式', 'CSS 选择器', 'XPath']


# CSS Selector

In [4]:
# uncomment below command to install beautifulsoup
# if you haven't install it yet

# !pip install beautifulsoup4

In [5]:
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <title>Hello, world</title>
  </head>
  <body>
    <h1>从 HTML 提取数据的方式有哪些</h1>
    <ul>
      <li class="option">正则表达式</li>
      <li id="css-selector">CSS 选择器</li>
      <li class="option">XPath</li>
      <li>...</li>
    </ul>
  </body>
</html>
"""
soup = BeautifulSoup(html)
css = soup.select("li#css-selector")
print(css)


[<li id="css-selector">CSS 选择器</li>]


In [6]:
option = soup.select("li.option")
print(option)

[<li class="option">正则表达式</li>, <li class="option">XPath</li>]


In [7]:
li_content = [li.text for li in soup.select("li")]
print(li_content)

['正则表达式', 'CSS 选择器', 'XPath', '...']


# XPath

In [8]:
# uncomment below command to install beautifulsoup
# if you haven't install it yet

# !pip install lxml

In [9]:
from lxml import etree

html = """
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <title>Hello, world</title>
  </head>
  <body>
    <h1>从 HTML 提取数据的方式有哪些</h1>
    <ul>
      <li class="option">正则表达式</li>
      <li id="css-selector">CSS 选择器</li>
      <li class="option">XPath</li>
      <li>...</li>
    </ul>
  </body>
</html>
"""

tree = etree.HTML(html)


lis = tree.xpath("//li")  # equal to //li
print([etree.tounicode(li) for li in lis])

css = tree.xpath("//li[@id='css-selector']")[0]
print(etree.tounicode(css))

options = tree.xpath("//li[contains(@class, 'option')]")
print([etree.tounicode(opt) for opt in options])

li_content = tree.xpath("//li/text()")
print(li_content)


['<li class="option">正则表达式</li>\n      ', '<li id="css-selector">CSS 选择器</li>\n      ', '<li class="option">XPath</li>\n      ', '<li>...</li>\n    ']
<li id="css-selector">CSS 选择器</li>
      
['<li class="option">正则表达式</li>\n      ', '<li class="option">XPath</li>\n      ']
['正则表达式', 'CSS 选择器', 'XPath', '...']


# JSON

In [10]:
import json
from pprint import pprint as print
API_CONTENT = """
{
    "name": "100gle",
    "platform": "sspai",
    "projects": [
        {
            "id": 148,
            "name": "《Python 自学手册》",
            "pubDate": "2020-08-24"
        },
        {
            "id": 271,
            "name": "《100 小时后请叫我程序员》",
            "pubDate": "2022-04-20"
        }
    ]
}
"""

JSON = json.loads(API_CONTENT)

In [11]:
print(type(JSON))

<class 'dict'>


In [12]:
print(JSON["projects"])

[{'id': 148, 'name': '《Python 自学手册》', 'pubDate': '2020-08-24'},
 {'id': 271, 'name': '《100 小时后请叫我程序员》', 'pubDate': '2022-04-20'}]
