{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "# Regular Expression" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello, world']\n" ] } ], "source": [ "import re\n", "\n", "html = \"\"\"\n", "\n", "\n", " \n", " \n", " Hello, world\n", " \n", " \n", "

从 HTML 提取数据的方式

\n", "

正则表达式
CSS 选择器
XPath

\n", " \n", "\n", "\"\"\"\n", "\n", "\n", "pattern = r\"(.*)\"\n", "head_title = re.findall(pattern, html)\n", "print(head_title)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['正则表达式\\n

CSS 选择器

\\n

XPath']\n" ] } ], "source": [ "# greedy mode\n", "\n", "li_pattern = r\"

(.*)

\"\n", "print(re.findall(li_pattern, html, re.DOTALL))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['正则表达式', 'CSS 选择器', 'XPath']\n" ] } ], "source": [ "# non-greedy mode\n", "\n", "li_pattern = r\"

(.*?)

\"\n", "print(re.findall(li_pattern, html, re.DOTALL))" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "# CSS Selector" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# uncomment below command to install beautifulsoup\n", "# if you haven't install it yet\n", "\n", "# !pip install beautifulsoup4" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[

CSS 选择器

]\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "html = \"\"\"\n", "\n", "\n", " \n", " \n", " Hello, world\n", " \n", " \n", "

从 HTML 提取数据的方式有哪些

\n", "

正则表达式
CSS 选择器
XPath
...

\n", " \n", "\n", "\"\"\"\n", "soup = BeautifulSoup(html)\n", "css = soup.select(\"li#css-selector\")\n", "print(css)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[

正则表达式

XPath

]\n" ] } ], "source": [ "option = soup.select(\"li.option\")\n", "print(option)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['正则表达式', 'CSS 选择器', 'XPath', '...']\n" ] } ], "source": [ "li_content = [li.text for li in soup.select(\"li\")]\n", "print(li_content)" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%%\n" } }, "source": [ "# XPath" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# uncomment below command to install beautifulsoup\n", "# if you haven't install it yet\n", "\n", "# !pip install lxml" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['

正则表达式

\\n ', '

CSS 选择器

\\n ', '

XPath

\\n ', '

...

\\n ']\n", "

CSS 选择器

\n", " \n", "['

正则表达式

\\n ', '

XPath

\\n ']\n", "['正则表达式', 'CSS 选择器', 'XPath', '...']\n" ] } ], "source": [ "from lxml import etree\n", "\n", "html = \"\"\"\n", "\n", "\n", " \n", " \n", " Hello, world\n", " \n", " \n", "

从 HTML 提取数据的方式有哪些

\n", "

正则表达式
CSS 选择器
XPath
...

\n", " \n", "\n", "\"\"\"\n", "\n", "tree = etree.HTML(html)\n", "\n", "\n", "lis = tree.xpath(\"//li\") # equal to //li\n", "print([etree.tounicode(li) for li in lis])\n", "\n", "css = tree.xpath(\"//li[@id='css-selector']\")[0]\n", "print(etree.tounicode(css))\n", "\n", "options = tree.xpath(\"//li[contains(@class, 'option')]\")\n", "print([etree.tounicode(opt) for opt in options])\n", "\n", "li_content = tree.xpath(\"//li/text()\")\n", "print(li_content)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# JSON" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pprint import pprint as print\n", "API_CONTENT = \"\"\"\n", "{\n", " \"name\": \"100gle\",\n", " \"platform\": \"sspai\",\n", " \"projects\": [\n", " {\n", " \"id\": 148,\n", " \"name\": \"《Python 自学手册》\",\n", " \"pubDate\": \"2020-08-24\"\n", " },\n", " {\n", " \"id\": 271,\n", " \"name\": \"《100 小时后请叫我程序员》\",\n", " \"pubDate\": \"2022-04-20\"\n", " }\n", " ]\n", "}\n", "\"\"\"\n", "\n", "JSON = json.loads(API_CONTENT)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(JSON))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'id': 148, 'name': '《Python 自学手册》', 'pubDate': '2020-08-24'},\n", " {'id': 271, 'name': '《100 小时后请叫我程序员》', 'pubDate': '2022-04-20'}]\n" ] } ], "source": [ "print(JSON[\"projects\"])" ] } ], "metadata": { "interpreter": { "hash": "13977d4cc82dee5f9d9535ceb495bd0ab12a43c33c664e5f0d53c24cf634b67f" }, "kernelspec": { "display_name": "Python 3.9.0 ('pandas-startup')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }