feat: 新增Vol07章节示例源码

This commit is contained in:
100gle
2022-05-12 11:18:31 +08:00
parent 29cf2bd5a9
commit fead78da03

280
code/07/regexp.ipynb Normal file
View File

@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 正则表达式基本语法"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 字符"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 精确匹配\n",
"\n",
"import re\n",
"\n",
"text = \"Hello, world\"\n",
"pattern = \"world\"\n",
"print(re.findall(pattern, text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 模糊匹配\n",
"\n",
"import re\n",
"\n",
"text = \"Hello, world\"\n",
"pattern = \".\"\n",
"print(re.findall(pattern, text))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 量词"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"text = \"fooraskdsororaskaaaaadsooo\"\n",
"print(\n",
" \"{n} pattern: \", re.findall(r\"a{3}\", text), \"\\n\",\n",
" \"{n,} pattern: \", re.findall(r\"a{1,}\", text), \"\\n\",\n",
" \"{n, m} pattern: \", re.findall(r\"a{1,3}\", text), \"\\n\",\n",
" sep=\"\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"text = \"fooraskdsororaskaaaaadsooo\"\n",
"print(\n",
" \"? pattern: \", re.findall(r\"a?\", text), \"\\n\",\n",
" \"* pattern: \", re.findall(r\"a*\", text), \"\\n\",\n",
" \"+ pattern: \", re.findall(r\"a+\", text), \"\\n\",\n",
" sep=\"\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 条件"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# anchor\n",
"import re\n",
"\n",
"poetry = \"\"\"The Zen of Python\n",
"Beautiful is better than ugly.\n",
"Explicit is better than implicit.\n",
"Simple is better than complex.\n",
"Complex is better than complicated.\n",
"Flat is better than nested.\n",
"Sparse is better than dense.\n",
"\"\"\"\n",
"\n",
"print(f'Start with \"S\": {re.findall(r\"^S.*\", poetry, re.MULTILINE)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Anti Characters:', re.findall(r\"[^\\nA-Z].*\", poetry, re.MULTILINE))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f'End with \"ed.\": {re.findall(r\".*ed.$\", poetry, re.MULTILINE)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# or\n",
"\n",
"import re\n",
"\n",
"text = \"\"\"\n",
"http://www.google.com\n",
"https://www.google.com\n",
"\"\"\"\n",
"\n",
"print(f\"Match http or https url: {re.findall(r'http.*|https.*', text)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# group and sub string\n",
"\n",
"import re\n",
"\n",
"poetry = \"\"\"The Zen of Python\n",
"Beautiful is better than ugly.\n",
"Explicit is better than implicit.\n",
"Simple is better than complex.\n",
"Complex is better than complicated.\n",
"Flat is better than nested.\n",
"Sparse is better than dense.\n",
"\"\"\"\n",
"groups = re.findall(r\"([A-Z].*?) is better than (.*).\", poetry, re.MULTILINE)\n",
"for group in groups:\n",
" start = group[0].lower()\n",
" end = group[1]\n",
" print(f\"{start} -> {end}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 在 Python 中使用正则表达式"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"poetry = \"\"\"The Zen of Python\n",
"Beautiful is better than ugly.\n",
"Explicit is better than implicit.\n",
"Simple is better than complex.\n",
"Complex is better than complicated.\n",
"Flat is better than nested.\n",
"Sparse is better than dense.\n",
"\"\"\"\n",
"\n",
"print(f're.match: ', re.match(r'^[Tt]he.*', poetry, re.MULTILINE).group())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f're.search: ' , re.search(r'(?P<beautiful>^B.+) is', poetry, re.MULTILINE).group(\"beautiful\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f're.findall: ', re.findall(r'(?P<Title>^[A-Z].+) is', poetry, re.MULTILINE))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f're.split: ', re.split(r' is better than |[\\n\\.]', poetry))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f're.sub: ', re.sub(r'\\s?is better than\\s?', ' -> ', poetry))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pattern object\n",
"\n",
"import re\n",
"\n",
"dates = \"\"\"\n",
"1999/12/31\n",
"2000/1/1\n",
"2000/1/2\n",
"2000/1/3\n",
"2000/1/4\n",
"2000/1/5\n",
"\"\"\"\n",
"\n",
"pattern = re.compile(r'(?P<year>\\d{4})/(?P<month>\\d{1,2})/(?P<day>\\d{1,2})', re.MULTILINE)\n",
"for group in pattern.finditer(dates):\n",
" print(group.groupdict())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}