281 lines
6.0 KiB
Plaintext
281 lines
6.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 正则表达式基本语法"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 字符"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# 精确匹配\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"text = \"Hello, world\"\n",
|
|
"pattern = \"world\"\n",
|
|
"print(re.findall(pattern, text))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# 模糊匹配\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"text = \"Hello, world\"\n",
|
|
"pattern = \".\"\n",
|
|
"print(re.findall(pattern, text))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 量词"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"text = \"fooraskdsororaskaaaaadsooo\"\n",
|
|
"print(\n",
|
|
" \"{n} pattern: \", re.findall(r\"a{3}\", text), \"\\n\",\n",
|
|
" \"{n,} pattern: \", re.findall(r\"a{1,}\", text), \"\\n\",\n",
|
|
" \"{n, m} pattern: \", re.findall(r\"a{1,3}\", text), \"\\n\",\n",
|
|
" sep=\"\",\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"text = \"fooraskdsororaskaaaaadsooo\"\n",
|
|
"print(\n",
|
|
" \"? pattern: \", re.findall(r\"a?\", text), \"\\n\",\n",
|
|
" \"* pattern: \", re.findall(r\"a*\", text), \"\\n\",\n",
|
|
" \"+ pattern: \", re.findall(r\"a+\", text), \"\\n\",\n",
|
|
" sep=\"\",\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 条件"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# anchor\n",
|
|
"import re\n",
|
|
"\n",
|
|
"poetry = \"\"\"The Zen of Python\n",
|
|
"Beautiful is better than ugly.\n",
|
|
"Explicit is better than implicit.\n",
|
|
"Simple is better than complex.\n",
|
|
"Complex is better than complicated.\n",
|
|
"Flat is better than nested.\n",
|
|
"Sparse is better than dense.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"print(f'Start with \"S\": {re.findall(r\"^S.*\", poetry, re.MULTILINE)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('Anti Characters:', re.findall(r\"[^\\nA-Z].*\", poetry, re.MULTILINE))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f'End with \"ed.\": {re.findall(r\".*ed.$\", poetry, re.MULTILINE)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# or\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"text = \"\"\"\n",
|
|
"http://www.google.com\n",
|
|
"https://www.google.com\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"print(f\"Match http or https url: {re.findall(r'http.*|https.*', text)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# group and sub string\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"poetry = \"\"\"The Zen of Python\n",
|
|
"Beautiful is better than ugly.\n",
|
|
"Explicit is better than implicit.\n",
|
|
"Simple is better than complex.\n",
|
|
"Complex is better than complicated.\n",
|
|
"Flat is better than nested.\n",
|
|
"Sparse is better than dense.\n",
|
|
"\"\"\"\n",
|
|
"groups = re.findall(r\"([A-Z].*?) is better than (.*).\", poetry, re.MULTILINE)\n",
|
|
"for group in groups:\n",
|
|
" start = group[0].lower()\n",
|
|
" end = group[1]\n",
|
|
" print(f\"{start} -> {end}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 在 Python 中使用正则表达式"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"poetry = \"\"\"The Zen of Python\n",
|
|
"Beautiful is better than ugly.\n",
|
|
"Explicit is better than implicit.\n",
|
|
"Simple is better than complex.\n",
|
|
"Complex is better than complicated.\n",
|
|
"Flat is better than nested.\n",
|
|
"Sparse is better than dense.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"print(f're.match: ', re.match(r'^[Tt]he.*', poetry, re.MULTILINE).group())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f're.search: ' , re.search(r'(?P<beautiful>^B.+) is', poetry, re.MULTILINE).group(\"beautiful\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f're.findall: ', re.findall(r'(?P<Title>^[A-Z].+) is', poetry, re.MULTILINE))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f're.split: ', re.split(r' is better than |[\\n\\.]', poetry))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f're.sub: ', re.sub(r'\\s?is better than\\s?', ' -> ', poetry))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# pattern object\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"dates = \"\"\"\n",
|
|
"1999/12/31\n",
|
|
"2000/1/1\n",
|
|
"2000/1/2\n",
|
|
"2000/1/3\n",
|
|
"2000/1/4\n",
|
|
"2000/1/5\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"pattern = re.compile(r'(?P<year>\\d{4})/(?P<month>\\d{1,2})/(?P<day>\\d{1,2})', re.MULTILINE)\n",
|
|
"for group in pattern.finditer(dates):\n",
|
|
" print(group.groupdict())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"language_info": {
|
|
"name": "python"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|