feat: 新增Vol07章节示例源码

2022-05-12 11:18:31 +08:00
parent 29cf2bd5a9
commit fead78da03
1 changed files with 280 additions and 0 deletions
--- a/code/07/regexp.ipynb
+++ b/code/07/regexp.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 正则表达式基本语法"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 字符"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 精确匹配\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "text = \"Hello, world\"\n",
+    "pattern = \"world\"\n",
+    "print(re.findall(pattern, text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 模糊匹配\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "text = \"Hello, world\"\n",
+    "pattern = \".\"\n",
+    "print(re.findall(pattern, text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 量词"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "text = \"fooraskdsororaskaaaaadsooo\"\n",
+    "print(\n",
+    "    \"{n} pattern: \", re.findall(r\"a{3}\", text), \"\\n\",\n",
+    "    \"{n,} pattern: \", re.findall(r\"a{1,}\", text), \"\\n\",\n",
+    "    \"{n, m} pattern: \", re.findall(r\"a{1,3}\", text), \"\\n\",\n",
+    "    sep=\"\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "text = \"fooraskdsororaskaaaaadsooo\"\n",
+    "print(\n",
+    "    \"? pattern: \", re.findall(r\"a?\", text), \"\\n\",\n",
+    "    \"* pattern: \", re.findall(r\"a*\", text), \"\\n\",\n",
+    "    \"+ pattern: \", re.findall(r\"a+\", text), \"\\n\",\n",
+    "    sep=\"\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 条件"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# anchor\n",
+    "import re\n",
+    "\n",
+    "poetry = \"\"\"The Zen of Python\n",
+    "Beautiful is better than ugly.\n",
+    "Explicit is better than implicit.\n",
+    "Simple is better than complex.\n",
+    "Complex is better than complicated.\n",
+    "Flat is better than nested.\n",
+    "Sparse is better than dense.\n",
+    "\"\"\"\n",
+    "\n",
+    "print(f'Start with \"S\": {re.findall(r\"^S.*\", poetry, re.MULTILINE)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Anti Characters:', re.findall(r\"[^\\nA-Z].*\", poetry, re.MULTILINE))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'End with \"ed.\": {re.findall(r\".*ed.$\", poetry, re.MULTILINE)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# or\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "text = \"\"\"\n",
+    "http://www.google.com\n",
+    "https://www.google.com\n",
+    "\"\"\"\n",
+    "\n",
+    "print(f\"Match http or https url: {re.findall(r'http.*|https.*', text)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# group and sub string\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "poetry = \"\"\"The Zen of Python\n",
+    "Beautiful is better than ugly.\n",
+    "Explicit is better than implicit.\n",
+    "Simple is better than complex.\n",
+    "Complex is better than complicated.\n",
+    "Flat is better than nested.\n",
+    "Sparse is better than dense.\n",
+    "\"\"\"\n",
+    "groups = re.findall(r\"([A-Z].*?) is better than (.*).\", poetry, re.MULTILINE)\n",
+    "for group in groups:\n",
+    "    start = group[0].lower()\n",
+    "    end = group[1]\n",
+    "    print(f\"{start} -> {end}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 在 Python 中使用正则表达式"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "poetry = \"\"\"The Zen of Python\n",
+    "Beautiful is better than ugly.\n",
+    "Explicit is better than implicit.\n",
+    "Simple is better than complex.\n",
+    "Complex is better than complicated.\n",
+    "Flat is better than nested.\n",
+    "Sparse is better than dense.\n",
+    "\"\"\"\n",
+    "\n",
+    "print(f're.match: ', re.match(r'^[Tt]he.*', poetry, re.MULTILINE).group())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f're.search: ' , re.search(r'(?P<beautiful>^B.+) is', poetry, re.MULTILINE).group(\"beautiful\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f're.findall: ', re.findall(r'(?P<Title>^[A-Z].+) is', poetry, re.MULTILINE))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f're.split: ', re.split(r' is better than |[\\n\\.]', poetry))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f're.sub: ', re.sub(r'\\s?is better than\\s?', ' -> ', poetry))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pattern object\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "dates = \"\"\"\n",
+    "1999/12/31\n",
+    "2000/1/1\n",
+    "2000/1/2\n",
+    "2000/1/3\n",
+    "2000/1/4\n",
+    "2000/1/5\n",
+    "\"\"\"\n",
+    "\n",
+    "pattern = re.compile(r'(?P<year>\\d{4})/(?P<month>\\d{1,2})/(?P<day>\\d{1,2})', re.MULTILINE)\n",
+    "for group in pattern.finditer(dates):\n",
+    "    print(group.groupdict())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}