From fead78da030450be542cfe81779c51a6d04ba52a Mon Sep 17 00:00:00 2001 From: 100gle <569590461@qq.com> Date: Thu, 12 May 2022 11:18:31 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9EVol07=E7=AB=A0?= =?UTF-8?q?=E8=8A=82=E7=A4=BA=E4=BE=8B=E6=BA=90=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code/07/regexp.ipynb | 280 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 code/07/regexp.ipynb diff --git a/code/07/regexp.ipynb b/code/07/regexp.ipynb new file mode 100644 index 0000000..2956d1d --- /dev/null +++ b/code/07/regexp.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 正则表达式基本语法" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 字符" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 精确匹配\n", + "\n", + "import re\n", + "\n", + "text = \"Hello, world\"\n", + "pattern = \"world\"\n", + "print(re.findall(pattern, text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 模糊匹配\n", + "\n", + "import re\n", + "\n", + "text = \"Hello, world\"\n", + "pattern = \".\"\n", + "print(re.findall(pattern, text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 量词" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "text = \"fooraskdsororaskaaaaadsooo\"\n", + "print(\n", + " \"{n} pattern: \", re.findall(r\"a{3}\", text), \"\\n\",\n", + " \"{n,} pattern: \", re.findall(r\"a{1,}\", text), \"\\n\",\n", + " \"{n, m} pattern: \", re.findall(r\"a{1,3}\", text), \"\\n\",\n", + " sep=\"\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "text = \"fooraskdsororaskaaaaadsooo\"\n", + "print(\n", + " \"? pattern: \", re.findall(r\"a?\", text), \"\\n\",\n", + " \"* pattern: \", re.findall(r\"a*\", text), \"\\n\",\n", + " \"+ pattern: \", re.findall(r\"a+\", text), \"\\n\",\n", + " sep=\"\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 条件" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# anchor\n", + "import re\n", + "\n", + "poetry = \"\"\"The Zen of Python\n", + "Beautiful is better than ugly.\n", + "Explicit is better than implicit.\n", + "Simple is better than complex.\n", + "Complex is better than complicated.\n", + "Flat is better than nested.\n", + "Sparse is better than dense.\n", + "\"\"\"\n", + "\n", + "print(f'Start with \"S\": {re.findall(r\"^S.*\", poetry, re.MULTILINE)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Anti Characters:', re.findall(r\"[^\\nA-Z].*\", poetry, re.MULTILINE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f'End with \"ed.\": {re.findall(r\".*ed.$\", poetry, re.MULTILINE)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# or\n", + "\n", + "import re\n", + "\n", + "text = \"\"\"\n", + "http://www.google.com\n", + "https://www.google.com\n", + "\"\"\"\n", + "\n", + "print(f\"Match http or https url: {re.findall(r'http.*|https.*', text)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# group and sub string\n", + "\n", + "import re\n", + "\n", + "poetry = \"\"\"The Zen of Python\n", + "Beautiful is better than ugly.\n", + "Explicit is better than implicit.\n", + "Simple is better than complex.\n", + "Complex is better than complicated.\n", + "Flat is better than nested.\n", + "Sparse is better than dense.\n", + "\"\"\"\n", + "groups = re.findall(r\"([A-Z].*?) is better than (.*).\", poetry, re.MULTILINE)\n", + "for group in groups:\n", + " start = group[0].lower()\n", + " end = group[1]\n", + " print(f\"{start} -> {end}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 在 Python 中使用正则表达式" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "poetry = \"\"\"The Zen of Python\n", + "Beautiful is better than ugly.\n", + "Explicit is better than implicit.\n", + "Simple is better than complex.\n", + "Complex is better than complicated.\n", + "Flat is better than nested.\n", + "Sparse is better than dense.\n", + "\"\"\"\n", + "\n", + "print(f're.match: ', re.match(r'^[Tt]he.*', poetry, re.MULTILINE).group())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f're.search: ' , re.search(r'(?P^B.+) is', poetry, re.MULTILINE).group(\"beautiful\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f're.findall: ', re.findall(r'(?P^[A-Z].+) is', poetry, re.MULTILINE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f're.split: ', re.split(r' is better than |[\\n\\.]', poetry))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f're.sub: ', re.sub(r'\\s?is better than\\s?', ' -> ', poetry))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pattern object\n", + "\n", + "import re\n", + "\n", + "dates = \"\"\"\n", + "1999/12/31\n", + "2000/1/1\n", + "2000/1/2\n", + "2000/1/3\n", + "2000/1/4\n", + "2000/1/5\n", + "\"\"\"\n", + "\n", + "pattern = re.compile(r'(?P<year>\\d{4})/(?P<month>\\d{1,2})/(?P<day>\\d{1,2})', re.MULTILINE)\n", + "for group in pattern.finditer(dates):\n", + " print(group.groupdict())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}