sspai-100-hours-series-python/code/07/regexp.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 正则表达式基本语法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 精确匹配\n",
    "\n",
    "import re\n",
    "\n",
    "text = \"Hello, world\"\n",
    "pattern = \"world\"\n",
    "print(re.findall(pattern, text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模糊匹配\n",
    "\n",
    "import re\n",
    "\n",
    "text = \"Hello, world\"\n",
    "pattern = \".\"\n",
    "print(re.findall(pattern, text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 量词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "text = \"fooraskdsororaskaaaaadsooo\"\n",
    "print(\n",
    "    \"{n} pattern: \", re.findall(r\"a{3}\", text), \"\\n\",\n",
    "    \"{n,} pattern: \", re.findall(r\"a{1,}\", text), \"\\n\",\n",
    "    \"{n, m} pattern: \", re.findall(r\"a{1,3}\", text), \"\\n\",\n",
    "    sep=\"\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "text = \"fooraskdsororaskaaaaadsooo\"\n",
    "print(\n",
    "    \"? pattern: \", re.findall(r\"a?\", text), \"\\n\",\n",
    "    \"* pattern: \", re.findall(r\"a*\", text), \"\\n\",\n",
    "    \"+ pattern: \", re.findall(r\"a+\", text), \"\\n\",\n",
    "    sep=\"\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 条件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# anchor\n",
    "import re\n",
    "\n",
    "poetry = \"\"\"The Zen of Python\n",
    "Beautiful is better than ugly.\n",
    "Explicit is better than implicit.\n",
    "Simple is better than complex.\n",
    "Complex is better than complicated.\n",
    "Flat is better than nested.\n",
    "Sparse is better than dense.\n",
    "\"\"\"\n",
    "\n",
    "print(f'Start with \"S\": {re.findall(r\"^S.*\", poetry, re.MULTILINE)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Anti Characters:', re.findall(r\"[^\\nA-Z].*\", poetry, re.MULTILINE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'End with \"ed.\": {re.findall(r\".*ed.$\", poetry, re.MULTILINE)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# or\n",
    "\n",
    "import re\n",
    "\n",
    "text = \"\"\"\n",
    "http://www.google.com\n",
    "https://www.google.com\n",
    "\"\"\"\n",
    "\n",
    "print(f\"Match http or https url: {re.findall(r'http.*|https.*', text)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group and sub string\n",
    "\n",
    "import re\n",
    "\n",
    "poetry = \"\"\"The Zen of Python\n",
    "Beautiful is better than ugly.\n",
    "Explicit is better than implicit.\n",
    "Simple is better than complex.\n",
    "Complex is better than complicated.\n",
    "Flat is better than nested.\n",
    "Sparse is better than dense.\n",
    "\"\"\"\n",
    "groups = re.findall(r\"([A-Z].*?) is better than (.*).\", poetry, re.MULTILINE)\n",
    "for group in groups:\n",
    "    start = group[0].lower()\n",
    "    end = group[1]\n",
    "    print(f\"{start} -> {end}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 在 Python 中使用正则表达式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "poetry = \"\"\"The Zen of Python\n",
    "Beautiful is better than ugly.\n",
    "Explicit is better than implicit.\n",
    "Simple is better than complex.\n",
    "Complex is better than complicated.\n",
    "Flat is better than nested.\n",
    "Sparse is better than dense.\n",
    "\"\"\"\n",
    "\n",
    "print(f're.match: ', re.match(r'^[Tt]he.*', poetry, re.MULTILINE).group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f're.search: ' , re.search(r'(?P<beautiful>^B.+) is', poetry, re.MULTILINE).group(\"beautiful\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f're.findall: ', re.findall(r'(?P<Title>^[A-Z].+) is', poetry, re.MULTILINE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f're.split: ', re.split(r' is better than |[\\n\\.]', poetry))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f're.sub: ', re.sub(r'\\s?is better than\\s?', ' -> ', poetry))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pattern object\n",
    "\n",
    "import re\n",
    "\n",
    "dates = \"\"\"\n",
    "1999/12/31\n",
    "2000/1/1\n",
    "2000/1/2\n",
    "2000/1/3\n",
    "2000/1/4\n",
    "2000/1/5\n",
    "\"\"\"\n",
    "\n",
    "pattern = re.compile(r'(?P<year>\\d{4})/(?P<month>\\d{1,2})/(?P<day>\\d{1,2})', re.MULTILINE)\n",
    "for group in pattern.finditer(dates):\n",
    "    print(group.groupdict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}