Files
everyone-can-use-english/new-edition-drafts/jupyter-notebooks/spaCy.ipynb
2024-01-18 07:31:45 +08:00

162 lines
3.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b1f9b75a-f626-487d-857a-27717aa788bf",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!pip install -U pip setuptools wheel\n",
"!pip install -U spacy\n",
"!python -m spacy download en_core_web_sm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f08b8fa4-59d5-4a51-8979-d29aa7915d32",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"from spacy import displacy\n",
"from pathlib import Path\n",
"\n",
"nlp = spacy.load('en_core_web_sm')\n",
"\n",
"def highlight_root(sentence):\n",
" doc = nlp(sentence)\n",
" \n",
" # Find the root of the sentence\n",
" root = None\n",
" for token in doc:\n",
" if token.dep_ == 'ROOT':\n",
" root = token\n",
" break\n",
" \n",
" # Highlight the root entity in the output\n",
" if root is not None:\n",
" root_start = root.idx\n",
" root_end = root.idx + len(root.text)\n",
" \n",
" text = [{\n",
" 'text': sentence,\n",
" 'ents': [{\n",
" 'start': root_start,\n",
" 'end': root_end,\n",
" 'label': '',\n",
" }],\n",
" 'title': None\n",
" }]\n",
" displacy.render(text, style='ent', manual=True)\n",
" else:\n",
" print(\"No root found.\")\n",
" return root"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d58bfcd-a222-4da2-970c-72326c478861",
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The spectacular aurora light displays that appear in Earths atmosphere around the north and south magnetic poles were once mysterious phenomena.\"\n",
"\n",
"doc = nlp(sentence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42ee6c93-b37d-494a-82d6-fb97a0f7acba",
"metadata": {},
"outputs": [],
"source": [
"# 标注每个单词的词性\n",
"\n",
"for token in doc:\n",
" print(token.text, token.pos_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "517506f7-240a-4106-a62b-33dc7f86a391",
"metadata": {},
"outputs": [],
"source": [
"# 高亮标注主句的谓语动词\n",
"\n",
"root = highlight_root(sentence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6c5898f-4b0c-4e6e-81b5-0e8231c0e9cd",
"metadata": {},
"outputs": [],
"source": [
"# 主句的简化版本\n",
"\n",
"children = list(root.children)\n",
"children.insert(1, root)\n",
"simplified_setence = ' '.join(str(c) for c in children).strip().replace(\" .\", \".\").capitalize()\n",
"\n",
"print(simplified_setence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df071fdb-0618-453a-8482-c818fea98a0a",
"metadata": {},
"outputs": [],
"source": [
"# 图形化显示句子成分之间的依赖关系\n",
"\n",
"displacy.render(doc, style=\"dep\", options={'distance': 60})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "273694e6-2e32-48e6-9317-5353612a8c89",
"metadata": {},
"outputs": [],
"source": [
"# 将图形保存为 dep-graph.svg 文件\n",
"\n",
"svg = displacy.render(doc, style=\"dep\", jupyter=False)\n",
"output_path = Path(\"dep-graph.svg\")\n",
"output_path.open(\"w\", encoding=\"utf-8\").write(svg)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}