162 lines
3.9 KiB
Plaintext
162 lines
3.9 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b1f9b75a-f626-487d-857a-27717aa788bf",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"!pip install -U pip setuptools wheel\n",
|
||
"!pip install -U spacy\n",
|
||
"!python -m spacy download en_core_web_sm"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f08b8fa4-59d5-4a51-8979-d29aa7915d32",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import spacy\n",
|
||
"from spacy import displacy\n",
|
||
"from pathlib import Path\n",
|
||
"\n",
|
||
"nlp = spacy.load('en_core_web_sm')\n",
|
||
"\n",
|
||
"def highlight_root(sentence):\n",
|
||
" doc = nlp(sentence)\n",
|
||
" \n",
|
||
" # Find the root of the sentence\n",
|
||
" root = None\n",
|
||
" for token in doc:\n",
|
||
" if token.dep_ == 'ROOT':\n",
|
||
" root = token\n",
|
||
" break\n",
|
||
" \n",
|
||
" # Highlight the root entity in the output\n",
|
||
" if root is not None:\n",
|
||
" root_start = root.idx\n",
|
||
" root_end = root.idx + len(root.text)\n",
|
||
" \n",
|
||
" text = [{\n",
|
||
" 'text': sentence,\n",
|
||
" 'ents': [{\n",
|
||
" 'start': root_start,\n",
|
||
" 'end': root_end,\n",
|
||
" 'label': '',\n",
|
||
" }],\n",
|
||
" 'title': None\n",
|
||
" }]\n",
|
||
" displacy.render(text, style='ent', manual=True)\n",
|
||
" else:\n",
|
||
" print(\"No root found.\")\n",
|
||
" return root"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2d58bfcd-a222-4da2-970c-72326c478861",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sentence = \"The spectacular aurora light displays that appear in Earth’s atmosphere around the north and south magnetic poles were once mysterious phenomena.\"\n",
|
||
"\n",
|
||
"doc = nlp(sentence)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "42ee6c93-b37d-494a-82d6-fb97a0f7acba",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 标注每个单词的词性\n",
|
||
"\n",
|
||
"for token in doc:\n",
|
||
" print(token.text, token.pos_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "517506f7-240a-4106-a62b-33dc7f86a391",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 高亮标注主句的谓语动词\n",
|
||
"\n",
|
||
"root = highlight_root(sentence)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e6c5898f-4b0c-4e6e-81b5-0e8231c0e9cd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 主句的简化版本\n",
|
||
"\n",
|
||
"children = list(root.children)\n",
|
||
"children.insert(1, root)\n",
|
||
"simplified_setence = ' '.join(str(c) for c in children).strip().replace(\" .\", \".\").capitalize()\n",
|
||
"\n",
|
||
"print(simplified_setence)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "df071fdb-0618-453a-8482-c818fea98a0a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 图形化显示句子成分之间的依赖关系\n",
|
||
"\n",
|
||
"displacy.render(doc, style=\"dep\", options={'distance': 60})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "273694e6-2e32-48e6-9317-5353612a8c89",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 将图形保存为 dep-graph.svg 文件\n",
|
||
"\n",
|
||
"svg = displacy.render(doc, style=\"dep\", jupyter=False)\n",
|
||
"output_path = Path(\"dep-graph.svg\")\n",
|
||
"output_path.open(\"w\", encoding=\"utf-8\").write(svg)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.18"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|