Package: add 1000-hours repo in workspace (#337)

* add 1000-hours repo in workspace

* update README
This commit is contained in:
an-lee
2024-02-20 15:40:18 +08:00
committed by GitHub
parent 66bad2002c
commit dab09ea644
385 changed files with 21653 additions and 36 deletions

View File

@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b1f9b75a-f626-487d-857a-27717aa788bf",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!pip install -U pip setuptools wheel\n",
"!pip install -U spacy\n",
"!python -m spacy download en_core_web_sm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f08b8fa4-59d5-4a51-8979-d29aa7915d32",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"from spacy import displacy\n",
"from pathlib import Path\n",
"\n",
"nlp = spacy.load('en_core_web_sm')\n",
"\n",
"def highlight_root(sentence):\n",
" doc = nlp(sentence)\n",
" \n",
" # Find the root of the sentence\n",
" root = None\n",
" for token in doc:\n",
" if token.dep_ == 'ROOT':\n",
" root = token\n",
" break\n",
" \n",
" # Highlight the root entity in the output\n",
" if root is not None:\n",
" root_start = root.idx\n",
" root_end = root.idx + len(root.text)\n",
" \n",
" text = [{\n",
" 'text': sentence,\n",
" 'ents': [{\n",
" 'start': root_start,\n",
" 'end': root_end,\n",
" 'label': '',\n",
" }],\n",
" 'title': None\n",
" }]\n",
" displacy.render(text, style='ent', manual=True)\n",
" else:\n",
" print(\"No root found.\")\n",
" return root"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d58bfcd-a222-4da2-970c-72326c478861",
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The spectacular aurora light displays that appear in Earths atmosphere around the north and south magnetic poles were once mysterious phenomena.\"\n",
"\n",
"doc = nlp(sentence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42ee6c93-b37d-494a-82d6-fb97a0f7acba",
"metadata": {},
"outputs": [],
"source": [
"# 标注每个单词的词性\n",
"\n",
"for token in doc:\n",
" print(token.text, token.pos_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "517506f7-240a-4106-a62b-33dc7f86a391",
"metadata": {},
"outputs": [],
"source": [
"# 高亮标注主句的谓语动词\n",
"\n",
"root = highlight_root(sentence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6c5898f-4b0c-4e6e-81b5-0e8231c0e9cd",
"metadata": {},
"outputs": [],
"source": [
"# 主句的简化版本\n",
"\n",
"children = list(root.children)\n",
"children.insert(1, root)\n",
"simplified_setence = ' '.join(str(c) for c in children).strip().replace(\" .\", \".\").capitalize()\n",
"\n",
"print(simplified_setence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df071fdb-0618-453a-8482-c818fea98a0a",
"metadata": {},
"outputs": [],
"source": [
"# 图形化显示句子成分之间的依赖关系\n",
"\n",
"displacy.render(doc, style=\"dep\", options={'distance': 60})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "273694e6-2e32-48e6-9317-5353612a8c89",
"metadata": {},
"outputs": [],
"source": [
"# 将图形保存为 dep-graph.svg 文件\n",
"\n",
"svg = displacy.render(doc, style=\"dep\", jupyter=False)\n",
"output_path = Path(\"dep-graph.svg\")\n",
"output_path.open(\"w\", encoding=\"utf-8\").write(svg)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}