├── .gitignore ├── LICENSE ├── README.md ├── cawler ├── 1-1.csv ├── CitiesChina.json ├── __init__.py ├── basic_information.csv ├── connect_db.py ├── crawler.ipynb ├── crawler.py ├── culture.csv ├── culture.json ├── loc.csv ├── people.csv ├── pre_data.py ├── process_data.ipynb └── process_data.py ├── examples.py ├── khan2020.pdf ├── main.py ├── model └── __init__.py ├── requirements.txt ├── stopwords.txt ├── train └── __init__.py └── utils ├── __init__.py └── task.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | dataset/ 3 | output/ 4 | tmp/ 5 | .vs/ 6 | data/ 7 | paper/ 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | <<<<<<< HEAD 137 | .pyre/ 138 | ======= 139 | .pyre/ 140 | >>>>>>> 2329ddfe15b15c6033d591c97f38e967c53ddfba 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EssayTopicPredict 2 | 3 | ![image](https://img.shields.io/badge/License-Apache--2.0-green) ![image](https://img.shields.io/badge/License-MIT-orange) ![image](https://img.shields.io/badge/License-Anti--996-red) ![image](https://img.shields.io/badge/pypi-v0.0.1a4-yellowgreen) ![image](https://img.shields.io/badge/stars-%3C%201k-blue) ![image](https://img.shields.io/badge/issues-1%20open-brightgreen) 4 | 5 | 通用型高考作文题目预测模型 v2.0 -人工智能框架,仅限交流与科普。 6 | 7 | 8 | ## 项目简介 9 | EssayTopicPredict是基于无监督学习、模式识别与NLP领域的最新模型所构建的生成式AIGC考试题预测框架,目前第一版finetune模型针对高考作文,可以有效生成符合人类认知的文章题目。 10 | | 项目作者 | 主页1 | 主页2 | 11 | | ------------- |:------:|:----:| 12 | | Y1ran | [CSDN](https://y1ran.blog.csdn.net/) |[Github](https://github.com/Y1ran) | 13 | 14 |
15 | 16 | ## 框架说明 17 | - [x] 基于哈工大RoBerta-WWM-EXT、Bertopic、GAN模型的高考题目预测AI 18 | - [x] 支持bert tokenizer,当前版本基于clue chinese vocab 19 | - [x] 17亿参数多模块异构深度神经网络,超2亿条预训练数据 20 | - [x] 可结合作文生成器一起使用:[17亿参数作文杀手](https://colab.research.google.com/github/EssayKillerBrain/EssayKiller_V2/blob/master/colab_online.ipynb) 21 | - [x] 端到端生成,从试卷识别到答题卡输出一条龙服务 22 | - [x] 通过GPT-4模型进行命题指导与判断,同时也可以写/优化范文 23 | 24 | 25 | ## 本地环境 26 | * Ubuntu 18.04.2/ Windows10 x86 27 | * Pandas 0.24.2 28 | * Regex 2019.4.14 29 | * h5py 2.9.0 30 | * Numpy 1.16.2 31 | * Tensorboard 1.15.2 32 | * Tensorflow-gpu 1.15.2 33 | * Requests 2.22.0 34 | * CUDA >= 10.0 35 | * CuDNN >= 7.6.0 36 | * OpenAI API 37 | 38 | ## 开发日志 39 | 40 | * 2022.04.23 本地Git项目建立 41 | * 2022.05.03 整体模型架构搭建,开始语料收集 42 | * 2022.05.13 数据集清洗、语料处理 43 | * 2022.05.21 Bertopic+DBSCAN聚类算法 44 | * 2022.05.31 RoBerta与摘要模型调整 45 | * 2022.05.30 代码Review与开源发布 46 | * 2023.05.24 更新chatgpt-4框架辅助判断 47 | 48 | ## 模型结构 49 | 整个框架分为Proprocess、Bert、DNSCAN 3个模块,每个模块的网络单独训练,参数相互独立。 50 | 51 | ### 1. 例子 52 | 高考语文试卷作文题 53 | >![浙江卷](https://images.shobserver.com/img/2020/7/7/37b2224ee3de441a8a040cb4f5576c2d.jpg) 54 | 55 | 56 | **数据准备** 57 | 58 | 人民日报、央视新闻、微博客户端、人民网4个主要爬虫渠道,通过不同API进行爬取(时间为过去12个月内) 59 | 60 | *修改/train/config.py中train_data_root,validation_data_root以及image_path* 61 | 62 | **训练** 63 | ```bash 64 | cd train 65 | python train.py 66 | ``` 67 | 68 | 69 |
70 | 71 | ### 2. 网络结构 72 | #### 2.1 BERT 73 | 74 | **Whole Word Masking (wwm)**,暂翻译为`全词Mask`或`整词Mask`,是谷歌在2019年5月31日发布的一项BERT的升级版本,主要更改了原预训练阶段的训练样本生成策略。 75 | 简单来说,原有基于WordPiece的分词方式会把一个完整的词切分成若干个子词,在生成训练样本时,这些被分开的子词会随机被mask。 76 | 在`全词Mask`中,如果一个完整的词的部分WordPiece子词被mask,则同属该词的其他部分也会被mask,即`全词Mask`。 77 | 78 | **需要注意的是,这里的mask指的是广义的mask(替换成[MASK];保持原词汇;随机替换成另外一个词),并非只局限于单词替换成`[MASK]`标签的情况。 79 | 更详细的说明及样例请参考:[#4](https://github.com/ymcui/Chinese-BERT-wwm/issues/4)** 80 | 81 | 同理,由于谷歌官方发布的`BERT-base, Chinese`中,中文是以**字**为粒度进行切分,没有考虑到传统NLP中的中文分词(CWS)。 82 | 我们将全词Mask的方法应用在了中文中,使用了中文维基百科(包括简体和繁体)进行训练,并且使用了[哈工大LTP](http://ltp.ai)作为分词工具,即对组成同一个**词**的汉字全部进行Mask。 83 | 84 | 下述文本展示了`全词Mask`的生成样例。 85 | **注意:为了方便理解,下述例子中只考虑替换成[MASK]标签的情况。** 86 | 87 | | 说明 | 样例 | 88 | | :------- | :--------- | 89 | | 原始文本 | 使用语言模型来预测下一个词的probability。 | 90 | | 分词文本 | 使用 语言 模型 来 预测 下 一个 词 的 probability 。 | 91 | | 原始Mask输入 | 使 用 语 言 [MASK] 型 来 [MASK] 测 下 一 个 词 的 pro [MASK] ##lity 。 | 92 | | 全词Mask输入 | 使 用 语 言 [MASK] [MASK] 来 [MASK] [MASK] 下 一 个 词 的 [MASK] [MASK] [MASK] 。 | 93 | 94 | 95 | #### 2.2 DBSCAN 96 | 97 | 基于密度的噪声应用空间聚类(DBSCAN)是一种无监督的ML聚类算法。无监督的意思是它不使用预先标记的目标来聚类数据点。聚类是指试图将相似的数据点分组到人工确定的组或簇中。它可以替代KMeans和层次聚类等流行的聚类算法。 98 | 99 | KMeans vs DBSCAN: 100 | KMeans尤其容易受到异常值的影响。当算法遍历质心时,在达到稳定性和收敛性之前,离群值对质心的移动方式有显著的影响。此外,KMeans在集群大小和密度不同的情况下还存在数据精确聚类的问题。K-Means只能应用球形簇,如果数据不是球形的,它的准确性就会受到影响。最后,KMeans要求我们首先选择希望找到的集群的数量。 101 | 102 | 另一方面,DBSCAN不要求我们指定集群的数量,避免了异常值,并且在任意形状和大小的集群中工作得非常好。它没有质心,聚类簇是通过将相邻的点连接在一起的过程形成的。 103 | 104 | ## 中文模型下载 105 | 本目录中主要包含base模型,故我们不在模型简称中标注`base`字样。对于其他大小的模型会标注对应的标记(例如large)。 106 | 107 | * **`BERT-large模型`**:24-layer, 1024-hidden, 16-heads, 330M parameters 108 | * **`BERT-base模型`**:12-layer, 768-hidden, 12-heads, 110M parameters 109 | 110 | **注意:开源版本不包含MLM任务的权重;如需做MLM任务,请使用额外数据进行二次预训练(和其他下游任务一样)。** 111 | 112 | | 模型简称 | 语料 | Google下载 | 百度网盘下载 | 113 | | :------- | :--------- | :---------: | :---------: | 114 | | **`RBT6, Chinese`** | **EXT数据[1]** | - | **[TensorFlow(密码hniy)](https://pan.baidu.com/s/1_MDAIYIGVgDovWkSs51NDA?pwd=hniy)** | 115 | | **`RBT4, Chinese`** | **EXT数据[1]** | - | **[TensorFlow(密码sjpt)](https://pan.baidu.com/s/1MUrmuTULnMn3L1aw_dXxSA?pwd=sjpt)** | 116 | | **`RBTL3, Chinese`** | **EXT数据[1]** | **[TensorFlow](https://drive.google.com/open?id=1Jzn1hYwmv0kXkfTeIvNT61Rn1IbRc-o8)**
**[PyTorch](https://drive.google.com/open?id=1qs5OasLXXjOnR2XuGUh12NanUl0pkjEv)** | **[TensorFlow(密码s6cu)](https://pan.baidu.com/s/1vV9ClBMbsSpt8wUpfQz62Q?pwd=s6cu)** | 117 | | **`RBT3, Chinese`** | **EXT数据[1]** | **[TensorFlow](https://drive.google.com/open?id=1-rvV0nBDvRCASbRz8M9Decc3_8Aw-2yi)**
**[PyTorch](https://drive.google.com/open?id=1_LqmIxm8Nz1Abvlqb8QFZaxYo-TInOed)** | **[TensorFlow(密码5a57)](https://pan.baidu.com/s/1AnapwWj1YBZ_4E6AAtj2lg?pwd=5a57)** | 118 | | **`RoBERTa-wwm-ext-large, Chinese`** | **EXT数据[1]** | **[TensorFlow](https://drive.google.com/open?id=1dtad0FFzG11CBsawu8hvwwzU2R0FDI94)**
**[PyTorch](https://drive.google.com/open?id=1-2vEZfIFCdM1-vJ3GD6DlSyKT4eVXMKq)** | **[TensorFlow(密码dqqe)](https://pan.baidu.com/s/1F68xzCLWEonTEVP7HQ0Ciw?pwd=dqqe)** | 119 | | **`RoBERTa-wwm-ext, Chinese`** | **EXT数据[1]** | **[TensorFlow](https://drive.google.com/open?id=1jMAKIJmPn7kADgD3yQZhpsqM-IRM1qZt)**
**[PyTorch](https://drive.google.com/open?id=1eHM3l4fMo6DsQYGmey7UZGiTmQquHw25)** | **[TensorFlow(密码vybq)](https://pan.baidu.com/s/1oR0cgSXE3Nz6dESxr98qVA?pwd=vybq)** | 120 | | **`BERT-wwm-ext, Chinese`** | **EXT数据[1]** | **[TensorFlow](https://drive.google.com/open?id=1buMLEjdtrXE2c4G1rpsNGWEx7lUQ0RHi)**
**[PyTorch](https://drive.google.com/open?id=1iNeYFhCBJWeUsIlnW_2K6SMwXkM4gLb_)** | **[TensorFlow(密码wgnt)](https://pan.baidu.com/s/1x-jIw1X2yNYHGak2yiq4RQ?pwd=wgnt)** | 121 | | **`BERT-wwm, Chinese`** | **中文维基** | **[TensorFlow](https://drive.google.com/open?id=1RoTQsXp2hkQ1gSRVylRIJfQxJUgkfJMW)**
**[PyTorch](https://drive.google.com/open?id=1AQitrjbvCWc51SYiLN-cJq4e0WiNN4KY)** | **[TensorFlow(密码qfh8)](https://pan.baidu.com/s/1HDdDXiYxGT5ub5OeO7qdWw?pwd=qfh8)** | 122 | | `BERT-base, Chinese`Google | 中文维基 | [Google Cloud](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip) | - | 123 | | `BERT-base, Multilingual Cased`Google | 多语种维基 | [Google Cloud](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip) | - | 124 | | `BERT-base, Multilingual Uncased`Google | 多语种维基 | [Google Cloud](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip) | - | 125 | 126 | > [1] EXT数据包括:中文维基百科,其他百科、新闻、问答等数据,总词数达5.4B。 127 | 128 | 查看更多哈工大讯飞联合实验室(HFL)发布的资源:https://github.com/ymcui/HFL-Anthology 129 | 130 | ```bash 131 | python run.py --model bert 132 | ``` 133 | 134 | 135 | ## Citation 136 | ``` 137 | @misc{EssayKillerBrain, 138 | author = {Turing's Cat}, 139 | title = {Autowritting Ai Framework}, 140 | year = {2022}, 141 | publisher = {GitHub}, 142 | journal = {GitHub repository}, 143 | howpublished = {\url{https://github.com/AlanTur1ng/EssayTopicPredict}}, 144 | } 145 | ``` 146 | 147 |
148 | 149 | 150 | ## 参考资料 151 | [1] BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 152 | [2] ERNIE: Enhanced Representation through Knowledge Integration 153 | [3] Fine-tune BERT for Extractive Summarization 154 | [4] EAST: An Efficient and Accurate Scene Text Detector 155 | [5] An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition 156 | [6] Language Models are Unsupervised Multitask Learners 157 | [7] https://github.com/Morizeyao/GPT2-Chinese 158 | [8] https://github.com/argman/EAST 159 | [9] https://github.com/bgshih/crnn 160 | [10] https://github.com/zhiyou720/chinese_summarizer 161 | [11] https://zhuanlan.zhihu.com/p/64737915 162 | [12] https://github.com/ouyanghuiyu/chineseocr_lite 163 | [13] https://github.com/google-research/bert 164 | [14] https://github.com/rowanz/grover 165 | [15] https://github.com/wind91725/gpt2-ml-finetune- 166 | [16] https://github.com/guodongxiaren/README 167 | [17] https://www.jianshu.com/p/55560d3e0e8a 168 | [18] https://github.com/YCG09/chinese_ocr 169 | [19] https://github.com/xiaomaxiao/keras_ocr 170 | [20] https://github.com/nghuyong/ERNIE-Pytorch 171 | [21] https://zhuanlan.zhihu.com/p/43534801 172 | [22] https://blog.csdn.net/xuxunjie147/article/details/87178774/ 173 | [23] https://github.com/JiangYanting/Pre-modern_Chinese_corpus_dataset 174 | [24] https://github.com/brightmart/nlp_chinese_corpus 175 | [25] https://github.com/SophonPlus/ChineseNlpCorpus 176 | [26] https://github.com/THUNLP-AIPoet/Resources 177 | [27] https://github.com/OYE93/Chinese-NLP-Corpus 178 | [28] https://github.com/CLUEbenchmark/CLUECorpus2020 179 | [29] https://github.com/zhiyou720/chinese_summarizer 180 | 181 | 182 | ## 免责声明 183 | 该项目中的内容仅供技术研究与科普,不作为任何结论性依据,不提供任何商业化应用授权 184 | -------------------------------------------------------------------------------- /cawler/CitiesChina.json: -------------------------------------------------------------------------------- 1 | [{"city":"东城","province":"北京","lat":39.92835300000001,"lng":116.416357},{"city":"西城","province":"北京","lat":39.912289,"lng":116.365868},{"city":"朝阳","province":"北京","lat":39.92147,"lng":116.443108},{"city":"丰台","province":"北京","lat":39.858427,"lng":116.287149},{"city":"石景山","province":"北京","lat":39.906611,"lng":116.222982},{"city":"海淀","province":"北京","lat":39.959912,"lng":116.298055},{"city":"门头沟","province":"北京","lat":39.940646,"lng":116.102009},{"city":"房山","province":"北京","lat":39.749144,"lng":116.143267},{"city":"通州","province":"北京","lat":39.909946,"lng":116.656435},{"city":"顺义","province":"北京","lat":40.130347,"lng":116.654651},{"city":"昌平","province":"北京","lat":40.22066,"lng":116.231204},{"city":"大兴","province":"北京","lat":39.726929,"lng":116.341395},{"city":"平谷","province":"北京","lat":40.140701,"lng":117.121383},{"city":"怀柔","province":"北京","lat":40.316304,"lng":116.63198},{"city":"密云","province":"北京","lat":40.376931,"lng":116.843066},{"city":"延庆","province":"北京","lat":40.456951,"lng":115.974848},{"city":"河东","province":"天津","lat":35.0672108,"lng":127.7512687},{"city":"南开","province":"天津","lat":39.10394429999999,"lng":117.1742232},{"city":"河北","province":"天津","lat":38.037057,"lng":114.468665},{"city":"红桥","province":"天津","lat":39.167345,"lng":117.151533},{"city":"东丽","province":"天津","lat":39.086569,"lng":117.314323},{"city":"西青","province":"天津","lat":39.141152,"lng":117.008827},{"city":"津南","province":"天津","lat":37.0142742,"lng":138.6526221},{"city":"北辰","province":"天津","lat":39.224792,"lng":117.135488},{"city":"宁河","province":"天津","lat":39.330087,"lng":117.826724},{"city":"武清","province":"天津","lat":39.384119,"lng":117.044388},{"city":"静海","province":"天津","lat":38.947512,"lng":116.97413},{"city":"宝坻","province":"天津","lat":39.717379,"lng":117.309863},{"city":"蓟县","province":"天津","lat":40.046061,"lng":117.408306},{"city":"滨海新区","province":"天津","lat":39.00366,"lng":117.710496},{"city":"石家庄","province":"河北","lat":38.042307,"lng":114.51486},{"city":"唐山","province":"河北","lat":39.63086699999999,"lng":118.180194},{"city":"秦皇岛","province":"河北","lat":39.935385,"lng":119.600493},{"city":"邯郸","province":"河北","lat":36.625657,"lng":114.538962},{"city":"邢台","province":"河北","lat":37.070589,"lng":114.504844},{"city":"保定","province":"河北","lat":38.874434,"lng":115.46459},{"city":"张家口","province":"河北","lat":40.76754500000001,"lng":114.886335},{"city":"承德","province":"河北","lat":40.952942,"lng":117.96275},{"city":"沧州","province":"河北","lat":38.30447700000001,"lng":116.838835},{"city":"廊坊","province":"河北","lat":39.53804700000001,"lng":116.683752},{"city":"衡水","province":"河北","lat":37.73892000000001,"lng":115.670177},{"city":"太原","province":"山西","lat":37.87059,"lng":112.548879},{"city":"大同","province":"山西","lat":40.076763,"lng":113.300129},{"city":"阳泉","province":"山西","lat":37.856972,"lng":113.580519},{"city":"长治","province":"山西","lat":36.195409,"lng":113.116404},{"city":"晋城","province":"山西","lat":35.490702,"lng":112.851831},{"city":"朔州","province":"山西","lat":39.331595,"lng":112.432825},{"city":"晋中","province":"山西","lat":37.687358,"lng":112.752653},{"city":"运城","province":"山西","lat":35.026412,"lng":111.007529},{"city":"忻州","province":"山西","lat":38.416663,"lng":112.734174},{"city":"临汾","province":"山西","lat":36.088005,"lng":111.518976},{"city":"吕梁","province":"山西","lat":37.518314,"lng":111.144319},{"city":"呼和浩特","province":"内蒙古","lat":40.842585,"lng":111.749181},{"city":"包头","province":"内蒙古","lat":40.657378,"lng":109.840349},{"city":"乌海","province":"内蒙古","lat":39.655389,"lng":106.794249},{"city":"赤峰","province":"内蒙古","lat":42.257817,"lng":118.886856},{"city":"通辽","province":"内蒙古","lat":43.65289,"lng":122.243444},{"city":"鄂尔多斯","province":"内蒙古","lat":39.608266,"lng":109.781327},{"city":"呼伦贝尔","province":"内蒙古","lat":49.211575,"lng":119.765745},{"city":"巴彦淖尔","province":"内蒙古","lat":40.743213,"lng":107.387657},{"city":"乌兰察布","province":"内蒙古","lat":40.994786,"lng":113.132585},{"city":"兴安","province":"内蒙古","lat":25.611705,"lng":110.67167},{"city":"锡林郭勒","province":"内蒙古","lat":43.933454,"lng":116.048222},{"city":"阿拉善","province":"内蒙古","lat":38.851921,"lng":105.728958},{"city":"沈阳","province":"辽宁","lat":41.805699,"lng":123.431472},{"city":"大连","province":"辽宁","lat":38.91400300000001,"lng":121.614682},{"city":"鞍山","province":"辽宁","lat":41.108647,"lng":122.994329},{"city":"抚顺","province":"辽宁","lat":41.880872,"lng":123.957208},{"city":"本溪","province":"辽宁","lat":41.486981,"lng":123.685143},{"city":"丹东","province":"辽宁","lat":40.000787,"lng":124.354451},{"city":"锦州","province":"辽宁","lat":41.095685,"lng":121.126846},{"city":"营口","province":"辽宁","lat":40.667012,"lng":122.235418},{"city":"阜新","province":"辽宁","lat":42.021619,"lng":121.670324},{"city":"辽阳","province":"辽宁","lat":41.267244,"lng":123.236944},{"city":"盘锦","province":"辽宁","lat":41.119997,"lng":122.070714},{"city":"铁岭","province":"辽宁","lat":42.223828,"lng":123.726035},{"city":"朝阳","province":"辽宁","lat":39.92147,"lng":116.443108},{"city":"葫芦岛","province":"辽宁","lat":40.711041,"lng":120.83694},{"city":"长春","province":"吉林","lat":43.817072,"lng":125.323544},{"city":"吉林","province":"吉林","lat":43.89653999999999,"lng":125.325862},{"city":"四平","province":"吉林","lat":43.16642,"lng":124.350398},{"city":"辽源","province":"吉林","lat":42.887767,"lng":125.14366},{"city":"通化","province":"吉林","lat":41.728401,"lng":125.939697},{"city":"白山","province":"吉林","lat":36.1541666,"lng":136.77},{"city":"松原","province":"吉林","lat":45.141789,"lng":124.825118},{"city":"白城","province":"吉林","lat":45.619588,"lng":122.838826},{"city":"延边","province":"吉林","lat":42.909409,"lng":129.471868},{"city":"哈尔滨","province":"黑龙江","lat":45.80377499999999,"lng":126.534967},{"city":"齐齐哈尔","province":"黑龙江","lat":47.35434799999999,"lng":123.918186},{"city":"鸡西","province":"黑龙江","lat":45.295075,"lng":130.969333},{"city":"鹤岗","province":"黑龙江","lat":47.349916,"lng":130.297964},{"city":"双鸭山","province":"黑龙江","lat":46.64650899999999,"lng":131.159133},{"city":"大庆","province":"黑龙江","lat":46.58931,"lng":125.103784},{"city":"伊春","province":"黑龙江","lat":47.72753600000001,"lng":128.841148},{"city":"佳木斯","province":"黑龙江","lat":46.799778,"lng":130.318879},{"city":"七台河","province":"黑龙江","lat":45.771727,"lng":131.003138},{"city":"牡丹江","province":"黑龙江","lat":44.55165299999999,"lng":129.633169},{"city":"黑河","province":"黑龙江","lat":50.245129,"lng":127.528294},{"city":"绥化","province":"黑龙江","lat":46.653845,"lng":126.968887},{"city":"大兴安岭","province":"黑龙江","lat":52.335262,"lng":124.711526},{"city":"黄浦","province":"上海","lat":31.23173,"lng":121.484384},{"city":"徐汇","province":"上海","lat":31.188523,"lng":121.436525},{"city":"长宁","province":"上海","lat":31.220367,"lng":121.424624},{"city":"静安","province":"上海","lat":31.223428,"lng":121.455965},{"city":"普陀","province":"上海","lat":29.97176,"lng":122.323867},{"city":"闸北","province":"上海","lat":31.247105,"lng":121.459384},{"city":"虹口","province":"上海","lat":31.2646,"lng":121.505133},{"city":"杨浦","province":"上海","lat":31.259822,"lng":121.525727},{"city":"闵行","province":"上海","lat":31.112818,"lng":121.381716},{"city":"宝山","province":"上海","lat":31.405457,"lng":121.489612},{"city":"嘉定","province":"上海","lat":31.37586899999999,"lng":121.265374},{"city":"浦东新","province":"上海","lat":31.221517,"lng":121.544379},{"city":"松江","province":"上海","lat":31.032243,"lng":121.227747},{"city":"奉贤","province":"上海","lat":30.917795,"lng":121.474042},{"city":"青浦","province":"上海","lat":31.150681,"lng":121.124178},{"city":"崇明","province":"上海","lat":31.623728,"lng":121.397422},{"city":"南京","province":"江苏","lat":32.060255,"lng":118.796877},{"city":"无锡","province":"江苏","lat":31.49117,"lng":120.31191},{"city":"徐州","province":"江苏","lat":34.205768,"lng":117.284124},{"city":"常州","province":"江苏","lat":31.811226,"lng":119.974062},{"city":"苏州","province":"江苏","lat":31.298979,"lng":120.58529},{"city":"南通","province":"江苏","lat":31.980172,"lng":120.894291},{"city":"连云港","province":"江苏","lat":34.596653,"lng":119.221611},{"city":"淮安","province":"江苏","lat":33.61036,"lng":119.015288},{"city":"盐城","province":"江苏","lat":33.347316,"lng":120.16366},{"city":"扬州","province":"江苏","lat":32.394213,"lng":119.412947},{"city":"镇江","province":"江苏","lat":32.187849,"lng":119.425836},{"city":"泰州","province":"江苏","lat":32.455536,"lng":119.922933},{"city":"宿迁","province":"江苏","lat":33.963232,"lng":118.275198},{"city":"杭州","province":"浙江","lat":30.274085,"lng":120.15507},{"city":"宁波","province":"浙江","lat":29.868336,"lng":121.54399},{"city":"温州","province":"浙江","lat":27.993828,"lng":120.699362},{"city":"嘉兴","province":"浙江","lat":30.753924,"lng":120.758543},{"city":"湖州","province":"浙江","lat":30.894348,"lng":120.086823},{"city":"绍兴","province":"浙江","lat":29.995762,"lng":120.586109},{"city":"金华","province":"浙江","lat":29.079175,"lng":119.647421},{"city":"衢州","province":"浙江","lat":28.97008,"lng":118.859457},{"city":"舟山","province":"浙江","lat":29.98529499999999,"lng":122.207216},{"city":"台州","province":"浙江","lat":28.65638,"lng":121.42076},{"city":"丽水","province":"浙江","lat":28.46763,"lng":119.922796},{"city":"合肥","province":"安徽","lat":31.820592,"lng":117.227219},{"city":"芜湖","province":"安徽","lat":31.352859,"lng":118.432941},{"city":"蚌埠","province":"安徽","lat":32.916287,"lng":117.389719},{"city":"淮南","province":"安徽","lat":32.625478,"lng":116.999933},{"city":"马鞍山","province":"安徽","lat":31.67045199999999,"lng":118.50676},{"city":"淮北","province":"安徽","lat":33.955845,"lng":116.798265},{"city":"铜陵","province":"安徽","lat":30.945516,"lng":117.811267},{"city":"安庆","province":"安徽","lat":30.53192,"lng":117.115101},{"city":"黄山","province":"安徽","lat":29.714748,"lng":118.337621},{"city":"滁州","province":"安徽","lat":32.25342500000001,"lng":118.328322},{"city":"阜阳","province":"安徽","lat":32.890124,"lng":115.814205},{"city":"宿州","province":"安徽","lat":33.646357,"lng":116.964358},{"city":"六安","province":"安徽","lat":31.7337,"lng":116.521855},{"city":"亳州","province":"安徽","lat":33.844582,"lng":115.778676},{"city":"池州","province":"安徽","lat":30.6648,"lng":117.491568},{"city":"宣城","province":"安徽","lat":30.94069,"lng":118.758588},{"city":"福州","province":"福建","lat":26.074508,"lng":119.296494},{"city":"厦门","province":"福建","lat":24.479834,"lng":118.089425},{"city":"莆田","province":"福建","lat":25.454085,"lng":119.007777},{"city":"三明","province":"福建","lat":26.263407,"lng":117.638678},{"city":"泉州","province":"福建","lat":24.874132,"lng":118.675676},{"city":"漳州","province":"福建","lat":24.513025,"lng":117.647093},{"city":"南平","province":"福建","lat":26.641769,"lng":118.177708},{"city":"龙岩","province":"福建","lat":25.075123,"lng":117.017537},{"city":"宁德","province":"福建","lat":26.665617,"lng":119.547933},{"city":"南昌","province":"江西","lat":28.682892,"lng":115.858198},{"city":"景德镇","province":"江西","lat":29.268783,"lng":117.178443},{"city":"萍乡","province":"江西","lat":27.622768,"lng":113.854556},{"city":"九江","province":"江西","lat":29.705077,"lng":116.00193},{"city":"新余","province":"江西","lat":27.817809,"lng":114.917347},{"city":"鹰潭","province":"江西","lat":28.260189,"lng":117.069202},{"city":"赣州","province":"江西","lat":25.831829,"lng":114.93503},{"city":"吉安","province":"江西","lat":27.087637,"lng":114.964696},{"city":"宜春","province":"江西","lat":27.815743,"lng":114.416786},{"city":"抚州","province":"江西","lat":27.949217,"lng":116.358182},{"city":"上饶","province":"江西","lat":28.454863,"lng":117.943433},{"city":"济南","province":"山东","lat":36.651216,"lng":117.12},{"city":"青岛","province":"山东","lat":36.067108,"lng":120.382609},{"city":"淄博","province":"山东","lat":36.81348699999999,"lng":118.054927},{"city":"枣庄","province":"山东","lat":34.810488,"lng":117.323725},{"city":"东营","province":"山东","lat":37.434751,"lng":118.674767},{"city":"烟台","province":"山东","lat":37.463822,"lng":121.447935},{"city":"潍坊","province":"山东","lat":36.706962,"lng":119.161749},{"city":"济宁","province":"山东","lat":35.41498199999999,"lng":116.587282},{"city":"泰安","province":"山东","lat":36.200252,"lng":117.087614},{"city":"威海","province":"山东","lat":37.513068,"lng":122.12042},{"city":"日照","province":"山东","lat":35.416734,"lng":119.526925},{"city":"莱芜","province":"山东","lat":36.213814,"lng":117.676724},{"city":"临沂","province":"山东","lat":35.104674,"lng":118.356414},{"city":"德州","province":"山东","lat":31.9685988,"lng":-99.9018131},{"city":"聊城","province":"山东","lat":36.45703,"lng":115.98546},{"city":"滨州","province":"山东","lat":37.38198,"lng":117.9707},{"city":"菏泽","province":"山东","lat":35.23375,"lng":115.480656},{"city":"郑州","province":"河南","lat":34.746611,"lng":113.625328},{"city":"开封","province":"河南","lat":34.797281,"lng":114.307583},{"city":"洛阳","province":"河南","lat":34.619683,"lng":112.45404},{"city":"平顶山","province":"河南","lat":33.76617,"lng":113.192661},{"city":"安阳","province":"河南","lat":36.097577,"lng":114.392393},{"city":"鹤壁","province":"河南","lat":35.747225,"lng":114.297273},{"city":"新乡","province":"河南","lat":35.30313599999999,"lng":113.927016},{"city":"焦作","province":"河南","lat":35.215893,"lng":113.241823},{"city":"濮阳","province":"河南","lat":35.761829,"lng":115.029216},{"city":"许昌","province":"河南","lat":34.035771,"lng":113.852454},{"city":"漯河","province":"河南","lat":33.580873,"lng":114.016536},{"city":"三门峡","province":"河南","lat":34.772792,"lng":111.200367},{"city":"南阳","province":"河南","lat":32.990664,"lng":112.528308},{"city":"商丘","province":"河南","lat":34.414961,"lng":115.656339},{"city":"信阳","province":"河南","lat":32.146984,"lng":114.091023},{"city":"周口","province":"河南","lat":33.626149,"lng":114.696951},{"city":"驻马店","province":"河南","lat":33.011529,"lng":114.022298},{"city":"济源","province":"河南","lat":35.067243,"lng":112.601919},{"city":"武汉","province":"湖北","lat":30.593099,"lng":114.305393},{"city":"黄石","province":"湖北","lat":44.427963,"lng":-110.588455},{"city":"十堰","province":"湖北","lat":32.629398,"lng":110.798266},{"city":"宜昌","province":"湖北","lat":30.691967,"lng":111.286471},{"city":"襄阳","province":"湖北","lat":32.008986,"lng":112.122415},{"city":"鄂州","province":"湖北","lat":30.39194,"lng":114.894843},{"city":"荆门","province":"湖北","lat":31.035396,"lng":112.199427},{"city":"孝感","province":"湖北","lat":30.917766,"lng":113.957037},{"city":"荆州","province":"湖北","lat":30.335165,"lng":112.239741},{"city":"黄冈","province":"湖北","lat":30.453667,"lng":114.8722},{"city":"咸宁","province":"湖北","lat":29.841438,"lng":114.322439},{"city":"随州","province":"湖北","lat":31.690216,"lng":113.382458},{"city":"恩施","province":"湖北","lat":30.272156,"lng":109.488172},{"city":"仙桃","province":"湖北","lat":30.360882,"lng":113.423482},{"city":"潜江","province":"湖北","lat":30.40211,"lng":112.900079},{"city":"天门","province":"湖北","lat":30.66333699999999,"lng":113.166078},{"city":"神农架","province":"湖北","lat":31.743483,"lng":110.680448},{"city":"长沙","province":"湖南","lat":28.228209,"lng":112.938814},{"city":"株洲","province":"湖南","lat":27.827987,"lng":113.133853},{"city":"湘潭","province":"湖南","lat":27.829738,"lng":112.944049},{"city":"衡阳","province":"湖南","lat":26.893369,"lng":112.572018},{"city":"邵阳","province":"湖南","lat":27.238893,"lng":111.467791},{"city":"岳阳","province":"湖南","lat":29.356804,"lng":113.12873},{"city":"常德","province":"湖南","lat":29.031673,"lng":111.698497},{"city":"张家界","province":"湖南","lat":29.117096,"lng":110.479191},{"city":"益阳","province":"湖南","lat":28.55386,"lng":112.35518},{"city":"郴州","province":"湖南","lat":25.77051,"lng":113.014718},{"city":"永州","province":"湖南","lat":26.420394,"lng":111.613445},{"city":"怀化","province":"湖南","lat":27.569517,"lng":110.001923},{"city":"娄底","province":"湖南","lat":27.700063,"lng":111.993497},{"city":"湘西","province":"湖南","lat":28.314296,"lng":109.739735},{"city":"广州","province":"广东","lat":23.12911,"lng":113.264385},{"city":"韶关","province":"广东","lat":24.810329,"lng":113.597547},{"city":"深圳","province":"广东","lat":22.543096,"lng":114.057865},{"city":"珠海","province":"广东","lat":22.270978,"lng":113.576678},{"city":"汕头","province":"广东","lat":23.354091,"lng":116.681972},{"city":"佛山","province":"广东","lat":23.021479,"lng":113.121436},{"city":"江门","province":"广东","lat":22.579117,"lng":113.081508},{"city":"湛江","province":"广东","lat":21.270746,"lng":110.359336},{"city":"茂名","province":"广东","lat":21.662999,"lng":110.925456},{"city":"肇庆","province":"广东","lat":23.047192,"lng":112.465091},{"city":"惠州","province":"广东","lat":23.112257,"lng":114.415801},{"city":"梅州","province":"广东","lat":24.288615,"lng":116.122239},{"city":"汕尾","province":"广东","lat":22.786186,"lng":115.375159},{"city":"河源","province":"广东","lat":23.743686,"lng":114.700961},{"city":"阳江","province":"广东","lat":21.857958,"lng":111.982232},{"city":"清远","province":"广东","lat":23.681764,"lng":113.056031},{"city":"东莞","province":"广东","lat":23.020673,"lng":113.7518},{"city":"中山","province":"广东","lat":22.516999,"lng":113.392725},{"city":"潮州","province":"广东","lat":23.656704,"lng":116.622756},{"city":"揭阳","province":"广东","lat":23.549993,"lng":116.372831},{"city":"云浮","province":"广东","lat":22.915094,"lng":112.044491},{"city":"南宁","province":"广西","lat":22.817002,"lng":108.366543},{"city":"柳州","province":"广西","lat":24.326038,"lng":109.42841},{"city":"桂林","province":"广西","lat":25.234479,"lng":110.179954},{"city":"梧州","province":"广西","lat":23.476963,"lng":111.279115},{"city":"北海","province":"广西","lat":21.481291,"lng":109.120161},{"city":"防城港","province":"广西","lat":21.68686,"lng":108.353847},{"city":"钦州","province":"广西","lat":21.979934,"lng":108.654147},{"city":"贵港","province":"广西","lat":23.111531,"lng":109.598927},{"city":"玉林","province":"广西","lat":22.636379,"lng":110.164756},{"city":"百色","province":"广西","lat":23.902333,"lng":106.618201},{"city":"贺州","province":"广西","lat":24.403582,"lng":111.566694},{"city":"河池","province":"广西","lat":24.692931,"lng":108.085261},{"city":"来宾","province":"广西","lat":23.750306,"lng":109.221466},{"city":"崇左","province":"广西","lat":22.376533,"lng":107.364711},{"city":"海口","province":"海南","lat":20.044002,"lng":110.198293},{"city":"三亚","province":"海南","lat":18.252847,"lng":109.511909},{"city":"三沙","province":"海南","lat":26.92168,"lng":120.216566},{"city":"五指山","province":"海南","lat":18.775147,"lng":109.516925},{"city":"琼海","province":"海南","lat":19.258342,"lng":110.474648},{"city":"儋州","province":"海南","lat":19.521134,"lng":109.580811},{"city":"文昌","province":"海南","lat":19.543279,"lng":110.79776},{"city":"定安","province":"海南","lat":19.681434,"lng":110.358891},{"city":"屯昌","province":"海南","lat":19.351766,"lng":110.103415},{"city":"澄迈","province":"海南","lat":19.738521,"lng":110.006755},{"city":"临高","province":"海南","lat":19.912026,"lng":109.690508},{"city":"乐东","province":"海南","lat":18.75026,"lng":109.173055},{"city":"陵水","province":"海南","lat":18.506048,"lng":110.037504},{"city":"保亭","province":"海南","lat":18.63913,"lng":109.70259},{"city":"琼中","province":"海南","lat":19.033369,"lng":109.838389},{"city":"万州","province":"重庆","lat":30.807667,"lng":108.408661},{"city":"涪陵","province":"重庆","lat":29.703113,"lng":107.389298},{"city":"渝中","province":"重庆","lat":29.55275,"lng":106.568892},{"city":"大渡口","province":"重庆","lat":29.484527,"lng":106.482347},{"city":"江北","province":"重庆","lat":33.2205924,"lng":130.1575277},{"city":"沙坪坝","province":"重庆","lat":29.541145,"lng":106.456878},{"city":"九龙坡","province":"重庆","lat":29.502272,"lng":106.510676},{"city":"北碚","province":"重庆","lat":29.805108,"lng":106.395612},{"city":"万盛","province":"重庆","lat":39.973373,"lng":139.741562},{"city":"渝北","province":"重庆","lat":29.718143,"lng":106.631187},{"city":"巴南","province":"重庆","lat":29.40240799999999,"lng":106.540257},{"city":"长寿","province":"重庆","lat":29.857996,"lng":107.081283},{"city":"綦江","province":"重庆","lat":29.028067,"lng":106.651362},{"city":"潼南","province":"重庆","lat":30.191013,"lng":105.840556},{"city":"铜梁","province":"重庆","lat":29.844811,"lng":106.056404},{"city":"大足","province":"重庆","lat":29.707032,"lng":105.721733},{"city":"荣昌","province":"重庆","lat":29.405003,"lng":105.594623},{"city":"璧山","province":"重庆","lat":29.592024,"lng":106.227305},{"city":"梁平","province":"重庆","lat":30.674362,"lng":107.803911},{"city":"城口","province":"重庆","lat":31.947633,"lng":108.664214},{"city":"丰都","province":"重庆","lat":29.8635,"lng":107.730895},{"city":"垫江","province":"重庆","lat":30.327717,"lng":107.33339},{"city":"武隆","province":"重庆","lat":29.325601,"lng":107.760025},{"city":"忠县","province":"重庆","lat":30.29956,"lng":108.039002},{"city":"开县","province":"重庆","lat":31.160711,"lng":108.393135},{"city":"云阳","province":"重庆","lat":30.930613,"lng":108.697324},{"city":"奉节","province":"重庆","lat":31.018498,"lng":109.463987},{"city":"巫山","province":"重庆","lat":31.074834,"lng":109.879153},{"city":"巫溪","province":"重庆","lat":31.398604,"lng":109.570062},{"city":"黔江","province":"重庆","lat":29.53361,"lng":108.770678},{"city":"石柱","province":"重庆","lat":29.99928499999999,"lng":108.114069},{"city":"秀山","province":"重庆","lat":28.447997,"lng":109.007094},{"city":"酉阳","province":"重庆","lat":28.841244,"lng":108.767747},{"city":"彭水","province":"重庆","lat":29.293902,"lng":108.165538},{"city":"江津","province":"重庆","lat":35.0114175,"lng":132.2209303},{"city":"合川","province":"重庆","lat":29.972084,"lng":106.27613},{"city":"永川","province":"重庆","lat":29.35611699999999,"lng":105.927376},{"city":"南川","province":"重庆","lat":29.157891,"lng":107.099266},{"city":"两江新区","province":"重庆","lat":29.6017886,"lng":106.5075522},{"city":"成都","province":"四川","lat":30.572816,"lng":104.066801},{"city":"自贡","province":"四川","lat":29.33903,"lng":104.778442},{"city":"攀枝花","province":"四川","lat":26.582347,"lng":101.718637},{"city":"泸州","province":"四川","lat":28.871811,"lng":105.442258},{"city":"德阳","province":"四川","lat":31.126856,"lng":104.397894},{"city":"绵阳","province":"四川","lat":31.467495,"lng":104.678946},{"city":"广元","province":"四川","lat":32.435435,"lng":105.843357},{"city":"遂宁","province":"四川","lat":30.532847,"lng":105.592898},{"city":"内江","province":"四川","lat":29.580229,"lng":105.058433},{"city":"乐山","province":"四川","lat":29.552106,"lng":103.765568},{"city":"南充","province":"四川","lat":30.837793,"lng":106.110698},{"city":"眉山","province":"四川","lat":30.07544,"lng":103.848538},{"city":"宜宾","province":"四川","lat":28.751769,"lng":104.643215},{"city":"广安","province":"四川","lat":30.456224,"lng":106.633088},{"city":"达州","province":"四川","lat":31.209572,"lng":107.468023},{"city":"雅安","province":"四川","lat":29.980537,"lng":103.013261},{"city":"巴中","province":"四川","lat":31.867903,"lng":106.747478},{"city":"资阳","province":"四川","lat":30.128901,"lng":104.627636},{"city":"阿坝","province":"四川","lat":31.899413,"lng":102.224653},{"city":"甘孜","province":"四川","lat":30.04952,"lng":101.962311},{"city":"凉山","province":"四川","lat":27.881611,"lng":102.267335},{"city":"贵阳","province":"贵州","lat":26.647661,"lng":106.630154},{"city":"六盘水","province":"贵州","lat":26.592666,"lng":104.830359},{"city":"遵义","province":"贵州","lat":27.725654,"lng":106.927389},{"city":"安顺","province":"贵州","lat":26.253072,"lng":105.947594},{"city":"铜仁","province":"贵州","lat":27.731515,"lng":109.189598},{"city":"黔西南","province":"贵州","lat":25.087825,"lng":104.906397},{"city":"毕节","province":"贵州","lat":27.283955,"lng":105.291644},{"city":"黔东南","province":"贵州","lat":26.583442,"lng":107.982859},{"city":"黔南","province":"贵州","lat":26.254092,"lng":107.522098},{"city":"昆明","province":"云南","lat":24.880095,"lng":102.832892},{"city":"曲靖","province":"云南","lat":25.49,"lng":103.796167},{"city":"玉溪","province":"云南","lat":24.347324,"lng":102.527197},{"city":"保山","province":"云南","lat":25.112046,"lng":99.16176100000001},{"city":"昭通","province":"云南","lat":27.338257,"lng":103.717465},{"city":"丽江","province":"云南","lat":26.855047,"lng":100.227751},{"city":"普洱","province":"云南","lat":22.825066,"lng":100.966512},{"city":"临沧","province":"云南","lat":23.884154,"lng":100.088824},{"city":"楚雄","province":"云南","lat":25.045532,"lng":101.528068},{"city":"红河","province":"云南","lat":23.36313,"lng":103.374799},{"city":"文山","province":"云南","lat":23.400733,"lng":104.216248},{"city":"西双版纳","province":"云南","lat":22.008811,"lng":100.79715},{"city":"大理","province":"云南","lat":25.606486,"lng":100.267638},{"city":"德宏","province":"云南","lat":24.433353,"lng":98.58489499999999},{"city":"怒江","province":"云南","lat":25.817556,"lng":98.856601},{"city":"迪庆","province":"云南","lat":27.818757,"lng":99.702254},{"city":"拉萨","province":"西藏","lat":29.652491,"lng":91.17210999999999},{"city":"昌都","province":"西藏","lat":31.140969,"lng":97.17202},{"city":"山南","province":"西藏","lat":29.237137,"lng":91.773134},{"city":"日喀则","province":"西藏","lat":29.26686999999999,"lng":88.880583},{"city":"那曲","province":"西藏","lat":31.476202,"lng":92.051239},{"city":"阿里","province":"西藏","lat":32.848685,"lng":-107.3191906},{"city":"林芝","province":"西藏","lat":29.649128,"lng":94.36149},{"city":"西安","province":"陕西","lat":34.341575,"lng":108.93977},{"city":"铜川","province":"陕西","lat":34.896756,"lng":108.945233},{"city":"宝鸡","province":"陕西","lat":34.363184,"lng":107.237743},{"city":"咸阳","province":"陕西","lat":34.329605,"lng":108.708991},{"city":"渭南","province":"陕西","lat":34.499995,"lng":109.509786},{"city":"延安","province":"陕西","lat":36.585445,"lng":109.489757},{"city":"汉中","province":"陕西","lat":33.06748,"lng":107.023323},{"city":"榆林","province":"陕西","lat":38.28539,"lng":109.734589},{"city":"安康","province":"陕西","lat":32.684715,"lng":109.029022},{"city":"商洛","province":"陕西","lat":33.870422,"lng":109.940477},{"city":"兰州市","province":"甘肃","lat":36.061089,"lng":103.834304},{"city":"嘉峪关","province":"甘肃","lat":39.8013401,"lng":98.21616949999999},{"city":"金昌","province":"甘肃","lat":38.520089,"lng":102.188043},{"city":"白银","province":"甘肃","lat":36.545015,"lng":104.138586},{"city":"天水","province":"甘肃","lat":34.580864,"lng":105.724947},{"city":"武威","province":"甘肃","lat":37.928267,"lng":102.638201},{"city":"张掖","province":"甘肃","lat":38.925875,"lng":100.449818},{"city":"平凉","province":"甘肃","lat":35.543061,"lng":106.665131},{"city":"酒泉","province":"甘肃","lat":39.732819,"lng":98.49435199999999},{"city":"庆阳","province":"甘肃","lat":35.709077,"lng":107.643631},{"city":"定西","province":"甘肃","lat":35.580663,"lng":104.626282},{"city":"陇南","province":"甘肃","lat":33.400685,"lng":104.921841},{"city":"临夏","province":"甘肃","lat":35.601182,"lng":103.210539},{"city":"甘南","province":"甘肃","lat":34.983386,"lng":102.911027},{"city":"西宁","province":"青海","lat":36.617134,"lng":101.778224},{"city":"海东","province":"青海","lat":36.50204,"lng":102.104287},{"city":"海北","province":"青海","lat":36.954413,"lng":100.900998},{"city":"黄南","province":"青海","lat":35.519549,"lng":102.015248},{"city":"海南","province":"青海","lat":20.017378,"lng":110.349229},{"city":"果洛","province":"青海","lat":34.471431,"lng":100.244809},{"city":"玉树","province":"青海","lat":32.993107,"lng":97.00878499999999},{"city":"海西","province":"青海","lat":37.377139,"lng":97.36975199999999},{"city":"银川","province":"宁夏","lat":38.487194,"lng":106.230909},{"city":"石嘴山","province":"宁夏","lat":38.983236,"lng":106.383304},{"city":"吴忠","province":"宁夏","lat":37.997461,"lng":106.198394},{"city":"固原","province":"宁夏","lat":36.01585499999999,"lng":106.24261},{"city":"中卫","province":"宁夏","lat":37.499973,"lng":105.196902},{"city":"乌鲁木齐","province":"新疆","lat":43.825592,"lng":87.61684799999999},{"city":"克拉玛依","province":"新疆","lat":45.579889,"lng":84.88920700000001},{"city":"吐鲁番","province":"新疆","lat":42.951384,"lng":89.189655},{"city":"哈密","province":"新疆","lat":42.81842,"lng":93.515151},{"city":"昌吉","province":"新疆","lat":44.013183,"lng":87.30411199999999},{"city":"博尔塔拉","province":"新疆","lat":44.906065,"lng":82.06638699999999},{"city":"巴音郭楞","province":"新疆","lat":41.764115,"lng":86.145298},{"city":"克孜勒苏","province":"新疆","lat":39.714526,"lng":76.167819},{"city":"喀什","province":"新疆","lat":39.46768600000001,"lng":75.99378999999999},{"city":"和田","province":"新疆","lat":37.114157,"lng":79.92221099999999},{"city":"伊犁","province":"新疆","lat":43.91682300000001,"lng":81.324136},{"city":"塔城","province":"新疆","lat":46.745364,"lng":82.980317},{"city":"阿勒泰","province":"新疆","lat":47.844924,"lng":88.14125299999999},{"city":"石河子","province":"新疆","lat":44.306097,"lng":86.080602},{"city":"阿拉尔","province":"新疆","lat":40.547653,"lng":81.28052699999999},{"city":"图木舒克","province":"新疆","lat":39.864867,"lng":79.06933200000002},{"city":"五家渠","province":"新疆","lat":44.166757,"lng":87.54324},{"city":"北屯","province":"新疆","lat":24.1815237,"lng":120.6861019},{"city":"台北市","province":"台湾","lat":25.0329694,"lng":121.5654177},{"city":"高雄市","province":"台湾","lat":22.6272784,"lng":120.3014353},{"city":"基隆市","province":"台湾","lat":25.1276033,"lng":121.7391833},{"city":"台中市","province":"台湾","lat":24.1477358,"lng":120.6736482},{"city":"台南市","province":"台湾","lat":22.9997281,"lng":120.2270277},{"city":"新竹市","province":"台湾","lat":24.8138287,"lng":120.9674798},{"city":"嘉义市","province":"台湾","lat":23.4800751,"lng":120.4491113},{"city":"台北县","province":"台湾","lat":25.0169826,"lng":121.4627868},{"city":"宜兰县","province":"台湾","lat":24.7021073,"lng":121.7377502},{"city":"桃园县","province":"台湾","lat":24.9936281,"lng":121.3009798},{"city":"新竹县","province":"台湾","lat":24.8387226,"lng":121.0177246},{"city":"苗栗县","province":"台湾","lat":24.560159,"lng":120.8214265},{"city":"台中县","province":"台湾","lat":24.1477358,"lng":120.6736482},{"city":"彰化县","province":"台湾","lat":24.0517963,"lng":120.5161352},{"city":"南投县","province":"台湾","lat":23.9609981,"lng":120.9718638},{"city":"云林县","province":"台湾","lat":23.7092033,"lng":120.4313373},{"city":"嘉义县","province":"台湾","lat":23.4518428,"lng":120.2554615},{"city":"台南县","province":"台湾","lat":22.9997281,"lng":120.2270277},{"city":"高雄县","province":"台湾","lat":22.6272784,"lng":120.3014353},{"city":"屏东县","province":"台湾","lat":22.5519759,"lng":120.5487597},{"city":"澎湖县","province":"台湾","lat":23.5711899,"lng":119.5793157},{"city":"台东县","province":"台湾","lat":22.7972447,"lng":121.0713702},{"city":"花莲县","province":"台湾","lat":23.9871589,"lng":121.6015714},{"city":"中西区","province":"香港","lat":22.2730219,"lng":114.1498806},{"city":"东区","province":"香港","lat":37.9248713,"lng":139.0925911},{"city":"九龙城区","province":"香港","lat":22.3232097,"lng":114.1855505},{"city":"观塘区","province":"香港","lat":22.315698,"lng":114.2331057},{"city":"南区","province":"香港","lat":22.2432164,"lng":114.1974398},{"city":"深水埗区","province":"香港","lat":22.3320934,"lng":114.146908},{"city":"黄大仙区","province":"香港","lat":22.3548115,"lng":114.1974398},{"city":"湾仔区","province":"香港","lat":22.2762468,"lng":114.1825781},{"city":"油尖旺区","province":"香港","lat":22.3116028,"lng":114.1706884},{"city":"离岛区","province":"香港","lat":22.2627924,"lng":113.9655419},{"city":"葵青区","province":"香港","lat":22.3549077,"lng":114.1260991},{"city":"北区","province":"香港","lat":35.7528042,"lng":139.7334805},{"city":"西贡区","province":"香港","lat":22.3833893,"lng":114.270976},{"city":"沙田区","province":"香港","lat":22.386408,"lng":114.2093287},{"city":"屯门区","province":"香港","lat":22.3907654,"lng":113.9725161},{"city":"大埔区","province":"香港","lat":22.4423282,"lng":114.165521},{"city":"荃湾区","province":"香港","lat":22.3713227,"lng":114.1141601},{"city":"元朗区","province":"香港","lat":22.4445484,"lng":114.0222095},{"city":"花地玛堂区","province":"澳门","lat":22.2076603,"lng":113.5433129},{"city":"圣安多尼堂区","province":"澳门","lat":22.2008836,"lng":113.5416011},{"city":"大堂区","province":"澳门","lat":22.1904034,"lng":113.5460653},{"city":"望德堂区","province":"澳门","lat":22.1995225,"lng":113.5505295},{"city":"风顺堂区","province":"澳门","lat":22.1877904,"lng":113.5356487},{"city":"氹仔","province":"澳门","lat":22.1596848,"lng":113.563922},{"city":"路环","province":"澳门","lat":22.1260891,"lng":113.563922},{"city":"阿克苏","province":"新疆","lat":41.167548,"lng":80.263387}] 2 | -------------------------------------------------------------------------------- /cawler/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬虫目前是测试版本,代码不完整,请自行补充 3 | """ 4 | 5 | import requests -------------------------------------------------------------------------------- /cawler/connect_db.py: -------------------------------------------------------------------------------- 1 | from neo4j import GraphDatabase, basic_auth 2 | from tqdm import tqdm 3 | 4 | driver = GraphDatabase.driver('bolt://localhost', auth=basic_auth("neo4j", "1231")) 5 | session = driver.session() 6 | with open('people.csv', encoding='utf-8') as f: 7 | text = tqdm(f.readlines()[1:]) 8 | for rel in text: 9 | newsid = rel.split(',')[0] 10 | orgname = rel.split(',')[1].strip('\n') 11 | query = "MATCH (news:News {news_id:" +'"'+ str(newsid) +'"' + "}), (p: Person {person_name:" + '"' + orgname + '"' + "}) MERGE (news) - [r:参与人员] ->(p)" 12 | # print(query) 13 | session.run(query) 14 | session.close() 15 | -------------------------------------------------------------------------------- /cawler/crawler.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests\t\t\t\t\t\t# 发起网络请求\n", 10 | "from bs4 import BeautifulSoup\t\t# 解析HTML文本\n", 11 | "import pandas as pd\t\t\t\t\t# 处理数据\n", 12 | "import os\n", 13 | "import time\t\t\t# 处理时间戳\n", 14 | "import json\t\t\t# 用来解析json文本" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 18, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "'''\n", 24 | "用于发起网络请求\n", 25 | "url : Request Url\n", 26 | "kw : Keyword\n", 27 | "page: Page number\n", 28 | "'''\n", 29 | "def fetchUrl(url, kw, page):\n", 30 | " # 请求头\n", 31 | " headers={\n", 32 | " \"Accept\": \"application/json, text/plain, */*\",\n", 33 | " \"Content-Type\": \"application/json;charset=UTF-8\",\n", 34 | " # \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36\",\n", 35 | " \"Referer\": \"https://blog.csdn.net/wenxuhonghe/article/details/113730696\",\n", 36 | " \"Accept-Encoding\" : \"gzip, deflate\",\n", 37 | " \"Accept-Language\": \"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6\",\n", 38 | " \"Connection\": \"keep-alive\",\n", 39 | " \"Cookie\": \"__jsluid_h=feed647f64ed868f713978e151ffee30; sso_c=0; sfr=1\",\n", 40 | " \"Host\": \"search.people.cn\",\n", 41 | " \"Origin\": \"http://search.people.cn\",\n", 42 | " \"Referer\": \"http://search.people.cn/s/?keyword=%E6%96%87%E5%8C%96&st=0&_=1640775018893\",\n", 43 | " \"User-Agent\" : \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62\"\n", 44 | " }\n", 45 | " # 请求参数\n", 46 | " payloads = {\n", 47 | " \"endTime\": 0,\n", 48 | " \"hasContent\": True,\n", 49 | " \"hasTitle\": True,\n", 50 | " \"isFuzzy\": True,\n", 51 | " \"key\": kw,\n", 52 | " \"limit\": 10,\n", 53 | " \"page\": page,\n", 54 | " \"sortType\": 2,\n", 55 | " \"startTime\": 0,\n", 56 | " \"type\": 0,\n", 57 | " }\n", 58 | "\n", 59 | " # 发起 post 请求\n", 60 | " r = requests.post(url, headers=headers, data=json.dumps(payloads))\n", 61 | " return r.json()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 13, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def parseJson(jsonObj):\n", 71 | " #解析数据\n", 72 | " records = jsonObj[\"data\"][\"records\"]\n", 73 | " for item in records:\n", 74 | " # 这里示例解析了几条,其他数据项如末尾所示,有需要自行解析\n", 75 | " pid = item[\"id\"]\n", 76 | " originalName = item[\"originalName\"]\n", 77 | " belongsName = item[\"belongsName\"]\n", 78 | " content = BeautifulSoup(item[\"content\"], \"html.parser\").text\n", 79 | " displayTime = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(item[\"displayTime\"]/1000))\n", 80 | " subtitle = item[\"subtitle\"]\n", 81 | " title = BeautifulSoup(item[\"title\"], \"html.parser\").text\n", 82 | " url = item[\"url\"]\n", 83 | " \n", 84 | " yield [[pid, title, subtitle, displayTime, originalName, belongsName, content, url]]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 14, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "'''\n", 94 | "用于将数据保存成 csv 格式的文件(以追加的模式)\n", 95 | "path : 保存的路径,若文件夹不存在,则自动创建\n", 96 | "filename: 保存的文件名\n", 97 | "data : 保存的数据内容\n", 98 | "'''\n", 99 | "def saveFile(path, filename, data):\n", 100 | " # 如果路径不存在,就创建路径\n", 101 | " if not os.path.exists(path):\n", 102 | " os.makedirs(path)\n", 103 | " # 保存数据\n", 104 | " dataframe = pd.DataFrame(data)\n", 105 | " dataframe.to_csv(path + filename + \".csv\", encoding='utf_8_sig', mode='a', index=False, sep=',', header=False )" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 22, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "第1页爬取完成\n" 118 | ] 119 | }, 120 | { 121 | "ename": "KeyboardInterrupt", 122 | "evalue": "", 123 | "output_type": "error", 124 | "traceback": [ 125 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 126 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 127 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[0msaveFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkw\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"第{}页爬取完成\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 17\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[1;31m# 爬虫完成提示信息\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 128 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "if __name__ == \"__main__\":\n", 134 | "\t# 起始页,终止页,关键词设置\n", 135 | " start = 1\n", 136 | " end = 2000\n", 137 | " kw = \"文化\"\n", 138 | " # 保存表头行\n", 139 | " headline = [[\"文章id\", \"标题\", \"副标题\", \"发表时间\", \"来源\", \"版面\", \"摘要\", \"链接\"]]\n", 140 | " saveFile(\"data/\", kw, headline)\n", 141 | " #爬取数据\n", 142 | " for page in range(start, end + 1):\n", 143 | " url = \"http://search.people.cn/search-platform/front/search\"\n", 144 | " html = fetchUrl(url, kw, page)\n", 145 | " for data in parseJson(html):\n", 146 | " saveFile(\"data/\", kw, data)\n", 147 | " print(\"第{}页爬取完成\".format(page))\n", 148 | " time.sleep(2)\n", 149 | " \n", 150 | " # 爬虫完成提示信息\n", 151 | " print(\"爬虫执行完毕!数据已保存至以下路径中,请查看!\")\n", 152 | " print(os.getcwd(), \"\\\\data\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/html": [ 163 | "
\n", 164 | "\n", 177 | "\n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | "
标题发表时间版面摘要
0品味传统文化 喜迎中秋佳节2021/9/21 6:57#特别关注#做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引...
1信仰的力量:做红色文化的“追星人”2021/8/7 16:15#滚动#本地#教育#广东频道#做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯...
2版权保护让民族民间文化大放异彩2021/3/10 16:39#采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云#做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族...
3打造“隆平小镇”历史文化街区2021/1/20 18:18#市州县域#做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙...
4忠县:《忠文化干部读本》教材编写协议签订2021/3/25 4:02#区县#座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠...
...............
7897在行走中触摸历史 百年南滨文化探秘活动启动2021/12/8 12:27#要闻#“百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重...
7898第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行2020/8/26 17:51#通辽#8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包...
7899金华历史文化研究成果展示厅启用2021/11/26 9:10#浙江频道#图片#今日热点#11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。
7900四川省第三届残疾人文化艺术节延期举办2021/10/29 8:43#社会#四川频道#10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体...
7901第三届闽东柏柱洋红色文化旅游周在福建福安启动2021/6/24 16:07#新闻列表#转载#福建频道#陈楚楚#综合#宁德#昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。
\n", 267 | "

7902 rows × 4 columns

\n", 268 | "
" 269 | ], 270 | "text/plain": [ 271 | " 标题 发表时间 \\\n", 272 | "0 品味传统文化 喜迎中秋佳节 2021/9/21 6:57 \n", 273 | "1 信仰的力量:做红色文化的“追星人” 2021/8/7 16:15 \n", 274 | "2 版权保护让民族民间文化大放异彩 2021/3/10 16:39 \n", 275 | "3 打造“隆平小镇”历史文化街区 2021/1/20 18:18 \n", 276 | "4 忠县:《忠文化干部读本》教材编写协议签订 2021/3/25 4:02 \n", 277 | "... ... ... \n", 278 | "7897 在行走中触摸历史 百年南滨文化探秘活动启动 2021/12/8 12:27 \n", 279 | "7898 第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行 2020/8/26 17:51 \n", 280 | "7899 金华历史文化研究成果展示厅启用 2021/11/26 9:10 \n", 281 | "7900 四川省第三届残疾人文化艺术节延期举办 2021/10/29 8:43 \n", 282 | "7901 第三届闽东柏柱洋红色文化旅游周在福建福安启动 2021/6/24 16:07 \n", 283 | "\n", 284 | " 版面 \\\n", 285 | "0 #特别关注# \n", 286 | "1 #滚动#本地#教育#广东频道# \n", 287 | "2 #采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云# \n", 288 | "3 #市州县域# \n", 289 | "4 #区县# \n", 290 | "... ... \n", 291 | "7897 #要闻# \n", 292 | "7898 #通辽# \n", 293 | "7899 #浙江频道#图片#今日热点# \n", 294 | "7900 #社会#四川频道# \n", 295 | "7901 #新闻列表#转载#福建频道#陈楚楚#综合#宁德# \n", 296 | "\n", 297 | " 摘要 \n", 298 | "0 做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引... \n", 299 | "1 做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯... \n", 300 | "2 做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族... \n", 301 | "3 做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙... \n", 302 | "4 座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠... \n", 303 | "... ... \n", 304 | "7897    “百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重... \n", 305 | "7898     8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包... \n", 306 | "7899     11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。 \n", 307 | "7900      10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体... \n", 308 | "7901      昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。    \n", 309 | "\n", 310 | "[7902 rows x 4 columns]" 311 | ] 312 | }, 313 | "execution_count": 16, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "import pandas as pd\n", 320 | "data = pd.read_csv('data/文化.csv', encoding='utf-8', usecols=[1,3,5,6])\n", 321 | "data.drop_duplicates(subset='标题', inplace=True)\n", 322 | "pd.set_option('display.max_colwidth', 100)\n", 323 | "data.reset_index().drop('index', axis=1)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 75, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "王小明 B PERSON\n", 336 | "在 O \n", 337 | "北京 B GPE\n", 338 | "的 O \n", 339 | "清华 B ORG\n", 340 | "大学 I ORG\n", 341 | "读书 O \n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "\n", 347 | "from spacy import displacy\n", 348 | "\n", 349 | "import zh_core_web_sm\n", 350 | "\n", 351 | "nlp = zh_core_web_sm.load()\n", 352 | "\n", 353 | "\n", 354 | "def main():\n", 355 | " doc = nlp(\"王小明在北京的清华大学读书\")\n", 356 | " for token in doc:\n", 357 | " print(token.text,token.ent_iob_, token.ent_type_\n", 358 | " )\n", 359 | "\n", 360 | " # displacy.serve(doc)\n", 361 | "\n", 362 | "\n", 363 | "if __name__ == \"__main__\":\n", 364 | " main()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 17, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "data = pd.concat([data, pd.DataFrame(columns=['DATE','ORG','PERSON','GPE'])])" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 18, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "\n", 383 | "data = data.fillna('')" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 9, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/html": [ 394 | "
\n", 395 | "\n", 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | "
标题发表时间版面摘要DATEORGPERSONGPE
0品味传统文化 喜迎中秋佳节2021/9/21 6:57#特别关注#做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引...
4信仰的力量:做红色文化的“追星人”2021/8/7 16:15#滚动#本地#教育#广东频道#做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯...
5版权保护让民族民间文化大放异彩2021/3/10 16:39#采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云#做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族...
6打造“隆平小镇”历史文化街区2021/1/20 18:18#市州县域#做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙...
7忠县:《忠文化干部读本》教材编写协议签订2021/3/25 4:02#区县#座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠...
...........................
18956在行走中触摸历史 百年南滨文化探秘活动启动2021/12/8 12:27#要闻#“百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重...
18957第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行2020/8/26 17:51#通辽#8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包...
18958金华历史文化研究成果展示厅启用2021/11/26 9:10#浙江频道#图片#今日热点#11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。
18959四川省第三届残疾人文化艺术节延期举办2021/10/29 8:43#社会#四川频道#10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体...
18960第三届闽东柏柱洋红色文化旅游周在福建福安启动2021/6/24 16:07#新闻列表#转载#福建频道#陈楚楚#综合#宁德#昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。
\n", 546 | "

7902 rows × 8 columns

\n", 547 | "
" 548 | ], 549 | "text/plain": [ 550 | " 标题 发表时间 \\\n", 551 | "0 品味传统文化 喜迎中秋佳节 2021/9/21 6:57 \n", 552 | "4 信仰的力量:做红色文化的“追星人” 2021/8/7 16:15 \n", 553 | "5 版权保护让民族民间文化大放异彩 2021/3/10 16:39 \n", 554 | "6 打造“隆平小镇”历史文化街区 2021/1/20 18:18 \n", 555 | "7 忠县:《忠文化干部读本》教材编写协议签订 2021/3/25 4:02 \n", 556 | "... ... ... \n", 557 | "18956 在行走中触摸历史 百年南滨文化探秘活动启动 2021/12/8 12:27 \n", 558 | "18957 第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行 2020/8/26 17:51 \n", 559 | "18958 金华历史文化研究成果展示厅启用 2021/11/26 9:10 \n", 560 | "18959 四川省第三届残疾人文化艺术节延期举办 2021/10/29 8:43 \n", 561 | "18960 第三届闽东柏柱洋红色文化旅游周在福建福安启动 2021/6/24 16:07 \n", 562 | "\n", 563 | " 版面 \\\n", 564 | "0 #特别关注# \n", 565 | "4 #滚动#本地#教育#广东频道# \n", 566 | "5 #采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云# \n", 567 | "6 #市州县域# \n", 568 | "7 #区县# \n", 569 | "... ... \n", 570 | "18956 #要闻# \n", 571 | "18957 #通辽# \n", 572 | "18958 #浙江频道#图片#今日热点# \n", 573 | "18959 #社会#四川频道# \n", 574 | "18960 #新闻列表#转载#福建频道#陈楚楚#综合#宁德# \n", 575 | "\n", 576 | " 摘要 \\\n", 577 | "0 做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引... \n", 578 | "4 做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯... \n", 579 | "5 做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族... \n", 580 | "6 做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙... \n", 581 | "7 座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠... \n", 582 | "... ... \n", 583 | "18956    “百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重... \n", 584 | "18957     8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包... \n", 585 | "18958     11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。 \n", 586 | "18959      10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体... \n", 587 | "18960      昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。    \n", 588 | "\n", 589 | " DATE ORG PERSON GPE \n", 590 | "0 \n", 591 | "4 \n", 592 | "5 \n", 593 | "6 \n", 594 | "7 \n", 595 | "... ... .. ... .. \n", 596 | "18956 \n", 597 | "18957 \n", 598 | "18958 \n", 599 | "18959 \n", 600 | "18960 \n", 601 | "\n", 602 | "[7902 rows x 8 columns]" 603 | ] 604 | }, 605 | "execution_count": 9, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "data" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 127, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "name": "stderr", 621 | "output_type": "stream", 622 | "text": [ 623 | "7902it [02:34, 51.28it/s]\n" 624 | ] 625 | } 626 | ], 627 | "source": [ 628 | "from tqdm import tqdm\n", 629 | "for i,text in tqdm(enumerate(data['摘要'])):\n", 630 | " doc = nlp(text)\n", 631 | " for token in doc:\n", 632 | " # temp = ''\n", 633 | " if token.ent_iob_=='B':\n", 634 | " if token.ent_type_=='DATE':\n", 635 | " data.iloc[i,4] = data.iloc[i,4]+';'+token.text\n", 636 | " elif token.ent_type_ =='ORG':\n", 637 | " data.iloc[i,5] = data.iloc[i,5]+';'+token.text\n", 638 | " elif token.ent_type_ =='PERSON':\n", 639 | " data.iloc[i,6] = data.iloc[i,6]+';'+token.text\n", 640 | " elif token.ent_type_ =='GPE':\n", 641 | " data.iloc[i,7] = data.iloc[i,7]+';'+token.text\n", 642 | " elif token.ent_iob_=='I':\n", 643 | " if token.ent_type_=='DATE':\n", 644 | " data.iloc[i,4] = data.iloc[i,4]+token.text\n", 645 | " elif token.ent_type_ =='ORG':\n", 646 | " data.iloc[i,5] = data.iloc[i,5]+token.text\n", 647 | " elif token.ent_type_ =='PERSON':\n", 648 | " data.iloc[i,6] = data.iloc[i,6]+token.text\n", 649 | " elif token.ent_type_ =='GPE':\n", 650 | " data.iloc[i,7] = data.iloc[i,7]+token.text" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 128, 656 | "metadata": {}, 657 | "outputs": [ 658 | { 659 | "data": { 660 | "text/html": [ 661 | "
\n", 662 | "\n", 675 | "\n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | "
标题发表时间版面摘要DATEORGPERSONGPE
0品味传统文化 喜迎中秋佳节2021/9/21 6:57#特别关注#做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引...;五号;北京市石景山区;江西省博物馆;中国航天;云南分公司;赏晚会;天南地北
4信仰的力量:做红色文化的“追星人”2021/8/7 16:15#滚动#本地#教育#广东频道#做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯...;广东医青年;广东医科大学;广东医青年;徐畅;造星;栏目;广东省
5版权保护让民族民间文化大放异彩2021/3/10 16:39#采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云#做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族...;彰显贵州文化特色和人文;贵州;贵州
6打造“隆平小镇”历史文化街区2021/1/20 18:18#市州县域#做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙...;1月18日;郑明;今年;郑明;长沙市;芙蓉区政协;郑明津;郑明;陈新;长沙
7忠县:《忠文化干部读本》教材编写协议签订2021/3/25 4:02#区县#座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠...;重庆市委党校;丰富忠
...........................
18956在行走中触摸历史 百年南滨文化探秘活动启动2021/12/8 12:27#要闻#“百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重...;百年;12月8日;百年;百年;第一个周末;百年;重庆市文物;重庆市导游协会;重庆市文物;重庆;重庆;南滨路
18957第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行2020/8/26 17:51#通辽#8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包...;25日;三天;市政府;中国艺术摄影学会;科尔沁美食品鉴会;张怡致
18958金华历史文化研究成果展示厅启用2021/11/26 9:10#浙江频道#图片#今日热点#11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。;11月25日;第一天;金华历史文化研究成果
18959四川省第三届残疾人文化艺术节延期举办2021/10/29 8:43#社会#四川频道#10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体...;11月15日至19日;四川省
18960第三届闽东柏柱洋红色文化旅游周在福建福安启动2021/6/24 16:07#新闻列表#转载#福建频道#陈楚楚#综合#宁德#昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。;昨日;福安市;柏柱洋;柏柱洋
\n", 813 | "

7902 rows × 8 columns

\n", 814 | "
" 815 | ], 816 | "text/plain": [ 817 | " 标题 发表时间 \\\n", 818 | "0 品味传统文化 喜迎中秋佳节 2021/9/21 6:57 \n", 819 | "4 信仰的力量:做红色文化的“追星人” 2021/8/7 16:15 \n", 820 | "5 版权保护让民族民间文化大放异彩 2021/3/10 16:39 \n", 821 | "6 打造“隆平小镇”历史文化街区 2021/1/20 18:18 \n", 822 | "7 忠县:《忠文化干部读本》教材编写协议签订 2021/3/25 4:02 \n", 823 | "... ... ... \n", 824 | "18956 在行走中触摸历史 百年南滨文化探秘活动启动 2021/12/8 12:27 \n", 825 | "18957 第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行 2020/8/26 17:51 \n", 826 | "18958 金华历史文化研究成果展示厅启用 2021/11/26 9:10 \n", 827 | "18959 四川省第三届残疾人文化艺术节延期举办 2021/10/29 8:43 \n", 828 | "18960 第三届闽东柏柱洋红色文化旅游周在福建福安启动 2021/6/24 16:07 \n", 829 | "\n", 830 | " 版面 \\\n", 831 | "0 #特别关注# \n", 832 | "4 #滚动#本地#教育#广东频道# \n", 833 | "5 #采编人员信息统计#贵州频道#贵州文教#频道稿件编辑统计查询#顾兰云# \n", 834 | "6 #市州县域# \n", 835 | "7 #区县# \n", 836 | "... ... \n", 837 | "18956 #要闻# \n", 838 | "18957 #通辽# \n", 839 | "18958 #浙江频道#图片#今日热点# \n", 840 | "18959 #社会#四川频道# \n", 841 | "18960 #新闻列表#转载#福建频道#陈楚楚#综合#宁德# \n", 842 | "\n", 843 | " 摘要 \\\n", 844 | "0 做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引... \n", 845 | "4 做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯... \n", 846 | "5 做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族... \n", 847 | "6 做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙... \n", 848 | "7 座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠... \n", 849 | "... ... \n", 850 | "18956    “百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重... \n", 851 | "18957     8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包... \n", 852 | "18958     11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。 \n", 853 | "18959      10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体... \n", 854 | "18960      昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。    \n", 855 | "\n", 856 | " DATE ORG PERSON \\\n", 857 | "0 ;五号 ;北京市石景山区;江西省博物馆;中国航天;云南分公司 ;赏晚会 \n", 858 | "4 ;广东医青年;广东医科大学;广东医青年 ;徐畅;造星;栏目 \n", 859 | "5 ;彰显贵州文化特色和人文 \n", 860 | "6 ;1月18日;郑明;今年;郑明 ;长沙市;芙蓉区政协 ;郑明津;郑明;陈新 \n", 861 | "7 ;重庆市委党校;丰富忠 \n", 862 | "... ... ... ... \n", 863 | "18956 ;百年;12月8日;百年;百年;第一个周末;百年 ;重庆市文物;重庆市导游协会;重庆市文物 \n", 864 | "18957 ;25日;三天 ;市政府;中国艺术摄影学会;科尔沁美食品鉴会 ;张怡致 \n", 865 | "18958 ;11月25日;第一天 ;金华历史文化研究成果 \n", 866 | "18959 ;11月15日至19日 \n", 867 | "18960 ;昨日 \n", 868 | "\n", 869 | " GPE \n", 870 | "0 ;天南地北 \n", 871 | "4 ;广东省 \n", 872 | "5 ;贵州;贵州 \n", 873 | "6 ;长沙 \n", 874 | "7 \n", 875 | "... ... \n", 876 | "18956 ;重庆;重庆;南滨路 \n", 877 | "18957 \n", 878 | "18958 \n", 879 | "18959 ;四川省 \n", 880 | "18960 ;福安市;柏柱洋;柏柱洋 \n", 881 | "\n", 882 | "[7902 rows x 8 columns]" 883 | ] 884 | }, 885 | "execution_count": 128, 886 | "metadata": {}, 887 | "output_type": "execute_result" 888 | } 889 | ], 890 | "source": [ 891 | "data" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 129, 897 | "metadata": {}, 898 | "outputs": [], 899 | "source": [ 900 | "data.to_csv('pro1.csv')" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": 111, 906 | "metadata": {}, 907 | "outputs": [], 908 | "source": [ 909 | "data1 = pd.read_csv('1.csv')" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 114, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [ 918 | "data1 = data1.drop('Column1', axis=1)" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 19, 924 | "metadata": {}, 925 | "outputs": [], 926 | "source": [ 927 | "import spacy\n", 928 | "nlp = spacy.load('zh_core_web_lg')" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 51, 934 | "metadata": {}, 935 | "outputs": [ 936 | { 937 | "name": "stderr", 938 | "output_type": "stream", 939 | "text": [ 940 | "7902it [02:35, 50.79it/s]\n" 941 | ] 942 | } 943 | ], 944 | "source": [ 945 | "from tqdm import tqdm\n", 946 | "type_list = ['DATE', 'ORG', 'PERSON','GPE']\n", 947 | "for j,text in tqdm(enumerate(data['摘要'])):\n", 948 | " ent_dict ={'DATE':[],'ORG':[],'PERSON':[],'GPE':[]}\n", 949 | " doc = nlp(text)\n", 950 | " for token in doc.ents:\n", 951 | " if token.label_ in type_list:\n", 952 | " idx = type_list.index(token.label_)\n", 953 | " ent_dict[token.label_].append(token.text)\n", 954 | " for i in type_list:\n", 955 | " ent_dict[i]=list(set(ent_dict[i]))\n", 956 | " data.iloc[j,4]=';'.join(ent_dict['DATE'])\n", 957 | " data.iloc[j,5]=';'.join(ent_dict['ORG'])\n", 958 | " data.iloc[j,6]=';'.join(ent_dict['PERSON'])\n", 959 | " data.iloc[j,7]=';'.join(ent_dict['GPE'])" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 54, 965 | "metadata": {}, 966 | "outputs": [], 967 | "source": [ 968 | "data.to_csv('culture.csv')" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 58, 974 | "metadata": {}, 975 | "outputs": [], 976 | "source": [ 977 | "data.reset_index().drop(['版面','index'], axis=1).to_csv('culture.csv')" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 60, 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "data": { 987 | "text/html": [ 988 | "
\n", 989 | "\n", 1002 | "\n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | "
标题发表时间摘要DATEORGPERSONGPE
0品味传统文化 喜迎中秋佳节2021/9/21 6:57做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引...五号;中秋;中秋节江西省博物馆;中建二局;内蒙古博物院线上北京市;石景山区;云南;中国
1信仰的力量:做红色文化的“追星人”2021/8/7 16:15做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯...广东医科大学徐畅;中诉广东省;广东
2版权保护让民族民间文化大放异彩2021/3/10 16:39做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族...贵州
3打造“隆平小镇”历史文化街区2021/1/20 18:18做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙...1月18日;今年芙蓉区政协;长沙市政协;驻街单位;农科院郑明津;陈新;郑明长沙;长沙市
4忠县:《忠文化干部读本》教材编写协议签订2021/3/25 4:02座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠...市委党校;忠县忠文化干部学院;重庆市委党校弘扬忠忠州;忠县
........................
7897在行走中触摸历史 百年南滨文化探秘活动启动2021/12/8 12:27“百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重...第一个周末;12月8日;百年重庆金牌导游童思斯;重庆市导游协会;重庆市文物保护志愿者服务总队;人民网重庆
7898第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行2020/8/26 17:518月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包...8月25日;三天科尔沁美食品;市政府;中国艺术摄影学会张怡致科左;阿古拉
7899金华历史文化研究成果展示厅启用2021/11/26 9:1011月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。第一天;11月25日金华
7900四川省第三届残疾人文化艺术节延期举办2021/10/29 8:4310月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体...10月28日;11月15日至19日四川省;南充市
7901第三届闽东柏柱洋红色文化旅游周在福建福安启动2021/6/24 16:07昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。柏柱洋闽东;福安市
\n", 1128 | "

7902 rows × 7 columns

\n", 1129 | "
" 1130 | ], 1131 | "text/plain": [ 1132 | " 标题 发表时间 \\\n", 1133 | "0 品味传统文化 喜迎中秋佳节 2021/9/21 6:57 \n", 1134 | "1 信仰的力量:做红色文化的“追星人” 2021/8/7 16:15 \n", 1135 | "2 版权保护让民族民间文化大放异彩 2021/3/10 16:39 \n", 1136 | "3 打造“隆平小镇”历史文化街区 2021/1/20 18:18 \n", 1137 | "4 忠县:《忠文化干部读本》教材编写协议签订 2021/3/25 4:02 \n", 1138 | "... ... ... \n", 1139 | "7897 在行走中触摸历史 百年南滨文化探秘活动启动 2021/12/8 12:27 \n", 1140 | "7898 第三届敖包相会文化旅游节在后旗阿古拉文化旅游主题小镇举行 2020/8/26 17:51 \n", 1141 | "7899 金华历史文化研究成果展示厅启用 2021/11/26 9:10 \n", 1142 | "7900 四川省第三届残疾人文化艺术节延期举办 2021/10/29 8:43 \n", 1143 | "7901 第三届闽东柏柱洋红色文化旅游周在福建福安启动 2021/6/24 16:07 \n", 1144 | "\n", 1145 | " 摘要 \\\n", 1146 | "0 做月饼、猜灯谜、赏晚会……中秋佳节来临,各地举办精彩纷呈的文化活动,人们在游览体验中感受传统节日的文化魅力,乐享美好假期。北京市石景山区五里坨街道新时代文明实践所文化广场举办中秋节游园会,吸引... \n", 1147 | "1 做红色文化的追星者   在众多“追星”团队中,有一支名叫“灯塔筑梦”的特殊队伍,他们既不是根据学院、专业构成的实践队伍,又不是按照特定条件招募组建的,团队的队员有一个共同点——都来自“青马灯... \n", 1148 | "2 做好民族民间文化的版权保护工作是激活文化资产价值的有力手段。乡村振兴战略实施中,人们对精神文化的需求将会更大,通过实施版权保护和运用,不但可以进一步增加优秀文化资源的供给,还可以进一步提高民族... \n", 1149 | "3 做好“农”字文章—— 打造“隆平小镇”历史文化街区 “芙蓉区聚焦农业优势,大有可为!”1月18日,谈起长沙市芙蓉区优势,长沙市政协委员、芙蓉区政协主席郑明津津乐道。通过调研,郑明在今年长沙... \n", 1150 | "4 座谈会前,调研组一行到作为忠文化“忠于事业”教学点的忠州腐乳厂,实地查看忠州豆腐乳生产和销售情况,了解企业秉持忠文化精神,感受忠文化在企业发展中的魅力。   随后,重庆市委党校与忠县签订《忠... \n", 1151 | "... ... \n", 1152 | "7897    “百年南滨”文化深度探秘活动启动。重庆市文物保护志愿者服务总队供图        人民网重庆12月8日电 近日,“百年南滨”文化深度探秘活动在南岸区南滨文化产业园启动。在文保志愿者、重... \n", 1153 | "7898     8月25日,第三届敖包相会文化旅游节在科左后旗阿古拉文化旅游主题小镇拉开帷幕。本次文化旅游节为期三天,由市政府主办,市文旅广电局、科左后旗人民政府承办。副市长张怡致辞并宣布第三届敖包... \n", 1154 | "7899     11月25日,金华历史文化研究成果展示厅启用第一天,就吸引了不少市民前往参观,还有研究者来此查阅文献资料。展示厅内,金华历史文化之脉徐徐展开,述说着金华长期以来历史文化研究取得的丰硕成果。 \n", 1155 | "7900      10月28日,封面新闻记者从四川省第三届残疾人文化艺术节组委会获悉,为切实做好当前疫情防控工作,原定于11月15日至19日举办的四川省第三届残疾人文化艺术节,延期在南充市举办,具体... \n", 1156 | "7901      昨日,在福安市溪柄镇柏柱洋,斗面村村口的闽东苏区纪念馆一片欢歌笑语,第三届闽东柏柱洋红色文化旅游周在这里启动。    \n", 1157 | "\n", 1158 | " DATE ORG PERSON \\\n", 1159 | "0 五号;中秋;中秋节 江西省博物馆;中建二局;内蒙古博物院线上 \n", 1160 | "1 广东医科大学 徐畅;中诉 \n", 1161 | "2 \n", 1162 | "3 1月18日;今年 芙蓉区政协;长沙市政协;驻街单位;农科院 郑明津;陈新;郑明 \n", 1163 | "4 市委党校;忠县忠文化干部学院;重庆市委党校 弘扬忠 \n", 1164 | "... ... ... ... \n", 1165 | "7897 第一个周末;12月8日;百年 重庆金牌导游童思斯;重庆市导游协会;重庆市文物保护志愿者服务总队;人民网 \n", 1166 | "7898 8月25日;三天 科尔沁美食品;市政府;中国艺术摄影学会 张怡致 \n", 1167 | "7899 第一天;11月25日 \n", 1168 | "7900 10月28日;11月15日至19日 \n", 1169 | "7901 柏柱洋 \n", 1170 | "\n", 1171 | " GPE \n", 1172 | "0 北京市;石景山区;云南;中国 \n", 1173 | "1 广东省;广东 \n", 1174 | "2 贵州 \n", 1175 | "3 长沙;长沙市 \n", 1176 | "4 忠州;忠县 \n", 1177 | "... ... \n", 1178 | "7897 重庆 \n", 1179 | "7898 科左;阿古拉 \n", 1180 | "7899 金华 \n", 1181 | "7900 四川省;南充市 \n", 1182 | "7901 闽东;福安市 \n", 1183 | "\n", 1184 | "[7902 rows x 7 columns]" 1185 | ] 1186 | }, 1187 | "execution_count": 60, 1188 | "metadata": {}, 1189 | "output_type": "execute_result" 1190 | } 1191 | ], 1192 | "source": [ 1193 | "data.reset_index().drop(['版面','index'], axis=1)" 1194 | ] 1195 | } 1196 | ], 1197 | "metadata": { 1198 | "interpreter": { 1199 | "hash": "79f39d06ccac4f0a6b3d11477479a3d7a8039e2bca6f83223d20136807394865" 1200 | }, 1201 | "kernelspec": { 1202 | "display_name": "Python 3.7.12 64-bit ('torch': conda)", 1203 | "language": "python", 1204 | "name": "python3" 1205 | }, 1206 | "language_info": { 1207 | "codemirror_mode": { 1208 | "name": "ipython", 1209 | "version": 3 1210 | }, 1211 | "file_extension": ".py", 1212 | "mimetype": "text/x-python", 1213 | "name": "python", 1214 | "nbconvert_exporter": "python", 1215 | "pygments_lexer": "ipython3", 1216 | "version": "3.7.12" 1217 | }, 1218 | "orig_nbformat": 4 1219 | }, 1220 | "nbformat": 4, 1221 | "nbformat_minor": 2 1222 | } 1223 | -------------------------------------------------------------------------------- /cawler/crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | from datetime import datetime 4 | 5 | import requests # 发起网络请求 6 | from bs4 import BeautifulSoup # 解析HTML文本 7 | import pandas as pd # 处理数据 8 | import os 9 | import time # 处理时间戳 10 | import json # 用来解析json文本 11 | 12 | from tqdm import tqdm 13 | 14 | ''' 15 | 用于发起网络请求 16 | url : Request Url 17 | kw : Keyword 18 | page: Page number 19 | ''' 20 | 21 | 22 | def fetchUrl(url, kw, page): 23 | # 请求头 24 | headers = { 25 | "Accept": "application/json, text/plain, */*", 26 | "Content-Type": "application/json;charset=UTF-8", 27 | # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36", 28 | "Accept-Encoding": "gzip, deflate", 29 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 30 | "Connection": "keep-alive", 31 | "Cookie": "__jsluid_h=feed647f64ed868f713978e151ffee30; sso_c=0; sfr=1", 32 | "Host": "search.people.cn", 33 | "Origin": "http://search.people.cn", 34 | "Referer": "http://search.people.cn/s/?keyword=%E6%96%87%E5%8C%96&st=0&_=1640775018893", 35 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62" 36 | } 37 | # 请求参数 38 | payloads = { 39 | "endTime": 0, 40 | "hasContent": True, 41 | "hasTitle": True, 42 | "isFuzzy": True, 43 | "key": kw, 44 | "limit": 10, 45 | "page": page, 46 | "sortType": 2, 47 | "startTime": 0, 48 | "type": 0, 49 | } 50 | 51 | # 发起 post 请求 52 | r = requests.post(url, headers=headers, data=json.dumps(payloads)) 53 | return r.json() 54 | 55 | 56 | def parseJson(jsonObj): 57 | # 解析数据 58 | records = jsonObj["data"]["records"] 59 | for item in records: 60 | # 这里示例解析了几条,其他数据项如末尾所示,有需要自行解析 61 | pid = item["id"] 62 | originalName = item["originalName"] 63 | belongsName = item["belongsName"] 64 | content = BeautifulSoup(item["content"], "html.parser").text 65 | displayTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item["displayTime"] / 1000)) 66 | subtitle = item["subtitle"] 67 | title = BeautifulSoup(item["title"], "html.parser").text 68 | url = item["url"] 69 | 70 | yield [[pid, title, subtitle, displayTime, originalName, belongsName, content, url]] 71 | 72 | 73 | ''' 74 | 用于将数据保存成 csv 格式的文件(以追加的模式) 75 | path : 保存的路径,若文件夹不存在,则自动创建 76 | filename: 保存的文件名 77 | data : 保存的数据内容 78 | ''' 79 | 80 | 81 | def saveFile(path, filename, data): 82 | # 如果路径不存在,就创建路径 83 | if not os.path.exists(path): 84 | os.makedirs(path) 85 | # 保存数据 86 | dataframe = pd.DataFrame(data) 87 | dataframe.to_csv(path + filename + ".csv", encoding='utf_8_sig', mode='a', index=False, sep=',', header=False) 88 | 89 | 90 | if __name__ == "__main__": 91 | # 起始页,终止页,关键词设置 92 | start = 1 93 | end = 200 94 | kw_list = ["文化","经济","科技","农业","乡村","城镇","政治","历史","青年","人民","人民日报","十九届六中全会","进博会","抗疫","脱贫攻坚","数字经济","中国", 95 | "百年","疫苗","碳中和","反垄断","三胎","人口","生育","光刻机技术","5G","东数西算","光伏","新能源","芯片","反诈","饭圈","流量","税","退休","国际", 96 | "外交","美国","俄罗斯","气候","环境","环保","大小周","加班","躺平","创新","美丽中国","乡村振兴","品牌","主旋律","体育","运动","行业","责任","躺平", 97 | "旅游","冬奥","时代","生态","生活","知识","智慧","初心","工人","主义","鲁迅","贸易","金融","资本","国家","居民","数据","信息","安全","军事","边防", 98 | "法制","法治","年轻人","生活","奋斗","宗旨","发展","区域","城市化","财政","年龄","养老","社会","腐败","反腐","教育","菜","粮食","医疗","健康","市场", 99 | "时代","鲁迅","建党","铭记","不忘","纪念","工业","科学","交通","人工智能","淄博","全国统一","德","企业","中国式","外交","制度","行业","文化","孔乙己"] 100 | # 保存表头行 101 | headline = [["文章id", "标题", "副标题", "发表时间", "来源", "版面", "摘要", "链接"]] 102 | for kw in kw_list: 103 | saveFile("data/", "daily_" + kw, headline) 104 | # 爬取数据 105 | bar = tqdm(range(start, end + 1), 106 | desc=f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Word tokenisation", 107 | total=end + 1, 108 | ncols=150) 109 | for page in bar: 110 | url = "http://search.people.cn/search-platform/front/search" 111 | html = fetchUrl(url, kw, page) 112 | for data in parseJson(html): 113 | if datetime.strptime(data[0][3], '%Y-%m-%d %H:%M:%S').year < 2021: 114 | continue 115 | saveFile("data/", kw, data) 116 | print("第{}页爬取完成".format(page)) 117 | time.sleep(2) 118 | 119 | # 爬虫完成提示信息 120 | print("人民网爬虫执行完毕。数据已保存至以下路径中,请查看:") 121 | print(os.getcwd(), "\\data") 122 | -------------------------------------------------------------------------------- /cawler/pre_data.py: -------------------------------------------------------------------------------- 1 | # python批量更换后缀名 2 | import datetime 3 | import os 4 | import sys 5 | 6 | from tqdm import tqdm 7 | 8 | os.chdir(r'D:\桌面\创作\AI预测高考作文\Spider-People-s-daily') 9 | 10 | # 列出当前目录下所有的文件 11 | files = os.listdir('./') 12 | print('files', files) 13 | 14 | for root, dirs, files in os.walk(r'D:\桌面\创作\AI预测高考作文\Spider-People-s-daily\data'): 15 | print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>2", root, dirs, files) 16 | input_col = [] 17 | output_col = [] 18 | count = 0 19 | bar = tqdm(files, 20 | total=len(files), 21 | desc=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Process NFT, POAP, Donation", 22 | ncols=150) 23 | for filename in bar: 24 | print('filename', filename) 25 | portion = os.path.splitext(filename) 26 | os.chdir(root) 27 | # 如果后缀是.dat 28 | if portion[1] == ".csv": 29 | #把原文件后缀名改为 txt 30 | newName = portion[0] + ".csv" 31 | os.renames(filename, newName) 32 | 33 | 34 | ### 提取摘要-这部分放在Tokenization函数里 35 | def tmp(): 36 | import jieba,os,re 37 | from gensim import corpora, models, similarities 38 | 39 | """创建停用词列表""" 40 | def stopwordslist(): 41 | stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()] 42 | return stopwords 43 | 44 | """对句子进行中文分词""" 45 | def seg_depart(sentence): 46 | sentence_depart = jieba.cut(sentence.strip()) 47 | stopwords = stopwordslist() 48 | outstr = '' 49 | for word in sentence_depart: 50 | if word not in stopwords: 51 | outstr += word 52 | outstr += " " 53 | return outstr 54 | 55 | train = [] 56 | for j,line in tqdm(enumerate(df["摘要"])): 57 | line = re.sub(r'[^\u4e00-\u9fa5]+','',line) 58 | line_seg = seg_depart(line.strip()) 59 | line = [word.strip() for word in line_seg.split(' ')] 60 | train.append(line[:-1]) 61 | -------------------------------------------------------------------------------- /cawler/process_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | data = pd.read_csv("data/文化.csv", encoding="utf-8", usecols=[1,3,5,6]) 9 | data.drop_duplicates(subset="标题", inplace=True) 10 | pd.set_option("display.max_colwidth", 100) 11 | data 12 | # data['index']=range(7902) 13 | 14 | 15 | # In[2]: 16 | 17 | 18 | data.reset_index().drop('index',axis=1) 19 | 20 | 21 | # In[3]: 22 | 23 | 24 | data['id']=range(7902) 25 | data 26 | 27 | 28 | # In[4]: 29 | 30 | 31 | data = pd.concat([data, pd.DataFrame(columns=["ORG","PERSON","GPE-P","GPE-C","TOPIC"])]) 32 | data = data.fillna("") 33 | 34 | 35 | # In[5]: 36 | 37 | 38 | df = data.reset_index().drop(["index","版面"],axis=1) 39 | 40 | 41 | # In[6]: 42 | 43 | 44 | df[['id']] = df[['id']].astype(int) 45 | 46 | 47 | # In[7]: 48 | 49 | 50 | df.head() 51 | 52 | 53 | # ### 时间处理函数 54 | 55 | # In[8]: 56 | 57 | 58 | df["DATE"] = pd.to_datetime(df["发表时间"], format="%Y/%m/%d").dt.date 59 | 60 | 61 | # In[9]: 62 | 63 | 64 | time_df = df[['id','标题','摘要']] 65 | time_df.to_csv('basic_information.csv') 66 | 67 | 68 | # ### 地区处理 69 | 70 | # In[10]: 71 | 72 | 73 | import json 74 | with open("CitiesChina.json","r", encoding="UTF-8") as f: 75 | city_dict = json.load(f) 76 | 77 | 78 | # In[12]: 79 | 80 | 81 | province_list = ["河北","山西","辽宁","吉林","黑龙江","江苏","浙江","安徽","福建","江西","山东","河南","湖北","湖南","广东","海南","四川","贵州","云南","陕西","甘肃","青海","台湾","上海","北京","天津","重庆"] 82 | 83 | 84 | # In[11]: 85 | 86 | 87 | def find_province(city): 88 | if city[-1:]=="省": 89 | return 1,city[:-1] 90 | if city in province_list: 91 | return 1,city 92 | if city[-1:] in ["市","区"]: 93 | city = city[:-1] 94 | for loc in city_dict: 95 | if loc["city"] == city: 96 | return 2,loc["province"] 97 | return 0,city 98 | 99 | 100 | # ### 提取实体 101 | 102 | # In[13]: 103 | 104 | 105 | find_province("郑州") 106 | 107 | 108 | # In[11]: 109 | 110 | 111 | import spacy 112 | nlp = spacy.load("zh_core_web_lg") 113 | from tqdm import tqdm 114 | type_list = ["ORG", "PERSON","GPE-P","GPE-C"] 115 | for j,text in tqdm(enumerate(df["摘要"])): 116 | ent_dict ={"ORG":[],"PERSON":[],"GPE-P":[],"GPE-C":[]} 117 | doc = nlp(text) 118 | for token in doc.ents: 119 | if token.label_ in type_list: 120 | idx = type_list.index(token.label_) 121 | ent_dict[token.label_].append(token.text.replace("习近","习近平")) 122 | elif token.label_ == "GPE": 123 | # 去中国等字 124 | if token.text in ["中华人民共和国","中国","中华"]: 125 | pass 126 | #找省份 127 | elif find_province(token.text)[0] == 1: 128 | ent_dict["GPE-P"].append(find_province(token.text)[1]) 129 | # ent_dict['GPE-C'].append(token.text) 130 | elif find_province(token.text)[0] == 2: 131 | ent_dict["GPE-P"].append(find_province(token.text)[1]) 132 | ent_dict["GPE-C"].append(find_province(token.text)[1]+token.text) 133 | # ent_dict['GPE-C'].append(token.text) 134 | elif find_province(token.text)[0] == 0: 135 | ent_dict['GPE-C'].append(token.text) 136 | for i in type_list: 137 | ent_dict[i]=list(set(ent_dict[i])) 138 | df.iloc[j,3]=" ".join(ent_dict["ORG"]) 139 | df.iloc[j,4]=" ".join(ent_dict["PERSON"]) 140 | df.iloc[j,5]=" ".join(ent_dict["GPE-P"]) 141 | df.iloc[j,6]=" ".join(ent_dict["GPE-C"]) 142 | 143 | 144 | # In[14]: 145 | 146 | 147 | # 写入csv 148 | import spacy 149 | nlp = spacy.load("zh_core_web_lg") 150 | from tqdm import tqdm 151 | with open('location.txt','w',encoding='utf-8') as f: 152 | f.write('id'+'province'+'\n') 153 | for j,text in tqdm(enumerate(df["摘要"])): 154 | j = str(j) 155 | doc = nlp(text) 156 | for token in doc.ents: 157 | if token.label_ =='GPE': 158 | if token.text in ["中华人民共和国","中国","中华"]: 159 | pass 160 | elif find_province(token.text)[0] == 1: 161 | f.write(j+','+find_province(token.text)[1]+'\n') 162 | # ent_dict['GPE-C'].append(token.text) 163 | elif find_province(token.text)[0] == 2: 164 | f.write(j+','+find_province(token.text)[1]+'\n') 165 | # ent_dict['GPE-C'].append(token.text) 166 | elif find_province(token.text)[0] == 0: 167 | f.write(j+','+token.text+'\n') 168 | 169 | 170 | # In[69]: 171 | 172 | 173 | f =open('people.csv','w',encoding='utf-8') 174 | s = open('org.csv','w',encoding='utf-8') 175 | f.write('id'+','+'people'+'\n') 176 | s.write('id'+','+'org'+'\n') 177 | for j,text in tqdm(enumerate(df["摘要"])): 178 | j = str(j) 179 | doc = nlp(text) 180 | for token in doc.ents: 181 | if token.label_ =='PERSON': 182 | f.write(j+','+token.text+'\n') 183 | if token.label_ =='ORG': 184 | s.write(j+','+token.text+'\n') 185 | f.close() 186 | s.close() 187 | 188 | 189 | # ### 提取摘要 190 | import jieba,os,re 191 | from gensim import corpora, models, similarities 192 | 193 | """创建停用词列表""" 194 | def stopwordslist(): 195 | stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()] 196 | return stopwords 197 | 198 | """对句子进行中文分词""" 199 | def seg_depart(sentence): 200 | sentence_depart = jieba.cut(sentence.strip()) 201 | stopwords = stopwordslist() 202 | outstr = '' 203 | for word in sentence_depart: 204 | if word not in stopwords: 205 | outstr += word 206 | outstr += " " 207 | return outstr 208 | 209 | train = [] 210 | for j,line in tqdm(enumerate(df["摘要"])): 211 | line = re.sub(r'[^\u4e00-\u9fa5]+','',line) 212 | line_seg = seg_depart(line.strip()) 213 | line = [word.strip() for word in line_seg.split(' ')] 214 | train.append(line[:-1]) 215 | 216 | 217 | """构建词频矩阵,训练LDA模型""" 218 | dictionary = corpora.Dictionary(train) 219 | # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...] 220 | # corpus是把每条新闻ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率 221 | corpus = [dictionary.doc2bow(text) for text in train] 222 | 223 | lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10) 224 | topic_list = lda.print_topics(10) 225 | print("20个主题的单词分布为:\n") 226 | for topic in topic_list: 227 | print(topic) 228 | 229 | 230 | -------------------------------------------------------------------------------- /examples.py: -------------------------------------------------------------------------------- 1 | import datetime, shutil 2 | import logging 3 | import os 4 | from typing import Dict, Any, List 5 | import openai 6 | import jsonlines 7 | 8 | TEST_RESULT = "./test_result/examples.txt" 9 | os.environ["http_proxy"] = "127.0.0.1:7890" 10 | os.environ["https_proxy"] = "127.0.0.1:7890" 11 | 12 | def get_chat_response(title: str) -> str: 13 | """ 14 | 加入prompt话术范文写作,获取GPT-4模型的返回结果 15 | :param array: 16 | :param title: str 17 | :return: 18 | """ 19 | global response 20 | openai.api_key = "your_key" 21 | 22 | # Make a request to the ChatGPT API 23 | messages = [{"role": "system", "content": "你是一个正在参加中国高考的考生,请基于用户输入的命题,用中文写出一篇800字左右的高考作文。 " 24 | "作文必须贴合主题,首尾呼应,结构匀称,立意明确,中心突出,感情真挚,语言流畅,意境深远, " 25 | "引经据典,善于运用修辞方法,构思精巧,见解新颖,具有积极作用。"}, 26 | {"role": 'user', "content": title}] 27 | # print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Load {len(messages)} few-shot data.") 28 | 29 | try: 30 | response = openai.ChatCompletion.create( 31 | model="gpt-4", 32 | messages=messages, 33 | temperature=0.75, 34 | max_tokens=2048, 35 | top_p=1, 36 | frequency_penalty=1, 37 | presence_penalty=0, 38 | ) 39 | except Exception as e: 40 | response = openai.ChatCompletion.create( 41 | model="gpt-3.5-turbo", 42 | messages=messages, 43 | temperature=0.75, 44 | max_tokens=2048, 45 | top_p=1, 46 | frequency_penalty=1, 47 | presence_penalty=0, 48 | ) 49 | logging.warning(e) 50 | finally: 51 | # Print the generated code 52 | print(response["choices"][0]["message"]['content'].strip()) 53 | with jsonlines.open(TEST_RESULT + f"{datetime.datetime.now().strftime('%Y-%m-%d-%H')}" + ".jsonl", 54 | mode='a') as writer: 55 | writer.write({"title": title, "essay": response["choices"][0]["message"]['content'].strip()}) 56 | 57 | 58 | def main(): 59 | inputs = input("请输入高考作文题目:") 60 | get_chat_response(inputs) 61 | 62 | if __name__ == "__main__": 63 | main() -------------------------------------------------------------------------------- /khan2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Turing-Project/EssayTopicPredictV2/030e852d362b333c4566dde1be9a682c48eb0e7e/khan2020.pdf -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | import logging 4 | import os, langid, datetime, re 5 | from copy import deepcopy 6 | from typing import List 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from numba import Object 11 | from tqdm import tqdm 12 | from bertopic import BERTopic 13 | 14 | from flair.embeddings import TransformerDocumentEmbeddings 15 | 16 | INPUT_PATH = os.path.dirname(os.path.realpath(__file__)) + '/dataset/' 17 | MODEL_PATH = os.path.dirname(os.path.realpath(__file__)) + '/model/' 18 | OUTPUT_PATH = os.path.dirname(os.path.realpath(__file__)) + '/output/' 19 | TMP_PATH = os.path.dirname(os.path.realpath(__file__)) + '/tmp/' 20 | BAN_WORD_PATH = os.path.dirname(os.path.realpath(__file__)) + '/model/english_dictionary.txt' 21 | 22 | for path in [INPUT_PATH, MODEL_PATH, OUTPUT_PATH]: 23 | if not os.path.exists(path): 24 | os.makedirs(path) 25 | 26 | 27 | class EssayTopicPredictModel(Object): 28 | """ 29 | @:param Turing's Cat 30 | @:keyword Essay Topic Predict 31 | @:date: 2022/05/26 32 | """ 33 | 34 | def __init__(self, ): 35 | super().__init__() 36 | self.finalkey = None 37 | self.VISUAL_MODEL = False 38 | self.SAVE_MODEL = False 39 | self.SAVE_JSON = True 40 | self.THRESHOLD = 8 41 | self.config = { 42 | "load_data_batch": 1e6, 43 | "content_types": ['weibo', 'wenzhang', 'yangshi', 'daily'], 44 | "day_period": 3, 45 | "spec_delete_list": [' ', 'http', 'www', 'rt', 'media', 'class', 'jpg', 'com', 'twimg', 'image', 'png'], 46 | } 47 | self.items_dict["weibo"] = [] 48 | self.items_dict["wenzhang"] = [] 49 | self.items_dict["yangshi"] = [] 50 | self.items_dict["daily"] = [] 51 | self.n_gram_range = (2, 2) 52 | self.min_topic_size = 10 53 | self.diversity = 0.1 54 | self.num_scores = 50000 55 | 56 | def word2VecGaussianModel(self): 57 | pass # todo 58 | 59 | def BERTopicModel(self): 60 | """ 61 | BERT tokenize and Clustering with DBSCAN 62 | :return: model saver 63 | """ 64 | model_version = OUTPUT_PATH + "TopicModel" + "_range" + str(self.n_gram_range[0]) + "_min_size" \ 65 | + str(self.min_topic_size) + "_diversity" + str(self.diversity) 66 | 67 | roberta = TransformerDocumentEmbeddings('hfl/chinese-roberta-wwm-ext') 68 | if roberta: 69 | model = BERTopic(embedding_model=roberta, verbose=True, low_memory=True, n_gram_range=self.n_gram_range, 70 | min_topic_size=self.min_topic_size, diversity=self.diversity) 71 | else: 72 | model = BERTopic(embedding_model="all-MiniLM-L6-v2", language="english", calculate_probabilities=True, 73 | n_gram_range=self.n_gram_range, nr_topics='auto', min_topic_size=self.min_topic_size, 74 | diversity=self.diversity, verbose=True) # embedding can be any language 75 | 76 | if len(self.dataset) < 100: 77 | raise Exception(f"Too less feeds are fetched ({len(self.dataset)}<100), please set a longer day period.") 78 | 79 | f"model has been load through hugging face, then start training in{model_version}..." 80 | topics, probabilities = model.fit_transform(self.dataset) 81 | f"{topics=}" \ 82 | f"{probabilities=}" 83 | 84 | topic_count = deepcopy(list(model.topic_sizes.values())[:]) 85 | topic_names = deepcopy(list(model.topic_names.values())[:]) 86 | result = pd.DataFrame(zip(topic_names, topic_count)) 87 | result.to_csv("topic_result.csv", encoding='utf_8_sig', mode='w', index=False, sep=',', header=False) 88 | 89 | del topic_count[-1] 90 | # print(f"{first_Topic=}") 91 | 92 | if self.VISUAL_MODEL: 93 | fig_name = datetime.datetime.now().strftime('%Y%m%d') 94 | # there is a bug in the following fuction located in "python3.8/site-packages/bertopic/plotting/_topics.py" line 49. 95 | # need to change to "topics = sorted(topic_model.get_topic_freq().Topic.to_list()[0:top_n_topics])" 96 | fig1 = model.visualize_topics(top_n_topics=None, width=700, height=700) 97 | fig1.write_html(OUTPUT_PATH + f"{fig_name}_topic.html") 98 | fig2 = model.visualize_barchart(top_n_topics=None, width=400, height=400) 99 | fig2.write_html(OUTPUT_PATH + f"{fig_name}_word_score.html") 100 | fig3 = model.visualize_term_rank() # .visualize_distribution(probabilities[200], min_probability=0.015) 101 | fig3.write_html(OUTPUT_PATH + f"{fig_name}_3.html") 102 | 103 | if self.SAVE_MODEL: 104 | model.save(model_version) 105 | 106 | return topic_names 107 | 108 | @staticmethod 109 | def saveFile(path, filename, data): 110 | if not os.path.exists(path): 111 | os.makedirs(path) 112 | dataframe = pd.DataFrame(data) 113 | dataframe.to_csv(path + filename + ".csv", encoding='utf_8_sig', mode='w', index=False, sep=',', header=False) 114 | 115 | """创建停用词列表""" 116 | 117 | def stopwordslist(self): 118 | stopwords = [line.strip() for line in open('./stopwords.txt', encoding='UTF-8').readlines()] 119 | return stopwords 120 | 121 | def wordTokenPreprocessor(self): 122 | """ 123 | # 1.stripped emoji, URLs/HTML tags, and common English ”stopwords” 124 | # 2.lowercase,tokenized duplication-reduce and stemming/Lemmatization 125 | # 3.filter infrequent words less than 5 time in the entire corpus and short documents 126 | """ 127 | global local_cache 128 | combined_data = pd.DataFrame() 129 | step = 0 130 | local_cache = False 131 | try: 132 | for root, dirs, files in os.walk(TMP_PATH): 133 | for file in files: 134 | filename = os.path.join(root, file) 135 | if os.path.isfile(filename): 136 | combined_data = pd.read_csv(filename, encoding='utf_8_sig', sep=',') 137 | combined_data = combined_data.sample(n=20000, replace=False, weights=None, axis=0) 138 | local_cache = True 139 | break 140 | 141 | # self._fetch_nft_scores(self.num_scores) 142 | if local_cache is not True: 143 | for content_type in self.config["content_types"]: 144 | combined_data = combined_data.append(self.items_dict.get(content_type), ignore_index=True) 145 | 146 | combined_data.drop_duplicates(keep='last') 147 | combined_data.dropna() 148 | 149 | self.finalkey = [] 150 | bar = tqdm(combined_data.index, 151 | desc=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Word tokenisation", 152 | total=len(combined_data), 153 | ncols=150) 154 | for idx in bar: 155 | list_value = combined_data.loc[idx].values.tolist() 156 | if len(str(list_value[0])) < self.THRESHOLD: continue 157 | 158 | sentence = list_value[0] 159 | if sentence is None: 160 | continue 161 | # 0. Check language (only consider English in the first version) 162 | lan_identify, _ = langid.classify(sentence) # identify language the sentence is. 163 | if lan_identify != self.lan_candidates[0]: # en 164 | continue 165 | 166 | # 1. Remove other special characters such as emojis, picture links, website external links and account addresses 167 | if 'http' in sentence: 168 | self._remove_after(sentence, 'http') 169 | regwords = re.sub(r'<.*?>|\\[.*?\\]|\b0\S*?\w\b|http|com', "", sentence) 170 | stopwords = '|'.join(self.stopwordslist()) 171 | sentence = re.sub(stopwords, "", regwords) 172 | 173 | if (len(sentence)) <= 2: 174 | continue 175 | self.dataset.append(sentence) 176 | except Exception as e: 177 | logging.Logger.info("catch error: ", e) 178 | finally: 179 | if local_cache is not True: 180 | self.saveFile(TMP_PATH, "processed_data", self.dataset) 181 | print("final dataset has been saved, with %d" % len(self.dataset)) 182 | self.dataset = [str(x) for x in self.dataset] 183 | 184 | def jsonSummaryCheck(self, dict_path): 185 | if self.json_dict: 186 | try: 187 | self._json_summary_check(dict_path) 188 | except Exception as e: 189 | raise f'summary check failed {e}.' 190 | else: 191 | print('blank json dict') 192 | 193 | def datePreprocess(self, data_path): 194 | for root, dirs, files in os.walk(data_path): 195 | bar = tqdm(files, 196 | total=len(files), 197 | desc=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Process daily, yangshi, weibo", 198 | ncols=150) 199 | for file in bar: 200 | try: 201 | filename = os.path.join(root, file) 202 | """处理数据""" 203 | if file.startswith("weibo"): 204 | dataset = pd.read_csv(filename, encoding="utf-8", header=None) 205 | dataset = list(set([x[0] for x in np.array(dataset).tolist()])) 206 | self.items_dict["weibo"].extend(dataset) 207 | 208 | elif file.startswith("202"): 209 | f = open(filename, "r", encoding="utf-8") 210 | dataset = f.readlines() 211 | if "责编" not in dataset: 212 | self.items_dict["wenzhang"].extend(dataset) 213 | 214 | elif file.startswith("yangshi"): 215 | dataset = pd.read_csv(filename, encoding="utf-8") 216 | dataset["concat"] = dataset["title"] + "。" + dataset["brief"] 217 | dataset = list(set(np.array(dataset["concat"]).tolist())) 218 | self.items_dict["yangshi"].extend(dataset) 219 | 220 | elif file.startswith("daily"): 221 | dataset = pd.read_csv(filename, encoding="utf-8") 222 | dataset["concat"] = dataset.iloc[:, 1] + "。" + dataset.iloc[:, 6] 223 | dataset = list(set(np.array(dataset["concat"]).tolist())) 224 | self.items_dict["daily"] = dataset 225 | except Exception as e: 226 | print("logging error as %s" % e) 227 | 228 | self.PAST_TIME = self._get_past_time() 229 | 230 | def call_gpt_helper(self, api_key: str) \ 231 | -> List[str]: 232 | """ 233 | :parameter: api_key: str 234 | :return: 235 | """ 236 | 237 | 238 | 239 | def test(): 240 | # from sklearn.datasets import fetch_20newsgroups 241 | # docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] 242 | newDataHandler = EssayTopicPredictModel() 243 | newDataHandler.datePreprocess(INPUT_PATH) 244 | newDataHandler.wordTokenPreprocessor() 245 | newDataHandler.BERTopicModel() 246 | print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] finished") 247 | 248 | 249 | if __name__ == '__main__': 250 | parser = argparse.ArgumentParser() 251 | parser.add_argument('--online', default=False, type=str, required=False, help='open trigger') 252 | parser.add_argument('--visualize', default=False, type=str, required=False, help='visual trigger') 253 | 254 | args = parser.parse_args() 255 | print('args:\n' + args.__repr__()) 256 | 257 | if args.online: 258 | pass 259 | else: 260 | test() 261 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Turing-Project/EssayTopicPredictV2/030e852d362b333c4566dde1be9a682c48eb0e7e/model/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | requests~=2.25.1 3 | psycopg2~=2.9.3 4 | pytz~=2021.1 5 | pandas~=1.4.0 6 | tqdm~=4.59.0 7 | sshtunnel~=0.4.0 8 | nltk~=3.6.1 9 | langid~=1.1.6 10 | numpy~=1.20.1 11 | bertopic~=0.9.4 12 | scikit-learn~=0.24.1 -------------------------------------------------------------------------------- /stopwords.txt: -------------------------------------------------------------------------------- 1 | 习近平指出 2 | 习近平强调 3 | 文章强调 4 | 文章指出 5 | 李克强指出 6 | 李克强强调 7 | 栗战书指出 8 | 栗战书强调 9 | 汪洋指出 10 | 汪洋强调 11 | 王沪宁指出 12 | 王沪宁强调 13 | 赵乐际指出 14 | 赵乐际强调 15 | 韩正指出 16 | 韩正强调 17 | 王岐山指出 18 | 王岐山强调 19 | 刘鹤指出 20 | 刘鹤强调 21 | 王毅指出 22 | 王毅强调 23 | 全文 24 | nan 25 | NAN 26 |   27 | 央视频的微博视频 28 | 人民日报的微博视频 29 | 央视频的微博直播 30 | 人民日报的微博直播 31 | 主播 32 | 当地时间 33 | 日前 34 | 目前 35 | 微信公众号消息 36 | 近日 37 | 正在直播 38 | 共同关注 39 | 微博视频 40 | 网页链接 41 | 早安 42 | 晚安 43 | 上午 44 | 下午 45 | 今晚 46 | 昨晚 47 | 昨天 48 | 今天 49 | 厉害了 50 | 51 | -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 版权原因目前不支持自定义训练,避免被非法商用 3 | """ -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Turing-Project/EssayTopicPredictV2/030e852d362b333c4566dde1be9a682c48eb0e7e/utils/__init__.py -------------------------------------------------------------------------------- /utils/task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash # employ bash shell 2 | 3 | daily_excute="TRUE" # define execute period 4 | hourly_excute="FALSE" 5 | 6 | if ["$daily_excute" = "TRUE"]; then 7 | for file in 'ls /usr/Revery-Recommendation/utils' 8 | do 9 | #skip the shell script if it's empty 10 | if [-f $file] ; then 11 | if['ls -l $file|awk "{print $5}"' -gt 0] ; then 12 | python main.py --online true --visualize false & >> trending.log 13 | fi 14 | fi 15 | done 16 | fi 17 | 18 | echo "excution started in daily $daily_excute | or hourly $hourly_excute" # echo is used to printf in terminal 19 | 20 | --------------------------------------------------------------------------------