├── MANIFEST.in ├── build └── lib │ └── cocoNLP │ ├── __init__.py │ ├── __version__.py │ ├── data │ └── stopwords.txt │ └── rake.py ├── cocoNLP.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── requires.txt └── top_level.txt ├── cocoNLP ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── extractor.cpython-35.pyc │ ├── extractor.cpython-36.pyc │ └── extractor.cpython-37.pyc ├── __version__.py ├── config │ ├── .DS_Store │ ├── basic │ │ └── time_nlp │ │ │ ├── .DS_Store │ │ │ ├── EGG-INFO │ │ │ ├── PKG-INFO │ │ │ ├── SOURCES.txt │ │ │ ├── dependency_links.txt │ │ │ ├── not-zip-safe │ │ │ ├── requires.txt │ │ │ └── top_level.txt │ │ │ ├── LunarSolarConverter.py │ │ │ ├── README.md │ │ │ ├── RangeTimeEnum.py │ │ │ ├── StringPreHandler.py │ │ │ ├── Test.py │ │ │ ├── TimeConverter.egg-info │ │ │ ├── PKG-INFO │ │ │ ├── SOURCES.txt │ │ │ ├── dependency_links.txt │ │ │ ├── not-zip-safe │ │ │ ├── requires.txt │ │ │ └── top_level.txt │ │ │ ├── TimeNormalizer.py │ │ │ ├── TimePoint.py │ │ │ ├── TimeUnit.py │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── LunarSolarConverter.cpython-35.pyc │ │ │ ├── LunarSolarConverter.cpython-36.pyc │ │ │ ├── RangeTimeEnum.cpython-35.pyc │ │ │ ├── RangeTimeEnum.cpython-36.pyc │ │ │ ├── StringPreHandler.cpython-35.pyc │ │ │ ├── StringPreHandler.cpython-36.pyc │ │ │ ├── Test.cpython-36.pyc │ │ │ ├── TimeNormalizer.cpython-35.pyc │ │ │ ├── TimeNormalizer.cpython-36.pyc │ │ │ ├── TimePoint.cpython-35.pyc │ │ │ ├── TimePoint.cpython-36.pyc │ │ │ ├── TimeUnit.cpython-35.pyc │ │ │ ├── TimeUnit.cpython-36.pyc │ │ │ ├── __init__.cpython-35.pyc │ │ │ └── __init__.cpython-36.pyc │ │ │ └── resource │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ └── __init__.cpython-36.pyc │ │ │ ├── holi_lunar.json │ │ │ ├── holi_solar.json │ │ │ ├── reg.pkl │ │ │ └── regex.txt │ └── phrase │ │ ├── __pycache__ │ │ └── rake.cpython-36.pyc │ │ ├── data │ │ └── stopwords.txt │ │ └── rake.py ├── extractor.py └── extractor.pyc ├── dist ├── .DS_Store ├── cocoNLP-0.0.10.tar.gz ├── cocoNLP-0.0.11.tar.gz ├── cocoNLP-0.0.12.tar.gz ├── cocoNLP-0.0.13.tar.gz └── cocoNLP-0.0.9 │ ├── MANIFEST.in │ ├── PKG-INFO │ ├── cocoNLP.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt │ ├── cocoNLP │ ├── __init__.py │ ├── __version__.py │ ├── config │ │ ├── basic │ │ │ └── time_nlp │ │ │ │ ├── .DS_Store │ │ │ │ ├── EGG-INFO │ │ │ │ ├── PKG-INFO │ │ │ │ ├── SOURCES.txt │ │ │ │ ├── dependency_links.txt │ │ │ │ ├── not-zip-safe │ │ │ │ ├── requires.txt │ │ │ │ └── top_level.txt │ │ │ │ ├── LunarSolarConverter.py │ │ │ │ ├── README.md │ │ │ │ ├── RangeTimeEnum.py │ │ │ │ ├── StringPreHandler.py │ │ │ │ ├── Test.py │ │ │ │ ├── TimeConverter.egg-info │ │ │ │ ├── PKG-INFO │ │ │ │ ├── SOURCES.txt │ │ │ │ ├── dependency_links.txt │ │ │ │ ├── not-zip-safe │ │ │ │ ├── requires.txt │ │ │ │ └── top_level.txt │ │ │ │ ├── TimeNormalizer.py │ │ │ │ ├── TimePoint.py │ │ │ │ ├── TimeUnit.py │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── LunarSolarConverter.cpython-36.pyc │ │ │ │ ├── RangeTimeEnum.cpython-36.pyc │ │ │ │ ├── StringPreHandler.cpython-36.pyc │ │ │ │ ├── Test.cpython-36.pyc │ │ │ │ ├── TimeNormalizer.cpython-36.pyc │ │ │ │ ├── TimePoint.cpython-36.pyc │ │ │ │ ├── TimeUnit.cpython-36.pyc │ │ │ │ └── __init__.cpython-36.pyc │ │ │ │ └── resource │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ └── __init__.cpython-36.pyc │ │ │ │ ├── holi_lunar.json │ │ │ │ ├── holi_solar.json │ │ │ │ ├── reg.pkl │ │ │ │ └── regex.txt │ │ └── phrase │ │ │ ├── __pycache__ │ │ │ └── rake.cpython-36.pyc │ │ │ ├── data │ │ │ └── stopwords.txt │ │ │ └── rake.py │ └── extractor.py │ ├── readme.md │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ └── test.py ├── readme.md ├── requirements.txt ├── setup.py └── test.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cocoNLP/config * 2 | recursive-include cocoNLP/config * 3 | -------------------------------------------------------------------------------- /build/lib/cocoNLP/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | # -*- coding: utf-8 -*- 10 | 11 | """ 12 | cocoNLP module 13 | :copyright: (c) 2018 by Yang Yang. 14 | :license: MIT, see LICENSE for more details. 15 | """ 16 | -------------------------------------------------------------------------------- /build/lib/cocoNLP/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | 10 | __title__ = "cocoNLP" 11 | __description__ = "Python implementation of many nlp algorithms" 12 | __url__ = "https://github.com/fighting41love" 13 | __version__ = "0.0.7" 14 | __author__ = "Yang Yang" 15 | __author_email__ = "yangyangfuture@gmail.com" 16 | __license__ = "MIT" 17 | __copyright__ = "Copyright 2018 Yang Yang" 18 | -------------------------------------------------------------------------------- /build/lib/cocoNLP/rake.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Implementation of Rapid Automatic Keyword Extraction algorithm. 3 | 4 | As described in the paper `Automatic keyword extraction from individual 5 | documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley. 6 | """ 7 | 8 | import string 9 | from collections import Counter, defaultdict 10 | from itertools import chain, groupby, product 11 | import jieba 12 | import re 13 | from enum import Enum 14 | 15 | 16 | class Metric(Enum): 17 | """Different metrics that can be used for ranking.""" 18 | 19 | DEGREE_TO_FREQUENCY_RATIO = 0 # Uses d(w)/f(w) as the metric 20 | WORD_DEGREE = 1 # Uses d(w) alone as the metric 21 | WORD_FREQUENCY = 2 # Uses f(w) alone as the metric 22 | 23 | 24 | class Rake(object): 25 | """Rapid Automatic Keyword Extraction Algorithm.""" 26 | 27 | def __init__( 28 | self, 29 | punctuations=None, 30 | ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, 31 | max_length=100000, 32 | min_length=1, 33 | ): 34 | """Constructor. 35 | 36 | :param stopwords: List of Words to be ignored for keyword extraction. 37 | :param punctuations: Punctuations to be ignored for keyword extraction. 38 | :param language: Language to be used for stopwords 39 | :param max_length: Maximum limit on the number of words in a phrase 40 | (Inclusive. Defaults to 100000) 41 | :param min_length: Minimum limit on the number of words in a phrase 42 | (Inclusive. Defaults to 1) 43 | """ 44 | # By default use degree to frequency ratio as the metric. 45 | if isinstance(ranking_metric, Metric): 46 | self.metric = ranking_metric 47 | else: 48 | self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO 49 | 50 | # If stopwords not provided we use language stopwords by default. 51 | self.stopwords = self.load_stopwords() 52 | 53 | # If punctuations are not provided we ignore all punctuation symbols. 54 | self.punctuations = punctuations 55 | if self.punctuations is None: 56 | self.punctuations = string.punctuation + ',,。!?!?' # add chinese punctuation 57 | 58 | # All things which act as sentence breaks during keyword extraction. 59 | self.to_ignore = set(chain(self.stopwords, self.punctuations)) 60 | 61 | # Assign min or max length to the attributes 62 | self.min_length = min_length 63 | self.max_length = max_length 64 | 65 | # Stuff to be extracted from the provided text. 66 | self.frequency_dist = None 67 | self.degree = None 68 | self.rank_list = None 69 | self.ranked_phrases = None 70 | 71 | def load_stopwords(self, path = 'data/stopwords.txt'): 72 | """load stopwords list 73 | eg: stopwords_list = load_stopwords(path) 74 | 75 | :param path: 停用词表path,提前整理好的,直接读进来 76 | :return: list 77 | """ 78 | with open(path) as f: 79 | stopwords = f.readlines() 80 | stopwords_list = [] 81 | for word in stopwords: 82 | stopwords_list.append(word.replace('\n', '').replace(' ', '')) 83 | 84 | return stopwords_list 85 | 86 | def tokenize_chinese(self,text): 87 | 88 | sentences = re.split('(。|!|\!|\.|?|\?)', text) # 保留分割符 89 | 90 | new_sents = [] 91 | for i in range(int(len(sentences) / 2)): 92 | sent = sentences[2 * i] + sentences[2 * i + 1] 93 | new_sents.append(sent) 94 | return new_sents 95 | 96 | def extract_keywords_from_text(self, text, min_len, max_len): 97 | """Method to extract keywords from the text provided. 98 | 99 | :param text: Text to extract keywords from, provided as a string. 100 | """ 101 | sentences = self.tokenize_chinese(text) 102 | self.extract_keywords_from_sentences(sentences, min_len, max_len) 103 | 104 | def extract_keywords_from_sentences(self, sentences, min_len, max_len): 105 | """Method to extract keywords from the list of sentences provided. 106 | 107 | :param sentences: Text to extraxt keywords from, provided as a list 108 | of strings, where each string is a sentence. 109 | """ 110 | phrase_list = self._generate_phrases(sentences, min_len, max_len) 111 | self._build_frequency_dist(phrase_list) 112 | self._build_word_co_occurance_graph(phrase_list) 113 | self._build_ranklist(phrase_list) 114 | 115 | def get_ranked_phrases(self): 116 | """Method to fetch ranked keyword strings. 117 | 118 | :return: List of strings where each string represents an extracted 119 | keyword string. 120 | """ 121 | return self.ranked_phrases 122 | 123 | def get_ranked_phrases_with_scores(self): 124 | """Method to fetch ranked keyword strings along with their scores. 125 | 126 | :return: List of tuples where each tuple is formed of an extracted 127 | keyword string and its score. Ex: (5.68, 'Four Scoures') 128 | """ 129 | return self.rank_list 130 | 131 | def get_word_frequency_distribution(self): 132 | """Method to fetch the word frequency distribution in the given text. 133 | 134 | :return: Dictionary (defaultdict) of the format `word -> frequency`. 135 | """ 136 | return self.frequency_dist 137 | 138 | def get_word_degrees(self): 139 | """Method to fetch the degree of words in the given text. Degree can be 140 | defined as sum of co-occurances of the word with other words in the 141 | given text. 142 | 143 | :return: Dictionary (defaultdict) of the format `word -> degree`. 144 | """ 145 | return self.degree 146 | 147 | def _build_frequency_dist(self, phrase_list): 148 | """Builds frequency distribution of the words in the given body of text. 149 | 150 | :param phrase_list: List of List of strings where each sublist is a 151 | collection of words which form a contender phrase. 152 | """ 153 | self.frequency_dist = Counter(chain.from_iterable(phrase_list)) 154 | 155 | def _build_word_co_occurance_graph(self, phrase_list): 156 | """Builds the co-occurance graph of words in the given body of text to 157 | compute degree of each word. 158 | 159 | :param phrase_list: List of List of strings where each sublist is a 160 | collection of words which form a contender phrase. 161 | """ 162 | co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0)) 163 | for phrase in phrase_list: 164 | # For each phrase in the phrase list, count co-occurances of the 165 | # word with other words in the phrase. 166 | # 167 | # Note: Keep the co-occurances graph as is, to help facilitate its 168 | # use in other creative ways if required later. 169 | for (word, coword) in product(phrase, phrase): 170 | co_occurance_graph[word][coword] += 1 171 | self.degree = defaultdict(lambda: 0) 172 | for key in co_occurance_graph: 173 | self.degree[key] = sum(co_occurance_graph[key].values()) 174 | 175 | def _build_ranklist(self, phrase_list): 176 | """Method to rank each contender phrase using the formula 177 | 178 | phrase_score = sum of scores of words in the phrase. 179 | word_score = d(w)/f(w) where d is degree and f is frequency. 180 | 181 | :param phrase_list: List of List of strings where each sublist is a 182 | collection of words which form a contender phrase. 183 | """ 184 | self.rank_list = [] 185 | for phrase in phrase_list: 186 | rank = 0.0 187 | for word in phrase: 188 | if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO: 189 | rank += 1.0 * self.degree[word] / self.frequency_dist[word] 190 | elif self.metric == Metric.WORD_DEGREE: 191 | rank += 1.0 * self.degree[word] 192 | else: 193 | rank += 1.0 * self.frequency_dist[word] 194 | self.rank_list.append((rank, " ".join(phrase))) 195 | self.rank_list.sort(reverse=True) 196 | self.ranked_phrases = [ph[1] for ph in self.rank_list] 197 | 198 | def _generate_phrases(self, sentences, min_len, max_len): 199 | """Method to generate contender phrases given the sentences of the text 200 | document. 201 | 202 | :param sentences: List of strings where each string represents a 203 | sentence which forms the text. 204 | :return: Set of string tuples where each tuple is a collection 205 | of words forming a contender phrase. 206 | """ 207 | phrase_list = set() 208 | # Create contender phrases from sentences. 209 | for sentence in sentences: 210 | word_list = [word for word in list(jieba.cut(sentence))] 211 | phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len)) 212 | return phrase_list 213 | 214 | def _get_phrase_list_from_words(self, word_list, min_len, max_len): 215 | """Method to create contender phrases from the list of words that form 216 | a sentence by dropping stopwords and punctuations and grouping the left 217 | words into phrases. Only phrases in the given length range (both limits 218 | inclusive) would be considered to build co-occurrence matrix. Ex: 219 | 220 | Sentence: Red apples, are good in flavour. 221 | List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour'] 222 | List after dropping punctuations and stopwords. 223 | List of words: ['red', 'apples', *, *, good, *, 'flavour'] 224 | List of phrases: [('red', 'apples'), ('good',), ('flavour',)] 225 | 226 | List of phrases with a correct length: 227 | For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)] 228 | For the range [1, 1]: [('good',), ('flavour',)] 229 | For the range [2, 2]: [('red', 'apples')] 230 | 231 | :param word_list: List of words which form a sentence when joined in 232 | the same order. 233 | :return: List of contender phrases that are formed after dropping 234 | stopwords and punctuations. 235 | """ 236 | groups = groupby(word_list, lambda x: x not in self.to_ignore) 237 | phrases = [] 238 | for group in groups: 239 | tmp = tuple(group[1]) 240 | len_g1 = len(list(tmp)) 241 | if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase 242 | phrases.append(tuple(tmp)) 243 | 244 | return list( 245 | filter( 246 | lambda x: self.min_length <= len(x) <= self.max_length, phrases 247 | ) 248 | ) 249 | -------------------------------------------------------------------------------- /cocoNLP.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: cocoNLP 3 | Version: 0.0.13 4 | Summary: Python implementation of many nlp algorithms 5 | Home-page: https://github.com/fighting41love 6 | Author: Yang Yang 7 | Author-email: yangyangfuture@gmail.com 8 | License: MIT 9 | Description: UNKNOWN 10 | Keywords: nlp text-mining information extraction 11 | Platform: UNKNOWN 12 | Classifier: Intended Audience :: Developers 13 | Classifier: Intended Audience :: Education 14 | Classifier: License :: OSI Approved :: MIT License 15 | Classifier: Development Status :: 3 - Alpha 16 | Classifier: Operating System :: POSIX 17 | Classifier: Programming Language :: Python :: 2.7 18 | Classifier: Programming Language :: Python :: 3.4 19 | Classifier: Programming Language :: Python :: 3.5 20 | Classifier: Programming Language :: Python :: 3.6 21 | Classifier: Topic :: Software Development :: Build Tools 22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 23 | -------------------------------------------------------------------------------- /cocoNLP.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | MANIFEST.in 3 | readme.md 4 | requirements.txt 5 | setup.py 6 | test.py 7 | cocoNLP/__init__.py 8 | cocoNLP/__version__.py 9 | cocoNLP/extractor.py 10 | cocoNLP.egg-info/PKG-INFO 11 | cocoNLP.egg-info/SOURCES.txt 12 | cocoNLP.egg-info/dependency_links.txt 13 | cocoNLP.egg-info/requires.txt 14 | cocoNLP.egg-info/top_level.txt 15 | cocoNLP/config/.DS_Store 16 | cocoNLP/config/basic/time_nlp/.DS_Store 17 | cocoNLP/config/basic/time_nlp/LunarSolarConverter.py 18 | cocoNLP/config/basic/time_nlp/README.md 19 | cocoNLP/config/basic/time_nlp/RangeTimeEnum.py 20 | cocoNLP/config/basic/time_nlp/StringPreHandler.py 21 | cocoNLP/config/basic/time_nlp/Test.py 22 | cocoNLP/config/basic/time_nlp/TimeNormalizer.py 23 | cocoNLP/config/basic/time_nlp/TimePoint.py 24 | cocoNLP/config/basic/time_nlp/TimeUnit.py 25 | cocoNLP/config/basic/time_nlp/__init__.py 26 | cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO 27 | cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt 28 | cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt 29 | cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe 30 | cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt 31 | cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt 32 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO 33 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt 34 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt 35 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe 36 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt 37 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt 38 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc 39 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc 40 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc 41 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc 42 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc 43 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc 44 | cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc 45 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc 46 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc 47 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc 48 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc 49 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc 50 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc 51 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc 52 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc 53 | cocoNLP/config/basic/time_nlp/resource/__init__.py 54 | cocoNLP/config/basic/time_nlp/resource/holi_lunar.json 55 | cocoNLP/config/basic/time_nlp/resource/holi_solar.json 56 | cocoNLP/config/basic/time_nlp/resource/reg.pkl 57 | cocoNLP/config/basic/time_nlp/resource/regex.txt 58 | cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc 59 | cocoNLP/config/phrase/rake.py 60 | cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc 61 | cocoNLP/config/phrase/data/stopwords.txt -------------------------------------------------------------------------------- /cocoNLP.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cocoNLP.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | pyhanlp 3 | phone 4 | phonenumbers 5 | regex 6 | arrow 7 | -------------------------------------------------------------------------------- /cocoNLP.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | cocoNLP 2 | -------------------------------------------------------------------------------- /cocoNLP/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | # -*- coding: utf-8 -*- 10 | 11 | """ 12 | cocoNLP module 13 | :copyright: (c) 2019 by Yang Yang. 14 | :license: MIT, see LICENSE for more details. 15 | """ 16 | -------------------------------------------------------------------------------- /cocoNLP/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__init__.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/extractor.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/extractor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/__pycache__/extractor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-37.pyc -------------------------------------------------------------------------------- /cocoNLP/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | 10 | __title__ = "cocoNLP" 11 | __description__ = "Python implementation of many nlp algorithms" 12 | __url__ = "https://github.com/fighting41love" 13 | __version__ = "0.0.13" 14 | __author__ = "Yang Yang" 15 | __author_email__ = "yangyangfuture@gmail.com" 16 | __license__ = "MIT" 17 | __copyright__ = "Copyright 2019 Yang Yang" 18 | -------------------------------------------------------------------------------- /cocoNLP/config/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/.DS_Store -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/.DS_Store -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: TimeConverter 3 | Version: 1.1.0 4 | Summary: ... 5 | Home-page: http://test.com 6 | Author: test 7 | Author-email: test@gmail.com 8 | License: MIT Licence 9 | Description: ... 10 | Keywords: time,nlp 11 | Platform: any 12 | Classifier: Programming Language :: Python :: 2.6 13 | Classifier: Programming Language :: Python :: 2.7 14 | Classifier: Programming Language :: Python :: 3.6 15 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LunarSolarConverter.py 2 | README.md 3 | RangeTimeEnum.py 4 | StringPreHandler.py 5 | Test.py 6 | TimeNormalizer.py 7 | TimePoint.py 8 | TimeUnit.py 9 | __init__.py 10 | setup.py 11 | TimeConverter.egg-info/PKG-INFO 12 | TimeConverter.egg-info/SOURCES.txt 13 | TimeConverter.egg-info/dependency_links.txt 14 | TimeConverter.egg-info/not-zip-safe 15 | TimeConverter.egg-info/requires.txt 16 | TimeConverter.egg-info/top_level.txt 17 | resource/__init__.py -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | regex>=2017 2 | arrow>=0.10 3 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | resource 3 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/LunarSolarConverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/11 11:08 4 | # @Author : zhm 5 | # @File : LunarSolarConverter.py 6 | # @Software: PyCharm 7 | from pprint import pprint 8 | 9 | 10 | class Lunar: 11 | def __init__(self, lunarYear, lunarMonth, lunarDay, isleap): 12 | self.isleap = isleap 13 | self.lunarDay = lunarDay 14 | self.lunarMonth = lunarMonth 15 | self.lunarYear = lunarYear 16 | 17 | 18 | class Solar: 19 | def __init__(self, solarYear, solarMonth, solarDay): 20 | self.solarDay = solarDay 21 | self.solarMonth = solarMonth 22 | self.solarYear = solarYear 23 | 24 | 25 | def GetBitInt(data, length, shift): 26 | return (data & (((1 << length) - 1) << shift)) >> shift 27 | 28 | 29 | def SolarToInt(y, m, d): 30 | m = (m + 9) % 12 31 | y -= int(m / 10) 32 | return 365 * y + int(y / 4) - int(y / 100) + int(y / 400) + int((m * 306 + 5) / 10) + (d - 1) 33 | 34 | 35 | def SolarFromInt(g): 36 | y = int((10000 * g + 14780) / 3652425) 37 | ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400)) 38 | if ddd < 0: 39 | y -= 1 40 | ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400)) 41 | 42 | mi = int((100 * ddd + 52) / 3060) 43 | mm = (mi + 2) % 12 + 1 44 | y += int((mi + 2) / 12) 45 | dd = ddd - int((mi * 306 + 5) / 10) + 1 46 | solar = Solar(y, mm, dd) 47 | return solar 48 | 49 | 50 | class LunarSolarConverter: 51 | ##################################################################################### 52 | # 1888~2111年农历数据表 53 | # 农历数据 每个元素的存储格式如下: 54 | # 16~13 12 11~0 55 | # 闰几月 闰月日数 1~12月份农历日数(大小月) 56 | # 注:1、bit0表示农历1月份日数,为1表示30天,为0表示29天。bit1表示农历2月份日数,依次类推。 57 | # 2、bit12表示闰月日数,1为30天,0为29天。bit16~bit13表示第几月是闰月(注:为0表示该年无闰月) 58 | # 数据来源参考: http://data.weather.gov.hk/gts/time/conversion1_text_c.htm 59 | ##################################################################################### 60 | lunar_month_days = [1887, 0x1694, 0x16aa, 0x4ad5, 0xab6, 0xc4b7, 0x4ae, 0xa56, 0xb52a, 61 | 0x1d2a, 0xd54, 0x75aa, 0x156a, 0x1096d, 0x95c, 0x14ae, 0xaa4d, 0x1a4c, 0x1b2a, 0x8d55, 62 | 0xad4, 0x135a, 0x495d, 63 | 0x95c, 0xd49b, 0x149a, 0x1a4a, 0xbaa5, 0x16a8, 0x1ad4, 0x52da, 0x12b6, 0xe937, 0x92e, 64 | 0x1496, 0xb64b, 0xd4a, 65 | 0xda8, 0x95b5, 0x56c, 0x12ae, 0x492f, 0x92e, 0xcc96, 0x1a94, 0x1d4a, 0xada9, 0xb5a, 0x56c, 66 | 0x726e, 0x125c, 67 | 0xf92d, 0x192a, 0x1a94, 0xdb4a, 0x16aa, 0xad4, 0x955b, 0x4ba, 0x125a, 0x592b, 0x152a, 68 | 0xf695, 0xd94, 0x16aa, 69 | 0xaab5, 0x9b4, 0x14b6, 0x6a57, 0xa56, 0x1152a, 0x1d2a, 0xd54, 0xd5aa, 0x156a, 0x96c, 70 | 0x94ae, 0x14ae, 0xa4c, 71 | 0x7d26, 0x1b2a, 0xeb55, 0xad4, 0x12da, 0xa95d, 0x95a, 0x149a, 0x9a4d, 0x1a4a, 0x11aa5, 72 | 0x16a8, 0x16d4, 73 | 0xd2da, 0x12b6, 0x936, 0x9497, 0x1496, 0x1564b, 0xd4a, 0xda8, 0xd5b4, 0x156c, 0x12ae, 74 | 0xa92f, 0x92e, 0xc96, 75 | 0x6d4a, 0x1d4a, 0x10d65, 0xb58, 0x156c, 0xb26d, 0x125c, 0x192c, 0x9a95, 0x1a94, 0x1b4a, 76 | 0x4b55, 0xad4, 77 | 0xf55b, 0x4ba, 0x125a, 0xb92b, 0x152a, 0x1694, 0x96aa, 0x15aa, 0x12ab5, 0x974, 0x14b6, 78 | 0xca57, 0xa56, 0x1526, 79 | 0x8e95, 0xd54, 0x15aa, 0x49b5, 0x96c, 0xd4ae, 0x149c, 0x1a4c, 0xbd26, 0x1aa6, 0xb54, 80 | 0x6d6a, 0x12da, 0x1695d, 81 | 0x95a, 0x149a, 0xda4b, 0x1a4a, 0x1aa4, 0xbb54, 0x16b4, 0xada, 0x495b, 0x936, 0xf497, 82 | 0x1496, 0x154a, 0xb6a5, 83 | 0xda4, 0x15b4, 0x6ab6, 0x126e, 0x1092f, 0x92e, 0xc96, 0xcd4a, 0x1d4a, 0xd64, 0x956c, 84 | 0x155c, 0x125c, 0x792e, 85 | 0x192c, 0xfa95, 0x1a94, 0x1b4a, 0xab55, 0xad4, 0x14da, 0x8a5d, 0xa5a, 0x1152b, 0x152a, 86 | 0x1694, 0xd6aa, 87 | 0x15aa, 0xab4, 0x94ba, 0x14b6, 0xa56, 0x7527, 0xd26, 0xee53, 0xd54, 0x15aa, 0xa9b5, 0x96c, 88 | 0x14ae, 0x8a4e, 89 | 0x1a4c, 0x11d26, 0x1aa4, 0x1b54, 0xcd6a, 0xada, 0x95c, 0x949d, 0x149a, 0x1a2a, 0x5b25, 90 | 0x1aa4, 0xfb52, 91 | 0x16b4, 0xaba, 0xa95b, 0x936, 0x1496, 0x9a4b, 0x154a, 0x136a5, 0xda4, 0x15ac] 92 | # 额外添加数据,方便快速计算阴历转阳历 每个元素的存储格式如下: 93 | # 12~7 6~5 4~0 94 | # 离元旦多少天 春节月 春节日 95 | ##################################################################################### 96 | solar_1_1 = [1887, 0xec04c, 0xec23f, 0xec435, 0xec649, 0xec83e, 0xeca51, 0xecc46, 0xece3a, 97 | 0xed04d, 0xed242, 0xed436, 0xed64a, 0xed83f, 0xeda53, 0xedc48, 0xede3d, 0xee050, 0xee244, 0xee439, 98 | 0xee64d, 99 | 0xee842, 0xeea36, 0xeec4a, 0xeee3e, 0xef052, 0xef246, 0xef43a, 0xef64e, 0xef843, 0xefa37, 0xefc4b, 100 | 0xefe41, 101 | 0xf0054, 0xf0248, 0xf043c, 0xf0650, 0xf0845, 0xf0a38, 0xf0c4d, 0xf0e42, 0xf1037, 0xf124a, 0xf143e, 102 | 0xf1651, 103 | 0xf1846, 0xf1a3a, 0xf1c4e, 0xf1e44, 0xf2038, 0xf224b, 0xf243f, 0xf2653, 0xf2848, 0xf2a3b, 0xf2c4f, 104 | 0xf2e45, 105 | 0xf3039, 0xf324d, 0xf3442, 0xf3636, 0xf384a, 0xf3a3d, 0xf3c51, 0xf3e46, 0xf403b, 0xf424e, 0xf4443, 106 | 0xf4638, 107 | 0xf484c, 0xf4a3f, 0xf4c52, 0xf4e48, 0xf503c, 0xf524f, 0xf5445, 0xf5639, 0xf584d, 0xf5a42, 0xf5c35, 108 | 0xf5e49, 109 | 0xf603e, 0xf6251, 0xf6446, 0xf663b, 0xf684f, 0xf6a43, 0xf6c37, 0xf6e4b, 0xf703f, 0xf7252, 0xf7447, 110 | 0xf763c, 111 | 0xf7850, 0xf7a45, 0xf7c39, 0xf7e4d, 0xf8042, 0xf8254, 0xf8449, 0xf863d, 0xf8851, 0xf8a46, 0xf8c3b, 112 | 0xf8e4f, 113 | 0xf9044, 0xf9237, 0xf944a, 0xf963f, 0xf9853, 0xf9a47, 0xf9c3c, 0xf9e50, 0xfa045, 0xfa238, 0xfa44c, 114 | 0xfa641, 115 | 0xfa836, 0xfaa49, 0xfac3d, 0xfae52, 0xfb047, 0xfb23a, 0xfb44e, 0xfb643, 0xfb837, 0xfba4a, 0xfbc3f, 116 | 0xfbe53, 117 | 0xfc048, 0xfc23c, 0xfc450, 0xfc645, 0xfc839, 0xfca4c, 0xfcc41, 0xfce36, 0xfd04a, 0xfd23d, 0xfd451, 118 | 0xfd646, 119 | 0xfd83a, 0xfda4d, 0xfdc43, 0xfde37, 0xfe04b, 0xfe23f, 0xfe453, 0xfe648, 0xfe83c, 0xfea4f, 0xfec44, 120 | 0xfee38, 121 | 0xff04c, 0xff241, 0xff436, 0xff64a, 0xff83e, 0xffa51, 0xffc46, 0xffe3a, 0x10004e, 0x100242, 122 | 0x100437, 123 | 0x10064b, 0x100841, 0x100a53, 0x100c48, 0x100e3c, 0x10104f, 0x101244, 0x101438, 0x10164c, 124 | 0x101842, 0x101a35, 125 | 0x101c49, 0x101e3d, 0x102051, 0x102245, 0x10243a, 0x10264e, 0x102843, 0x102a37, 0x102c4b, 126 | 0x102e3f, 0x103053, 127 | 0x103247, 0x10343b, 0x10364f, 0x103845, 0x103a38, 0x103c4c, 0x103e42, 0x104036, 0x104249, 128 | 0x10443d, 0x104651, 129 | 0x104846, 0x104a3a, 0x104c4e, 0x104e43, 0x105038, 0x10524a, 0x10543e, 0x105652, 0x105847, 130 | 0x105a3b, 0x105c4f, 131 | 0x105e45, 0x106039, 0x10624c, 0x106441, 0x106635, 0x106849, 0x106a3d, 0x106c51, 0x106e47, 132 | 0x10703c, 0x10724f, 133 | 0x107444, 0x107638, 0x10784c, 0x107a3f, 0x107c53, 0x107e48] 134 | 135 | def LunarToSolar(self, lunar): 136 | days = LunarSolarConverter.lunar_month_days[lunar.lunarYear - LunarSolarConverter.lunar_month_days[0]] 137 | leap = GetBitInt(days, 4, 13) 138 | offset = 0 139 | loopend = leap 140 | if not lunar.isleap: 141 | 142 | if lunar.lunarMonth <= leap or leap == 0: 143 | 144 | loopend = lunar.lunarMonth - 1 145 | 146 | else: 147 | 148 | loopend = lunar.lunarMonth 149 | 150 | for i in range(0, loopend): 151 | offset += GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 152 | 153 | offset += lunar.lunarDay 154 | 155 | solar11 = LunarSolarConverter.solar_1_1[lunar.lunarYear - LunarSolarConverter.solar_1_1[0]] 156 | 157 | y = GetBitInt(solar11, 12, 9) 158 | m = GetBitInt(solar11, 4, 5) 159 | d = GetBitInt(solar11, 5, 0) 160 | 161 | return SolarFromInt(SolarToInt(y, m, d) + offset - 1) 162 | 163 | def SolarToLunar(self, solar): 164 | 165 | lunar = Lunar(0, 0, 0, False) 166 | index = solar.solarYear - LunarSolarConverter.solar_1_1[0] 167 | data = (solar.solarYear << 9) | (solar.solarMonth << 5) | solar.solarDay 168 | if LunarSolarConverter.solar_1_1[index] > data: 169 | index -= 1 170 | 171 | solar11 = LunarSolarConverter.solar_1_1[index] 172 | y = GetBitInt(solar11, 12, 9) 173 | m = GetBitInt(solar11, 4, 5) 174 | d = GetBitInt(solar11, 5, 0) 175 | offset = SolarToInt(solar.solarYear, solar.solarMonth, solar.solarDay) - SolarToInt(y, m, d) 176 | 177 | days = LunarSolarConverter.lunar_month_days[index] 178 | leap = GetBitInt(days, 4, 13) 179 | 180 | lunarY = index + LunarSolarConverter.solar_1_1[0] 181 | lunarM = 1 182 | offset += 1 183 | 184 | for i in range(0, 13): 185 | 186 | dm = GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 187 | if offset > dm: 188 | 189 | lunarM += 1 190 | offset -= dm 191 | 192 | else: 193 | 194 | break 195 | 196 | lunarD = int(offset) 197 | lunar.lunarYear = lunarY 198 | lunar.lunarMonth = lunarM 199 | lunar.isleap = False 200 | if leap != 0 and lunarM > leap: 201 | 202 | lunar.lunarMonth = lunarM - 1 203 | if lunarM == leap + 1: 204 | lunar.isleap = True 205 | 206 | lunar.lunarDay = lunarD 207 | return lunar 208 | 209 | def __init__(self): 210 | pass 211 | 212 | 213 | if __name__ == '__main__': 214 | converter = LunarSolarConverter() 215 | solar = Solar(2111, 1, 25) 216 | pprint(vars(solar)) 217 | lunar = converter.SolarToLunar(solar) 218 | pprint(vars(lunar)) 219 | solar = converter.LunarToSolar(lunar) 220 | pprint(vars(solar)) 221 | print(len(converter.solar_1_1)) 222 | print("Done") 223 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/README.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | Time-NLP的python3版本 3 | python 版本https://github.com/sunfiyes/Time-NLPY 4 | Java 版本https://github.com/shinyke/Time-NLP 5 | ## 功能说明 6 | 用于句子中时间词的抽取和转换 7 | 详情请见test.py 8 | 9 | res = tn.parse(target=u'过十分钟') # target为待分析语句,timeBase为基准时间默认是当前时间 10 | print(res) 11 | res = tn.parse(target=u'2013年二月二十八日下午四点三十分二十九秒', timeBase='2013-02-28 16:30:29') # target为待分析语句,timeBase为基准时间默认是当前时间 12 | print(res) 13 | res = tn.parse(target=u'我需要大概33天2分钟四秒', timeBase='2013-02-28 16:30:29') # target为待分析语句,timeBase为基准时间默认是当前时间 14 | print(res) 15 | res = tn.parse(target=u'今年儿童节晚上九点一刻') # target为待分析语句,timeBase为基准时间默认是当前时间 16 | print(res) 17 | res = tn.parse(target=u'2个小时以前') # target为待分析语句,timeBase为基准时间默认是当前时间 18 | print(res) 19 | res = tn.parse(target=u'晚上8点到上午10点之间') # target为待分析语句,timeBase为基准时间默认是当前时间 20 | print(res) 21 | 返回结果: 22 | 23 | {"timedelta": "0 days, 0:10:00", "type": "timedelta"} 24 | {"timestamp": "2013-02-28 16:30:29", "type": "timestamp"} 25 | {"type": "timedelta", "timedelta": {"year": 0, "month": 1, "day": 3, "hour": 0, "minute": 2, "second": 4}} 26 | {"timestamp": "2018-06-01 21:15:00", "type": "timestamp"} 27 | {"error": "no time pattern could be extracted."} 28 | {"type": "timespan", "timespan": ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]} 29 | 30 | ## 使用方式 31 | demo:python3 Test.py 32 | 33 | 优化说明 34 | 35 | | 问题 | 以前版本 | 现在版本 | 36 | | ----------- | ---------------------------------------- | ---------------------- | 37 | | 无法解析下下周末 | "timestamp": "2018-04-01 00:00:00" | "timestamp": "2018-04-08 00:00:00" | 38 | | 无法解析 3月4 | "2018-03-01" | "2018-03-04" | 39 | | 无法解析 初一 初二 | cannot parse | "2018-02-16" | 40 | | 晚上8点到上午10点之间 无法解析上午 | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] | ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]| 41 | | 3月21号  错误解析成2019年     | "2019-03-21" | "2018-03-21" | 42 | 43 | 感谢@[tianyuningmou](https://github.com/tianyuningmou) 目前增加了对24节气的支持 44 | 45 | 46 | temp = ['今年春分'] 47 | "timestamp" : "2020-03-20 00:00:00" 48 | 49 | ## TODO 50 | 51 | | 问题 | 现在版本 | 正确 52 | | ----------- | ---------------------------------------- | ---------------------- | 53 | | 晚上8点到上午10点之间 | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] | ["2018-03-16 20:00:00", "2018-03-17 10:00:00"]" | "timestamp": "2018-04-08 00:00:00" | 54 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/RangeTimeEnum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 16:27 4 | # @Author : zhm 5 | # @File : RangeTimeEnum.py 6 | # @Software: PyCharm 7 | 8 | 9 | 10 | # 范围时间的默认时间点 11 | class RangeTimeEnum(): 12 | day_break = 3 # 黎明 13 | early_morning = 8 # 早 14 | morning = 10 # 上午 15 | noon = 12 # 中午、午间 16 | afternoon = 15 # 下午、午后 17 | night = 18 # 晚上、傍晚 18 | lateNight = 20 # 晚、晚间 19 | midNight = 23 # 深夜 20 | 21 | 22 | if __name__ == "__main__": 23 | print(RangeTimeEnum.afternoon) 24 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/StringPreHandler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 15:42 4 | # @Author : zhm 5 | # @File : StringPreHandler.py 6 | # @Software: PyCharm 7 | import regex as re 8 | 9 | # * 字符串预处理模块,为分析器TimeNormalizer提供相应的字符串预处理服务 10 | class StringPreHandler: 11 | @classmethod 12 | def delKeyword(cls, target, rules): 13 | """ 14 | 该方法删除一字符串中所有匹配某一规则字串 15 | 可用于清理一个字符串中的空白符和语气助词 16 | :param target: 待处理字符串 17 | :param rules: 删除规则 18 | :return: 清理工作完成后的字符串 19 | """ 20 | pattern = re.compile(rules) 21 | res = pattern.sub('', target) 22 | # print res 23 | return res 24 | 25 | 26 | @classmethod 27 | def numberTranslator(cls, target): 28 | """ 29 | 该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字 30 | 如"这里有一千两百个人,六百零五个来自中国"可以转化为 31 | "这里有1200个人,605个来自中国" 32 | 此外添加支持了部分不规则表达方法 33 | 如两万零六百五可转化为20650 34 | 两百一十四和两百十四都可以转化为214 35 | 一六零加一五八可以转化为160+158 36 | 该方法目前支持的正确转化范围是0-99999999 37 | 该功能模块具有良好的复用性 38 | :param target: 待转化的字符串 39 | :return: 转化完毕后的字符串 40 | """ 41 | pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))") 42 | match = pattern.finditer(target) 43 | for m in match: 44 | group = m.group() 45 | s = group.split(u"万") 46 | s = filter(None, s) 47 | num = 0 48 | if len(s) == 2: 49 | num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000 50 | target = pattern.sub(str(num), target, 1) 51 | 52 | pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))") 53 | match = pattern.finditer(target) 54 | for m in match: 55 | group = m.group() 56 | s = group.split(u"千") 57 | s = filter(None, s) 58 | num = 0 59 | if len(s) == 2: 60 | num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100 61 | target = pattern.sub(str(num), target, 1) 62 | 63 | pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)") 64 | match = pattern.finditer(target) 65 | for m in match: 66 | group = m.group() 67 | s = group.split(u"百") 68 | s = filter(None, s) 69 | num = 0 70 | if len(s) == 2: 71 | num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10 72 | target = pattern.sub(str(num), target, 1) 73 | 74 | pattern = re.compile(u"[零一二两三四五六七八九]") 75 | match = pattern.finditer(target) 76 | for m in match: 77 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 78 | 79 | pattern = re.compile(u"(?<=(周|星期))[末天日]") 80 | match = pattern.finditer(target) 81 | for m in match: 82 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 83 | 84 | pattern = re.compile(u"(?> file_out, json.dumps(out, indent=2, ensure_ascii=False).encode('utf-8') 63 | # 64 | # with open('resource/holi_lunar.json') as file_out: 65 | # print json.load(file_out) 66 | 67 | 68 | # dset = [] 69 | # with open('C:/Users/zhm/Desktop/test.txt') as testfile: 70 | # for each in testfile: 71 | # dset.append(each) 72 | # 73 | # def run(query): 74 | # tn = TimeNormalizer() 75 | # res = tn.parse(target=query, timeBase='2013-02-28 16:30:29') 76 | # print res 77 | # if __name__ == '__main__': 78 | # while True: 79 | # query = random.choice(dset) 80 | # lp = LineProfiler() 81 | # lp_wrapper = lp(run) 82 | # lp_wrapper(query) 83 | # lp.print_stats() 84 | # cProfile.run("run(query)") 85 | 86 | # with open(os.path.dirname(__file__) + '/resource/regex.txt', 'wb') as f: 87 | # f.write(u'((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\\d+个?[年月日天][以之]?[前后])|(\\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\\d+[::]\\d+([::]\\d+)*)\\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\\d+[时点](\\d+)?分?(\\d+秒?)?)\\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\\d+)世)|([0-9]?[0-9]?[0-9]{2}\\.((10)|(11)|(12)|([1-9]))\\.((?=2017 2 | arrow>=0.10 3 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | resource 3 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/TimeNormalizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 16:39 4 | # @Author : zhm 5 | # @File : TimeNormalizer.py 6 | # @Software: PyCharm 7 | import pickle 8 | import regex as re 9 | import arrow 10 | import json 11 | import os 12 | import codecs 13 | 14 | 15 | from cocoNLP.config.basic.time_nlp.StringPreHandler import StringPreHandler 16 | from cocoNLP.config.basic.time_nlp.TimePoint import TimePoint 17 | from cocoNLP.config.basic.time_nlp.TimeUnit import TimeUnit 18 | 19 | # 时间表达式识别的主要工作类 20 | 21 | 22 | class TimeNormalizer: 23 | def __init__(self, isPreferFuture=True): 24 | self.isPreferFuture = isPreferFuture 25 | self.pattern, self.holi_solar, self.holi_lunar = self.init() 26 | 27 | # 这里对一些不规范的表达做转换 28 | def _filter(self, input_query): 29 | # 这里对于下个周末这种做转化 把个给移除掉 30 | input_query = StringPreHandler.numberTranslator(input_query) 31 | 32 | rule = u"[0-9]月[0-9]" 33 | pattern = re.compile(rule) 34 | match = pattern.search(input_query) 35 | if match != None: 36 | index = input_query.find('月') 37 | rule = u"日|号" 38 | pattern = re.compile(rule) 39 | match = pattern.search(input_query[index:]) 40 | if match == None: 41 | rule = u"[0-9]月[0-9]+" 42 | pattern = re.compile(rule) 43 | match = pattern.search(input_query) 44 | if match != None: 45 | end = match.span()[1] 46 | input_query = input_query[:end] + '号' + input_query[end:] 47 | 48 | rule = u"月" 49 | pattern = re.compile(rule) 50 | match = pattern.search(input_query) 51 | if match == None: 52 | input_query = input_query.replace('个', '') 53 | 54 | input_query = input_query.replace('中旬', '15号') 55 | input_query = input_query.replace('傍晚', '午后') 56 | input_query = input_query.replace('大年', '') 57 | input_query = input_query.replace('五一', '劳动节') 58 | input_query = input_query.replace('白天', '早上') 59 | input_query = input_query.replace(':', ':') 60 | return input_query 61 | 62 | def init(self): 63 | fpath = os.path.dirname(__file__) + '/resource/reg.pkl' 64 | # print(os.path.dirname(__file__)) 65 | try: 66 | with open(fpath, 'rb') as f: 67 | pattern = pickle.load(f) 68 | except: 69 | with codecs.open(os.path.dirname(__file__) + '/resource/regex.txt', 'r', 'utf-8-sig') as f: 70 | content = f.read() 71 | p = re.compile(content) 72 | with open(fpath, 'wb') as f: 73 | pickle.dump(p, f) 74 | with open(fpath, 'rb') as f: 75 | pattern = pickle.load(f) 76 | with codecs.open(os.path.dirname(__file__) + '/resource/holi_solar.json', 'r', 'utf-8-sig') as f: 77 | holi_solar = json.load(f) 78 | with codecs.open(os.path.dirname(__file__) + '/resource/holi_lunar.json', 'r', 'utf-8-sig') as f: 79 | holi_lunar = json.load(f) 80 | return pattern, holi_solar, holi_lunar 81 | 82 | def parse(self, target, timeBase=None): 83 | """ 84 | TimeNormalizer的构造方法,timeBase取默认的系统当前时间 85 | :param timeBase: 基准时间点 86 | :param target: 待分析字符串 87 | :return: 时间单元数组 88 | """ 89 | if timeBase is None: 90 | timeBase = arrow.now() 91 | self.isTimeSpan = False 92 | self.invalidSpan = False 93 | self.timeSpan = '' 94 | self.target = self._filter(target) 95 | self.timeBase = arrow.get(timeBase).format('YYYY-M-D-H-m-s') 96 | self.nowTime = timeBase 97 | self.oldTimeBase = self.timeBase 98 | self.__preHandling() 99 | self.timeToken = self.__timeEx() 100 | dic = {} 101 | res = self.timeToken 102 | 103 | if self.isTimeSpan: 104 | 105 | if self.invalidSpan: 106 | dic['error'] = 'no time pattern could be extracted.' 107 | else: 108 | result = {} 109 | dic['type'] = 'timedelta' 110 | dic['timedelta'] = self.timeSpan 111 | # print(dic['timedelta']) 112 | index = dic['timedelta'].find('days') 113 | 114 | days = int(dic['timedelta'][:index-1]) 115 | result['year'] = int(days / 365) 116 | result['month'] = int(days / 30 - result['year'] * 12) 117 | result['day'] = int(days - result['year'] 118 | * 365 - result['month'] * 30) 119 | index = dic['timedelta'].find(',') 120 | time = dic['timedelta'][index+1:] 121 | time = time.split(':') 122 | result['hour'] = int(time[0]) 123 | result['minute'] = int(time[1]) 124 | result['second'] = int(time[2]) 125 | dic['timedelta'] = result 126 | else: 127 | if len(res) == 0: 128 | dic['error'] = 'no time pattern could be extracted.' 129 | elif len(res) == 1: 130 | dic['type'] = 'timestamp' 131 | dic['timestamp'] = res[0].time.format("YYYY-MM-DD HH:mm:ss") 132 | else: 133 | dic['type'] = 'timespan' 134 | dic['timespan'] = [res[0].time.format( 135 | "YYYY-MM-DD HH:mm:ss"), res[1].time.format("YYYY-MM-DD HH:mm:ss")] 136 | return json.dumps(dic) 137 | 138 | def __preHandling(self): 139 | """ 140 | 待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理 141 | :return: 142 | """ 143 | self.target = StringPreHandler.delKeyword( 144 | self.target, u"\\s+") # 清理空白符 145 | self.target = StringPreHandler.delKeyword( 146 | self.target, u"[的]+") # 清理语气助词 147 | self.target = StringPreHandler.numberTranslator(self.target) # 大写数字转化 148 | 149 | def __timeEx(self): 150 | """ 151 | 152 | :param target: 输入文本字符串 153 | :param timeBase: 输入基准时间 154 | :return: TimeUnit[]时间表达式类型数组 155 | """ 156 | startline = -1 157 | endline = -1 158 | rpointer = 0 159 | temp = [] 160 | 161 | match = self.pattern.finditer(self.target) 162 | for m in match: 163 | startline = m.start() 164 | if startline == endline: 165 | rpointer -= 1 166 | temp[rpointer] = temp[rpointer] + m.group() 167 | else: 168 | temp.append(m.group()) 169 | endline = m.end() 170 | rpointer += 1 171 | res = [] 172 | # 时间上下文: 前一个识别出来的时间会是下一个时间的上下文,用于处理:周六3点到5点这样的多个时间的识别,第二个5点应识别到是周六的。 173 | contextTp = TimePoint() 174 | # print(self.timeBase) 175 | # print('temp',temp) 176 | for i in range(0, rpointer): 177 | # 这里是一个类嵌套了一个类 178 | res.append(TimeUnit(temp[i], self, contextTp)) 179 | # res[i].tp.tunit[3] = -1 180 | contextTp = res[i].tp 181 | # print(self.nowTime.year) 182 | # print(contextTp.tunit) 183 | res = self.__filterTimeUnit(res) 184 | 185 | return res 186 | 187 | def __filterTimeUnit(self, tu_arr): 188 | """ 189 | 过滤timeUnit中无用的识别词。无用识别词识别出的时间是1970.01.01 00:00:00(fastTime=0) 190 | :param tu_arr: 191 | :return: 192 | """ 193 | if (tu_arr is None) or (len(tu_arr) < 1): 194 | return tu_arr 195 | res = [] 196 | for tu in tu_arr: 197 | if tu.time.timestamp != 0: 198 | res.append(tu) 199 | return res 200 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/TimePoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 15:37 4 | # @Author : zhm 5 | # @File : TimePoint.py 6 | # @Software: PyCharm 7 | 8 | 9 | # * 时间表达式单元规范化对应的内部类, 10 | # * 对应时间表达式规范化的每个字段, 11 | # * 六个字段分别是:年-月-日-时-分-秒, 12 | # * 每个字段初始化为-1 13 | class TimePoint: 14 | def __init__(self): 15 | self.tunit = [-1, -1, -1, -1, -1, -1] 16 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/23 13:22 4 | # @Author : zhm 5 | # @File : __init__.py 6 | # @Software: PyCharm -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/5 17:29 4 | # @Author : zhm 5 | # @File : __init__.py 6 | # @Software: PyCharm -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/holi_lunar.json: -------------------------------------------------------------------------------- 1 | { 2 | "中和节": "02-02", 3 | "中秋节": "08-15", 4 | "中元节": "07-15", 5 | "端午节": "05-05", 6 | "春节": "01-01", 7 | "元宵节": "01-15", 8 | "重阳节": "09-09", 9 | "7夕节": "07-07", 10 | "初1节": "01-01", 11 | "初2节": "01-02", 12 | "初3节": "01-03", 13 | "初4节": "01-04", 14 | "初5节": "01-05", 15 | "初6节": "01-06", 16 | "初7节": "01-07", 17 | "初8节": "01-08", 18 | "初9节": "01-09", 19 | "初10节": "01-10", 20 | "初11节": "01-11", 21 | "初12节": "01-12", 22 | "初13节": "01-13", 23 | "初14节": "01-14", 24 | "初15节": "01-15" 25 | } 26 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/holi_solar.json: -------------------------------------------------------------------------------- 1 | { 2 | "植树节": "03-12", 3 | "圣诞节": "12-25", 4 | "青年节": "05-04", 5 | "教师节": "09-10", 6 | "儿童节": "06-01", 7 | "元旦节": "01-01", 8 | "国庆节": "10-01", 9 | "劳动节": "05-01", 10 | "妇女节": "03-08", 11 | "建军节": "08-01", 12 | "航海日节": "07-11", 13 | "建党节": "07-01", 14 | "记者节": "11-08", 15 | "情人节":"02-14", 16 | "母亲节":"05-11" 17 | } 18 | -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/reg.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/resource/reg.pkl -------------------------------------------------------------------------------- /cocoNLP/config/basic/time_nlp/resource/regex.txt: -------------------------------------------------------------------------------- 1 | ((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[::]\d+([::]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((? 82 | """ 83 | with codecs.open(path, 'r', 'utf-8-sig') as f: 84 | stopwords = f.readlines() 85 | stopwords_list = [] 86 | for word in stopwords: 87 | stopwords_list.append(word.replace('\n', '').replace(' ', '')) 88 | 89 | return stopwords_list 90 | 91 | def tokenize_chinese(self,text): 92 | 93 | sentences = re.split(r'(,|。|!|\!|\.|?|\?)', text) # 保留分割符 94 | 95 | new_sents = [] 96 | for i in range(int(len(sentences) / 2)): 97 | sent = sentences[2 * i] + sentences[2 * i + 1] 98 | new_sents.append(sent) 99 | return sentences 100 | 101 | def extract_keywords_from_text(self, text, min_len, max_len): 102 | """Method to extract keywords from the text provided. 103 | 104 | :param text: Text to extract keywords from, provided as a string. 105 | """ 106 | sentences = self.tokenize_chinese(text) 107 | self.extract_keywords_from_sentences(sentences, min_len, max_len) 108 | 109 | def extract_keywords_from_sentences(self, sentences, min_len, max_len): 110 | """Method to extract keywords from the list of sentences provided. 111 | 112 | :param sentences: Text to extraxt keywords from, provided as a list 113 | of strings, where each string is a sentence. 114 | """ 115 | phrase_list = self._generate_phrases(sentences, min_len, max_len) 116 | self._build_frequency_dist(phrase_list) 117 | self._build_word_co_occurance_graph(phrase_list) 118 | self._build_ranklist(phrase_list) 119 | 120 | def get_ranked_phrases(self): 121 | """Method to fetch ranked keyword strings. 122 | 123 | :return: List of strings where each string represents an extracted 124 | keyword string. 125 | """ 126 | return self.ranked_phrases 127 | 128 | def get_ranked_phrases_with_scores(self): 129 | """Method to fetch ranked keyword strings along with their scores. 130 | 131 | :return: List of tuples where each tuple is formed of an extracted 132 | keyword string and its score. Ex: (5.68, 'Four Scoures') 133 | """ 134 | return self.rank_list 135 | 136 | def get_word_frequency_distribution(self): 137 | """Method to fetch the word frequency distribution in the given text. 138 | 139 | :return: Dictionary (defaultdict) of the format `word -> frequency`. 140 | """ 141 | return self.frequency_dist 142 | 143 | def get_word_degrees(self): 144 | """Method to fetch the degree of words in the given text. Degree can be 145 | defined as sum of co-occurances of the word with other words in the 146 | given text. 147 | 148 | :return: Dictionary (defaultdict) of the format `word -> degree`. 149 | """ 150 | return self.degree 151 | 152 | def _build_frequency_dist(self, phrase_list): 153 | """Builds frequency distribution of the words in the given body of text. 154 | 155 | :param phrase_list: List of List of strings where each sublist is a 156 | collection of words which form a contender phrase. 157 | """ 158 | self.frequency_dist = Counter(chain.from_iterable(phrase_list)) 159 | 160 | def _build_word_co_occurance_graph(self, phrase_list): 161 | """Builds the co-occurance graph of words in the given body of text to 162 | compute degree of each word. 163 | 164 | :param phrase_list: List of List of strings where each sublist is a 165 | collection of words which form a contender phrase. 166 | """ 167 | co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0)) 168 | for phrase in phrase_list: 169 | # For each phrase in the phrase list, count co-occurances of the 170 | # word with other words in the phrase. 171 | # 172 | # Note: Keep the co-occurances graph as is, to help facilitate its 173 | # use in other creative ways if required later. 174 | for (word, coword) in product(phrase, phrase): 175 | co_occurance_graph[word][coword] += 1 176 | self.degree = defaultdict(lambda: 0) 177 | for key in co_occurance_graph: 178 | self.degree[key] = sum(co_occurance_graph[key].values()) 179 | 180 | def _build_ranklist(self, phrase_list): 181 | """Method to rank each contender phrase using the formula 182 | 183 | phrase_score = sum of scores of words in the phrase. 184 | word_score = d(w)/f(w) where d is degree and f is frequency. 185 | 186 | :param phrase_list: List of List of strings where each sublist is a 187 | collection of words which form a contender phrase. 188 | """ 189 | self.rank_list = [] 190 | for phrase in phrase_list: 191 | rank = 0.0 192 | for word in phrase: 193 | if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO: 194 | rank += 1.0 * self.degree[word] / self.frequency_dist[word] 195 | elif self.metric == Metric.WORD_DEGREE: 196 | rank += 1.0 * self.degree[word] 197 | else: 198 | rank += 1.0 * self.frequency_dist[word] 199 | self.rank_list.append((rank, " ".join(phrase))) 200 | self.rank_list.sort(reverse=True) 201 | self.ranked_phrases = [ph[1] for ph in self.rank_list] 202 | 203 | def _generate_phrases(self, sentences, min_len, max_len): 204 | """Method to generate contender phrases given the sentences of the text 205 | document. 206 | 207 | :param sentences: List of strings where each string represents a 208 | sentence which forms the text. 209 | :return: Set of string tuples where each tuple is a collection 210 | of words forming a contender phrase. 211 | """ 212 | phrase_list = set() 213 | # Create contender phrases from sentences. 214 | for sentence in sentences: 215 | word_list = [word for word in list(jieba.cut(sentence))] 216 | phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len)) 217 | return phrase_list 218 | 219 | def _get_phrase_list_from_words(self, word_list, min_len, max_len): 220 | """Method to create contender phrases from the list of words that form 221 | a sentence by dropping stopwords and punctuations and grouping the left 222 | words into phrases. Only phrases in the given length range (both limits 223 | inclusive) would be considered to build co-occurrence matrix. Ex: 224 | 225 | Sentence: Red apples, are good in flavour. 226 | List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour'] 227 | List after dropping punctuations and stopwords. 228 | List of words: ['red', 'apples', *, *, good, *, 'flavour'] 229 | List of phrases: [('red', 'apples'), ('good',), ('flavour',)] 230 | 231 | List of phrases with a correct length: 232 | For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)] 233 | For the range [1, 1]: [('good',), ('flavour',)] 234 | For the range [2, 2]: [('red', 'apples')] 235 | 236 | :param word_list: List of words which form a sentence when joined in 237 | the same order. 238 | :return: List of contender phrases that are formed after dropping 239 | stopwords and punctuations. 240 | """ 241 | groups = groupby(word_list, lambda x: x not in self.to_ignore) 242 | phrases = [] 243 | for group in groups: 244 | tmp = tuple(group[1]) 245 | len_g1 = len(list(tmp)) 246 | if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase 247 | phrases.append(tuple(tmp)) 248 | 249 | return list( 250 | filter( 251 | lambda x: self.min_length <= len(x) <= self.max_length, phrases 252 | ) 253 | ) 254 | -------------------------------------------------------------------------------- /cocoNLP/extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from phone import Phone 4 | from itertools import groupby 5 | import phonenumbers 6 | from pyhanlp import * 7 | from cocoNLP.config.basic.time_nlp.TimeNormalizer import * 8 | 9 | 10 | 11 | __all__ = ['extract_email', 'replace_chinese','extract_cellphone', 'extract_cellphone', 'extract_cellphone_location', 12 | 'get_location', 'extract_locations', 'replace_cellphoneNum', 'extract_time', 'extract_name', 'most_common'] 13 | 14 | class extractor(): 15 | def __init__(self): 16 | pass 17 | 18 | def extract_email(self, text): 19 | """ 20 | extract all email addresses from texts 21 | eg: extract_email('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 22 | 23 | 24 | :param: raw_text 25 | :return: email_addresses_list 26 | """ 27 | if text=='': 28 | return [] 29 | eng_texts = self.replace_chinese(text) 30 | eng_texts = eng_texts.replace(' at ','@').replace(' dot ','.') 31 | sep = ',!?:; ,。!?《》、|\\/' 32 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 33 | 34 | email_pattern = r'^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z_-]+)+$' 35 | 36 | emails = [] 37 | for eng_text in eng_split_texts: 38 | result = re.match(email_pattern, eng_text, flags=0) 39 | if result: 40 | emails.append(result.string) 41 | return emails 42 | 43 | def extract_ids(self, text): 44 | """ 45 | extract all ids from texts 46 | eg: extract_ids('my ids is 150404198812011101 m and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 47 | 48 | 49 | :param: raw_text 50 | :return: ids_list 51 | """ 52 | if text == '': 53 | return [] 54 | eng_texts = self.replace_chinese(text) 55 | sep = ',!?:; :,.。!?《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 56 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 57 | eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele) == 18] 58 | 59 | id_pattern = r'^[1-9][0-7]\d{4}((19\d{2}(0[13-9]|1[012])(0[1-9]|[12]\d|30))|(19\d{2}(0[13578]|1[02])31)|(19\d{2}02(0[1-9]|1\d|2[0-8]))|(19([13579][26]|[2468][048]|0[48])0229))\d{3}(\d|X|x)?$' 60 | 61 | phones = [] 62 | for eng_text in eng_split_texts_clean: 63 | result = re.match(id_pattern, eng_text, flags=0) 64 | if result: 65 | phones.append(result.string.replace('+86','').replace('-','')) 66 | return phones 67 | 68 | def replace_chinese(self, text): 69 | """ 70 | remove all the chinese characters in text 71 | eg: replace_chinese('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 72 | 73 | 74 | :param: raw_text 75 | :return: text_without_chinese 76 | """ 77 | if text=='': 78 | return [] 79 | filtrate = re.compile(u'[\u4E00-\u9FA5]') 80 | text_without_chinese = filtrate.sub(r' ', text) 81 | return text_without_chinese 82 | 83 | def extract_cellphone(self, text, nation): 84 | """ 85 | extract all cell phone numbers from texts 86 | eg: extract_email('my email address is sldisd@baidu.com and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 87 | 88 | 89 | :param: raw_text 90 | :return: email_addresses_list 91 | """ 92 | if text=='': 93 | return [] 94 | eng_texts = self.replace_chinese(text) 95 | sep = ',!?:; :,.。!?《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 96 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 97 | eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele)>=7 and len(ele)<17] 98 | if nation=='CHN': 99 | phone_pattern = r'^((\+86)?([- ])?)?(|(13[0-9])|(14[0-9])|(15[0-9])|(17[0-9])|(18[0-9])|(19[0-9]))([- ])?\d{3}([- ])?\d{4}([- ])?\d{4}$' 100 | 101 | phones = [] 102 | for eng_text in eng_split_texts_clean: 103 | result = re.match(phone_pattern, eng_text, flags=0) 104 | if result: 105 | phones.append(result.string.replace('+86','').replace('-','')) 106 | return phones 107 | 108 | def extract_cellphone_location(self, phoneNum, nation='CHN'): 109 | """ 110 | extract cellphone number locations according to the given number 111 | eg: extract_cellphone_location('181000765143',nation=CHN) 112 | 113 | 114 | :param: phoneNum, nation 115 | :return: location{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'} 116 | 117 | """ 118 | if nation=='CHN': 119 | p = Phone() 120 | loc_dict = p.find(phoneNum) 121 | if nation!='CHN': 122 | x = phonenumbers.parse(phoneNum, 'GB') 123 | if phonenumbers.is_possible_number(x): 124 | loc_dict = x 125 | # print(loc_dict) 126 | return loc_dict 127 | 128 | def get_location(self, word_pos_list): 129 | """ 130 | get location by the pos of the word, such as 'ns' 131 | eg: get_location('内蒙古赤峰市松山区') 132 | 133 | 134 | :param: word_pos_list 135 | :return: location_list eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 136 | 137 | """ 138 | location_list = [] 139 | if word_pos_list==[]: 140 | return [] 141 | 142 | for i,t in enumerate(word_pos_list): 143 | word = t[0] 144 | nature = t[1] 145 | if nature == 'ns': 146 | loc_tmp = word 147 | count = i + 1 148 | while count < len(word_pos_list): 149 | next_word_pos = word_pos_list[count] 150 | next_pos = next_word_pos[1] 151 | next_word = next_word_pos[0] 152 | if next_pos=='ns' or 'n' == next_pos[0]: 153 | loc_tmp += next_word 154 | else: 155 | break 156 | count += 1 157 | location_list.append(loc_tmp) 158 | 159 | return location_list # max(location_list) 160 | 161 | def extract_locations(self, text): 162 | """ 163 | extract locations by from texts 164 | eg: extract_locations('我家住在陕西省安康市汉滨区。') 165 | 166 | 167 | :param: raw_text 168 | :return: location_list eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 169 | 170 | """ 171 | if text=='': 172 | return [] 173 | seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)] 174 | location_list = self.get_location(seg_list) 175 | return location_list 176 | 177 | def replace_cellphoneNum(self, text): 178 | """ 179 | remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors. 180 | hence, we remove it here. 181 | eg: extract_locations('我家住在陕西省安康市汉滨区,我的手机号是181-0006-5143。') 182 | 183 | 184 | :param: raw_text 185 | :return: text_without_cellphone eg: '我家住在陕西省安康市汉滨区,我的手机号是。' 186 | 187 | """ 188 | eng_texts = self.replace_chinese(text) 189 | sep = ',!?:; :,.。!?《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 190 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 191 | eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele)>=7 and len(ele)<17] 192 | for phone_num in eng_split_texts_clean: 193 | text = text.replace(phone_num,'') 194 | return text 195 | 196 | def replace_ids(self, text): 197 | """ 198 | remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors. 199 | hence, we remove it here. 200 | eg: extract_locations('我家住在陕西省安康市汉滨区,我的身份证号是150404198412011312。') 201 | 202 | 203 | :param: raw_text 204 | :return: text_without_ids eg: '我家住在陕西省安康市汉滨区,我的身份证号号是。' 205 | 206 | """ 207 | if text == '': 208 | return [] 209 | eng_texts = self.replace_chinese(text) 210 | sep = ',!?:; :,.。!?《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 211 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 212 | eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele) == 18] 213 | 214 | id_pattern = r'^[1-9][0-7]\d{4}((19\d{2}(0[13-9]|1[012])(0[1-9]|[12]\d|30))|(19\d{2}(0[13578]|1[02])31)|(19\d{2}02(0[1-9]|1\d|2[0-8]))|(19([13579][26]|[2468][048]|0[48])0229))\d{3}(\d|X|x)?$' 215 | ids = [] 216 | for eng_text in eng_split_texts_clean: 217 | result = re.match(id_pattern, eng_text, flags=0) 218 | if result: 219 | ids.append(result.string) 220 | 221 | for phone_num in ids: 222 | text = text.replace(phone_num,'') 223 | return text 224 | 225 | def extract_time(self, text): 226 | """ 227 | extract timestamp from texts 228 | eg: extract_time('我于2018年1月1日获得1000万美金奖励。') 229 | 230 | 231 | :param: raw_text 232 | :return: time_info eg: {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"} 233 | 234 | """ 235 | if text=='': 236 | return [] 237 | tmp_text = self.replace_cellphoneNum(text) 238 | tmp_text = self.replace_ids(tmp_text) 239 | tn = TimeNormalizer() 240 | res = tn.parse(target=tmp_text) # target为待分析语句,timeBase为基准时间默认是当前时间 241 | return res 242 | 243 | def extract_name(self, text): 244 | """ 245 | extract chinese names from texts 246 | eg: extract_time('急寻王龙,短发,王龙,男,丢失发型短发,...如有线索,请迅速与警方联系:19909156745') 247 | 248 | 249 | :param: raw_text 250 | :return: name_list eg: ['王龙', '王龙'] 251 | 252 | """ 253 | if text=='': 254 | return [] 255 | seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)] 256 | names = [] 257 | for ele_tup in seg_list: 258 | if 'nr' in ele_tup[1]: 259 | names.append(ele_tup[0]) 260 | # print(ele_tup[0],ele_tup[1]) 261 | return self.most_common(names) 262 | 263 | def most_common(self, content_list): 264 | """ 265 | return the most common element in a list 266 | eg: extract_time(['王龙','王龙','李二狗']) 267 | 268 | 269 | :param: content_list 270 | :return: name eg: '王龙' 271 | """ 272 | if content_list==[]: 273 | return None 274 | if len(content_list)==0: 275 | return None 276 | return max(set(content_list), key=content_list.count) 277 | 278 | 279 | 280 | 281 | 282 | if __name__ == '__main__': 283 | 284 | text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发,...如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com' 285 | ex = extractor() 286 | 287 | emails = ex.extract_email(text) 288 | cellphones = ex.extract_cellphone(text,nation='CHN') 289 | cell_loc = [] 290 | for cell in cellphones: 291 | cell_loc.append(ex.extract_cellphone_location(cell,'CHN')) 292 | 293 | locations = ex.extract_locations(text) 294 | times = ex.extract_time(text) 295 | names = ex.extract_name(text) 296 | 297 | result_dict = {} 298 | result_dict['email'] = emails 299 | result_dict['cellphone'] = cellphones 300 | result_dict['cellphone_location'] = cell_loc 301 | result_dict['location'] = locations 302 | result_dict['time'] = times 303 | result_dict['name'] = names 304 | for key in result_dict.keys(): 305 | print(key,result_dict[key]) -------------------------------------------------------------------------------- /cocoNLP/extractor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/extractor.pyc -------------------------------------------------------------------------------- /dist/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/.DS_Store -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.10.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.10.tar.gz -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.11.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.11.tar.gz -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.12.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.12.tar.gz -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.13.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.13.tar.gz -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cocoNLP/config * 2 | recursive-include cocoNLP/config * 3 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: cocoNLP 3 | Version: 0.0.9 4 | Summary: Python implementation of many nlp algorithms 5 | Home-page: https://github.com/fighting41love 6 | Author: Yang Yang 7 | Author-email: yangyangfuture@gmail.com 8 | License: MIT 9 | Description: UNKNOWN 10 | Keywords: nlp text-mining information extraction 11 | Platform: UNKNOWN 12 | Classifier: Intended Audience :: Developers 13 | Classifier: Intended Audience :: Education 14 | Classifier: License :: OSI Approved :: MIT License 15 | Classifier: Development Status :: 3 - Alpha 16 | Classifier: Operating System :: POSIX 17 | Classifier: Programming Language :: Python :: 2.7 18 | Classifier: Programming Language :: Python :: 3.4 19 | Classifier: Programming Language :: Python :: 3.5 20 | Classifier: Programming Language :: Python :: 3.6 21 | Classifier: Topic :: Software Development :: Build Tools 22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 23 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: cocoNLP 3 | Version: 0.0.9 4 | Summary: Python implementation of many nlp algorithms 5 | Home-page: https://github.com/fighting41love 6 | Author: Yang Yang 7 | Author-email: yangyangfuture@gmail.com 8 | License: MIT 9 | Description: UNKNOWN 10 | Keywords: nlp text-mining information extraction 11 | Platform: UNKNOWN 12 | Classifier: Intended Audience :: Developers 13 | Classifier: Intended Audience :: Education 14 | Classifier: License :: OSI Approved :: MIT License 15 | Classifier: Development Status :: 3 - Alpha 16 | Classifier: Operating System :: POSIX 17 | Classifier: Programming Language :: Python :: 2.7 18 | Classifier: Programming Language :: Python :: 3.4 19 | Classifier: Programming Language :: Python :: 3.5 20 | Classifier: Programming Language :: Python :: 3.6 21 | Classifier: Topic :: Software Development :: Build Tools 22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 23 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | readme.md 3 | requirements.txt 4 | setup.py 5 | test.py 6 | cocoNLP/__init__.py 7 | cocoNLP/__version__.py 8 | cocoNLP/extractor.py 9 | cocoNLP.egg-info/PKG-INFO 10 | cocoNLP.egg-info/SOURCES.txt 11 | cocoNLP.egg-info/dependency_links.txt 12 | cocoNLP.egg-info/requires.txt 13 | cocoNLP.egg-info/top_level.txt 14 | cocoNLP/config/basic/time_nlp/.DS_Store 15 | cocoNLP/config/basic/time_nlp/LunarSolarConverter.py 16 | cocoNLP/config/basic/time_nlp/README.md 17 | cocoNLP/config/basic/time_nlp/RangeTimeEnum.py 18 | cocoNLP/config/basic/time_nlp/StringPreHandler.py 19 | cocoNLP/config/basic/time_nlp/Test.py 20 | cocoNLP/config/basic/time_nlp/TimeNormalizer.py 21 | cocoNLP/config/basic/time_nlp/TimePoint.py 22 | cocoNLP/config/basic/time_nlp/TimeUnit.py 23 | cocoNLP/config/basic/time_nlp/__init__.py 24 | cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO 25 | cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt 26 | cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt 27 | cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe 28 | cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt 29 | cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt 30 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO 31 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt 32 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt 33 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe 34 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt 35 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt 36 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc 37 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc 38 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc 39 | cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc 40 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc 41 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc 42 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc 43 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc 44 | cocoNLP/config/basic/time_nlp/resource/__init__.py 45 | cocoNLP/config/basic/time_nlp/resource/holi_lunar.json 46 | cocoNLP/config/basic/time_nlp/resource/holi_solar.json 47 | cocoNLP/config/basic/time_nlp/resource/reg.pkl 48 | cocoNLP/config/basic/time_nlp/resource/regex.txt 49 | cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc 50 | cocoNLP/config/phrase/rake.py 51 | cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc 52 | cocoNLP/config/phrase/data/stopwords.txt -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | pyhanlp 3 | phone 4 | phonenumbers 5 | regex 6 | arrow 7 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | cocoNLP 2 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | # -*- coding: utf-8 -*- 10 | 11 | """ 12 | cocoNLP module 13 | :copyright: (c) 2018 by Yang Yang. 14 | :license: MIT, see LICENSE for more details. 15 | """ 16 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # _ _ _ ____ 3 | # ___ ___ ___ ___ | \ | | | | _ \ 4 | # / __/ _ \ / __/ _ \| \| | | | |_) | 5 | # | (_| (_) | (_| (_) | |\ | |___| __/ 6 | # \___\___/ \___\___/|_| \_|_____|_| 7 | 8 | 9 | 10 | __title__ = "cocoNLP" 11 | __description__ = "Python implementation of many nlp algorithms" 12 | __url__ = "https://github.com/fighting41love" 13 | __version__ = "0.0.9" 14 | __author__ = "Yang Yang" 15 | __author_email__ = "yangyangfuture@gmail.com" 16 | __license__ = "MIT" 17 | __copyright__ = "Copyright 2018 Yang Yang" 18 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/.DS_Store -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: TimeConverter 3 | Version: 1.1.0 4 | Summary: ... 5 | Home-page: http://test.com 6 | Author: test 7 | Author-email: test@gmail.com 8 | License: MIT Licence 9 | Description: ... 10 | Keywords: time,nlp 11 | Platform: any 12 | Classifier: Programming Language :: Python :: 2.6 13 | Classifier: Programming Language :: Python :: 2.7 14 | Classifier: Programming Language :: Python :: 3.6 15 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LunarSolarConverter.py 2 | README.md 3 | RangeTimeEnum.py 4 | StringPreHandler.py 5 | Test.py 6 | TimeNormalizer.py 7 | TimePoint.py 8 | TimeUnit.py 9 | __init__.py 10 | setup.py 11 | TimeConverter.egg-info/PKG-INFO 12 | TimeConverter.egg-info/SOURCES.txt 13 | TimeConverter.egg-info/dependency_links.txt 14 | TimeConverter.egg-info/not-zip-safe 15 | TimeConverter.egg-info/requires.txt 16 | TimeConverter.egg-info/top_level.txt 17 | resource/__init__.py -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt: -------------------------------------------------------------------------------- 1 | regex>=2017 2 | arrow>=0.10 3 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | resource 3 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/LunarSolarConverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/11 11:08 4 | # @Author : zhm 5 | # @File : LunarSolarConverter.py 6 | # @Software: PyCharm 7 | from pprint import pprint 8 | 9 | 10 | class Lunar: 11 | def __init__(self, lunarYear, lunarMonth, lunarDay, isleap): 12 | self.isleap = isleap 13 | self.lunarDay = lunarDay 14 | self.lunarMonth = lunarMonth 15 | self.lunarYear = lunarYear 16 | 17 | 18 | class Solar: 19 | def __init__(self, solarYear, solarMonth, solarDay): 20 | self.solarDay = solarDay 21 | self.solarMonth = solarMonth 22 | self.solarYear = solarYear 23 | 24 | 25 | def GetBitInt(data, length, shift): 26 | return (data & (((1 << length) - 1) << shift)) >> shift 27 | 28 | 29 | def SolarToInt(y, m, d): 30 | m = (m + 9) % 12 31 | y -= int(m / 10) 32 | return 365 * y + int(y / 4) - int(y / 100) + int(y / 400) + int((m * 306 + 5) / 10) + (d - 1) 33 | 34 | 35 | def SolarFromInt(g): 36 | y = int((10000 * g + 14780) / 3652425) 37 | ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400)) 38 | if ddd < 0: 39 | y -= 1 40 | ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400)) 41 | 42 | mi = int((100 * ddd + 52) / 3060) 43 | mm = (mi + 2) % 12 + 1 44 | y += int((mi + 2) / 12) 45 | dd = ddd - int((mi * 306 + 5) / 10) + 1 46 | solar = Solar(y, mm, dd) 47 | return solar 48 | 49 | 50 | class LunarSolarConverter: 51 | ##################################################################################### 52 | # 1888~2111年农历数据表 53 | # 农历数据 每个元素的存储格式如下: 54 | # 16~13 12 11~0 55 | # 闰几月 闰月日数 1~12月份农历日数(大小月) 56 | # 注:1、bit0表示农历1月份日数,为1表示30天,为0表示29天。bit1表示农历2月份日数,依次类推。 57 | # 2、bit12表示闰月日数,1为30天,0为29天。bit16~bit13表示第几月是闰月(注:为0表示该年无闰月) 58 | # 数据来源参考: http://data.weather.gov.hk/gts/time/conversion1_text_c.htm 59 | ##################################################################################### 60 | lunar_month_days = [1887, 0x1694, 0x16aa, 0x4ad5, 0xab6, 0xc4b7, 0x4ae, 0xa56, 0xb52a, 61 | 0x1d2a, 0xd54, 0x75aa, 0x156a, 0x1096d, 0x95c, 0x14ae, 0xaa4d, 0x1a4c, 0x1b2a, 0x8d55, 62 | 0xad4, 0x135a, 0x495d, 63 | 0x95c, 0xd49b, 0x149a, 0x1a4a, 0xbaa5, 0x16a8, 0x1ad4, 0x52da, 0x12b6, 0xe937, 0x92e, 64 | 0x1496, 0xb64b, 0xd4a, 65 | 0xda8, 0x95b5, 0x56c, 0x12ae, 0x492f, 0x92e, 0xcc96, 0x1a94, 0x1d4a, 0xada9, 0xb5a, 0x56c, 66 | 0x726e, 0x125c, 67 | 0xf92d, 0x192a, 0x1a94, 0xdb4a, 0x16aa, 0xad4, 0x955b, 0x4ba, 0x125a, 0x592b, 0x152a, 68 | 0xf695, 0xd94, 0x16aa, 69 | 0xaab5, 0x9b4, 0x14b6, 0x6a57, 0xa56, 0x1152a, 0x1d2a, 0xd54, 0xd5aa, 0x156a, 0x96c, 70 | 0x94ae, 0x14ae, 0xa4c, 71 | 0x7d26, 0x1b2a, 0xeb55, 0xad4, 0x12da, 0xa95d, 0x95a, 0x149a, 0x9a4d, 0x1a4a, 0x11aa5, 72 | 0x16a8, 0x16d4, 73 | 0xd2da, 0x12b6, 0x936, 0x9497, 0x1496, 0x1564b, 0xd4a, 0xda8, 0xd5b4, 0x156c, 0x12ae, 74 | 0xa92f, 0x92e, 0xc96, 75 | 0x6d4a, 0x1d4a, 0x10d65, 0xb58, 0x156c, 0xb26d, 0x125c, 0x192c, 0x9a95, 0x1a94, 0x1b4a, 76 | 0x4b55, 0xad4, 77 | 0xf55b, 0x4ba, 0x125a, 0xb92b, 0x152a, 0x1694, 0x96aa, 0x15aa, 0x12ab5, 0x974, 0x14b6, 78 | 0xca57, 0xa56, 0x1526, 79 | 0x8e95, 0xd54, 0x15aa, 0x49b5, 0x96c, 0xd4ae, 0x149c, 0x1a4c, 0xbd26, 0x1aa6, 0xb54, 80 | 0x6d6a, 0x12da, 0x1695d, 81 | 0x95a, 0x149a, 0xda4b, 0x1a4a, 0x1aa4, 0xbb54, 0x16b4, 0xada, 0x495b, 0x936, 0xf497, 82 | 0x1496, 0x154a, 0xb6a5, 83 | 0xda4, 0x15b4, 0x6ab6, 0x126e, 0x1092f, 0x92e, 0xc96, 0xcd4a, 0x1d4a, 0xd64, 0x956c, 84 | 0x155c, 0x125c, 0x792e, 85 | 0x192c, 0xfa95, 0x1a94, 0x1b4a, 0xab55, 0xad4, 0x14da, 0x8a5d, 0xa5a, 0x1152b, 0x152a, 86 | 0x1694, 0xd6aa, 87 | 0x15aa, 0xab4, 0x94ba, 0x14b6, 0xa56, 0x7527, 0xd26, 0xee53, 0xd54, 0x15aa, 0xa9b5, 0x96c, 88 | 0x14ae, 0x8a4e, 89 | 0x1a4c, 0x11d26, 0x1aa4, 0x1b54, 0xcd6a, 0xada, 0x95c, 0x949d, 0x149a, 0x1a2a, 0x5b25, 90 | 0x1aa4, 0xfb52, 91 | 0x16b4, 0xaba, 0xa95b, 0x936, 0x1496, 0x9a4b, 0x154a, 0x136a5, 0xda4, 0x15ac] 92 | # 额外添加数据,方便快速计算阴历转阳历 每个元素的存储格式如下: 93 | # 12~7 6~5 4~0 94 | # 离元旦多少天 春节月 春节日 95 | ##################################################################################### 96 | solar_1_1 = [1887, 0xec04c, 0xec23f, 0xec435, 0xec649, 0xec83e, 0xeca51, 0xecc46, 0xece3a, 97 | 0xed04d, 0xed242, 0xed436, 0xed64a, 0xed83f, 0xeda53, 0xedc48, 0xede3d, 0xee050, 0xee244, 0xee439, 98 | 0xee64d, 99 | 0xee842, 0xeea36, 0xeec4a, 0xeee3e, 0xef052, 0xef246, 0xef43a, 0xef64e, 0xef843, 0xefa37, 0xefc4b, 100 | 0xefe41, 101 | 0xf0054, 0xf0248, 0xf043c, 0xf0650, 0xf0845, 0xf0a38, 0xf0c4d, 0xf0e42, 0xf1037, 0xf124a, 0xf143e, 102 | 0xf1651, 103 | 0xf1846, 0xf1a3a, 0xf1c4e, 0xf1e44, 0xf2038, 0xf224b, 0xf243f, 0xf2653, 0xf2848, 0xf2a3b, 0xf2c4f, 104 | 0xf2e45, 105 | 0xf3039, 0xf324d, 0xf3442, 0xf3636, 0xf384a, 0xf3a3d, 0xf3c51, 0xf3e46, 0xf403b, 0xf424e, 0xf4443, 106 | 0xf4638, 107 | 0xf484c, 0xf4a3f, 0xf4c52, 0xf4e48, 0xf503c, 0xf524f, 0xf5445, 0xf5639, 0xf584d, 0xf5a42, 0xf5c35, 108 | 0xf5e49, 109 | 0xf603e, 0xf6251, 0xf6446, 0xf663b, 0xf684f, 0xf6a43, 0xf6c37, 0xf6e4b, 0xf703f, 0xf7252, 0xf7447, 110 | 0xf763c, 111 | 0xf7850, 0xf7a45, 0xf7c39, 0xf7e4d, 0xf8042, 0xf8254, 0xf8449, 0xf863d, 0xf8851, 0xf8a46, 0xf8c3b, 112 | 0xf8e4f, 113 | 0xf9044, 0xf9237, 0xf944a, 0xf963f, 0xf9853, 0xf9a47, 0xf9c3c, 0xf9e50, 0xfa045, 0xfa238, 0xfa44c, 114 | 0xfa641, 115 | 0xfa836, 0xfaa49, 0xfac3d, 0xfae52, 0xfb047, 0xfb23a, 0xfb44e, 0xfb643, 0xfb837, 0xfba4a, 0xfbc3f, 116 | 0xfbe53, 117 | 0xfc048, 0xfc23c, 0xfc450, 0xfc645, 0xfc839, 0xfca4c, 0xfcc41, 0xfce36, 0xfd04a, 0xfd23d, 0xfd451, 118 | 0xfd646, 119 | 0xfd83a, 0xfda4d, 0xfdc43, 0xfde37, 0xfe04b, 0xfe23f, 0xfe453, 0xfe648, 0xfe83c, 0xfea4f, 0xfec44, 120 | 0xfee38, 121 | 0xff04c, 0xff241, 0xff436, 0xff64a, 0xff83e, 0xffa51, 0xffc46, 0xffe3a, 0x10004e, 0x100242, 122 | 0x100437, 123 | 0x10064b, 0x100841, 0x100a53, 0x100c48, 0x100e3c, 0x10104f, 0x101244, 0x101438, 0x10164c, 124 | 0x101842, 0x101a35, 125 | 0x101c49, 0x101e3d, 0x102051, 0x102245, 0x10243a, 0x10264e, 0x102843, 0x102a37, 0x102c4b, 126 | 0x102e3f, 0x103053, 127 | 0x103247, 0x10343b, 0x10364f, 0x103845, 0x103a38, 0x103c4c, 0x103e42, 0x104036, 0x104249, 128 | 0x10443d, 0x104651, 129 | 0x104846, 0x104a3a, 0x104c4e, 0x104e43, 0x105038, 0x10524a, 0x10543e, 0x105652, 0x105847, 130 | 0x105a3b, 0x105c4f, 131 | 0x105e45, 0x106039, 0x10624c, 0x106441, 0x106635, 0x106849, 0x106a3d, 0x106c51, 0x106e47, 132 | 0x10703c, 0x10724f, 133 | 0x107444, 0x107638, 0x10784c, 0x107a3f, 0x107c53, 0x107e48] 134 | 135 | def LunarToSolar(self, lunar): 136 | days = LunarSolarConverter.lunar_month_days[lunar.lunarYear - LunarSolarConverter.lunar_month_days[0]] 137 | leap = GetBitInt(days, 4, 13) 138 | offset = 0 139 | loopend = leap 140 | if not lunar.isleap: 141 | 142 | if lunar.lunarMonth <= leap or leap == 0: 143 | 144 | loopend = lunar.lunarMonth - 1 145 | 146 | else: 147 | 148 | loopend = lunar.lunarMonth 149 | 150 | for i in range(0, loopend): 151 | offset += GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 152 | 153 | offset += lunar.lunarDay 154 | 155 | solar11 = LunarSolarConverter.solar_1_1[lunar.lunarYear - LunarSolarConverter.solar_1_1[0]] 156 | 157 | y = GetBitInt(solar11, 12, 9) 158 | m = GetBitInt(solar11, 4, 5) 159 | d = GetBitInt(solar11, 5, 0) 160 | 161 | return SolarFromInt(SolarToInt(y, m, d) + offset - 1) 162 | 163 | def SolarToLunar(self, solar): 164 | 165 | lunar = Lunar(0, 0, 0, False) 166 | index = solar.solarYear - LunarSolarConverter.solar_1_1[0] 167 | data = (solar.solarYear << 9) | (solar.solarMonth << 5) | solar.solarDay 168 | if LunarSolarConverter.solar_1_1[index] > data: 169 | index -= 1 170 | 171 | solar11 = LunarSolarConverter.solar_1_1[index] 172 | y = GetBitInt(solar11, 12, 9) 173 | m = GetBitInt(solar11, 4, 5) 174 | d = GetBitInt(solar11, 5, 0) 175 | offset = SolarToInt(solar.solarYear, solar.solarMonth, solar.solarDay) - SolarToInt(y, m, d) 176 | 177 | days = LunarSolarConverter.lunar_month_days[index] 178 | leap = GetBitInt(days, 4, 13) 179 | 180 | lunarY = index + LunarSolarConverter.solar_1_1[0] 181 | lunarM = 1 182 | offset += 1 183 | 184 | for i in range(0, 13): 185 | 186 | dm = GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 187 | if offset > dm: 188 | 189 | lunarM += 1 190 | offset -= dm 191 | 192 | else: 193 | 194 | break 195 | 196 | lunarD = int(offset) 197 | lunar.lunarYear = lunarY 198 | lunar.lunarMonth = lunarM 199 | lunar.isleap = False 200 | if leap != 0 and lunarM > leap: 201 | 202 | lunar.lunarMonth = lunarM - 1 203 | if lunarM == leap + 1: 204 | lunar.isleap = True 205 | 206 | lunar.lunarDay = lunarD 207 | return lunar 208 | 209 | def __init__(self): 210 | pass 211 | 212 | 213 | if __name__ == '__main__': 214 | converter = LunarSolarConverter() 215 | solar = Solar(2111, 1, 25) 216 | pprint(vars(solar)) 217 | lunar = converter.SolarToLunar(solar) 218 | pprint(vars(lunar)) 219 | solar = converter.LunarToSolar(lunar) 220 | pprint(vars(solar)) 221 | print(len(converter.solar_1_1)) 222 | print("Done") 223 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/README.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | Time-NLP的python3版本 3 | python 版本https://github.com/sunfiyes/Time-NLPY 4 | Java 版本https://github.com/shinyke/Time-NLP 5 | ## 功能说明 6 | 用于句子中时间词的抽取和转换 7 | 详情请见test.py 8 | 9 | res = tn.parse(target=u'过十分钟') # target为待分析语句,timeBase为基准时间默认是当前时间 10 | print(res) 11 | res = tn.parse(target=u'2013年二月二十八日下午四点三十分二十九秒', timeBase='2013-02-28 16:30:29') # target为待分析语句,timeBase为基准时间默认是当前时间 12 | print(res) 13 | res = tn.parse(target=u'我需要大概33天2分钟四秒', timeBase='2013-02-28 16:30:29') # target为待分析语句,timeBase为基准时间默认是当前时间 14 | print(res) 15 | res = tn.parse(target=u'今年儿童节晚上九点一刻') # target为待分析语句,timeBase为基准时间默认是当前时间 16 | print(res) 17 | res = tn.parse(target=u'2个小时以前') # target为待分析语句,timeBase为基准时间默认是当前时间 18 | print(res) 19 | res = tn.parse(target=u'晚上8点到上午10点之间') # target为待分析语句,timeBase为基准时间默认是当前时间 20 | print(res) 21 | 返回结果: 22 | 23 | {"timedelta": "0 days, 0:10:00", "type": "timedelta"} 24 | {"timestamp": "2013-02-28 16:30:29", "type": "timestamp"} 25 | {"type": "timedelta", "timedelta": {"year": 0, "month": 1, "day": 3, "hour": 0, "minute": 2, "second": 4}} 26 | {"timestamp": "2018-06-01 21:15:00", "type": "timestamp"} 27 | {"error": "no time pattern could be extracted."} 28 | {"type": "timespan", "timespan": ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]} 29 | 30 | ## 使用方式 31 | demo:python3 Test.py 32 | 33 | 优化说明 34 | 35 | | 问题 | 以前版本 | 现在版本 | 36 | | ----------- | ---------------------------------------- | ---------------------- | 37 | | 无法解析下下周末 | "timestamp": "2018-04-01 00:00:00" | "timestamp": "2018-04-08 00:00:00" | 38 | | 无法解析 3月4 | "2018-03-01" | "2018-03-04" | 39 | | 无法解析 初一 初二 | cannot parse | "2018-02-16" | 40 | | 晚上8点到上午10点之间 无法解析上午 | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] | ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]| 41 | | 3月21号  错误解析成2019年     | "2019-03-21" | "2018-03-21" | 42 | 43 | 感谢@[tianyuningmou](https://github.com/tianyuningmou) 目前增加了对24节气的支持 44 | 45 | 46 | temp = ['今年春分'] 47 | "timestamp" : "2020-03-20 00:00:00" 48 | 49 | ## TODO 50 | 51 | | 问题 | 现在版本 | 正确 52 | | ----------- | ---------------------------------------- | ---------------------- | 53 | | 晚上8点到上午10点之间 | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] | ["2018-03-16 20:00:00", "2018-03-17 10:00:00"]" | "timestamp": "2018-04-08 00:00:00" | 54 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/RangeTimeEnum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 16:27 4 | # @Author : zhm 5 | # @File : RangeTimeEnum.py 6 | # @Software: PyCharm 7 | 8 | 9 | 10 | # 范围时间的默认时间点 11 | class RangeTimeEnum(): 12 | day_break = 3 # 黎明 13 | early_morning = 8 # 早 14 | morning = 10 # 上午 15 | noon = 12 # 中午、午间 16 | afternoon = 15 # 下午、午后 17 | night = 18 # 晚上、傍晚 18 | lateNight = 20 # 晚、晚间 19 | midNight = 23 # 深夜 20 | 21 | 22 | if __name__ == "__main__": 23 | print(RangeTimeEnum.afternoon) 24 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/StringPreHandler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 15:42 4 | # @Author : zhm 5 | # @File : StringPreHandler.py 6 | # @Software: PyCharm 7 | import regex as re 8 | 9 | # * 字符串预处理模块,为分析器TimeNormalizer提供相应的字符串预处理服务 10 | class StringPreHandler: 11 | @classmethod 12 | def delKeyword(cls, target, rules): 13 | """ 14 | 该方法删除一字符串中所有匹配某一规则字串 15 | 可用于清理一个字符串中的空白符和语气助词 16 | :param target: 待处理字符串 17 | :param rules: 删除规则 18 | :return: 清理工作完成后的字符串 19 | """ 20 | pattern = re.compile(rules) 21 | res = pattern.sub('', target) 22 | # print res 23 | return res 24 | 25 | 26 | @classmethod 27 | def numberTranslator(cls, target): 28 | """ 29 | 该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字 30 | 如"这里有一千两百个人,六百零五个来自中国"可以转化为 31 | "这里有1200个人,605个来自中国" 32 | 此外添加支持了部分不规则表达方法 33 | 如两万零六百五可转化为20650 34 | 两百一十四和两百十四都可以转化为214 35 | 一六零加一五八可以转化为160+158 36 | 该方法目前支持的正确转化范围是0-99999999 37 | 该功能模块具有良好的复用性 38 | :param target: 待转化的字符串 39 | :return: 转化完毕后的字符串 40 | """ 41 | pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))") 42 | match = pattern.finditer(target) 43 | for m in match: 44 | group = m.group() 45 | s = group.split(u"万") 46 | s = filter(None, s) 47 | num = 0 48 | if len(s) == 2: 49 | num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000 50 | target = pattern.sub(str(num), target, 1) 51 | 52 | pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))") 53 | match = pattern.finditer(target) 54 | for m in match: 55 | group = m.group() 56 | s = group.split(u"千") 57 | s = filter(None, s) 58 | num = 0 59 | if len(s) == 2: 60 | num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100 61 | target = pattern.sub(str(num), target, 1) 62 | 63 | pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)") 64 | match = pattern.finditer(target) 65 | for m in match: 66 | group = m.group() 67 | s = group.split(u"百") 68 | s = filter(None, s) 69 | num = 0 70 | if len(s) == 2: 71 | num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10 72 | target = pattern.sub(str(num), target, 1) 73 | 74 | pattern = re.compile(u"[零一二两三四五六七八九]") 75 | match = pattern.finditer(target) 76 | for m in match: 77 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 78 | 79 | pattern = re.compile(u"(?<=(周|星期))[末天日]") 80 | match = pattern.finditer(target) 81 | for m in match: 82 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 83 | 84 | pattern = re.compile(u"(?=2017 2 | arrow>=0.10 3 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | resource 3 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeNormalizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 16:39 4 | # @Author : zhm 5 | # @File : TimeNormalizer.py 6 | # @Software: PyCharm 7 | import pickle 8 | import regex as re 9 | import arrow 10 | import json 11 | import os 12 | 13 | 14 | from cocoNLP.config.basic.time_nlp.StringPreHandler import StringPreHandler 15 | from cocoNLP.config.basic.time_nlp.TimePoint import TimePoint 16 | from cocoNLP.config.basic.time_nlp.TimeUnit import TimeUnit 17 | 18 | # 时间表达式识别的主要工作类 19 | class TimeNormalizer: 20 | def __init__(self, isPreferFuture=True): 21 | self.isPreferFuture = isPreferFuture 22 | self.pattern, self.holi_solar, self.holi_lunar = self.init() 23 | 24 | # 这里对一些不规范的表达做转换 25 | def _filter(self, input_query): 26 | # 这里对于下个周末这种做转化 把个给移除掉 27 | input_query = StringPreHandler.numberTranslator(input_query) 28 | 29 | rule = u"[0-9]月[0-9]" 30 | pattern = re.compile(rule) 31 | match = pattern.search(input_query) 32 | if match != None: 33 | index = input_query.find('月') 34 | rule = u"日|号" 35 | pattern = re.compile(rule) 36 | match = pattern.search(input_query[index:]) 37 | if match == None: 38 | rule = u"[0-9]月[0-9]+" 39 | pattern = re.compile(rule) 40 | match = pattern.search(input_query) 41 | if match != None: 42 | end = match.span()[1] 43 | input_query = input_query[:end] + '号' + input_query[end:] 44 | 45 | rule = u"月" 46 | pattern = re.compile(rule) 47 | match = pattern.search(input_query) 48 | if match == None: 49 | input_query = input_query.replace('个', '') 50 | 51 | input_query = input_query.replace('中旬', '15号') 52 | input_query = input_query.replace('傍晚', '午后') 53 | input_query = input_query.replace('大年', '') 54 | input_query = input_query.replace('五一', '劳动节') 55 | input_query = input_query.replace('白天', '早上') 56 | input_query = input_query.replace(':', ':') 57 | return input_query 58 | 59 | def init(self): 60 | fpath = os.path.dirname(__file__) + '/resource/reg.pkl' 61 | # print(os.path.dirname(__file__)) 62 | try: 63 | with open(fpath, 'rb') as f: 64 | pattern = pickle.load(f) 65 | except: 66 | with open(os.path.dirname(__file__) + '/resource/regex.txt', 'r') as f: 67 | content = f.read() 68 | p = re.compile(content) 69 | with open(fpath, 'wb') as f: 70 | pickle.dump(p, f) 71 | with open(fpath, 'rb') as f: 72 | pattern = pickle.load(f) 73 | with open(os.path.dirname(__file__) + '/resource/holi_solar.json', 'r', encoding='utf-8') as f: 74 | holi_solar = json.load(f) 75 | with open(os.path.dirname(__file__) + '/resource/holi_lunar.json', 'r', encoding='utf-8') as f: 76 | holi_lunar = json.load(f) 77 | return pattern, holi_solar, holi_lunar 78 | 79 | def parse(self, target, timeBase=arrow.now()): 80 | """ 81 | TimeNormalizer的构造方法,timeBase取默认的系统当前时间 82 | :param timeBase: 基准时间点 83 | :param target: 待分析字符串 84 | :return: 时间单元数组 85 | """ 86 | self.isTimeSpan = False 87 | self.invalidSpan = False 88 | self.timeSpan = '' 89 | self.target = self._filter(target) 90 | self.timeBase = arrow.get(timeBase).format('YYYY-M-D-H-m-s') 91 | self.nowTime = timeBase 92 | self.oldTimeBase = self.timeBase 93 | self.__preHandling() 94 | self.timeToken = self.__timeEx() 95 | dic = {} 96 | res = self.timeToken 97 | 98 | if self.isTimeSpan: 99 | 100 | if self.invalidSpan: 101 | dic['error'] = 'no time pattern could be extracted.' 102 | else: 103 | result = {} 104 | dic['type'] = 'timedelta' 105 | dic['timedelta'] = self.timeSpan 106 | # print(dic['timedelta']) 107 | index = dic['timedelta'].find('days') 108 | 109 | days = int(dic['timedelta'][:index-1]) 110 | result['year'] = int(days / 365) 111 | result['month'] = int(days / 30 - result['year'] * 12) 112 | result['day'] = int(days - result['year'] * 365 - result['month'] * 30) 113 | index = dic['timedelta'].find(',') 114 | time = dic['timedelta'][index+1:] 115 | time = time.split(':') 116 | result['hour'] = int(time[0]) 117 | result['minute'] = int(time[1]) 118 | result['second'] = int(time[2]) 119 | dic['timedelta'] = result 120 | else: 121 | if len(res) == 0: 122 | dic['error'] = 'no time pattern could be extracted.' 123 | elif len(res) == 1: 124 | dic['type'] = 'timestamp' 125 | dic['timestamp'] = res[0].time.format("YYYY-MM-DD HH:mm:ss") 126 | else: 127 | dic['type'] = 'timespan' 128 | dic['timespan'] = [res[0].time.format("YYYY-MM-DD HH:mm:ss"), res[1].time.format("YYYY-MM-DD HH:mm:ss")] 129 | return json.dumps(dic) 130 | 131 | def __preHandling(self): 132 | """ 133 | 待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理 134 | :return: 135 | """ 136 | self.target = StringPreHandler.delKeyword(self.target, u"\\s+") # 清理空白符 137 | self.target = StringPreHandler.delKeyword(self.target, u"[的]+") # 清理语气助词 138 | self.target = StringPreHandler.numberTranslator(self.target) # 大写数字转化 139 | 140 | def __timeEx(self): 141 | """ 142 | 143 | :param target: 输入文本字符串 144 | :param timeBase: 输入基准时间 145 | :return: TimeUnit[]时间表达式类型数组 146 | """ 147 | startline = -1 148 | endline = -1 149 | rpointer = 0 150 | temp = [] 151 | 152 | match = self.pattern.finditer(self.target) 153 | for m in match: 154 | startline = m.start() 155 | if startline == endline: 156 | rpointer -= 1 157 | temp[rpointer] = temp[rpointer] + m.group() 158 | else: 159 | temp.append(m.group()) 160 | endline = m.end() 161 | rpointer += 1 162 | res = [] 163 | # 时间上下文: 前一个识别出来的时间会是下一个时间的上下文,用于处理:周六3点到5点这样的多个时间的识别,第二个5点应识别到是周六的。 164 | contextTp = TimePoint() 165 | # print(self.timeBase) 166 | # print('temp',temp) 167 | for i in range(0, rpointer): 168 | # 这里是一个类嵌套了一个类 169 | res.append(TimeUnit(temp[i], self, contextTp)) 170 | # res[i].tp.tunit[3] = -1 171 | contextTp = res[i].tp 172 | # print(self.nowTime.year) 173 | # print(contextTp.tunit) 174 | res = self.__filterTimeUnit(res) 175 | 176 | return res 177 | 178 | def __filterTimeUnit(self, tu_arr): 179 | """ 180 | 过滤timeUnit中无用的识别词。无用识别词识别出的时间是1970.01.01 00:00:00(fastTime=0) 181 | :param tu_arr: 182 | :return: 183 | """ 184 | if (tu_arr is None) or (len(tu_arr) < 1): 185 | return tu_arr 186 | res = [] 187 | for tu in tu_arr: 188 | if tu.time.timestamp != 0: 189 | res.append(tu) 190 | return res 191 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimePoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 15:37 4 | # @Author : zhm 5 | # @File : TimePoint.py 6 | # @Software: PyCharm 7 | 8 | 9 | # * 时间表达式单元规范化对应的内部类, 10 | # * 对应时间表达式规范化的每个字段, 11 | # * 六个字段分别是:年-月-日-时-分-秒, 12 | # * 每个字段初始化为-1 13 | class TimePoint: 14 | def __init__(self): 15 | self.tunit = [-1, -1, -1, -1, -1, -1] 16 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/23 13:22 4 | # @Author : zhm 5 | # @File : __init__.py 6 | # @Software: PyCharm -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/5 17:29 4 | # @Author : zhm 5 | # @File : __init__.py 6 | # @Software: PyCharm -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/holi_lunar.json: -------------------------------------------------------------------------------- 1 | { 2 | "中和节": "02-02", 3 | "中秋节": "08-15", 4 | "中元节": "07-15", 5 | "端午节": "05-05", 6 | "春节": "01-01", 7 | "元宵节": "01-15", 8 | "重阳节": "09-09", 9 | "7夕节": "07-07", 10 | "初1节": "01-01", 11 | "初2节": "01-02", 12 | "初3节": "01-03", 13 | "初4节": "01-04", 14 | "初5节": "01-05", 15 | "初6节": "01-06", 16 | "初7节": "01-07", 17 | "初8节": "01-08", 18 | "初9节": "01-09", 19 | "初10节": "01-10", 20 | "初11节": "01-11", 21 | "初12节": "01-12", 22 | "初13节": "01-13", 23 | "初14节": "01-14", 24 | "初15节": "01-15" 25 | } 26 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/holi_solar.json: -------------------------------------------------------------------------------- 1 | { 2 | "植树节": "03-12", 3 | "圣诞节": "12-25", 4 | "青年节": "05-04", 5 | "教师节": "09-10", 6 | "儿童节": "06-01", 7 | "元旦节": "01-01", 8 | "国庆节": "10-01", 9 | "劳动节": "05-01", 10 | "妇女节": "03-08", 11 | "建军节": "08-01", 12 | "航海日节": "07-11", 13 | "建党节": "07-01", 14 | "记者节": "11-08", 15 | "情人节":"02-14", 16 | "母亲节":"05-11" 17 | } 18 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/reg.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/reg.pkl -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/regex.txt: -------------------------------------------------------------------------------- 1 | ((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[::]\d+([::]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((? 81 | """ 82 | with open(path) as f: 83 | stopwords = f.readlines() 84 | stopwords_list = [] 85 | for word in stopwords: 86 | stopwords_list.append(word.replace('\n', '').replace(' ', '')) 87 | 88 | return stopwords_list 89 | 90 | def tokenize_chinese(self,text): 91 | 92 | sentences = re.split('(。|!|\!|\.|?|\?)', text) # 保留分割符 93 | 94 | new_sents = [] 95 | for i in range(int(len(sentences) / 2)): 96 | sent = sentences[2 * i] + sentences[2 * i + 1] 97 | new_sents.append(sent) 98 | return new_sents 99 | 100 | def extract_keywords_from_text(self, text, min_len, max_len): 101 | """Method to extract keywords from the text provided. 102 | 103 | :param text: Text to extract keywords from, provided as a string. 104 | """ 105 | sentences = self.tokenize_chinese(text) 106 | self.extract_keywords_from_sentences(sentences, min_len, max_len) 107 | 108 | def extract_keywords_from_sentences(self, sentences, min_len, max_len): 109 | """Method to extract keywords from the list of sentences provided. 110 | 111 | :param sentences: Text to extraxt keywords from, provided as a list 112 | of strings, where each string is a sentence. 113 | """ 114 | phrase_list = self._generate_phrases(sentences, min_len, max_len) 115 | self._build_frequency_dist(phrase_list) 116 | self._build_word_co_occurance_graph(phrase_list) 117 | self._build_ranklist(phrase_list) 118 | 119 | def get_ranked_phrases(self): 120 | """Method to fetch ranked keyword strings. 121 | 122 | :return: List of strings where each string represents an extracted 123 | keyword string. 124 | """ 125 | return self.ranked_phrases 126 | 127 | def get_ranked_phrases_with_scores(self): 128 | """Method to fetch ranked keyword strings along with their scores. 129 | 130 | :return: List of tuples where each tuple is formed of an extracted 131 | keyword string and its score. Ex: (5.68, 'Four Scoures') 132 | """ 133 | return self.rank_list 134 | 135 | def get_word_frequency_distribution(self): 136 | """Method to fetch the word frequency distribution in the given text. 137 | 138 | :return: Dictionary (defaultdict) of the format `word -> frequency`. 139 | """ 140 | return self.frequency_dist 141 | 142 | def get_word_degrees(self): 143 | """Method to fetch the degree of words in the given text. Degree can be 144 | defined as sum of co-occurances of the word with other words in the 145 | given text. 146 | 147 | :return: Dictionary (defaultdict) of the format `word -> degree`. 148 | """ 149 | return self.degree 150 | 151 | def _build_frequency_dist(self, phrase_list): 152 | """Builds frequency distribution of the words in the given body of text. 153 | 154 | :param phrase_list: List of List of strings where each sublist is a 155 | collection of words which form a contender phrase. 156 | """ 157 | self.frequency_dist = Counter(chain.from_iterable(phrase_list)) 158 | 159 | def _build_word_co_occurance_graph(self, phrase_list): 160 | """Builds the co-occurance graph of words in the given body of text to 161 | compute degree of each word. 162 | 163 | :param phrase_list: List of List of strings where each sublist is a 164 | collection of words which form a contender phrase. 165 | """ 166 | co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0)) 167 | for phrase in phrase_list: 168 | # For each phrase in the phrase list, count co-occurances of the 169 | # word with other words in the phrase. 170 | # 171 | # Note: Keep the co-occurances graph as is, to help facilitate its 172 | # use in other creative ways if required later. 173 | for (word, coword) in product(phrase, phrase): 174 | co_occurance_graph[word][coword] += 1 175 | self.degree = defaultdict(lambda: 0) 176 | for key in co_occurance_graph: 177 | self.degree[key] = sum(co_occurance_graph[key].values()) 178 | 179 | def _build_ranklist(self, phrase_list): 180 | """Method to rank each contender phrase using the formula 181 | 182 | phrase_score = sum of scores of words in the phrase. 183 | word_score = d(w)/f(w) where d is degree and f is frequency. 184 | 185 | :param phrase_list: List of List of strings where each sublist is a 186 | collection of words which form a contender phrase. 187 | """ 188 | self.rank_list = [] 189 | for phrase in phrase_list: 190 | rank = 0.0 191 | for word in phrase: 192 | if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO: 193 | rank += 1.0 * self.degree[word] / self.frequency_dist[word] 194 | elif self.metric == Metric.WORD_DEGREE: 195 | rank += 1.0 * self.degree[word] 196 | else: 197 | rank += 1.0 * self.frequency_dist[word] 198 | self.rank_list.append((rank, " ".join(phrase))) 199 | self.rank_list.sort(reverse=True) 200 | self.ranked_phrases = [ph[1] for ph in self.rank_list] 201 | 202 | def _generate_phrases(self, sentences, min_len, max_len): 203 | """Method to generate contender phrases given the sentences of the text 204 | document. 205 | 206 | :param sentences: List of strings where each string represents a 207 | sentence which forms the text. 208 | :return: Set of string tuples where each tuple is a collection 209 | of words forming a contender phrase. 210 | """ 211 | phrase_list = set() 212 | # Create contender phrases from sentences. 213 | for sentence in sentences: 214 | word_list = [word for word in list(jieba.cut(sentence))] 215 | phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len)) 216 | return phrase_list 217 | 218 | def _get_phrase_list_from_words(self, word_list, min_len, max_len): 219 | """Method to create contender phrases from the list of words that form 220 | a sentence by dropping stopwords and punctuations and grouping the left 221 | words into phrases. Only phrases in the given length range (both limits 222 | inclusive) would be considered to build co-occurrence matrix. Ex: 223 | 224 | Sentence: Red apples, are good in flavour. 225 | List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour'] 226 | List after dropping punctuations and stopwords. 227 | List of words: ['red', 'apples', *, *, good, *, 'flavour'] 228 | List of phrases: [('red', 'apples'), ('good',), ('flavour',)] 229 | 230 | List of phrases with a correct length: 231 | For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)] 232 | For the range [1, 1]: [('good',), ('flavour',)] 233 | For the range [2, 2]: [('red', 'apples')] 234 | 235 | :param word_list: List of words which form a sentence when joined in 236 | the same order. 237 | :return: List of contender phrases that are formed after dropping 238 | stopwords and punctuations. 239 | """ 240 | groups = groupby(word_list, lambda x: x not in self.to_ignore) 241 | phrases = [] 242 | for group in groups: 243 | tmp = tuple(group[1]) 244 | len_g1 = len(list(tmp)) 245 | if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase 246 | phrases.append(tuple(tmp)) 247 | 248 | return list( 249 | filter( 250 | lambda x: self.min_length <= len(x) <= self.max_length, phrases 251 | ) 252 | ) 253 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/cocoNLP/extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from phone import Phone 4 | from itertools import groupby 5 | import phonenumbers 6 | from pyhanlp import * 7 | from cocoNLP.config.basic.time_nlp.TimeNormalizer import * 8 | 9 | 10 | 11 | __all__ = ['extract_email', 'replace_chinese','extract_cellphone', 'extract_cellphone', 'extract_cellphone_location', 12 | 'get_location', 'extract_locations', 'replace_cellphoneNum', 'extract_time', 'extract_name', 'most_common'] 13 | 14 | class extractor(): 15 | def __init__(self): 16 | pass 17 | 18 | def extract_email(self, text): 19 | """ 20 | extract all email addresses from texts 21 | eg: extract_email('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 22 | 23 | 24 | :param: raw_text 25 | :return: email_addresses_list 26 | """ 27 | eng_texts = self.replace_chinese(text) 28 | eng_texts = eng_texts.replace(' at ','@').replace(' dot ','.') 29 | sep = ',!?:; ,。!?《》、|\\/' 30 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 31 | 32 | email_pattern = r'^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$' 33 | 34 | emails = [] 35 | for eng_text in eng_split_texts: 36 | result = re.match(email_pattern, eng_text, flags=0) 37 | if result: 38 | emails.append(result.string) 39 | return emails 40 | 41 | def replace_chinese(self, text): 42 | """ 43 | remove all the chinese characters in text 44 | eg: replace_chinese('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 45 | 46 | 47 | :param: raw_text 48 | :return: text_without_chinese 49 | """ 50 | filtrate = re.compile(u'[\u4E00-\u9FA5]') 51 | text_without_chinese = filtrate.sub(r' ', text) 52 | return text_without_chinese 53 | 54 | def extract_cellphone(self, text, nation): 55 | """ 56 | extract all cell phone numbers from texts 57 | eg: extract_email('my email address is sldisd@baidu.com and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈') 58 | 59 | 60 | :param: raw_text 61 | :return: email_addresses_list 62 | """ 63 | eng_texts = self.replace_chinese(text) 64 | sep = ',!?:; :,。!?《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 65 | eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k] 66 | if nation=='CHN': 67 | phone_pattern = r'^(\+86)?([- ])?(|(13[0-9])|(14[0-9])|(15[0-9])|(17[0-9])|(18[0-9])|(19[0-9]))([- ])?\d{3}([- ])?\d{4}([- ])?\d{4}' 68 | 69 | if nation!='CHN': 70 | phone_pattern = r'\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b' 71 | 72 | phones = [] 73 | for eng_text in eng_split_texts: 74 | result = re.match(phone_pattern, eng_text, flags=0) 75 | if result: 76 | phones.append(result.string.replace('+86','').replace('-','')) 77 | return phones 78 | 79 | def extract_cellphone_location(self, phoneNum, nation='CHN'): 80 | """ 81 | extract cellphone number locations according to the given number 82 | eg: extract_cellphone_location('181000765143',nation=CHN) 83 | 84 | 85 | :param: phoneNum, nation 86 | :return: location{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'} 87 | 88 | """ 89 | if nation=='CHN': 90 | p = Phone() 91 | loc_dict = p.find(phoneNum) 92 | if nation!='CHN': 93 | x = phonenumbers.parse(phoneNum, 'GB') 94 | if phonenumbers.is_possible_number(x): 95 | loc_dict = x 96 | # print(loc_dict) 97 | return loc_dict 98 | 99 | def get_location(self, word_pos_list): 100 | """ 101 | get location by the pos of the word, such as 'ns' 102 | eg: get_location('内蒙古赤峰市松山区') 103 | 104 | 105 | :param: word_pos_list 106 | :return: location_list eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 107 | 108 | """ 109 | location_list = [] 110 | 111 | for i,t in enumerate(word_pos_list): 112 | word = t[0] 113 | nature = t[1] 114 | if nature == 'ns': 115 | loc_tmp = word 116 | count = i + 1 117 | while count < len(word_pos_list): 118 | next_word_pos = word_pos_list[count] 119 | next_pos = next_word_pos[1] 120 | next_word = next_word_pos[0] 121 | if next_pos=='ns' or 'n' == next_pos[0]: 122 | loc_tmp += next_word 123 | else: 124 | break 125 | count += 1 126 | location_list.append(loc_tmp) 127 | 128 | return location_list # max(location_list) 129 | 130 | def extract_locations(self, text): 131 | """ 132 | extract locations by from texts 133 | eg: extract_locations('我家住在陕西省安康市汉滨区。') 134 | 135 | 136 | :param: raw_text 137 | :return: location_list eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 138 | 139 | """ 140 | seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)] 141 | location_list = self.get_location(seg_list) 142 | return location_list 143 | 144 | def replace_cellphoneNum(self, text): 145 | """ 146 | remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors. 147 | hence, we remove it here. 148 | eg: extract_locations('我家住在陕西省安康市汉滨区,我的手机号是181-0006-5143。') 149 | 150 | 151 | :param: raw_text 152 | :return: text_without_cellphone eg: '我家住在陕西省安康市汉滨区,我的手机号是。' 153 | 154 | """ 155 | cell_phones = self.extract_cellphone(text,'CHN') 156 | for phone_num in cell_phones: 157 | text = text.replace(phone_num,'') 158 | return text 159 | 160 | def extract_time(self, text): 161 | """ 162 | extract timestamp from texts 163 | eg: extract_time('我于2018年1月1日获得1000万美金奖励。') 164 | 165 | 166 | :param: raw_text 167 | :return: time_info eg: {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"} 168 | 169 | """ 170 | tmp_text = self.replace_cellphoneNum(text) 171 | tn = TimeNormalizer() 172 | res = tn.parse(target=tmp_text) # target为待分析语句,timeBase为基准时间默认是当前时间 173 | return res 174 | 175 | def extract_name(self, text): 176 | """ 177 | extract chinese names from texts 178 | eg: extract_time('急寻王龙,短发,王龙,男,丢失发型短发,...如有线索,请迅速与警方联系:19909156745') 179 | 180 | 181 | :param: raw_text 182 | :return: name_list eg: ['王龙', '王龙'] 183 | 184 | """ 185 | seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)] 186 | names = [] 187 | for ele_tup in seg_list: 188 | if 'nr' in ele_tup[1]: 189 | names.append(ele_tup[0]) 190 | # print(ele_tup[0],ele_tup[1]) 191 | return self.most_common(names) 192 | 193 | def most_common(self, content_list): 194 | """ 195 | return the most common element in a list 196 | eg: extract_time(['王龙','王龙','李二狗']) 197 | 198 | 199 | :param: content_list 200 | :return: name eg: '王龙' 201 | """ 202 | return max(set(content_list), key=content_list.count) 203 | 204 | 205 | 206 | 207 | 208 | if __name__ == '__main__': 209 | 210 | text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发,...如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com' 211 | ex = extractor() 212 | 213 | emails = ex.extract_email(text) 214 | cellphones = ex.extract_cellphone(text,nation='CHN') 215 | cell_loc = [] 216 | for cell in cellphones: 217 | cell_loc.append(ex.extract_cellphone_location(cell,'CHN')) 218 | 219 | locations = ex.extract_locations(text) 220 | times = ex.extract_time(text) 221 | names = ex.extract_name(text) 222 | 223 | result_dict = {} 224 | result_dict['email'] = emails 225 | result_dict['cellphone'] = cellphones 226 | result_dict['cellphone_location'] = cell_loc 227 | result_dict['location'] = locations 228 | result_dict['time'] = times 229 | result_dict['name'] = names 230 | for key in result_dict.keys(): 231 | print(key,result_dict[key]) -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/readme.md: -------------------------------------------------------------------------------- 1 | ## This is a Chinese nlp package, which can extract information from texts. 2 | 3 | [![pypiv](https://img.shields.io/pypi/v/rake-nltk.svg)](https://pypi.org/project/cocoNLP/) 4 | [![Thanks](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://www.zhihu.com/people/mountain-blue-64/posts) 5 | 6 | ## It is developed for a public welfare program, a weibo robot [@寻人微博](https://weibo.com/xrwbyangyangfuture). 7 | 8 | ## installation 9 | It works well on macOS Mojave with python=3.6. 10 | ``` 11 | pip install cocoNLP 12 | ``` 13 | 14 | ## Directly from the repository 15 | 16 | ``` 17 | git clone https://github.com/fighting41love/cocoNLP.git 18 | cd cocoNLP 19 | python setup.py install 20 | ``` 21 | 22 | ## Quick start 23 | 24 | ### Extract basic information from texts 25 | ``` 26 | >>> from cocoNLP.extractor import extractor 27 | 28 | >>> ex = extractor() 29 | 30 | >>> text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发,...如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com' 31 | 32 | # 抽取邮箱 33 | >>> emails = ex.extract_email(text) 34 | >>> print(emails) 35 | 36 | ['baizhantang@sina.com.cn', 'yangyangfuture@gmail.com.cn'] 37 | ``` 38 | 39 | ``` 40 | # 抽取手机号 41 | >>> cellphones = ex.extract_cellphone(text,nation='CHN') 42 | >>> print(cellphones) 43 | 44 | ['18100065143', '13261562938'] 45 | ``` 46 | 47 | ``` 48 | # 抽取手机归属地、运营商 49 | >>> cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones] 50 | >>> print(cell_locs) 51 | 52 | cellphone_location [{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}] 53 | ``` 54 | 55 | ``` 56 | # 抽取地址信息 57 | >>> locations = ex.extract_locations(text) 58 | >>> print(locations) 59 | ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 60 | ``` 61 | ``` 62 | # 抽取时间点 63 | >>> times = ex.extract_time(text) 64 | >>> print(times) 65 | time {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"} 66 | ``` 67 | ``` 68 | # 抽取人名 69 | >>> name = ex.extract_name(text) 70 | >>> print(name) 71 | 特朗普 72 | 73 | ``` 74 | ### Extract phrases from texts 75 | ``` 76 | >>> from cocoNLP.config.phrase import rake 77 | 78 | >>> r = rake.Rake() 79 | 80 | >>> # Extraction given the list of strings where each string is a sentence. 81 | >>> r.extract_keywords_from_sentences(['2015年5月11日,“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、华某(25岁)名誉权纠纷及成某(38岁)名誉权纠纷二案,要求被诉人公开赔礼道歉、恢复名誉、删除相关视频、断开转载该视频的链接,赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀法院已经受理了这两起案件。原告章泽天诉称,她被许多网友称为“奶茶妹妹”,在网络上获得相当的关注度。2014年4月18日,北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并发布了名为“奶茶妹妹恋情或为炒作,百万炒作团队浮出水面”的视频,该段视频捏造包括“奶茶妹妹走红,实为幕后商业策划”、“100万,奶茶妹妹花巨资,请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论,包括声称其就是原告聘请的“幕后推手和炒作专家”,原告曾花100万聘请其为之宣传策划,原告与刘强东的恋情系两者合作的结果等等。 82 | '],2,4) 83 | 84 | >>> # To get keyword phrases ranked highest to lowest. 85 | >>> ranked_words = r.get_ranked_phrases() 86 | 87 | >>> # To get keyword phrases ranked highest to lowest with scores. 88 | >>> ranked_words_score = r.get_ranked_phrases_with_scores() 89 | 90 | >>> for ele in ranked_words_score: 91 | >>> print(ele) 92 | 93 | (16.0, '要求 被诉人 公开 赔礼道歉') 94 | (15.0, '上述 节目 中 捏造') 95 | (14.5, '该段 视频 捏造 包括') 96 | (14.0, '实为 幕后 商业 策划') 97 | (14.0, '奶茶 妹妹 花 巨资') 98 | (9.5, '删除 相关 视频') 99 | (9.0, '请人 策划 走红') 100 | (9.0, '网络 上 获得') 101 | (9.0, '想方设法 地转 学院') 102 | (9.0, '奶茶 妹妹 走红') 103 | (9.0, '名誉权 纠纷 及成') 104 | (9.0, '名誉权 纠纷 二案') 105 | (8.5, '奶茶 妹妹 恋情') 106 | (8.5, '原告 章泽天 诉称') 107 | (6.0, '奶茶 妹妹') 108 | (5.0, '节目 制作') 109 | (5.0, '幕后 推手') 110 | (5.0, '宣传 策划') 111 | ``` 112 | 113 | 114 | ## References 115 | 116 | This is a python implementation of the algorithm as mentioned in paper [Automatic keyword extraction from individual documents by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley](https://www.researchgate.net/profile/Stuart_Rose/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents/links/55071c570cf27e990e04c8bb.pdf) 117 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/requirements.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | pyhanlp 3 | phone 4 | phonenumbers 5 | regex 6 | arrow 7 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | tag_build = 3 | tag_date = 0 4 | 5 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from setuptools import setup 4 | from setuptools.command.develop import develop 5 | from setuptools.command.install import install 6 | from subprocess import call 7 | 8 | 9 | here = path.abspath(path.dirname(__file__)) 10 | 11 | 12 | class PostDevelop(develop): 13 | """Post-installation for development mode.""" 14 | 15 | def run(self): 16 | develop.run(self) 17 | 18 | 19 | class PostInstall(install): 20 | """Post-installation for production mode.""" 21 | 22 | def run(self): 23 | install.run(self) 24 | 25 | 26 | class MyInstall(install): 27 | def run(self): 28 | call(["pip install -r requirements.txt --no-clean"], shell=True) 29 | install.run(self) 30 | 31 | # Get package and author details. 32 | about = {} 33 | with open(path.join(here, "cocoNLP", "__version__.py")) as f: 34 | exec(f.read(), about) 35 | 36 | setup( 37 | # Name of the module 38 | name="cocoNLP", 39 | # Details 40 | version=about["__version__"], 41 | description=about["__description__"], 42 | #long_description=long_description, 43 | # The project's main homepage. 44 | url=about["__url__"], 45 | # Author details 46 | author=about["__author__"], 47 | author_email=about["__author_email__"], 48 | # License 49 | license=about["__license__"], 50 | packages=["cocoNLP"], 51 | test_suite="tests", 52 | keywords="nlp text-mining information extraction", 53 | include_package_data=True, 54 | classifiers=[ 55 | # Intended Audience. 56 | "Intended Audience :: Developers", 57 | "Intended Audience :: Education", 58 | # License. 59 | "License :: OSI Approved :: MIT License", 60 | # Project maturity. 61 | "Development Status :: 3 - Alpha", 62 | # Operating Systems. 63 | "Operating System :: POSIX", 64 | # Supported Languages. 65 | "Programming Language :: Python :: 2.7", 66 | "Programming Language :: Python :: 3.4", 67 | "Programming Language :: Python :: 3.5", 68 | "Programming Language :: Python :: 3.6", 69 | # Topic tags. 70 | "Topic :: Software Development :: Build Tools", 71 | "Topic :: Software Development :: Libraries :: Python Modules", 72 | ], 73 | setup_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"], 74 | install_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"], 75 | cmdclass={'install': MyInstall}, 76 | ) 77 | -------------------------------------------------------------------------------- /dist/cocoNLP-0.0.9/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from cocoNLP.extractor import extractor 3 | 4 | ex = extractor() 5 | 6 | text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发,...如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com' 7 | 8 | # 抽取邮箱 9 | emails = ex.extract_email(text) 10 | print(emails) 11 | 12 | # 抽取手机号 13 | cellphones = ex.extract_cellphone(text,nation='CHN') 14 | print(cellphones) 15 | 16 | # 抽取手机归属地、运营商 17 | cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones] 18 | print(cell_locs) 19 | 20 | # 抽取地址信息 21 | locations = ex.extract_locations(text) 22 | print(locations) 23 | 24 | # 抽取时间点 25 | times = ex.extract_time(text) 26 | print(times) 27 | 28 | # 抽取人名 29 | name = ex.extract_name(text) 30 | print(name) 31 | 32 | 33 | from cocoNLP.config.phrase import rake 34 | 35 | r = rake.Rake() 36 | 37 | # Extraction given the list of strings where each string is a sentence. 38 | r.extract_keywords_from_sentences(['2015年5月11日,“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、' 39 | '华某(25岁)名誉权纠纷及成某(38岁)名誉权纠纷二案,要求被诉人公开赔礼道歉、恢复名誉、' 40 | '删除相关视频、断开转载该视频的链接,赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀' 41 | '法院已经受理了这两起案件。原告章泽天诉称,她被许多网友称为“奶茶妹妹”,在网络上获得相当的' 42 | '关注度。2014年4月18日,北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并' 43 | '发布了名为“奶茶妹妹恋情或为炒作,百万炒作团队浮出水面”的视频,该段视频捏造包括“奶茶妹妹走红' 44 | ',实为幕后商业策划”、“100万,奶茶妹妹花巨资,请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、' 45 | '想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论,包括声称其就是原告聘请的' 46 | '“幕后推手和炒作专家”,原告曾花100万聘请其为之宣传策划,原告与刘强东的恋情系两者合作的结果等等。'],2,4) 47 | 48 | # To get keyword phrases ranked highest to lowest. 49 | ranked_words = r.get_ranked_phrases() 50 | 51 | # To get keyword phrases ranked highest to lowest with scores. 52 | ranked_words_score = r.get_ranked_phrases_with_scores() 53 | 54 | for ele in ranked_words_score: 55 | print(ele) 56 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## This is a Chinese nlp package, which can extract information from texts. 2 | 3 | [![pypiv](https://img.shields.io/pypi/v/rake-nltk.svg)](https://pypi.org/project/cocoNLP/) 4 | [![Thanks](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://www.zhihu.com/people/mountain-blue-64/posts) 5 | 6 | ## It is developed for a public welfare program, a weibo robot [@寻人微博](https://weibo.com/xrwbyangyangfuture). 7 | 8 | ## installation 9 | It works well on macOS Mojave with python=3.6. 10 | ``` 11 | pip install cocoNLP 12 | ``` 13 | 14 | ## Directly from the repository 15 | 16 | ``` 17 | git clone https://github.com/fighting41love/cocoNLP.git 18 | cd cocoNLP 19 | python setup.py install 20 | ``` 21 | 22 | ## Quick start 23 | 24 | ### Extract basic information from texts 25 | ``` 26 | >>> from cocoNLP.extractor import extractor 27 | 28 | >>> ex = extractor() 29 | 30 | >>> text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发,...如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com' 31 | 32 | # 抽取邮箱 33 | >>> emails = ex.extract_email(text) 34 | >>> print(emails) 35 | 36 | ['baizhantang@sina.com.cn', 'yangyangfuture@gmail.com.cn'] 37 | ``` 38 | 39 | ``` 40 | # 抽取手机号 41 | >>> cellphones = ex.extract_cellphone(text,nation='CHN') 42 | >>> print(cellphones) 43 | 44 | ['18100065143', '13261562938'] 45 | ``` 46 | 47 | ``` 48 | # 抽取身份证号 49 | >>> ids = ex.extract_ids(text) 50 | >>> print(ids) 51 | 52 | ['410105196904010537'] 53 | ``` 54 | 55 | ``` 56 | # 抽取手机归属地、运营商 57 | >>> cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones] 58 | >>> print(cell_locs) 59 | 60 | cellphone_location [{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}] 61 | ``` 62 | 63 | ``` 64 | # 抽取地址信息 65 | >>> locations = ex.extract_locations(text) 66 | >>> print(locations) 67 | ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] 68 | ``` 69 | ``` 70 | # 抽取时间点 71 | >>> times = ex.extract_time(text) 72 | >>> print(times) 73 | time {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"} 74 | ``` 75 | ``` 76 | # 抽取人名 77 | >>> name = ex.extract_name(text) 78 | >>> print(name) 79 | 特朗普 80 | 81 | ``` 82 | ### Extract phrases from texts 83 | ``` 84 | >>> from cocoNLP.config.phrase import rake 85 | 86 | >>> r = rake.Rake() 87 | 88 | >>> # Extraction given the list of strings where each string is a sentence. 89 | >>> r.extract_keywords_from_sentences(['2015年5月11日,“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、华某(25岁)名誉权纠纷及成某(38岁)名誉权纠纷二案,要求被诉人公开赔礼道歉、恢复名誉、删除相关视频、断开转载该视频的链接,赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀法院已经受理了这两起案件。原告章泽天诉称,她被许多网友称为“奶茶妹妹”,在网络上获得相当的关注度。2014年4月18日,北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并发布了名为“奶茶妹妹恋情或为炒作,百万炒作团队浮出水面”的视频,该段视频捏造包括“奶茶妹妹走红,实为幕后商业策划”、“100万,奶茶妹妹花巨资,请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论,包括声称其就是原告聘请的“幕后推手和炒作专家”,原告曾花100万聘请其为之宣传策划,原告与刘强东的恋情系两者合作的结果等等。 90 | '],2,4) 91 | 92 | >>> # To get keyword phrases ranked highest to lowest. 93 | >>> ranked_words = r.get_ranked_phrases() 94 | 95 | >>> # To get keyword phrases ranked highest to lowest with scores. 96 | >>> ranked_words_score = r.get_ranked_phrases_with_scores() 97 | 98 | >>> for ele in ranked_words_score: 99 | >>> print(ele) 100 | 101 | (16.0, '要求 被诉人 公开 赔礼道歉') 102 | (15.0, '上述 节目 中 捏造') 103 | (14.5, '该段 视频 捏造 包括') 104 | (14.0, '实为 幕后 商业 策划') 105 | (14.0, '奶茶 妹妹 花 巨资') 106 | (9.5, '删除 相关 视频') 107 | (9.0, '请人 策划 走红') 108 | (9.0, '网络 上 获得') 109 | (9.0, '想方设法 地转 学院') 110 | (9.0, '奶茶 妹妹 走红') 111 | (9.0, '名誉权 纠纷 及成') 112 | (9.0, '名誉权 纠纷 二案') 113 | (8.5, '奶茶 妹妹 恋情') 114 | (8.5, '原告 章泽天 诉称') 115 | (6.0, '奶茶 妹妹') 116 | (5.0, '节目 制作') 117 | (5.0, '幕后 推手') 118 | (5.0, '宣传 策划') 119 | ``` 120 | 121 | 122 | ## References 123 | 124 | This is a python implementation of the algorithm as mentioned in paper [Automatic keyword extraction from individual documents by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley](https://www.researchgate.net/profile/Stuart_Rose/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents/links/55071c570cf27e990e04c8bb.pdf) 125 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | pyhanlp 3 | phone 4 | phonenumbers 5 | regex 6 | arrow==0.14.3 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from setuptools import setup 4 | from setuptools.command.develop import develop 5 | from setuptools.command.install import install 6 | from subprocess import call 7 | 8 | 9 | here = path.abspath(path.dirname(__file__)) 10 | 11 | 12 | class PostDevelop(develop): 13 | """Post-installation for development mode.""" 14 | 15 | def run(self): 16 | develop.run(self) 17 | 18 | 19 | class PostInstall(install): 20 | """Post-installation for production mode.""" 21 | 22 | def run(self): 23 | install.run(self) 24 | 25 | 26 | class MyInstall(install): 27 | def run(self): 28 | call(["pip install -r requirements.txt --no-clean"], shell=True) 29 | install.run(self) 30 | 31 | # Get package and author details. 32 | about = {} 33 | with open(path.join(here, "cocoNLP", "__version__.py")) as f: 34 | exec(f.read(), about) 35 | 36 | setup( 37 | # Name of the module 38 | name="cocoNLP", 39 | # Details 40 | version=about["__version__"], 41 | description=about["__description__"], 42 | #long_description=long_description, 43 | # The project's main homepage. 44 | url=about["__url__"], 45 | # Author details 46 | author=about["__author__"], 47 | author_email=about["__author_email__"], 48 | # License 49 | license=about["__license__"], 50 | packages=["cocoNLP"], 51 | test_suite="tests", 52 | keywords="nlp text-mining information extraction", 53 | include_package_data=True, 54 | classifiers=[ 55 | # Intended Audience. 56 | "Intended Audience :: Developers", 57 | "Intended Audience :: Education", 58 | # License. 59 | "License :: OSI Approved :: MIT License", 60 | # Project maturity. 61 | "Development Status :: 3 - Alpha", 62 | # Operating Systems. 63 | "Operating System :: POSIX", 64 | # Supported Languages. 65 | "Programming Language :: Python :: 2.7", 66 | "Programming Language :: Python :: 3.4", 67 | "Programming Language :: Python :: 3.5", 68 | "Programming Language :: Python :: 3.6", 69 | # Topic tags. 70 | "Topic :: Software Development :: Build Tools", 71 | "Topic :: Software Development :: Libraries :: Python Modules", 72 | ], 73 | setup_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"], 74 | install_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"], 75 | cmdclass={'install': MyInstall}, 76 | ) 77 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from cocoNLP.extractor import extractor 3 | 4 | ex = extractor() 5 | 6 | text = '急寻特朗普,男孩,于2018年11月27号11时在陕西省安康市汉滨区走失。身份证号码410105196904010537丢失发型短发,...' \ 7 | '如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com13673630861' 8 | text = '急寻特朗普,男孩,于2018年11月27号11时在鼓楼区走失。身份证号码410105196904010537丢失发型短发,...' \ 9 | '如有线索,请迅速与警方联系:18100065143,132-6156-2938,baizhantang@sina.com.cn 和yangyangfuture at gmail dot com13673630861' 10 | # text = '4点15分钟后的番茄炒蛋' 11 | # text = '我下午2点15分30秒的番茄炒蛋' 12 | # text = '晚上8点15的番茄炒蛋' 13 | 14 | 15 | # 抽取邮箱 16 | emails = ex.extract_email(text) 17 | print(emails) 18 | 19 | # 抽取手机号 20 | cellphones = ex.extract_cellphone(text,nation='CHN') 21 | print(cellphones) 22 | 23 | # 抽取身份证号 24 | ids = ex.extract_ids(text) 25 | print(ids) 26 | 27 | # 抽取手机归属地、运营商 28 | cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones] 29 | print(cell_locs) 30 | 31 | # 抽取地址信息 32 | locations = ex.extract_locations(text) 33 | print(locations) 34 | 35 | # 抽取时间点 36 | times = ex.extract_time(text) 37 | print(times) 38 | 39 | # 抽取人名 40 | name = ex.extract_name(text) 41 | print(name) 42 | 43 | 44 | from cocoNLP.config.phrase import rake 45 | 46 | r = rake.Rake() 47 | 48 | # Extraction given the list of strings where each string is a sentence. 49 | r.extract_keywords_from_text('2015年5月11日,“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、' 50 | '华某(25岁)名誉权纠纷及成某(38岁)名誉权纠纷二案,要求被诉人公开赔礼道歉、恢复名誉、' 51 | '删除相关视频、断开转载该视频的链接,赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀' 52 | '法院已经受理了这两起案件。原告章泽天诉称,她被许多网友称为“奶茶妹妹”,在网络上获得相当的' 53 | '关注度。2014年4月18日,北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并' 54 | '发布了名为“奶茶妹妹恋情或为炒作,百万炒作团队浮出水面”的视频,该段视频捏造包括“奶茶妹妹走红' 55 | ',实为幕后商业策划”、“100万,奶茶妹妹花巨资,请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、' 56 | '想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论,包括声称其就是原告聘请的' 57 | '“幕后推手和炒作专家”,原告曾花100万聘请其为之宣传策划,原告与刘强东的恋情系两者合作的结果等等。',2,4) 58 | 59 | # r.extract_keywords_from_sentences(['如果您认识的人你要通知他一下就行了好吧对吧因为我们这边都,如果您认识的人你要通知他一下就行了好吧对波因为我们这边都'],2,4) 60 | 61 | # To get keyword phrases ranked highest to lowest. 62 | ranked_words = r.get_ranked_phrases() 63 | 64 | # To get keyword phrases ranked highest to lowest with scores. 65 | ranked_words_score = r.get_ranked_phrases_with_scores() 66 | print(ranked_words_score) 67 | for ele in ranked_words_score: 68 | print(ele) 69 | --------------------------------------------------------------------------------