├── MANIFEST.in
├── build
    └── lib
    │   └── cocoNLP
    │       ├── __init__.py
    │       ├── __version__.py
    │       ├── data
    │           └── stopwords.txt
    │       └── rake.py
├── cocoNLP.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── requires.txt
    └── top_level.txt
├── cocoNLP
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── extractor.cpython-35.pyc
    │   ├── extractor.cpython-36.pyc
    │   └── extractor.cpython-37.pyc
    ├── __version__.py
    ├── config
    │   ├── .DS_Store
    │   ├── basic
    │   │   └── time_nlp
    │   │   │   ├── .DS_Store
    │   │   │   ├── EGG-INFO
    │   │   │       ├── PKG-INFO
    │   │   │       ├── SOURCES.txt
    │   │   │       ├── dependency_links.txt
    │   │   │       ├── not-zip-safe
    │   │   │       ├── requires.txt
    │   │   │       └── top_level.txt
    │   │   │   ├── LunarSolarConverter.py
    │   │   │   ├── README.md
    │   │   │   ├── RangeTimeEnum.py
    │   │   │   ├── StringPreHandler.py
    │   │   │   ├── Test.py
    │   │   │   ├── TimeConverter.egg-info
    │   │   │       ├── PKG-INFO
    │   │   │       ├── SOURCES.txt
    │   │   │       ├── dependency_links.txt
    │   │   │       ├── not-zip-safe
    │   │   │       ├── requires.txt
    │   │   │       └── top_level.txt
    │   │   │   ├── TimeNormalizer.py
    │   │   │   ├── TimePoint.py
    │   │   │   ├── TimeUnit.py
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── LunarSolarConverter.cpython-35.pyc
    │   │   │       ├── LunarSolarConverter.cpython-36.pyc
    │   │   │       ├── RangeTimeEnum.cpython-35.pyc
    │   │   │       ├── RangeTimeEnum.cpython-36.pyc
    │   │   │       ├── StringPreHandler.cpython-35.pyc
    │   │   │       ├── StringPreHandler.cpython-36.pyc
    │   │   │       ├── Test.cpython-36.pyc
    │   │   │       ├── TimeNormalizer.cpython-35.pyc
    │   │   │       ├── TimeNormalizer.cpython-36.pyc
    │   │   │       ├── TimePoint.cpython-35.pyc
    │   │   │       ├── TimePoint.cpython-36.pyc
    │   │   │       ├── TimeUnit.cpython-35.pyc
    │   │   │       ├── TimeUnit.cpython-36.pyc
    │   │   │       ├── __init__.cpython-35.pyc
    │   │   │       └── __init__.cpython-36.pyc
    │   │   │   └── resource
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │           └── __init__.cpython-36.pyc
    │   │   │       ├── holi_lunar.json
    │   │   │       ├── holi_solar.json
    │   │   │       ├── reg.pkl
    │   │   │       └── regex.txt
    │   └── phrase
    │   │   ├── __pycache__
    │   │       └── rake.cpython-36.pyc
    │   │   ├── data
    │   │       └── stopwords.txt
    │   │   └── rake.py
    ├── extractor.py
    └── extractor.pyc
├── dist
    ├── .DS_Store
    ├── cocoNLP-0.0.10.tar.gz
    ├── cocoNLP-0.0.11.tar.gz
    ├── cocoNLP-0.0.12.tar.gz
    ├── cocoNLP-0.0.13.tar.gz
    └── cocoNLP-0.0.9
    │   ├── MANIFEST.in
    │   ├── PKG-INFO
    │   ├── cocoNLP.egg-info
    │       ├── PKG-INFO
    │       ├── SOURCES.txt
    │       ├── dependency_links.txt
    │       ├── requires.txt
    │       └── top_level.txt
    │   ├── cocoNLP
    │       ├── __init__.py
    │       ├── __version__.py
    │       ├── config
    │       │   ├── basic
    │       │   │   └── time_nlp
    │       │   │   │   ├── .DS_Store
    │       │   │   │   ├── EGG-INFO
    │       │   │   │       ├── PKG-INFO
    │       │   │   │       ├── SOURCES.txt
    │       │   │   │       ├── dependency_links.txt
    │       │   │   │       ├── not-zip-safe
    │       │   │   │       ├── requires.txt
    │       │   │   │       └── top_level.txt
    │       │   │   │   ├── LunarSolarConverter.py
    │       │   │   │   ├── README.md
    │       │   │   │   ├── RangeTimeEnum.py
    │       │   │   │   ├── StringPreHandler.py
    │       │   │   │   ├── Test.py
    │       │   │   │   ├── TimeConverter.egg-info
    │       │   │   │       ├── PKG-INFO
    │       │   │   │       ├── SOURCES.txt
    │       │   │   │       ├── dependency_links.txt
    │       │   │   │       ├── not-zip-safe
    │       │   │   │       ├── requires.txt
    │       │   │   │       └── top_level.txt
    │       │   │   │   ├── TimeNormalizer.py
    │       │   │   │   ├── TimePoint.py
    │       │   │   │   ├── TimeUnit.py
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── __pycache__
    │       │   │   │       ├── LunarSolarConverter.cpython-36.pyc
    │       │   │   │       ├── RangeTimeEnum.cpython-36.pyc
    │       │   │   │       ├── StringPreHandler.cpython-36.pyc
    │       │   │   │       ├── Test.cpython-36.pyc
    │       │   │   │       ├── TimeNormalizer.cpython-36.pyc
    │       │   │   │       ├── TimePoint.cpython-36.pyc
    │       │   │   │       ├── TimeUnit.cpython-36.pyc
    │       │   │   │       └── __init__.cpython-36.pyc
    │       │   │   │   └── resource
    │       │   │   │       ├── __init__.py
    │       │   │   │       ├── __pycache__
    │       │   │   │           └── __init__.cpython-36.pyc
    │       │   │   │       ├── holi_lunar.json
    │       │   │   │       ├── holi_solar.json
    │       │   │   │       ├── reg.pkl
    │       │   │   │       └── regex.txt
    │       │   └── phrase
    │       │   │   ├── __pycache__
    │       │   │       └── rake.cpython-36.pyc
    │       │   │   ├── data
    │       │   │       └── stopwords.txt
    │       │   │   └── rake.py
    │       └── extractor.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   └── test.py
├── readme.md
├── requirements.txt
├── setup.py
└── test.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include cocoNLP/config *
2 | recursive-include cocoNLP/config *
3 | 


--------------------------------------------------------------------------------
/build/lib/cocoNLP/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | # -*- coding: utf-8 -*-
10 | 
11 | """
12 | cocoNLP module
13 | :copyright: (c) 2018 by Yang Yang.
14 | :license: MIT, see LICENSE for more details.
15 | """
16 | 


--------------------------------------------------------------------------------
/build/lib/cocoNLP/__version__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | 
10 | __title__ = "cocoNLP"
11 | __description__ = "Python implementation of many nlp algorithms"
12 | __url__ = "https://github.com/fighting41love"
13 | __version__ = "0.0.7"
14 | __author__ = "Yang Yang"
15 | __author_email__ = "yangyangfuture@gmail.com"
16 | __license__ = "MIT"
17 | __copyright__ = "Copyright 2018 Yang Yang"
18 | 


--------------------------------------------------------------------------------
/build/lib/cocoNLP/rake.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Implementation of Rapid Automatic Keyword Extraction algorithm.
  3 | 
  4 | As described in the paper `Automatic keyword extraction from individual
  5 | documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
  6 | """
  7 | 
  8 | import string
  9 | from collections import Counter, defaultdict
 10 | from itertools import chain, groupby, product
 11 | import jieba
 12 | import re
 13 | from enum import Enum
 14 | 
 15 | 
 16 | class Metric(Enum):
 17 |     """Different metrics that can be used for ranking."""
 18 | 
 19 |     DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
 20 |     WORD_DEGREE = 1  # Uses d(w) alone as the metric
 21 |     WORD_FREQUENCY = 2  # Uses f(w) alone as the metric
 22 | 
 23 | 
 24 | class Rake(object):
 25 |     """Rapid Automatic Keyword Extraction Algorithm."""
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         punctuations=None,
 30 |         ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
 31 |         max_length=100000,
 32 |         min_length=1,
 33 |     ):
 34 |         """Constructor.
 35 | 
 36 |         :param stopwords: List of Words to be ignored for keyword extraction.
 37 |         :param punctuations: Punctuations to be ignored for keyword extraction.
 38 |         :param language: Language to be used for stopwords
 39 |         :param max_length: Maximum limit on the number of words in a phrase
 40 |                            (Inclusive. Defaults to 100000)
 41 |         :param min_length: Minimum limit on the number of words in a phrase
 42 |                            (Inclusive. Defaults to 1)
 43 |         """
 44 |         # By default use degree to frequency ratio as the metric.
 45 |         if isinstance(ranking_metric, Metric):
 46 |             self.metric = ranking_metric
 47 |         else:
 48 |             self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
 49 | 
 50 |         # If stopwords not provided we use language stopwords by default.
 51 |         self.stopwords = self.load_stopwords()
 52 | 
 53 |         # If punctuations are not provided we ignore all punctuation symbols.
 54 |         self.punctuations = punctuations
 55 |         if self.punctuations is None:
 56 |             self.punctuations = string.punctuation + ',，。！？!?' # add chinese punctuation
 57 | 
 58 |         # All things which act as sentence breaks during keyword extraction.
 59 |         self.to_ignore = set(chain(self.stopwords, self.punctuations))
 60 | 
 61 |         # Assign min or max length to the attributes
 62 |         self.min_length = min_length
 63 |         self.max_length = max_length
 64 | 
 65 |         # Stuff to be extracted from the provided text.
 66 |         self.frequency_dist = None
 67 |         self.degree = None
 68 |         self.rank_list = None
 69 |         self.ranked_phrases = None
 70 | 
 71 |     def load_stopwords(self, path = 'data/stopwords.txt'):
 72 |         """load stopwords list
 73 |         eg: stopwords_list = load_stopwords(path)
 74 | 
 75 |         :param path: 停用词表path，提前整理好的，直接读进来
 76 |         :return: list<stopwords>
 77 |         """
 78 |         with open(path) as f:
 79 |             stopwords = f.readlines()
 80 |         stopwords_list = []
 81 |         for word in stopwords:
 82 |             stopwords_list.append(word.replace('\n', '').replace(' ', ''))
 83 | 
 84 |         return stopwords_list
 85 | 
 86 |     def tokenize_chinese(self,text):
 87 | 
 88 |         sentences = re.split('(。|！|\!|\.|？|\?)', text)  # 保留分割符
 89 | 
 90 |         new_sents = []
 91 |         for i in range(int(len(sentences) / 2)):
 92 |             sent = sentences[2 * i] + sentences[2 * i + 1]
 93 |             new_sents.append(sent)
 94 |         return new_sents
 95 | 
 96 |     def extract_keywords_from_text(self, text, min_len, max_len):
 97 |         """Method to extract keywords from the text provided.
 98 | 
 99 |         :param text: Text to extract keywords from, provided as a string.
100 |         """
101 |         sentences = self.tokenize_chinese(text)
102 |         self.extract_keywords_from_sentences(sentences, min_len, max_len)
103 | 
104 |     def extract_keywords_from_sentences(self, sentences, min_len, max_len):
105 |         """Method to extract keywords from the list of sentences provided.
106 | 
107 |         :param sentences: Text to extraxt keywords from, provided as a list
108 |                           of strings, where each string is a sentence.
109 |         """
110 |         phrase_list = self._generate_phrases(sentences, min_len, max_len)
111 |         self._build_frequency_dist(phrase_list)
112 |         self._build_word_co_occurance_graph(phrase_list)
113 |         self._build_ranklist(phrase_list)
114 | 
115 |     def get_ranked_phrases(self):
116 |         """Method to fetch ranked keyword strings.
117 | 
118 |         :return: List of strings where each string represents an extracted
119 |                  keyword string.
120 |         """
121 |         return self.ranked_phrases
122 | 
123 |     def get_ranked_phrases_with_scores(self):
124 |         """Method to fetch ranked keyword strings along with their scores.
125 | 
126 |         :return: List of tuples where each tuple is formed of an extracted
127 |                  keyword string and its score. Ex: (5.68, 'Four Scoures')
128 |         """
129 |         return self.rank_list
130 | 
131 |     def get_word_frequency_distribution(self):
132 |         """Method to fetch the word frequency distribution in the given text.
133 | 
134 |         :return: Dictionary (defaultdict) of the format `word -> frequency`.
135 |         """
136 |         return self.frequency_dist
137 | 
138 |     def get_word_degrees(self):
139 |         """Method to fetch the degree of words in the given text. Degree can be
140 |         defined as sum of co-occurances of the word with other words in the
141 |         given text.
142 | 
143 |         :return: Dictionary (defaultdict) of the format `word -> degree`.
144 |         """
145 |         return self.degree
146 | 
147 |     def _build_frequency_dist(self, phrase_list):
148 |         """Builds frequency distribution of the words in the given body of text.
149 | 
150 |         :param phrase_list: List of List of strings where each sublist is a
151 |                             collection of words which form a contender phrase.
152 |         """
153 |         self.frequency_dist = Counter(chain.from_iterable(phrase_list))
154 | 
155 |     def _build_word_co_occurance_graph(self, phrase_list):
156 |         """Builds the co-occurance graph of words in the given body of text to
157 |         compute degree of each word.
158 | 
159 |         :param phrase_list: List of List of strings where each sublist is a
160 |                             collection of words which form a contender phrase.
161 |         """
162 |         co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
163 |         for phrase in phrase_list:
164 |             # For each phrase in the phrase list, count co-occurances of the
165 |             # word with other words in the phrase.
166 |             #
167 |             # Note: Keep the co-occurances graph as is, to help facilitate its
168 |             # use in other creative ways if required later.
169 |             for (word, coword) in product(phrase, phrase):
170 |                 co_occurance_graph[word][coword] += 1
171 |         self.degree = defaultdict(lambda: 0)
172 |         for key in co_occurance_graph:
173 |             self.degree[key] = sum(co_occurance_graph[key].values())
174 | 
175 |     def _build_ranklist(self, phrase_list):
176 |         """Method to rank each contender phrase using the formula
177 | 
178 |               phrase_score = sum of scores of words in the phrase.
179 |               word_score = d(w)/f(w) where d is degree and f is frequency.
180 | 
181 |         :param phrase_list: List of List of strings where each sublist is a
182 |                             collection of words which form a contender phrase.
183 |         """
184 |         self.rank_list = []
185 |         for phrase in phrase_list:
186 |             rank = 0.0
187 |             for word in phrase:
188 |                 if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
189 |                     rank += 1.0 * self.degree[word] / self.frequency_dist[word]
190 |                 elif self.metric == Metric.WORD_DEGREE:
191 |                     rank += 1.0 * self.degree[word]
192 |                 else:
193 |                     rank += 1.0 * self.frequency_dist[word]
194 |             self.rank_list.append((rank, " ".join(phrase)))
195 |         self.rank_list.sort(reverse=True)
196 |         self.ranked_phrases = [ph[1] for ph in self.rank_list]
197 | 
198 |     def _generate_phrases(self, sentences, min_len, max_len):
199 |         """Method to generate contender phrases given the sentences of the text
200 |         document.
201 | 
202 |         :param sentences: List of strings where each string represents a
203 |                           sentence which forms the text.
204 |         :return: Set of string tuples where each tuple is a collection
205 |                  of words forming a contender phrase.
206 |         """
207 |         phrase_list = set()
208 |         # Create contender phrases from sentences.
209 |         for sentence in sentences:
210 |             word_list = [word for word in list(jieba.cut(sentence))]
211 |             phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len))
212 |         return phrase_list
213 | 
214 |     def _get_phrase_list_from_words(self, word_list, min_len, max_len):
215 |         """Method to create contender phrases from the list of words that form
216 |         a sentence by dropping stopwords and punctuations and grouping the left
217 |         words into phrases. Only phrases in the given length range (both limits
218 |         inclusive) would be considered to build co-occurrence matrix. Ex:
219 | 
220 |         Sentence: Red apples, are good in flavour.
221 |         List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
222 |         List after dropping punctuations and stopwords.
223 |         List of words: ['red', 'apples', *, *, good, *, 'flavour']
224 |         List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
225 | 
226 |         List of phrases with a correct length:
227 |         For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
228 |         For the range [1, 1]: [('good',), ('flavour',)]
229 |         For the range [2, 2]: [('red', 'apples')]
230 | 
231 |         :param word_list: List of words which form a sentence when joined in
232 |                           the same order.
233 |         :return: List of contender phrases that are formed after dropping
234 |                  stopwords and punctuations.
235 |         """
236 |         groups = groupby(word_list, lambda x: x not in self.to_ignore)
237 |         phrases = []
238 |         for group in groups:
239 |             tmp = tuple(group[1])
240 |             len_g1 = len(list(tmp))
241 |             if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase
242 |                 phrases.append(tuple(tmp))
243 | 
244 |         return list(
245 |             filter(
246 |                 lambda x: self.min_length <= len(x) <= self.max_length, phrases
247 |             )
248 |         )
249 | 


--------------------------------------------------------------------------------
/cocoNLP.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: cocoNLP
 3 | Version: 0.0.13
 4 | Summary: Python implementation of many nlp algorithms
 5 | Home-page: https://github.com/fighting41love
 6 | Author: Yang Yang
 7 | Author-email: yangyangfuture@gmail.com
 8 | License: MIT
 9 | Description: UNKNOWN
10 | Keywords: nlp text-mining information extraction
11 | Platform: UNKNOWN
12 | Classifier: Intended Audience :: Developers
13 | Classifier: Intended Audience :: Education
14 | Classifier: License :: OSI Approved :: MIT License
15 | Classifier: Development Status :: 3 - Alpha
16 | Classifier: Operating System :: POSIX
17 | Classifier: Programming Language :: Python :: 2.7
18 | Classifier: Programming Language :: Python :: 3.4
19 | Classifier: Programming Language :: Python :: 3.5
20 | Classifier: Programming Language :: Python :: 3.6
21 | Classifier: Topic :: Software Development :: Build Tools
22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules
23 | 


--------------------------------------------------------------------------------
/cocoNLP.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | MANIFEST.in
 3 | readme.md
 4 | requirements.txt
 5 | setup.py
 6 | test.py
 7 | cocoNLP/__init__.py
 8 | cocoNLP/__version__.py
 9 | cocoNLP/extractor.py
10 | cocoNLP.egg-info/PKG-INFO
11 | cocoNLP.egg-info/SOURCES.txt
12 | cocoNLP.egg-info/dependency_links.txt
13 | cocoNLP.egg-info/requires.txt
14 | cocoNLP.egg-info/top_level.txt
15 | cocoNLP/config/.DS_Store
16 | cocoNLP/config/basic/time_nlp/.DS_Store
17 | cocoNLP/config/basic/time_nlp/LunarSolarConverter.py
18 | cocoNLP/config/basic/time_nlp/README.md
19 | cocoNLP/config/basic/time_nlp/RangeTimeEnum.py
20 | cocoNLP/config/basic/time_nlp/StringPreHandler.py
21 | cocoNLP/config/basic/time_nlp/Test.py
22 | cocoNLP/config/basic/time_nlp/TimeNormalizer.py
23 | cocoNLP/config/basic/time_nlp/TimePoint.py
24 | cocoNLP/config/basic/time_nlp/TimeUnit.py
25 | cocoNLP/config/basic/time_nlp/__init__.py
26 | cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO
27 | cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt
28 | cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt
29 | cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe
30 | cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt
31 | cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt
32 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO
33 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt
34 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt
35 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe
36 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt
37 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt
38 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc
39 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc
40 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc
41 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc
42 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc
43 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc
44 | cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc
45 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc
46 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc
47 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc
48 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc
49 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc
50 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc
51 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc
52 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc
53 | cocoNLP/config/basic/time_nlp/resource/__init__.py
54 | cocoNLP/config/basic/time_nlp/resource/holi_lunar.json
55 | cocoNLP/config/basic/time_nlp/resource/holi_solar.json
56 | cocoNLP/config/basic/time_nlp/resource/reg.pkl
57 | cocoNLP/config/basic/time_nlp/resource/regex.txt
58 | cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc
59 | cocoNLP/config/phrase/rake.py
60 | cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc
61 | cocoNLP/config/phrase/data/stopwords.txt


--------------------------------------------------------------------------------
/cocoNLP.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cocoNLP.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | pyhanlp
3 | phone
4 | phonenumbers
5 | regex
6 | arrow
7 | 


--------------------------------------------------------------------------------
/cocoNLP.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | cocoNLP
2 | 


--------------------------------------------------------------------------------
/cocoNLP/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | # -*- coding: utf-8 -*-
10 | 
11 | """
12 | cocoNLP module
13 | :copyright: (c) 2019 by Yang Yang.
14 | :license: MIT, see LICENSE for more details.
15 | """
16 | 


--------------------------------------------------------------------------------
/cocoNLP/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__init__.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/extractor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/extractor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/__pycache__/extractor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/__pycache__/extractor.cpython-37.pyc


--------------------------------------------------------------------------------
/cocoNLP/__version__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | 
10 | __title__ = "cocoNLP"
11 | __description__ = "Python implementation of many nlp algorithms"
12 | __url__ = "https://github.com/fighting41love"
13 | __version__ = "0.0.13"
14 | __author__ = "Yang Yang"
15 | __author_email__ = "yangyangfuture@gmail.com"
16 | __license__ = "MIT"
17 | __copyright__ = "Copyright 2019 Yang Yang"
18 | 


--------------------------------------------------------------------------------
/cocoNLP/config/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/.DS_Store


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/.DS_Store


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: TimeConverter
 3 | Version: 1.1.0
 4 | Summary: ...
 5 | Home-page: http://test.com
 6 | Author: test
 7 | Author-email: test@gmail.com
 8 | License: MIT Licence
 9 | Description: ...
10 | Keywords: time,nlp
11 | Platform: any
12 | Classifier: Programming Language :: Python :: 2.6
13 | Classifier: Programming Language :: Python :: 2.7
14 | Classifier: Programming Language :: Python :: 3.6
15 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LunarSolarConverter.py
 2 | README.md
 3 | RangeTimeEnum.py
 4 | StringPreHandler.py
 5 | Test.py
 6 | TimeNormalizer.py
 7 | TimePoint.py
 8 | TimeUnit.py
 9 | __init__.py
10 | setup.py
11 | TimeConverter.egg-info/PKG-INFO
12 | TimeConverter.egg-info/SOURCES.txt
13 | TimeConverter.egg-info/dependency_links.txt
14 | TimeConverter.egg-info/not-zip-safe
15 | TimeConverter.egg-info/requires.txt
16 | TimeConverter.egg-info/top_level.txt
17 | resource/__init__.py


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
1 | regex>=2017
2 | arrow>=0.10
3 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | resource
3 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/LunarSolarConverter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/12/11 11:08
  4 | # @Author  : zhm
  5 | # @File    : LunarSolarConverter.py
  6 | # @Software: PyCharm
  7 | from pprint import pprint
  8 | 
  9 | 
 10 | class Lunar:
 11 |     def __init__(self, lunarYear, lunarMonth, lunarDay, isleap):
 12 |         self.isleap = isleap
 13 |         self.lunarDay = lunarDay
 14 |         self.lunarMonth = lunarMonth
 15 |         self.lunarYear = lunarYear
 16 | 
 17 | 
 18 | class Solar:
 19 |     def __init__(self, solarYear, solarMonth, solarDay):
 20 |         self.solarDay = solarDay
 21 |         self.solarMonth = solarMonth
 22 |         self.solarYear = solarYear
 23 | 
 24 | 
 25 | def GetBitInt(data, length, shift):
 26 |     return (data & (((1 << length) - 1) << shift)) >> shift
 27 | 
 28 | 
 29 | def SolarToInt(y, m, d):
 30 |     m = (m + 9) % 12
 31 |     y -= int(m / 10)
 32 |     return 365 * y + int(y / 4) - int(y / 100) + int(y / 400) + int((m * 306 + 5) / 10) + (d - 1)
 33 | 
 34 | 
 35 | def SolarFromInt(g):
 36 |     y = int((10000 * g + 14780) / 3652425)
 37 |     ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400))
 38 |     if ddd < 0:
 39 |         y -= 1
 40 |         ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400))
 41 | 
 42 |     mi = int((100 * ddd + 52) / 3060)
 43 |     mm = (mi + 2) % 12 + 1
 44 |     y += int((mi + 2) / 12)
 45 |     dd = ddd - int((mi * 306 + 5) / 10) + 1
 46 |     solar = Solar(y, mm, dd)
 47 |     return solar
 48 | 
 49 | 
 50 | class LunarSolarConverter:
 51 |     #####################################################################################
 52 |     # 1888~2111年农历数据表
 53 |     # 农历数据 每个元素的存储格式如下：
 54 |     #   16~13    12          11~0
 55 |     #  闰几月 闰月日数  1~12月份农历日数(大小月)
 56 |     # 注：1、bit0表示农历1月份日数，为1表示30天，为0表示29天。bit1表示农历2月份日数，依次类推。
 57 |     #     2、bit12表示闰月日数，1为30天，0为29天。bit16~bit13表示第几月是闰月(注：为0表示该年无闰月)
 58 |     # 数据来源参考: http://data.weather.gov.hk/gts/time/conversion1_text_c.htm
 59 |     #####################################################################################
 60 |     lunar_month_days = [1887, 0x1694, 0x16aa, 0x4ad5, 0xab6, 0xc4b7, 0x4ae, 0xa56, 0xb52a,
 61 |                         0x1d2a, 0xd54, 0x75aa, 0x156a, 0x1096d, 0x95c, 0x14ae, 0xaa4d, 0x1a4c, 0x1b2a, 0x8d55,
 62 |                         0xad4, 0x135a, 0x495d,
 63 |                         0x95c, 0xd49b, 0x149a, 0x1a4a, 0xbaa5, 0x16a8, 0x1ad4, 0x52da, 0x12b6, 0xe937, 0x92e,
 64 |                         0x1496, 0xb64b, 0xd4a,
 65 |                         0xda8, 0x95b5, 0x56c, 0x12ae, 0x492f, 0x92e, 0xcc96, 0x1a94, 0x1d4a, 0xada9, 0xb5a, 0x56c,
 66 |                         0x726e, 0x125c,
 67 |                         0xf92d, 0x192a, 0x1a94, 0xdb4a, 0x16aa, 0xad4, 0x955b, 0x4ba, 0x125a, 0x592b, 0x152a,
 68 |                         0xf695, 0xd94, 0x16aa,
 69 |                         0xaab5, 0x9b4, 0x14b6, 0x6a57, 0xa56, 0x1152a, 0x1d2a, 0xd54, 0xd5aa, 0x156a, 0x96c,
 70 |                         0x94ae, 0x14ae, 0xa4c,
 71 |                         0x7d26, 0x1b2a, 0xeb55, 0xad4, 0x12da, 0xa95d, 0x95a, 0x149a, 0x9a4d, 0x1a4a, 0x11aa5,
 72 |                         0x16a8, 0x16d4,
 73 |                         0xd2da, 0x12b6, 0x936, 0x9497, 0x1496, 0x1564b, 0xd4a, 0xda8, 0xd5b4, 0x156c, 0x12ae,
 74 |                         0xa92f, 0x92e, 0xc96,
 75 |                         0x6d4a, 0x1d4a, 0x10d65, 0xb58, 0x156c, 0xb26d, 0x125c, 0x192c, 0x9a95, 0x1a94, 0x1b4a,
 76 |                         0x4b55, 0xad4,
 77 |                         0xf55b, 0x4ba, 0x125a, 0xb92b, 0x152a, 0x1694, 0x96aa, 0x15aa, 0x12ab5, 0x974, 0x14b6,
 78 |                         0xca57, 0xa56, 0x1526,
 79 |                         0x8e95, 0xd54, 0x15aa, 0x49b5, 0x96c, 0xd4ae, 0x149c, 0x1a4c, 0xbd26, 0x1aa6, 0xb54,
 80 |                         0x6d6a, 0x12da, 0x1695d,
 81 |                         0x95a, 0x149a, 0xda4b, 0x1a4a, 0x1aa4, 0xbb54, 0x16b4, 0xada, 0x495b, 0x936, 0xf497,
 82 |                         0x1496, 0x154a, 0xb6a5,
 83 |                         0xda4, 0x15b4, 0x6ab6, 0x126e, 0x1092f, 0x92e, 0xc96, 0xcd4a, 0x1d4a, 0xd64, 0x956c,
 84 |                         0x155c, 0x125c, 0x792e,
 85 |                         0x192c, 0xfa95, 0x1a94, 0x1b4a, 0xab55, 0xad4, 0x14da, 0x8a5d, 0xa5a, 0x1152b, 0x152a,
 86 |                         0x1694, 0xd6aa,
 87 |                         0x15aa, 0xab4, 0x94ba, 0x14b6, 0xa56, 0x7527, 0xd26, 0xee53, 0xd54, 0x15aa, 0xa9b5, 0x96c,
 88 |                         0x14ae, 0x8a4e,
 89 |                         0x1a4c, 0x11d26, 0x1aa4, 0x1b54, 0xcd6a, 0xada, 0x95c, 0x949d, 0x149a, 0x1a2a, 0x5b25,
 90 |                         0x1aa4, 0xfb52,
 91 |                         0x16b4, 0xaba, 0xa95b, 0x936, 0x1496, 0x9a4b, 0x154a, 0x136a5, 0xda4, 0x15ac]
 92 |     # 额外添加数据，方便快速计算阴历转阳历 每个元素的存储格式如下：
 93 |     #    12~7         6~5    4~0
 94 |     #  离元旦多少天  春节月  春节日
 95 |     #####################################################################################
 96 |     solar_1_1 = [1887, 0xec04c, 0xec23f, 0xec435, 0xec649, 0xec83e, 0xeca51, 0xecc46, 0xece3a,
 97 |                  0xed04d, 0xed242, 0xed436, 0xed64a, 0xed83f, 0xeda53, 0xedc48, 0xede3d, 0xee050, 0xee244, 0xee439,
 98 |                  0xee64d,
 99 |                  0xee842, 0xeea36, 0xeec4a, 0xeee3e, 0xef052, 0xef246, 0xef43a, 0xef64e, 0xef843, 0xefa37, 0xefc4b,
100 |                  0xefe41,
101 |                  0xf0054, 0xf0248, 0xf043c, 0xf0650, 0xf0845, 0xf0a38, 0xf0c4d, 0xf0e42, 0xf1037, 0xf124a, 0xf143e,
102 |                  0xf1651,
103 |                  0xf1846, 0xf1a3a, 0xf1c4e, 0xf1e44, 0xf2038, 0xf224b, 0xf243f, 0xf2653, 0xf2848, 0xf2a3b, 0xf2c4f,
104 |                  0xf2e45,
105 |                  0xf3039, 0xf324d, 0xf3442, 0xf3636, 0xf384a, 0xf3a3d, 0xf3c51, 0xf3e46, 0xf403b, 0xf424e, 0xf4443,
106 |                  0xf4638,
107 |                  0xf484c, 0xf4a3f, 0xf4c52, 0xf4e48, 0xf503c, 0xf524f, 0xf5445, 0xf5639, 0xf584d, 0xf5a42, 0xf5c35,
108 |                  0xf5e49,
109 |                  0xf603e, 0xf6251, 0xf6446, 0xf663b, 0xf684f, 0xf6a43, 0xf6c37, 0xf6e4b, 0xf703f, 0xf7252, 0xf7447,
110 |                  0xf763c,
111 |                  0xf7850, 0xf7a45, 0xf7c39, 0xf7e4d, 0xf8042, 0xf8254, 0xf8449, 0xf863d, 0xf8851, 0xf8a46, 0xf8c3b,
112 |                  0xf8e4f,
113 |                  0xf9044, 0xf9237, 0xf944a, 0xf963f, 0xf9853, 0xf9a47, 0xf9c3c, 0xf9e50, 0xfa045, 0xfa238, 0xfa44c,
114 |                  0xfa641,
115 |                  0xfa836, 0xfaa49, 0xfac3d, 0xfae52, 0xfb047, 0xfb23a, 0xfb44e, 0xfb643, 0xfb837, 0xfba4a, 0xfbc3f,
116 |                  0xfbe53,
117 |                  0xfc048, 0xfc23c, 0xfc450, 0xfc645, 0xfc839, 0xfca4c, 0xfcc41, 0xfce36, 0xfd04a, 0xfd23d, 0xfd451,
118 |                  0xfd646,
119 |                  0xfd83a, 0xfda4d, 0xfdc43, 0xfde37, 0xfe04b, 0xfe23f, 0xfe453, 0xfe648, 0xfe83c, 0xfea4f, 0xfec44,
120 |                  0xfee38,
121 |                  0xff04c, 0xff241, 0xff436, 0xff64a, 0xff83e, 0xffa51, 0xffc46, 0xffe3a, 0x10004e, 0x100242,
122 |                  0x100437,
123 |                  0x10064b, 0x100841, 0x100a53, 0x100c48, 0x100e3c, 0x10104f, 0x101244, 0x101438, 0x10164c,
124 |                  0x101842, 0x101a35,
125 |                  0x101c49, 0x101e3d, 0x102051, 0x102245, 0x10243a, 0x10264e, 0x102843, 0x102a37, 0x102c4b,
126 |                  0x102e3f, 0x103053,
127 |                  0x103247, 0x10343b, 0x10364f, 0x103845, 0x103a38, 0x103c4c, 0x103e42, 0x104036, 0x104249,
128 |                  0x10443d, 0x104651,
129 |                  0x104846, 0x104a3a, 0x104c4e, 0x104e43, 0x105038, 0x10524a, 0x10543e, 0x105652, 0x105847,
130 |                  0x105a3b, 0x105c4f,
131 |                  0x105e45, 0x106039, 0x10624c, 0x106441, 0x106635, 0x106849, 0x106a3d, 0x106c51, 0x106e47,
132 |                  0x10703c, 0x10724f,
133 |                  0x107444, 0x107638, 0x10784c, 0x107a3f, 0x107c53, 0x107e48]
134 | 
135 |     def LunarToSolar(self, lunar):
136 |         days = LunarSolarConverter.lunar_month_days[lunar.lunarYear - LunarSolarConverter.lunar_month_days[0]]
137 |         leap = GetBitInt(days, 4, 13)
138 |         offset = 0
139 |         loopend = leap
140 |         if not lunar.isleap:
141 | 
142 |             if lunar.lunarMonth <= leap or leap == 0:
143 | 
144 |                 loopend = lunar.lunarMonth - 1
145 | 
146 |             else:
147 | 
148 |                 loopend = lunar.lunarMonth
149 | 
150 |         for i in range(0, loopend):
151 |             offset += GetBitInt(days, 1, 12 - i) == 1 and 30 or 29
152 | 
153 |         offset += lunar.lunarDay
154 | 
155 |         solar11 = LunarSolarConverter.solar_1_1[lunar.lunarYear - LunarSolarConverter.solar_1_1[0]]
156 | 
157 |         y = GetBitInt(solar11, 12, 9)
158 |         m = GetBitInt(solar11, 4, 5)
159 |         d = GetBitInt(solar11, 5, 0)
160 | 
161 |         return SolarFromInt(SolarToInt(y, m, d) + offset - 1)
162 | 
163 |     def SolarToLunar(self, solar):
164 | 
165 |         lunar = Lunar(0, 0, 0, False)
166 |         index = solar.solarYear - LunarSolarConverter.solar_1_1[0]
167 |         data = (solar.solarYear << 9) | (solar.solarMonth << 5) | solar.solarDay
168 |         if LunarSolarConverter.solar_1_1[index] > data:
169 |             index -= 1
170 | 
171 |         solar11 = LunarSolarConverter.solar_1_1[index]
172 |         y = GetBitInt(solar11, 12, 9)
173 |         m = GetBitInt(solar11, 4, 5)
174 |         d = GetBitInt(solar11, 5, 0)
175 |         offset = SolarToInt(solar.solarYear, solar.solarMonth, solar.solarDay) - SolarToInt(y, m, d)
176 | 
177 |         days = LunarSolarConverter.lunar_month_days[index]
178 |         leap = GetBitInt(days, 4, 13)
179 | 
180 |         lunarY = index + LunarSolarConverter.solar_1_1[0]
181 |         lunarM = 1
182 |         offset += 1
183 | 
184 |         for i in range(0, 13):
185 | 
186 |             dm = GetBitInt(days, 1, 12 - i) == 1 and 30 or 29
187 |             if offset > dm:
188 | 
189 |                 lunarM += 1
190 |                 offset -= dm
191 | 
192 |             else:
193 | 
194 |                 break
195 | 
196 |         lunarD = int(offset)
197 |         lunar.lunarYear = lunarY
198 |         lunar.lunarMonth = lunarM
199 |         lunar.isleap = False
200 |         if leap != 0 and lunarM > leap:
201 | 
202 |             lunar.lunarMonth = lunarM - 1
203 |             if lunarM == leap + 1:
204 |                 lunar.isleap = True
205 | 
206 |         lunar.lunarDay = lunarD
207 |         return lunar
208 | 
209 |     def __init__(self):
210 |         pass
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     converter = LunarSolarConverter()
215 |     solar = Solar(2111, 1, 25)
216 |     pprint(vars(solar))
217 |     lunar = converter.SolarToLunar(solar)
218 |     pprint(vars(lunar))
219 |     solar = converter.LunarToSolar(lunar)
220 |     pprint(vars(solar))
221 |     print(len(converter.solar_1_1))
222 |     print("Done")
223 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/README.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | Time-NLP的python3版本   
 3 | python 版本https://github.com/sunfiyes/Time-NLPY  
 4 | Java 版本https://github.com/shinyke/Time-NLP
 5 | ## 功能说明
 6 | 用于句子中时间词的抽取和转换  
 7 | 详情请见test.py
 8 | 
 9 |     res = tn.parse(target=u'过十分钟') # target为待分析语句，timeBase为基准时间默认是当前时间
10 |     print(res)
11 |     res = tn.parse(target=u'2013年二月二十八日下午四点三十分二十九秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
12 |     print(res)
13 |     res = tn.parse(target=u'我需要大概33天2分钟四秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
14 |     print(res)
15 |     res = tn.parse(target=u'今年儿童节晚上九点一刻') # target为待分析语句，timeBase为基准时间默认是当前时间
16 |     print(res)
17 |     res = tn.parse(target=u'2个小时以前') # target为待分析语句，timeBase为基准时间默认是当前时间
18 |     print(res)
19 |     res = tn.parse(target=u'晚上8点到上午10点之间') # target为待分析语句，timeBase为基准时间默认是当前时间
20 |     print(res)
21 | 返回结果：
22 | 
23 |     {"timedelta": "0 days, 0:10:00", "type": "timedelta"}
24 |     {"timestamp": "2013-02-28 16:30:29", "type": "timestamp"}
25 |     {"type": "timedelta", "timedelta": {"year": 0, "month": 1, "day": 3, "hour": 0, "minute": 2, "second": 4}}
26 |     {"timestamp": "2018-06-01 21:15:00", "type": "timestamp"}
27 |     {"error": "no time pattern could be extracted."}
28 |     {"type": "timespan", "timespan": ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]}
29 |     
30 | ## 使用方式 
31 | demo：python3 Test.py
32 | 
33 | 优化说明
34 |     
35 | | 问题          | 以前版本                                     | 现在版本                    |
36 | | ----------- | ---------------------------------------- | ---------------------- |
37 | | 无法解析下下周末     | "timestamp": "2018-04-01 00:00:00"                                    | "timestamp": "2018-04-08 00:00:00"                 |
38 | | 无法解析 3月4         | "2018-03-01"                                   | "2018-03-04"               |
39 | | 无法解析 初一 初二      | cannot parse                                    | "2018-02-16"              |
40 | | 晚上8点到上午10点之间  无法解析上午      | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] |  ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]|
41 | | 3月21号  错误解析成2019年      | "2019-03-21" | "2018-03-21" |
42 | 
43 | 感谢@[tianyuningmou](https://github.com/tianyuningmou) 目前增加了对24节气的支持
44 | 
45 | 
46 |     temp = ['今年春分']
47 |     "timestamp" : "2020-03-20 00:00:00"
48 | 
49 | ## TODO
50 | 
51 | | 问题          | 现在版本                                     | 正确
52 | | ----------- | ---------------------------------------- | ---------------------- |
53 | | 晚上8点到上午10点之间     |  ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] |  ["2018-03-16 20:00:00", "2018-03-17 10:00:00"]"                                    | "timestamp": "2018-04-08 00:00:00"                 |
54 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/RangeTimeEnum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/11/20 16:27
 4 | # @Author  : zhm
 5 | # @File    : RangeTimeEnum.py
 6 | # @Software: PyCharm
 7 | 
 8 | 
 9 | 
10 | # 范围时间的默认时间点
11 | class RangeTimeEnum():
12 |     day_break = 3  # 黎明
13 |     early_morning = 8  # 早
14 |     morning = 10  # 上午
15 |     noon = 12  # 中午、午间
16 |     afternoon = 15  # 下午、午后
17 |     night = 18  # 晚上、傍晚
18 |     lateNight = 20  # 晚、晚间
19 |     midNight = 23  # 深夜
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     print(RangeTimeEnum.afternoon)
24 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/StringPreHandler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/11/20 15:42
  4 | # @Author  : zhm
  5 | # @File    : StringPreHandler.py
  6 | # @Software: PyCharm
  7 | import regex as re
  8 | 
  9 | # * 字符串预处理模块，为分析器TimeNormalizer提供相应的字符串预处理服务
 10 | class StringPreHandler:
 11 |     @classmethod
 12 |     def delKeyword(cls, target, rules):
 13 |         """
 14 |         该方法删除一字符串中所有匹配某一规则字串
 15 |         可用于清理一个字符串中的空白符和语气助词
 16 |         :param target: 待处理字符串
 17 |         :param rules: 删除规则
 18 |         :return: 清理工作完成后的字符串
 19 |         """
 20 |         pattern = re.compile(rules)
 21 |         res = pattern.sub('', target)
 22 |         # print res
 23 |         return res
 24 | 
 25 | 
 26 |     @classmethod
 27 |     def numberTranslator(cls, target):
 28 |         """
 29 |         该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
 30 |         如"这里有一千两百个人，六百零五个来自中国"可以转化为
 31 |         "这里有1200个人，605个来自中国"
 32 |         此外添加支持了部分不规则表达方法
 33 |         如两万零六百五可转化为20650
 34 |         两百一十四和两百十四都可以转化为214
 35 |         一六零加一五八可以转化为160+158
 36 |         该方法目前支持的正确转化范围是0-99999999
 37 |         该功能模块具有良好的复用性
 38 |         :param target: 待转化的字符串
 39 |         :return: 转化完毕后的字符串
 40 |         """
 41 |         pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
 42 |         match = pattern.finditer(target)
 43 |         for m in match:
 44 |             group = m.group()
 45 |             s = group.split(u"万")
 46 |             s = filter(None, s)
 47 |             num = 0
 48 |             if len(s) == 2:
 49 |                 num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000
 50 |             target = pattern.sub(str(num), target, 1)
 51 | 
 52 |         pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
 53 |         match = pattern.finditer(target)
 54 |         for m in match:
 55 |             group = m.group()
 56 |             s = group.split(u"千")
 57 |             s = filter(None, s)
 58 |             num = 0
 59 |             if len(s) == 2:
 60 |                 num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100
 61 |             target = pattern.sub(str(num), target, 1)
 62 | 
 63 |         pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
 64 |         match = pattern.finditer(target)
 65 |         for m in match:
 66 |             group = m.group()
 67 |             s = group.split(u"百")
 68 |             s = filter(None, s)
 69 |             num = 0
 70 |             if len(s) == 2:
 71 |                 num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10
 72 |             target = pattern.sub(str(num), target, 1)
 73 | 
 74 |         pattern = re.compile(u"[零一二两三四五六七八九]")
 75 |         match = pattern.finditer(target)
 76 |         for m in match:
 77 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 78 | 
 79 |         pattern = re.compile(u"(?<=(周|星期))[末天日]")
 80 |         match = pattern.finditer(target)
 81 |         for m in match:
 82 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 83 | 
 84 |         pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
 85 |         match = pattern.finditer(target)
 86 |         for m in match:
 87 |             group = m.group()
 88 |             s = group.split(u"十")
 89 |             num = 0
 90 |             ten = cls.strToInt(s[0])
 91 |             if ten == 0:
 92 |                 ten = 1
 93 |             unit = cls.strToInt(s[1])
 94 |             num = ten * 10 + unit
 95 |             target = pattern.sub(str(num), target, 1)
 96 | 
 97 |         pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
 98 |         match = pattern.finditer(target)
 99 |         for m in match:
100 |             group = m.group()
101 |             s = group.split(u"百")
102 |             s = filter(None, s)
103 |             num = 0
104 |             if len(s) == 1:
105 |                 hundred = int(s[0])
106 |                 num += hundred * 100
107 |             elif len(s) == 2:
108 |                 hundred = int(s[0])
109 |                 num += hundred * 100
110 |                 num += int(s[1])
111 |             target = pattern.sub(str(num), target, 1)
112 | 
113 |         pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
114 |         match = pattern.finditer(target)
115 |         for m in match:
116 |             group = m.group()
117 |             s = group.split(u"千")
118 |             s = filter(None, s)
119 |             num = 0
120 |             if len(s) == 1:
121 |                 thousand = int(s[0])
122 |                 num += thousand * 1000
123 |             elif len(s) == 2:
124 |                 thousand = int(s[0])
125 |                 num += thousand * 1000
126 |                 num += int(s[1])
127 |             target = pattern.sub(str(num), target, 1)
128 | 
129 |         pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
130 |         match = pattern.finditer(target)
131 |         for m in match:
132 |             group = m.group()
133 |             s = group.split(u"万")
134 |             s = filter(None, s)
135 |             num = 0
136 |             if len(s) == 1:
137 |                 tenthousand = int(s[0])
138 |                 num += tenthousand * 10000
139 |             elif len(s) == 2:
140 |                 tenthousand = int(s[0])
141 |                 num += tenthousand * 10000
142 |                 num += int(s[1])
143 |             target = pattern.sub(str(num), target, 1)
144 | 
145 |         return target
146 | 
147 |     @classmethod
148 |     def wordToNumber(cls, s):
149 |         """
150 |         方法numberTranslator的辅助方法，可将[零-九]正确翻译为[0-9]
151 |         :param s: 大写数字
152 |         :return: 对应的整形数，如果不是数字返回-1
153 |         """
154 |         if (s == u'零') or (s == '0'):
155 |             return 0
156 |         elif (s == u'一') or (s == '1'):
157 |             return 1
158 |         elif (s == u'二') or (s == u'两') or (s == '2'):
159 |             return 2
160 |         elif (s == u'三') or (s == '3'):
161 |             return 3
162 |         elif (s == u'四') or (s == '4'):
163 |             return 4
164 |         elif (s == u'五') or (s == '5'):
165 |             return 5
166 |         elif (s == u'六') or (s == '6'):
167 |             return 6
168 |         elif (s == u'七') or (s == u'天') or (s == u'日') or (s == u'末') or (s == '7'):
169 |             return 7
170 |         elif (s == u'八') or (s == '8'):
171 |             return 8
172 |         elif (s == u'九') or (s == '9'):
173 |             return 9
174 |         else:
175 |             return -1
176 | 
177 |     @classmethod
178 |     def strToInt(cls, s):
179 |         try:
180 |             res = int(s)
181 |         except:
182 |             res = 0
183 |         return res


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/Test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/11/22 10:21
 4 | # @Author  : zhm
 5 | # @File    : Test.py
 6 | # @Software: PyCharm
 7 | # @Changed : tianyuningmou
 8 | 
 9 | from  cocoNLP.config.basic.time_nlp.TimeNormalizer import TimeNormalizer # 引入包
10 | 
11 | tn = TimeNormalizer()
12 | 
13 | res = tn.parse(target=u'晚上8点到上午10点之间') # target为待分析语句，timeBase为基准时间默认是当前时间
14 | print(res)
15 | 
16 | res = tn.parse(target=u'2013年二月二十八日下午四点三十分二十九秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
17 | print(res)
18 | 
19 | res = tn.parse(target=u'我需要大概33天2分钟四秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
20 | print(res)
21 | 
22 | res = tn.parse(target=u'今年儿童节晚上九点一刻') # target为待分析语句，timeBase为基准时间默认是当前时间
23 | print(res)
24 | 
25 | res = tn.parse(target=u'三日') # target为待分析语句，timeBase为基准时间默认是当前时间
26 | print(res)
27 | 
28 | res = tn.parse(target=u'7点4') # target为待分析语句，timeBase为基准时间默认是当前时间
29 | print(res)
30 | 
31 | res = tn.parse(target=u'今年春分')
32 | print(res)
33 | 
34 | #
35 | #
36 | #
37 | # def boson_analy(pattern, basetime):
38 | #     nlp = BosonNLP('MWSY30Ja.14341.z39Al-kSksZt')
39 | #     result = nlp.convert_time(
40 | #         pattern,
41 | #         datetime.datetime.today())
42 | #     print (json.dumps(result))
43 | #     return json.dumps(result)
44 | #
45 | # with open('C:/Users/zhm/Desktop/test.txt') as testfile:
46 | #     data = []
47 | #     for each in testfile:
48 | #         res = tn.parse(each, arrow.now())
49 | #         res_b = boson_analy(each, datetime.datetime.now())
50 | #         data.append(each+'Boson:'+res_b+'\n'+'Time-NLP:'+res+'\n'+'\n')
51 | # with open('C:/Users/zhm/Desktop/对比.txt', 'wb') as resfile:
52 | #     resfile.writelines(data)
53 | #
54 | # ### 測試輸出文件
55 | 
56 | 
57 | # with open('resource/holi_lunar.json') as file_1:
58 | #     out = json.load(file_1)
59 | #     print type(out)
60 | # with open('resource/holi_lunar.json', 'w') as file_out:
61 | #     print json.dumps(out, indent=2, ensure_ascii=False)
62 | #     print >> file_out, json.dumps(out, indent=2, ensure_ascii=False).encode('utf-8')
63 | #
64 | # with open('resource/holi_lunar.json') as file_out:
65 | #     print json.load(file_out)
66 | 
67 | 
68 | # dset = []
69 | # with open('C:/Users/zhm/Desktop/test.txt') as testfile:
70 | #     for each in testfile:
71 | #         dset.append(each)
72 | #
73 | # def run(query):
74 | #     tn = TimeNormalizer()
75 | #     res = tn.parse(target=query, timeBase='2013-02-28 16:30:29')
76 | #     print res
77 | # if __name__ == '__main__':
78 | #     while True:
79 | #        query = random.choice(dset)
80 | #        lp = LineProfiler()
81 | #        lp_wrapper = lp(run)
82 | #        lp_wrapper(query)
83 | #        lp.print_stats()
84 | #        cProfile.run("run(query)")
85 | 
86 | # with open(os.path.dirname(__file__) + '/resource/regex.txt', 'wb') as f:
87 | #     f.write(u'((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\\d+个?[年月日天][以之]?[前后])|(\\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\\d+[:：]\\d+([:：]\\d+)*)\\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\\d+[时点](\\d+)?分?(\\d+秒?)?)\\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\\d+)世)|([0-9]?[0-9]?[0-9]{2}\\.((10)|(11)|(12)|([1-9]))\\.((?<!\\\\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\\d+)点)|(今年([零一二三四五六七八九十百千万]+|\\d+))|(\\d+[:：]\\d+(分|))|((\\d+):(\\d+))|(\\d+/\\d+/\\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\\d+分)+(\\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\\d*]世纪末?)|((\\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\\d+))|(([零一二三四五六七八九十百千万]+|\\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\\d+点)+(\\d+分)?(\\d+秒)?左右?)|((\\d+)年代)|(本月(\\d+))|(第(\\d+)天)|((\\d+)岁)|((\\d+)年(\\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(今年(\\d+)月(\\d+)日)|((\\d+)月(\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\\d+)年(\\d+)月(\\d+)日)|((\\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\\d+)点)|(去年(\\d+)月(\\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\\d+)日晚(\\d+)时)|(([零一二三四五六七八九十百千万]+|\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\\d+)月(\\d+)日)|(([零一二三四五六七八九十百千万]+|\\d+)周)|((\\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\\d+))|(长久)|(清晨)|((\\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\\d+)时)|(([零一二三四五六七八九十百千万]+|\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+))|(圣诞节)|(学期)|(\\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\\d+)月底)|(当天)|((\\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+)时)|(连[年月日夜])|((\\d+)年(\\d+)月(\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\\d+)月(\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+)时(\\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\\d+)时(\\d+)分)|(同日)|((\\d+)年(\\d+)月底)|((\\d+)分钟)|((\\d+)世纪)|(冬季)|(清明)(节)?|(青年节)|(教师节)|(中元节)|(端午)(节)?|(劳动节)|(7夕)(节)?|(建党节)|(建军节)|(中和节)|(圣诞)(节)?|(中秋)(节)?|(春节)|(元宵)(节)?|(航海日)|(儿童节)|(国庆)(节)?|(植树节)|(元旦)|(重阳节)|(妇女节)|(记者节)|(年代)|(([零一二三四五六七八九十百千万]+|\\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\\d+))|(([零一二三四五六七八九十百千万]+|\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[:：]\d+([:：]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?<!\\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)点)|(今年([零一二三四五六七八九十百千万]+|\d+))|(\d+[:：]\d+(分|))|((\d+):(\d+))|(\d+/\d+/\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(\d+)月|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\d+分)+(\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\d*]世纪末?)|((\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\d+点)+(\d+分)?(\d+秒)?左右?)|((\d+)年代)|(本月(\d+))|(第(\d+)天)|((\d+)岁)|((\d+)年(\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(\d+)月(\d+)日|(\d+)月(\d+)|(今年(\d+)月(\d+)日)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\d+)年(\d+)月(\d+)日)|((\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\d+)点)|(去年(\d+)月(\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\d+)日晚(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\d+)月(\d+)日)|(([零一二三四五六七八九十百千万]+|\d+)周)|((\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\d+))|(长久)|(清晨)|((\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+))|(圣诞节)|(学期)|(\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\d+)月底)|(当天)|((\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时)|(连[年月日夜])|((\d+)年(\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时(\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时(\d+)分)|(同日)|((\d+)年(\d+)月底)|((\d+)分钟)|((\d+)世纪)|(冬季)|(年代)|(([零一二三四五六七八九十百千万]+|\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?))|(([零一二三四五六七八九十百千万]+|\d+)生)|(今天凌晨)|(过去(\d+)年)|(元月)|((\d+)月(\d+)日凌晨)|([前去今明后新]+年)|((\d+)月(\d+))|(夏天)|((\d+)日凌晨(\d+)时许)|((\d+)月(\d+)日)|((\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\d+)月(\d+)日)|((\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\d+)个礼拜)|(昨日)|([年月]初)|((\d+)年的(\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\d+)月份)|(今年(\d+)月(\d+)号)|(今年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)月底)|(未来(\d+)年)|(第([零一二三四五六七八九十百千万]+|\d+)季)|(\d?多年)|(([零一二三四五六七八九十百千万]+|\d+)个星期)|((\d+)年([零一二三四五六七八九十百千万]+|\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)年(\d+)月(\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)天)|(工作日)|(稍后)|((\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)刻)|(很久)|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\d+)月(\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(古代)|(两天)|(\d+个?(小时|星期))|((\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\d+天)+(\d+点)?(\d+分)?(\d+秒)?)|((\d+)日([零一二三四五六七八九十百千万]+|\d+)时)|((\d+)周年)|(([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)日)|(去年(\d+)月)|(过去([零一二三四五六七八九十百千万]+|\d+)年)|((\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\d+)日)|(第([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)点(\d+)分)|([长近多]年)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\d+)早些时候)|((\d+)([零一二三四五六七八九十百千万]+|\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\d+)个小时)|(([零一二三四五六七八九十百千万]+|\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)个月)|((\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)时(\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)年)|(月底)|(这个月)|((\d+)年(\d+)个月)|(\d+大寿)|(周([零一二三四五六七八九十百千万]+|\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\d+)月)|((\d+)月(\d+)号)|((\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(下(个)?月)|(午后)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\d+:\d+:\d+)|(\d+/\d+/\d+:\d+:\d+.\d+)|(\?\?\?\?-\?\?-\?\?T\d+:\d+:\d+)|(\d+-\d+-\d+T\d+:\d+:\d+)|(\d+/\d+/\d+ \d+:\d+:\d+.\d+)|(\d+-\d+-\d+|[0-9]{8})|(((\d+)年)?((10)|(11)|(12)|([1-9]))月(\d+))|((\d[\.\-])?((10)|(11)|(12)|([1-9]))[\.\-](\d+))))|(([零一二三四五六七八九十百千万]+|\\d+)生)|(今天凌晨)|(过去(\\d+)年)|(元月)|((\\d+)月(\\d+)日凌晨)|([前去今明后新]+年)|((\\d+)月(\\d+))|(夏天)|((\\d+)日凌晨(\\d+)时许)|((\\d+)月(\\d+)日)|((\\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\\d+)月(\\d+)日)|((\\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\\d+)早)|(([零一二三四五六七八九十百千万]+|\\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\\d+)个礼拜)|(昨日)|([年月]初)|((\\d+)年的(\\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\\d+)月份)|(今年(\\d+)月(\\d+)号)|(今年([零一二三四五六七八九十百千万]+|\\d+)月)|((\\d+)月底)|(未来(\\d+)年)|(第([零一二三四五六七八九十百千万]+|\\d+)季)|(\\d?多年)|(([零一二三四五六七八九十百千万]+|\\d+)个星期)|((\\d+)年([零一二三四五六七八九十百千万]+|\\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\\d+)天)|((\\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\\d+)月)|((\\d+)年(\\d+)月(\\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)天)|(工作日)|(稍后)|((\\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)刻)|(很久)|((\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\\d+)月(\\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\\d+)时)|(古代)|(两天)|(\\d+个?(小时|星期))|((\\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\\d+天)+(\\d+点)?(\\d+分)?(\\d+秒)?)|((\\d+)日([零一二三四五六七八九十百千万]+|\\d+)时)|((\\d+)周年)|(([零一二三四五六七八九十百千万]+|\\d+)早)|(([零一二三四五六七八九十百千万]+|\\d+)日)|(去年(\\d+)月)|(过去([零一二三四五六七八九十百千万]+|\\d+)年)|((\\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\\d+)日)|(第([零一二三四五六七八九十百千万]+|\\d+)天)|((\\d+)点(\\d+)分)|([长近多]年)|((\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\\d+)早些时候)|((\\d+)([零一二三四五六七八九十百千万]+|\\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\\d+)个小时)|(([零一二三四五六七八九十百千万]+|\\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\\d+)个月)|((\\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\\d+)月)|((\\d+)时(\\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\\d+)年)|(月底)|(这个月)|((\\d+)年(\\d+)个月)|(\\d+大寿)|(周([零一二三四五六七八九十百千万]+|\\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\\d+)月)|((\\d+)月(\\d+)号)|((\\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(下(个)?月)|(午后)|((\\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(\\d+秒)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\\d+:\\d+:\\d+)|(\\d+/\\d+/\\d+:\\d+:\\d+.\\d+)|(\\?\\?\\?\\?-\\?\\?-\\?\\?T\\d+:\\d+:\\d+)|(\\d+-\\d+-\\d+T\\d+:\\d+:\\d+)|(\\d+/\\d+/\\d+ \\d+:\\d+:\\d+.\\d+)|(\\d+-\\d+-\\d+|[0-9]{8})|(((\\d+)年)?((10)|(11)|(12)|([1-9]))月(\\d+))|((\\d[\\.\\-])?((10)|(11)|(12)|([1-9]))[\\.\\-](\\d+))')
88 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: TimeConverter
 3 | Version: 1.1.0
 4 | Summary: ...
 5 | Home-page: http://test.com
 6 | Author: test
 7 | Author-email: test@gmail.com
 8 | License: MIT Licence
 9 | Description: ...
10 | Keywords: time,nlp
11 | Platform: any
12 | Classifier: Programming Language :: Python :: 2.6
13 | Classifier: Programming Language :: Python :: 2.7
14 | Classifier: Programming Language :: Python :: 3.6
15 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LunarSolarConverter.py
 2 | README.md
 3 | RangeTimeEnum.py
 4 | StringPreHandler.py
 5 | Test.py
 6 | TimeNormalizer.py
 7 | TimePoint.py
 8 | TimeUnit.py
 9 | __init__.py
10 | setup.py
11 | TimeConverter.egg-info/PKG-INFO
12 | TimeConverter.egg-info/SOURCES.txt
13 | TimeConverter.egg-info/dependency_links.txt
14 | TimeConverter.egg-info/not-zip-safe
15 | TimeConverter.egg-info/requires.txt
16 | TimeConverter.egg-info/top_level.txt
17 | resource/__init__.py


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | regex>=2017
2 | arrow>=0.10
3 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | resource
3 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimeNormalizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/11/20 16:39
  4 | # @Author  : zhm
  5 | # @File    : TimeNormalizer.py
  6 | # @Software: PyCharm
  7 | import pickle
  8 | import regex as re
  9 | import arrow
 10 | import json
 11 | import os
 12 | import codecs
 13 | 
 14 | 
 15 | from cocoNLP.config.basic.time_nlp.StringPreHandler import StringPreHandler
 16 | from cocoNLP.config.basic.time_nlp.TimePoint import TimePoint
 17 | from cocoNLP.config.basic.time_nlp.TimeUnit import TimeUnit
 18 | 
 19 | # 时间表达式识别的主要工作类
 20 | 
 21 | 
 22 | class TimeNormalizer:
 23 |     def __init__(self, isPreferFuture=True):
 24 |         self.isPreferFuture = isPreferFuture
 25 |         self.pattern, self.holi_solar, self.holi_lunar = self.init()
 26 | 
 27 |     # 这里对一些不规范的表达做转换
 28 |     def _filter(self, input_query):
 29 |         # 这里对于下个周末这种做转化 把个给移除掉
 30 |         input_query = StringPreHandler.numberTranslator(input_query)
 31 | 
 32 |         rule = u"[0-9]月[0-9]"
 33 |         pattern = re.compile(rule)
 34 |         match = pattern.search(input_query)
 35 |         if match != None:
 36 |             index = input_query.find('月')
 37 |             rule = u"日|号"
 38 |             pattern = re.compile(rule)
 39 |             match = pattern.search(input_query[index:])
 40 |             if match == None:
 41 |                 rule = u"[0-9]月[0-9]+"
 42 |                 pattern = re.compile(rule)
 43 |                 match = pattern.search(input_query)
 44 |                 if match != None:
 45 |                     end = match.span()[1]
 46 |                     input_query = input_query[:end] + '号' + input_query[end:]
 47 | 
 48 |         rule = u"月"
 49 |         pattern = re.compile(rule)
 50 |         match = pattern.search(input_query)
 51 |         if match == None:
 52 |             input_query = input_query.replace('个', '')
 53 | 
 54 |         input_query = input_query.replace('中旬', '15号')
 55 |         input_query = input_query.replace('傍晚', '午后')
 56 |         input_query = input_query.replace('大年', '')
 57 |         input_query = input_query.replace('五一', '劳动节')
 58 |         input_query = input_query.replace('白天', '早上')
 59 |         input_query = input_query.replace('：', ':')
 60 |         return input_query
 61 | 
 62 |     def init(self):
 63 |         fpath = os.path.dirname(__file__) + '/resource/reg.pkl'
 64 |         # print(os.path.dirname(__file__))
 65 |         try:
 66 |             with open(fpath, 'rb') as f:
 67 |                 pattern = pickle.load(f)
 68 |         except:
 69 |             with codecs.open(os.path.dirname(__file__) + '/resource/regex.txt', 'r', 'utf-8-sig') as f:
 70 |                 content = f.read()
 71 |             p = re.compile(content)
 72 |             with open(fpath, 'wb') as f:
 73 |                 pickle.dump(p, f)
 74 |             with open(fpath, 'rb') as f:
 75 |                 pattern = pickle.load(f)
 76 |         with codecs.open(os.path.dirname(__file__) + '/resource/holi_solar.json', 'r', 'utf-8-sig') as f:
 77 |             holi_solar = json.load(f)
 78 |         with codecs.open(os.path.dirname(__file__) + '/resource/holi_lunar.json', 'r', 'utf-8-sig') as f:
 79 |             holi_lunar = json.load(f)
 80 |         return pattern, holi_solar, holi_lunar
 81 | 
 82 |     def parse(self, target, timeBase=None):
 83 |         """
 84 |         TimeNormalizer的构造方法，timeBase取默认的系统当前时间
 85 |         :param timeBase: 基准时间点
 86 |         :param target: 待分析字符串
 87 |         :return: 时间单元数组
 88 |         """
 89 |         if timeBase is None:
 90 |             timeBase = arrow.now()
 91 |         self.isTimeSpan = False
 92 |         self.invalidSpan = False
 93 |         self.timeSpan = ''
 94 |         self.target = self._filter(target)
 95 |         self.timeBase = arrow.get(timeBase).format('YYYY-M-D-H-m-s')
 96 |         self.nowTime = timeBase
 97 |         self.oldTimeBase = self.timeBase
 98 |         self.__preHandling()
 99 |         self.timeToken = self.__timeEx()
100 |         dic = {}
101 |         res = self.timeToken
102 | 
103 |         if self.isTimeSpan:
104 | 
105 |             if self.invalidSpan:
106 |                 dic['error'] = 'no time pattern could be extracted.'
107 |             else:
108 |                 result = {}
109 |                 dic['type'] = 'timedelta'
110 |                 dic['timedelta'] = self.timeSpan
111 |                 # print(dic['timedelta'])
112 |                 index = dic['timedelta'].find('days')
113 | 
114 |                 days = int(dic['timedelta'][:index-1])
115 |                 result['year'] = int(days / 365)
116 |                 result['month'] = int(days / 30 - result['year'] * 12)
117 |                 result['day'] = int(days - result['year']
118 |                                     * 365 - result['month'] * 30)
119 |                 index = dic['timedelta'].find(',')
120 |                 time = dic['timedelta'][index+1:]
121 |                 time = time.split(':')
122 |                 result['hour'] = int(time[0])
123 |                 result['minute'] = int(time[1])
124 |                 result['second'] = int(time[2])
125 |                 dic['timedelta'] = result
126 |         else:
127 |             if len(res) == 0:
128 |                 dic['error'] = 'no time pattern could be extracted.'
129 |             elif len(res) == 1:
130 |                 dic['type'] = 'timestamp'
131 |                 dic['timestamp'] = res[0].time.format("YYYY-MM-DD HH:mm:ss")
132 |             else:
133 |                 dic['type'] = 'timespan'
134 |                 dic['timespan'] = [res[0].time.format(
135 |                     "YYYY-MM-DD HH:mm:ss"), res[1].time.format("YYYY-MM-DD HH:mm:ss")]
136 |         return json.dumps(dic)
137 | 
138 |     def __preHandling(self):
139 |         """
140 |         待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理
141 |         :return:
142 |         """
143 |         self.target = StringPreHandler.delKeyword(
144 |             self.target, u"\\s+")  # 清理空白符
145 |         self.target = StringPreHandler.delKeyword(
146 |             self.target, u"[的]+")  # 清理语气助词
147 |         self.target = StringPreHandler.numberTranslator(self.target)  # 大写数字转化
148 | 
149 |     def __timeEx(self):
150 |         """
151 | 
152 |         :param target: 输入文本字符串
153 |         :param timeBase: 输入基准时间
154 |         :return: TimeUnit[]时间表达式类型数组
155 |         """
156 |         startline = -1
157 |         endline = -1
158 |         rpointer = 0
159 |         temp = []
160 | 
161 |         match = self.pattern.finditer(self.target)
162 |         for m in match:
163 |             startline = m.start()
164 |             if startline == endline:
165 |                 rpointer -= 1
166 |                 temp[rpointer] = temp[rpointer] + m.group()
167 |             else:
168 |                 temp.append(m.group())
169 |             endline = m.end()
170 |             rpointer += 1
171 |         res = []
172 |         # 时间上下文： 前一个识别出来的时间会是下一个时间的上下文，用于处理：周六3点到5点这样的多个时间的识别，第二个5点应识别到是周六的。
173 |         contextTp = TimePoint()
174 |         # print(self.timeBase)
175 |         # print('temp',temp)
176 |         for i in range(0, rpointer):
177 |             # 这里是一个类嵌套了一个类
178 |             res.append(TimeUnit(temp[i], self, contextTp))
179 |             # res[i].tp.tunit[3] = -1
180 |             contextTp = res[i].tp
181 |             # print(self.nowTime.year)
182 |             # print(contextTp.tunit)
183 |         res = self.__filterTimeUnit(res)
184 | 
185 |         return res
186 | 
187 |     def __filterTimeUnit(self, tu_arr):
188 |         """
189 |         过滤timeUnit中无用的识别词。无用识别词识别出的时间是1970.01.01 00:00:00(fastTime=0)
190 |         :param tu_arr:
191 |         :return:
192 |         """
193 |         if (tu_arr is None) or (len(tu_arr) < 1):
194 |             return tu_arr
195 |         res = []
196 |         for tu in tu_arr:
197 |             if tu.time.timestamp != 0:
198 |                 res.append(tu)
199 |         return res
200 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/TimePoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/11/20 15:37
 4 | # @Author  : zhm
 5 | # @File    : TimePoint.py
 6 | # @Software: PyCharm
 7 | 
 8 | 
 9 | #  * 时间表达式单元规范化对应的内部类,
10 | #  * 对应时间表达式规范化的每个字段，
11 | #  * 六个字段分别是：年-月-日-时-分-秒，
12 | #  * 每个字段初始化为-1
13 | class TimePoint:
14 |     def __init__(self):
15 |         self.tunit = [-1, -1, -1, -1, -1, -1]
16 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 2017/11/23 13:22
4 | # @Author  : zhm
5 | # @File    : __init__.py
6 | # @Software: PyCharm


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 2017/12/5 17:29
4 | # @Author  : zhm
5 | # @File    : __init__.py
6 | # @Software: PyCharm


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/holi_lunar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "中和节": "02-02", 
 3 |   "中秋节": "08-15", 
 4 |   "中元节": "07-15", 
 5 |   "端午节": "05-05", 
 6 |   "春节": "01-01", 
 7 |   "元宵节": "01-15", 
 8 |   "重阳节": "09-09", 
 9 |   "7夕节": "07-07",
10 |   "初1节": "01-01",
11 |   "初2节": "01-02",
12 |   "初3节": "01-03",
13 |   "初4节": "01-04",
14 |   "初5节": "01-05",
15 |   "初6节": "01-06",
16 |   "初7节": "01-07",
17 |   "初8节": "01-08",
18 |   "初9节": "01-09",
19 |   "初10节": "01-10",
20 |   "初11节": "01-11",
21 |   "初12节": "01-12",
22 |   "初13节": "01-13",
23 |   "初14节": "01-14",
24 |   "初15节": "01-15"
25 | }
26 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/holi_solar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "植树节": "03-12", 
 3 |   "圣诞节": "12-25", 
 4 |   "青年节": "05-04", 
 5 |   "教师节": "09-10", 
 6 |   "儿童节": "06-01", 
 7 |   "元旦节": "01-01", 
 8 |   "国庆节": "10-01", 
 9 |   "劳动节": "05-01", 
10 |   "妇女节": "03-08", 
11 |   "建军节": "08-01", 
12 |   "航海日节": "07-11", 
13 |   "建党节": "07-01", 
14 |   "记者节": "11-08",
15 |   "情人节":"02-14",
16 |   "母亲节":"05-11"
17 | }
18 | 


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/reg.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/basic/time_nlp/resource/reg.pkl


--------------------------------------------------------------------------------
/cocoNLP/config/basic/time_nlp/resource/regex.txt:
--------------------------------------------------------------------------------
1 | ((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[:：]\d+([:：]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?<!\\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)点)|(今年([零一二三四五六七八九十百千万]+|\d+))|(\d+[:：]\d+(分|))|((\d+):(\d+))|(\d+/\d+/\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\d+分)+(\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\d*]世纪末?)|((\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\d+点)+(\d+分)?(\d+秒)?左右?)|((\d+)年代)|(本月(\d+))|(第(\d+)天)|((\d+)岁)|((\d+)年(\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(今年(\d+)月(\d+)日)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\d+)年(\d+)月(\d+)日)|((\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\d+)点)|(去年(\d+)月(\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\d+)日晚(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\d+)月(\d+)日)|(([零一二三四五六七八九十百千万]+|\d+)周)|((\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\d+))|(长久)|(清晨)|((\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+))|(圣诞节)|(学期)|(\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\d+)月底)|(当天)|((\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时)|(连[年月日夜])|((\d+)年(\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时(\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时(\d+)分)|(同日)|((\d+)年(\d+)月底)|((\d+)分钟)|((\d+)世纪)|(冬季)|(清明)(节)?|(立春)|(雨水)|(惊蛰)|(春分)|(谷雨)|(立夏)|(小满 )|(芒种)|(夏至)|(小暑)|(大暑)|(立秋)|(处暑)|(白露)|(秋分)|(寒露)|(霜降)|(立冬)|(小雪)|(大雪)|(冬至)|(小寒)|(大寒)|(青年节)|(教师节)|(中元节)|(端午)(节)?|(劳动节)|(7夕)(节)?|(建党节)|(建军节)|(初13)|(初14)|(初15)|(初12)|(初11)|(初9)|(初8)|(初7)|(初6)|(初5)|(初4)|(初3)|(初2)|(初1)|(情人节)|(母亲节)|(中和节)|(圣诞)(节)?|(中秋)(节)?|(春节)|(元宵)(节)?|(航海日)|(儿童节)|(国庆)(节)?|(植树节)|(元旦)|(重阳节)|(妇女节)|(记者节)|(年代)|(([零一二三四五六七八九十百千万]+|\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[:：]\d+([:：]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?<!\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)点)|(今年([零一二三四五六七八九十百千万]+|\d+))|(\d+[:：]\d+(分|))|((\d+):(\d+))|(\d+/\d+/\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(\d+)月|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\d+分)+(\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\d*]世纪末?)|((\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\d+点)+(\d+分)?(\d+秒)?左右?)|((\d+)年代)|(本月(\d+))|(第(\d+)天)|((\d+)岁)|((\d+)年(\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(\d+)月(\d+)日|(\d+)月(\d+)|(今年(\d+)月(\d+)日)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\d+)年(\d+)月(\d+)日)|((\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\d+)点)|(去年(\d+)月(\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\d+)日晚(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\d+)月(\d+)日)|(([零一二三四五六七八九十百千万]+|\d+)周)|((\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\d+))|(长久)|(清晨)|((\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+))|(圣诞节)|(学期)|(\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\d+)月底)|(当天)|((\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时)|(连[年月日夜])|((\d+)年(\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时(\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时(\d+)分)|(同日)|((\d+)年(\d+)月底)|((\d+)分钟)|((\d+)世纪)|(冬季)|(年代)|(([零一二三四五六七八九十百千万]+|\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?))|(([零一二三四五六七八九十百千万]+|\d+)生)|(今天凌晨)|(过去(\d+)年)|(元月)|((\d+)月(\d+)日凌晨)|([前去今明后新]+年)|((\d+)月(\d+))|(夏天)|((\d+)日凌晨(\d+)时许)|((\d+)月(\d+)日)|((\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\d+)月(\d+)日)|((\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\d+)个礼拜)|(昨日)|([年月]初)|((\d+)年的(\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\d+)月份)|(今年(\d+)月(\d+)号)|(今年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)月底)|(未来(\d+)年)|(第([零一二三四五六七八九十百千万]+|\d+)季)|(\d?多年)|(([零一二三四五六七八九十百千万]+|\d+)个星期)|((\d+)年([零一二三四五六七八九十百千万]+|\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)年(\d+)月(\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)天)|(工作日)|(稍后)|((\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)刻)|(很久)|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\d+)月(\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(古代)|(两天)|(\d+个?(小时|星期))|((\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\d+天)+(\d+点)?(\d+分)?(\d+秒)?)|((\d+)日([零一二三四五六七八九十百千万]+|\d+)时)|((\d+)周年)|(([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)日)|(去年(\d+)月)|(过去([零一二三四五六七八九十百千万]+|\d+)年)|((\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\d+)日)|(第([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)点(\d+)分)|([长近多]年)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\d+)早些时候)|((\d+)([零一二三四五六七八九十百千万]+|\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\d+)个小时)|(([零一二三四五六七八九十百千万]+|\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)个月)|((\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)时(\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)年)|(月底)|(这个月)|((\d+)年(\d+)个月)|(\d+大寿)|(周([零一二三四五六七八九十百千万]+|\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\d+)月)|((\d+)月(\d+)号)|((\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(午后)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\d+:\d+:\d+)|(\d+/\d+/\d+:\d+:\d+.\d+)|(\?\?\?\?-\?\?-\?\?T\d+:\d+:\d+)|(\d+-\d+-\d+T\d+:\d+:\d+)|(\d+/\d+/\d+ \d+:\d+:\d+.\d+)|(\d+-\d+-\d+|[0-9]{8})|(((\d+)年)?((10)|(11)|(12)|([1-9]))月(\d+))|((\d[\.\-])?((10)|(11)|(12)|([1-9]))[\.\-](\d+))))|(([零一二三四五六七八九十百千万]+|\d+)生)|(今天凌晨)|(过去(\d+)年)|(元月)|((\d+)月(\d+)日凌晨)|([前去今明后新]+年)|((\d+)月(\d+))|(夏天)|((\d+)日凌晨(\d+)时许)|((\d+)月(\d+)日)|((\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\d+)月(\d+)日)|((\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\d+)个礼拜)|(昨日)|([年月]初)|((\d+)年的(\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\d+)月份)|(今年(\d+)月(\d+)号)|(今年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)月底)|(未来(\d+)年)|(第([零一二三四五六七八九十百千万]+|\d+)季)|(\d?多年)|(([零一二三四五六七八九十百千万]+|\d+)个星期)|((\d+)年([零一二三四五六七八九十百千万]+|\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)年(\d+)月(\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)天)|(工作日)|(稍后)|((\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)刻)|(很久)|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\d+)月(\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(古代)|(两天)|(\d+个?(小时|星期))|((\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\d+天)+(\d+点)?(\d+分)?(\d+秒)?)|((\d+)日([零一二三四五六七八九十百千万]+|\d+)时)|((\d+)周年)|(([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)日)|(去年(\d+)月)|(过去([零一二三四五六七八九十百千万]+|\d+)年)|((\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\d+)日)|(第([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)点(\d+)分)|([长近多]年)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\d+)早些时候)|((\d+)([零一二三四五六七八九十百千万]+|\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\d+)个小时)|(([零一二三四五六七八九十百千万]+|\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)个月)|((\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)时(\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)年)|(月底)|(这个月)|((\d+)年(\d+)个月)|(\d+大寿)|(周([零一二三四五六七八九十百千万]+|\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\d+)月)|((\d+)月(\d+)号)|((\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(下*个?月)|(午后)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(\d+秒)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\d+:\d+:\d+)|(\d+/\d+/\d+:\d+:\d+.\d+)|(\?\?\?\?-\?\?-\?\?T\d+:\d+:\d+)|(\d+-\d+-\d+T\d+:\d+:\d+)|(\d+/\d+/\d+ \d+:\d+:\d+.\d+)|(\d+-\d+-\d+|[0-9]{8})|(((\d+)年)?((10)|(11)|(12)|([1-9]))月(\d+))|((\d[\.\-])?((10)|(11)|(12)|([1-9]))[\.\-](\d+))


--------------------------------------------------------------------------------
/cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc


--------------------------------------------------------------------------------
/cocoNLP/config/phrase/rake.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Implementation of Rapid Automatic Keyword Extraction algorithm.
  3 | 
  4 | As described in the paper `Automatic keyword extraction from individual
  5 | documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
  6 | """
  7 | 
  8 | import string
  9 | from collections import Counter, defaultdict
 10 | from itertools import chain, groupby, product
 11 | import jieba
 12 | import re
 13 | from enum import Enum
 14 | 
 15 | import os
 16 | import sys
 17 | import codecs
 18 | pwd_path = os.path.abspath(os.path.dirname(__file__))
 19 | 
 20 | 
 21 | class Metric(Enum):
 22 |     """Different metrics that can be used for ranking."""
 23 | 
 24 |     DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
 25 |     WORD_DEGREE = 1  # Uses d(w) alone as the metric
 26 |     WORD_FREQUENCY = 2  # Uses f(w) alone as the metric
 27 | 
 28 | 
 29 | class Rake(object):
 30 |     """Rapid Automatic Keyword Extraction Algorithm."""
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         punctuations=None,
 35 |         ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
 36 |         max_length=100000,
 37 |         min_length=1,
 38 |     ):
 39 |         """Constructor.
 40 | 
 41 |         :param stopwords: List of Words to be ignored for keyword extraction.
 42 |         :param punctuations: Punctuations to be ignored for keyword extraction.
 43 |         :param language: Language to be used for stopwords
 44 |         :param max_length: Maximum limit on the number of words in a phrase
 45 |                            (Inclusive. Defaults to 100000)
 46 |         :param min_length: Minimum limit on the number of words in a phrase
 47 |                            (Inclusive. Defaults to 1)
 48 |         """
 49 |         # By default use degree to frequency ratio as the metric.
 50 |         if isinstance(ranking_metric, Metric):
 51 |             self.metric = ranking_metric
 52 |         else:
 53 |             self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
 54 | 
 55 |         # If stopwords not provided we use language stopwords by default.
 56 |         self.stopwords = self.load_stopwords()
 57 | 
 58 |         # If punctuations are not provided we ignore all punctuation symbols.
 59 |         self.punctuations = punctuations
 60 |         if self.punctuations is None:
 61 |             self.punctuations = string.punctuation + ',，。！？!?' # add chinese punctuation
 62 | 
 63 |         # All things which act as sentence breaks during keyword extraction.
 64 |         self.to_ignore = set(chain(self.stopwords, self.punctuations))
 65 | 
 66 |         # Assign min or max length to the attributes
 67 |         self.min_length = min_length
 68 |         self.max_length = max_length
 69 | 
 70 |         # Stuff to be extracted from the provided text.
 71 |         self.frequency_dist = None
 72 |         self.degree = None
 73 |         self.rank_list = None
 74 |         self.ranked_phrases = None
 75 | 
 76 |     def load_stopwords(self, path = pwd_path+'/'+'data/stopwords.txt'):
 77 |         """load stopwords list
 78 |         eg: stopwords_list = load_stopwords(path)
 79 | 
 80 |         :param path: 停用词表path，提前整理好的，直接读进来
 81 |         :return: list<stopwords>
 82 |         """
 83 |         with codecs.open(path, 'r', 'utf-8-sig') as f:
 84 |             stopwords = f.readlines()
 85 |         stopwords_list = []
 86 |         for word in stopwords:
 87 |             stopwords_list.append(word.replace('\n', '').replace(' ', ''))
 88 | 
 89 |         return stopwords_list
 90 | 
 91 |     def tokenize_chinese(self,text):
 92 | 
 93 |         sentences = re.split(r'(，|。|！|\!|\.|？|\?)', text)  # 保留分割符
 94 | 
 95 |         new_sents = []
 96 |         for i in range(int(len(sentences) / 2)):
 97 |             sent = sentences[2 * i] + sentences[2 * i + 1]
 98 |             new_sents.append(sent)
 99 |         return sentences
100 | 
101 |     def extract_keywords_from_text(self, text, min_len, max_len):
102 |         """Method to extract keywords from the text provided.
103 | 
104 |         :param text: Text to extract keywords from, provided as a string.
105 |         """
106 |         sentences = self.tokenize_chinese(text)
107 |         self.extract_keywords_from_sentences(sentences, min_len, max_len)
108 | 
109 |     def extract_keywords_from_sentences(self, sentences, min_len, max_len):
110 |         """Method to extract keywords from the list of sentences provided.
111 | 
112 |         :param sentences: Text to extraxt keywords from, provided as a list
113 |                           of strings, where each string is a sentence.
114 |         """
115 |         phrase_list = self._generate_phrases(sentences, min_len, max_len)
116 |         self._build_frequency_dist(phrase_list)
117 |         self._build_word_co_occurance_graph(phrase_list)
118 |         self._build_ranklist(phrase_list)
119 | 
120 |     def get_ranked_phrases(self):
121 |         """Method to fetch ranked keyword strings.
122 | 
123 |         :return: List of strings where each string represents an extracted
124 |                  keyword string.
125 |         """
126 |         return self.ranked_phrases
127 | 
128 |     def get_ranked_phrases_with_scores(self):
129 |         """Method to fetch ranked keyword strings along with their scores.
130 | 
131 |         :return: List of tuples where each tuple is formed of an extracted
132 |                  keyword string and its score. Ex: (5.68, 'Four Scoures')
133 |         """
134 |         return self.rank_list
135 | 
136 |     def get_word_frequency_distribution(self):
137 |         """Method to fetch the word frequency distribution in the given text.
138 | 
139 |         :return: Dictionary (defaultdict) of the format `word -> frequency`.
140 |         """
141 |         return self.frequency_dist
142 | 
143 |     def get_word_degrees(self):
144 |         """Method to fetch the degree of words in the given text. Degree can be
145 |         defined as sum of co-occurances of the word with other words in the
146 |         given text.
147 | 
148 |         :return: Dictionary (defaultdict) of the format `word -> degree`.
149 |         """
150 |         return self.degree
151 | 
152 |     def _build_frequency_dist(self, phrase_list):
153 |         """Builds frequency distribution of the words in the given body of text.
154 | 
155 |         :param phrase_list: List of List of strings where each sublist is a
156 |                             collection of words which form a contender phrase.
157 |         """
158 |         self.frequency_dist = Counter(chain.from_iterable(phrase_list))
159 | 
160 |     def _build_word_co_occurance_graph(self, phrase_list):
161 |         """Builds the co-occurance graph of words in the given body of text to
162 |         compute degree of each word.
163 | 
164 |         :param phrase_list: List of List of strings where each sublist is a
165 |                             collection of words which form a contender phrase.
166 |         """
167 |         co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
168 |         for phrase in phrase_list:
169 |             # For each phrase in the phrase list, count co-occurances of the
170 |             # word with other words in the phrase.
171 |             #
172 |             # Note: Keep the co-occurances graph as is, to help facilitate its
173 |             # use in other creative ways if required later.
174 |             for (word, coword) in product(phrase, phrase):
175 |                 co_occurance_graph[word][coword] += 1
176 |         self.degree = defaultdict(lambda: 0)
177 |         for key in co_occurance_graph:
178 |             self.degree[key] = sum(co_occurance_graph[key].values())
179 | 
180 |     def _build_ranklist(self, phrase_list):
181 |         """Method to rank each contender phrase using the formula
182 | 
183 |               phrase_score = sum of scores of words in the phrase.
184 |               word_score = d(w)/f(w) where d is degree and f is frequency.
185 | 
186 |         :param phrase_list: List of List of strings where each sublist is a
187 |                             collection of words which form a contender phrase.
188 |         """
189 |         self.rank_list = []
190 |         for phrase in phrase_list:
191 |             rank = 0.0
192 |             for word in phrase:
193 |                 if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
194 |                     rank += 1.0 * self.degree[word] / self.frequency_dist[word]
195 |                 elif self.metric == Metric.WORD_DEGREE:
196 |                     rank += 1.0 * self.degree[word]
197 |                 else:
198 |                     rank += 1.0 * self.frequency_dist[word]
199 |             self.rank_list.append((rank, " ".join(phrase)))
200 |         self.rank_list.sort(reverse=True)
201 |         self.ranked_phrases = [ph[1] for ph in self.rank_list]
202 | 
203 |     def _generate_phrases(self, sentences, min_len, max_len):
204 |         """Method to generate contender phrases given the sentences of the text
205 |         document.
206 | 
207 |         :param sentences: List of strings where each string represents a
208 |                           sentence which forms the text.
209 |         :return: Set of string tuples where each tuple is a collection
210 |                  of words forming a contender phrase.
211 |         """
212 |         phrase_list = set()
213 |         # Create contender phrases from sentences.
214 |         for sentence in sentences:
215 |             word_list = [word for word in list(jieba.cut(sentence))]
216 |             phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len))
217 |         return phrase_list
218 | 
219 |     def _get_phrase_list_from_words(self, word_list, min_len, max_len):
220 |         """Method to create contender phrases from the list of words that form
221 |         a sentence by dropping stopwords and punctuations and grouping the left
222 |         words into phrases. Only phrases in the given length range (both limits
223 |         inclusive) would be considered to build co-occurrence matrix. Ex:
224 | 
225 |         Sentence: Red apples, are good in flavour.
226 |         List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
227 |         List after dropping punctuations and stopwords.
228 |         List of words: ['red', 'apples', *, *, good, *, 'flavour']
229 |         List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
230 | 
231 |         List of phrases with a correct length:
232 |         For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
233 |         For the range [1, 1]: [('good',), ('flavour',)]
234 |         For the range [2, 2]: [('red', 'apples')]
235 | 
236 |         :param word_list: List of words which form a sentence when joined in
237 |                           the same order.
238 |         :return: List of contender phrases that are formed after dropping
239 |                  stopwords and punctuations.
240 |         """
241 |         groups = groupby(word_list, lambda x: x not in self.to_ignore)
242 |         phrases = []
243 |         for group in groups:
244 |             tmp = tuple(group[1])
245 |             len_g1 = len(list(tmp))
246 |             if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase
247 |                 phrases.append(tuple(tmp))
248 | 
249 |         return list(
250 |             filter(
251 |                 lambda x: self.min_length <= len(x) <= self.max_length, phrases
252 |             )
253 |         )
254 | 


--------------------------------------------------------------------------------
/cocoNLP/extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from phone import Phone
  4 | from itertools import groupby
  5 | import phonenumbers
  6 | from pyhanlp import *
  7 | from cocoNLP.config.basic.time_nlp.TimeNormalizer import *
  8 | 
  9 | 
 10 | 
 11 | __all__ = ['extract_email', 'replace_chinese','extract_cellphone', 'extract_cellphone', 'extract_cellphone_location',
 12 |            'get_location', 'extract_locations', 'replace_cellphoneNum', 'extract_time', 'extract_name', 'most_common']
 13 | 
 14 | class extractor():
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def extract_email(self, text):
 19 |         """
 20 |         extract all email addresses from texts<string>
 21 |         eg: extract_email('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 22 | 
 23 | 
 24 |         :param: raw_text
 25 |         :return: email_addresses_list<list>
 26 |         """
 27 |         if text=='':
 28 |             return []
 29 |         eng_texts = self.replace_chinese(text)
 30 |         eng_texts = eng_texts.replace(' at ','@').replace(' dot ','.')
 31 |         sep = ',!?:; ，。！？《》、|\\/'
 32 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
 33 | 
 34 |         email_pattern = r'^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z_-]+)+$'
 35 | 
 36 |         emails = []
 37 |         for eng_text in eng_split_texts:
 38 |             result = re.match(email_pattern, eng_text, flags=0)
 39 |             if result:
 40 |                 emails.append(result.string)
 41 |         return emails
 42 | 
 43 |     def extract_ids(self, text):
 44 |         """
 45 |         extract all ids from texts<string>
 46 |         eg: extract_ids('my ids is 150404198812011101 m and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 47 | 
 48 | 
 49 |         :param: raw_text
 50 |         :return: ids_list<list>
 51 |         """
 52 |         if text == '':
 53 |             return []
 54 |         eng_texts = self.replace_chinese(text)
 55 |         sep = ',!?:; ：，.。！？《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 56 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
 57 |         eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele) == 18]
 58 | 
 59 |         id_pattern = r'^[1-9][0-7]\d{4}((19\d{2}(0[13-9]|1[012])(0[1-9]|[12]\d|30))|(19\d{2}(0[13578]|1[02])31)|(19\d{2}02(0[1-9]|1\d|2[0-8]))|(19([13579][26]|[2468][048]|0[48])0229))\d{3}(\d|X|x)?$'
 60 | 
 61 |         phones = []
 62 |         for eng_text in eng_split_texts_clean:
 63 |             result = re.match(id_pattern, eng_text, flags=0)
 64 |             if result:
 65 |                 phones.append(result.string.replace('+86','').replace('-',''))
 66 |         return phones
 67 | 
 68 |     def replace_chinese(self, text):
 69 |         """
 70 |         remove all the chinese characters in text
 71 |         eg: replace_chinese('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 72 | 
 73 | 
 74 |         :param: raw_text
 75 |         :return: text_without_chinese<str>
 76 |         """
 77 |         if text=='':
 78 |             return []
 79 |         filtrate = re.compile(u'[\u4E00-\u9FA5]')
 80 |         text_without_chinese = filtrate.sub(r' ', text)
 81 |         return text_without_chinese
 82 | 
 83 |     def extract_cellphone(self, text, nation):
 84 |         """
 85 |         extract all cell phone numbers from texts<string>
 86 |         eg: extract_email('my email address is sldisd@baidu.com and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 87 | 
 88 | 
 89 |         :param: raw_text
 90 |         :return: email_addresses_list<list>
 91 |         """
 92 |         if text=='':
 93 |             return []
 94 |         eng_texts = self.replace_chinese(text)
 95 |         sep = ',!?:; ：，.。！？《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 96 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
 97 |         eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele)>=7 and len(ele)<17]
 98 |         if nation=='CHN':
 99 |             phone_pattern = r'^((\+86)?([- ])?)?(|(13[0-9])|(14[0-9])|(15[0-9])|(17[0-9])|(18[0-9])|(19[0-9]))([- ])?\d{3}([- ])?\d{4}([- ])?\d{4}$'
100 | 
101 |         phones = []
102 |         for eng_text in eng_split_texts_clean:
103 |             result = re.match(phone_pattern, eng_text, flags=0)
104 |             if result:
105 |                 phones.append(result.string.replace('+86','').replace('-',''))
106 |         return phones
107 | 
108 |     def extract_cellphone_location(self, phoneNum, nation='CHN'):
109 |         """
110 |         extract cellphone number locations according to the given number
111 |         eg: extract_cellphone_location('181000765143',nation=CHN)
112 | 
113 | 
114 |         :param: phoneNum<string>, nation<string>
115 |         :return: location<dict>{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}
116 | 
117 |         """
118 |         if nation=='CHN':
119 |             p = Phone()
120 |             loc_dict = p.find(phoneNum)
121 |         if nation!='CHN':
122 |             x = phonenumbers.parse(phoneNum, 'GB')
123 |             if phonenumbers.is_possible_number(x):
124 |                 loc_dict = x
125 |         # print(loc_dict)
126 |         return loc_dict
127 | 
128 |     def get_location(self, word_pos_list):
129 |         """
130 |         get location by the pos of the word, such as 'ns'
131 |         eg: get_location('内蒙古赤峰市松山区')
132 | 
133 | 
134 |         :param: word_pos_list<list>
135 |         :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
136 | 
137 |         """
138 |         location_list = []
139 |         if word_pos_list==[]:
140 |             return []
141 | 
142 |         for i,t in enumerate(word_pos_list):
143 |             word = t[0]
144 |             nature = t[1]
145 |             if nature == 'ns':
146 |                 loc_tmp = word
147 |                 count = i + 1
148 |                 while count < len(word_pos_list):
149 |                     next_word_pos = word_pos_list[count]
150 |                     next_pos = next_word_pos[1]
151 |                     next_word = next_word_pos[0]
152 |                     if next_pos=='ns' or 'n' == next_pos[0]:
153 |                         loc_tmp += next_word
154 |                     else:
155 |                         break
156 |                     count += 1
157 |                 location_list.append(loc_tmp)
158 | 
159 |         return location_list # max(location_list)
160 | 
161 |     def extract_locations(self, text):
162 |         """
163 |         extract locations by from texts
164 |         eg: extract_locations('我家住在陕西省安康市汉滨区。')
165 | 
166 | 
167 |         :param: raw_text<string>
168 |         :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
169 | 
170 |         """
171 |         if text=='':
172 |             return []
173 |         seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)]
174 |         location_list = self.get_location(seg_list)
175 |         return location_list
176 | 
177 |     def replace_cellphoneNum(self, text):
178 |         """
179 |         remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors.
180 |         hence, we remove it here.
181 |         eg: extract_locations('我家住在陕西省安康市汉滨区，我的手机号是181-0006-5143。')
182 | 
183 | 
184 |         :param: raw_text<string>
185 |         :return: text_without_cellphone<string> eg: '我家住在陕西省安康市汉滨区，我的手机号是。'
186 | 
187 |         """
188 |         eng_texts = self.replace_chinese(text)
189 |         sep = ',!?:; ：，.。！？《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
190 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
191 |         eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele)>=7 and len(ele)<17]
192 |         for phone_num in eng_split_texts_clean:
193 |             text = text.replace(phone_num,'')
194 |         return text
195 | 
196 |     def replace_ids(self, text):
197 |         """
198 |         remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors.
199 |         hence, we remove it here.
200 |         eg: extract_locations('我家住在陕西省安康市汉滨区，我的身份证号是150404198412011312。')
201 | 
202 | 
203 |         :param: raw_text<string>
204 |         :return: text_without_ids<string> eg: '我家住在陕西省安康市汉滨区，我的身份证号号是。'
205 | 
206 |         """
207 |         if text == '':
208 |             return []
209 |         eng_texts = self.replace_chinese(text)
210 |         sep = ',!?:; ：，.。！？《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
211 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
212 |         eng_split_texts_clean = [ele for ele in eng_split_texts if len(ele) == 18]
213 | 
214 |         id_pattern = r'^[1-9][0-7]\d{4}((19\d{2}(0[13-9]|1[012])(0[1-9]|[12]\d|30))|(19\d{2}(0[13578]|1[02])31)|(19\d{2}02(0[1-9]|1\d|2[0-8]))|(19([13579][26]|[2468][048]|0[48])0229))\d{3}(\d|X|x)?$'
215 |         ids = []
216 |         for eng_text in eng_split_texts_clean:
217 |             result = re.match(id_pattern, eng_text, flags=0)
218 |             if result:
219 |                 ids.append(result.string)
220 | 
221 |         for phone_num in ids:
222 |             text = text.replace(phone_num,'')
223 |         return text
224 | 
225 |     def extract_time(self, text):
226 |         """
227 |         extract timestamp from texts
228 |         eg: extract_time('我于2018年1月1日获得1000万美金奖励。')
229 | 
230 | 
231 |         :param: raw_text<string>
232 |         :return: time_info<time_dict> eg: {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"}
233 | 
234 |         """
235 |         if text=='':
236 |             return []
237 |         tmp_text = self.replace_cellphoneNum(text)
238 |         tmp_text = self.replace_ids(tmp_text)
239 |         tn = TimeNormalizer()
240 |         res = tn.parse(target=tmp_text)  # target为待分析语句，timeBase为基准时间默认是当前时间
241 |         return res
242 | 
243 |     def extract_name(self, text):
244 |         """
245 |         extract chinese names from texts
246 |         eg: extract_time('急寻王龙，短发，王龙，男，丢失发型短发，...如有线索，请迅速与警方联系：19909156745')
247 | 
248 | 
249 |         :param: raw_text<string>
250 |         :return: name_list<list> eg: ['王龙', '王龙']
251 | 
252 |         """
253 |         if text=='':
254 |             return []
255 |         seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)]
256 |         names = []
257 |         for ele_tup in seg_list:
258 |             if 'nr' in ele_tup[1]:
259 |                 names.append(ele_tup[0])
260 |                 # print(ele_tup[0],ele_tup[1])
261 |         return self.most_common(names)
262 | 
263 |     def most_common(self, content_list):
264 |         """
265 |         return the most common element in a list
266 |         eg: extract_time(['王龙'，'王龙'，'李二狗'])
267 | 
268 | 
269 |         :param: content_list<list>
270 |         :return: name<string> eg: '王龙'
271 |         """
272 |         if content_list==[]:
273 |             return None
274 |         if len(content_list)==0:
275 |             return None
276 |         return max(set(content_list), key=content_list.count)
277 | 
278 | 
279 | 
280 | 
281 | 
282 | if __name__ == '__main__':
283 | 
284 |     text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发，...如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com'
285 |     ex = extractor()
286 | 
287 |     emails = ex.extract_email(text)
288 |     cellphones = ex.extract_cellphone(text,nation='CHN')
289 |     cell_loc = []
290 |     for cell in cellphones:
291 |         cell_loc.append(ex.extract_cellphone_location(cell,'CHN'))
292 | 
293 |     locations = ex.extract_locations(text)
294 |     times = ex.extract_time(text)
295 |     names = ex.extract_name(text)
296 | 
297 |     result_dict = {}
298 |     result_dict['email'] = emails
299 |     result_dict['cellphone'] = cellphones
300 |     result_dict['cellphone_location'] = cell_loc
301 |     result_dict['location'] = locations
302 |     result_dict['time'] = times
303 |     result_dict['name'] = names
304 |     for key in result_dict.keys():
305 |         print(key,result_dict[key])


--------------------------------------------------------------------------------
/cocoNLP/extractor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/cocoNLP/extractor.pyc


--------------------------------------------------------------------------------
/dist/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/.DS_Store


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.10.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.10.tar.gz


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.11.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.11.tar.gz


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.12.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.12.tar.gz


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.13.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.13.tar.gz


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include cocoNLP/config *
2 | recursive-include cocoNLP/config *
3 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: cocoNLP
 3 | Version: 0.0.9
 4 | Summary: Python implementation of many nlp algorithms
 5 | Home-page: https://github.com/fighting41love
 6 | Author: Yang Yang
 7 | Author-email: yangyangfuture@gmail.com
 8 | License: MIT
 9 | Description: UNKNOWN
10 | Keywords: nlp text-mining information extraction
11 | Platform: UNKNOWN
12 | Classifier: Intended Audience :: Developers
13 | Classifier: Intended Audience :: Education
14 | Classifier: License :: OSI Approved :: MIT License
15 | Classifier: Development Status :: 3 - Alpha
16 | Classifier: Operating System :: POSIX
17 | Classifier: Programming Language :: Python :: 2.7
18 | Classifier: Programming Language :: Python :: 3.4
19 | Classifier: Programming Language :: Python :: 3.5
20 | Classifier: Programming Language :: Python :: 3.6
21 | Classifier: Topic :: Software Development :: Build Tools
22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules
23 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: cocoNLP
 3 | Version: 0.0.9
 4 | Summary: Python implementation of many nlp algorithms
 5 | Home-page: https://github.com/fighting41love
 6 | Author: Yang Yang
 7 | Author-email: yangyangfuture@gmail.com
 8 | License: MIT
 9 | Description: UNKNOWN
10 | Keywords: nlp text-mining information extraction
11 | Platform: UNKNOWN
12 | Classifier: Intended Audience :: Developers
13 | Classifier: Intended Audience :: Education
14 | Classifier: License :: OSI Approved :: MIT License
15 | Classifier: Development Status :: 3 - Alpha
16 | Classifier: Operating System :: POSIX
17 | Classifier: Programming Language :: Python :: 2.7
18 | Classifier: Programming Language :: Python :: 3.4
19 | Classifier: Programming Language :: Python :: 3.5
20 | Classifier: Programming Language :: Python :: 3.6
21 | Classifier: Topic :: Software Development :: Build Tools
22 | Classifier: Topic :: Software Development :: Libraries :: Python Modules
23 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | MANIFEST.in
 2 | readme.md
 3 | requirements.txt
 4 | setup.py
 5 | test.py
 6 | cocoNLP/__init__.py
 7 | cocoNLP/__version__.py
 8 | cocoNLP/extractor.py
 9 | cocoNLP.egg-info/PKG-INFO
10 | cocoNLP.egg-info/SOURCES.txt
11 | cocoNLP.egg-info/dependency_links.txt
12 | cocoNLP.egg-info/requires.txt
13 | cocoNLP.egg-info/top_level.txt
14 | cocoNLP/config/basic/time_nlp/.DS_Store
15 | cocoNLP/config/basic/time_nlp/LunarSolarConverter.py
16 | cocoNLP/config/basic/time_nlp/README.md
17 | cocoNLP/config/basic/time_nlp/RangeTimeEnum.py
18 | cocoNLP/config/basic/time_nlp/StringPreHandler.py
19 | cocoNLP/config/basic/time_nlp/Test.py
20 | cocoNLP/config/basic/time_nlp/TimeNormalizer.py
21 | cocoNLP/config/basic/time_nlp/TimePoint.py
22 | cocoNLP/config/basic/time_nlp/TimeUnit.py
23 | cocoNLP/config/basic/time_nlp/__init__.py
24 | cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO
25 | cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt
26 | cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt
27 | cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe
28 | cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt
29 | cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt
30 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO
31 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt
32 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt
33 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe
34 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt
35 | cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt
36 | cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc
37 | cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc
38 | cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc
39 | cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc
40 | cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc
41 | cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc
42 | cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc
43 | cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc
44 | cocoNLP/config/basic/time_nlp/resource/__init__.py
45 | cocoNLP/config/basic/time_nlp/resource/holi_lunar.json
46 | cocoNLP/config/basic/time_nlp/resource/holi_solar.json
47 | cocoNLP/config/basic/time_nlp/resource/reg.pkl
48 | cocoNLP/config/basic/time_nlp/resource/regex.txt
49 | cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc
50 | cocoNLP/config/phrase/rake.py
51 | cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc
52 | cocoNLP/config/phrase/data/stopwords.txt


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | pyhanlp
3 | phone
4 | phonenumbers
5 | regex
6 | arrow
7 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | cocoNLP
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | # -*- coding: utf-8 -*-
10 | 
11 | """
12 | cocoNLP module
13 | :copyright: (c) 2018 by Yang Yang.
14 | :license: MIT, see LICENSE for more details.
15 | """
16 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/__version__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #                      _   _ _     ____
 3 | #   ___ ___   ___ ___ | \ | | |   |  _ \
 4 | #  / __/ _ \ / __/ _ \|  \| | |   | |_) |
 5 | # | (_| (_) | (_| (_) | |\  | |___|  __/
 6 | #  \___\___/ \___\___/|_| \_|_____|_|
 7 | 
 8 | 
 9 | 
10 | __title__ = "cocoNLP"
11 | __description__ = "Python implementation of many nlp algorithms"
12 | __url__ = "https://github.com/fighting41love"
13 | __version__ = "0.0.9"
14 | __author__ = "Yang Yang"
15 | __author_email__ = "yangyangfuture@gmail.com"
16 | __license__ = "MIT"
17 | __copyright__ = "Copyright 2018 Yang Yang"
18 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/.DS_Store


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: TimeConverter
 3 | Version: 1.1.0
 4 | Summary: ...
 5 | Home-page: http://test.com
 6 | Author: test
 7 | Author-email: test@gmail.com
 8 | License: MIT Licence
 9 | Description: ...
10 | Keywords: time,nlp
11 | Platform: any
12 | Classifier: Programming Language :: Python :: 2.6
13 | Classifier: Programming Language :: Python :: 2.7
14 | Classifier: Programming Language :: Python :: 3.6
15 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LunarSolarConverter.py
 2 | README.md
 3 | RangeTimeEnum.py
 4 | StringPreHandler.py
 5 | Test.py
 6 | TimeNormalizer.py
 7 | TimePoint.py
 8 | TimeUnit.py
 9 | __init__.py
10 | setup.py
11 | TimeConverter.egg-info/PKG-INFO
12 | TimeConverter.egg-info/SOURCES.txt
13 | TimeConverter.egg-info/dependency_links.txt
14 | TimeConverter.egg-info/not-zip-safe
15 | TimeConverter.egg-info/requires.txt
16 | TimeConverter.egg-info/top_level.txt
17 | resource/__init__.py


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/requires.txt:
--------------------------------------------------------------------------------
1 | regex>=2017
2 | arrow>=0.10
3 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/EGG-INFO/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | resource
3 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/LunarSolarConverter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/12/11 11:08
  4 | # @Author  : zhm
  5 | # @File    : LunarSolarConverter.py
  6 | # @Software: PyCharm
  7 | from pprint import pprint
  8 | 
  9 | 
 10 | class Lunar:
 11 |     def __init__(self, lunarYear, lunarMonth, lunarDay, isleap):
 12 |         self.isleap = isleap
 13 |         self.lunarDay = lunarDay
 14 |         self.lunarMonth = lunarMonth
 15 |         self.lunarYear = lunarYear
 16 | 
 17 | 
 18 | class Solar:
 19 |     def __init__(self, solarYear, solarMonth, solarDay):
 20 |         self.solarDay = solarDay
 21 |         self.solarMonth = solarMonth
 22 |         self.solarYear = solarYear
 23 | 
 24 | 
 25 | def GetBitInt(data, length, shift):
 26 |     return (data & (((1 << length) - 1) << shift)) >> shift
 27 | 
 28 | 
 29 | def SolarToInt(y, m, d):
 30 |     m = (m + 9) % 12
 31 |     y -= int(m / 10)
 32 |     return 365 * y + int(y / 4) - int(y / 100) + int(y / 400) + int((m * 306 + 5) / 10) + (d - 1)
 33 | 
 34 | 
 35 | def SolarFromInt(g):
 36 |     y = int((10000 * g + 14780) / 3652425)
 37 |     ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400))
 38 |     if ddd < 0:
 39 |         y -= 1
 40 |         ddd = g - (365 * y + int(y / 4) - int(y / 100) + int(y / 400))
 41 | 
 42 |     mi = int((100 * ddd + 52) / 3060)
 43 |     mm = (mi + 2) % 12 + 1
 44 |     y += int((mi + 2) / 12)
 45 |     dd = ddd - int((mi * 306 + 5) / 10) + 1
 46 |     solar = Solar(y, mm, dd)
 47 |     return solar
 48 | 
 49 | 
 50 | class LunarSolarConverter:
 51 |     #####################################################################################
 52 |     # 1888~2111年农历数据表
 53 |     # 农历数据 每个元素的存储格式如下：
 54 |     #   16~13    12          11~0
 55 |     #  闰几月 闰月日数  1~12月份农历日数(大小月)
 56 |     # 注：1、bit0表示农历1月份日数，为1表示30天，为0表示29天。bit1表示农历2月份日数，依次类推。
 57 |     #     2、bit12表示闰月日数，1为30天，0为29天。bit16~bit13表示第几月是闰月(注：为0表示该年无闰月)
 58 |     # 数据来源参考: http://data.weather.gov.hk/gts/time/conversion1_text_c.htm
 59 |     #####################################################################################
 60 |     lunar_month_days = [1887, 0x1694, 0x16aa, 0x4ad5, 0xab6, 0xc4b7, 0x4ae, 0xa56, 0xb52a,
 61 |                         0x1d2a, 0xd54, 0x75aa, 0x156a, 0x1096d, 0x95c, 0x14ae, 0xaa4d, 0x1a4c, 0x1b2a, 0x8d55,
 62 |                         0xad4, 0x135a, 0x495d,
 63 |                         0x95c, 0xd49b, 0x149a, 0x1a4a, 0xbaa5, 0x16a8, 0x1ad4, 0x52da, 0x12b6, 0xe937, 0x92e,
 64 |                         0x1496, 0xb64b, 0xd4a,
 65 |                         0xda8, 0x95b5, 0x56c, 0x12ae, 0x492f, 0x92e, 0xcc96, 0x1a94, 0x1d4a, 0xada9, 0xb5a, 0x56c,
 66 |                         0x726e, 0x125c,
 67 |                         0xf92d, 0x192a, 0x1a94, 0xdb4a, 0x16aa, 0xad4, 0x955b, 0x4ba, 0x125a, 0x592b, 0x152a,
 68 |                         0xf695, 0xd94, 0x16aa,
 69 |                         0xaab5, 0x9b4, 0x14b6, 0x6a57, 0xa56, 0x1152a, 0x1d2a, 0xd54, 0xd5aa, 0x156a, 0x96c,
 70 |                         0x94ae, 0x14ae, 0xa4c,
 71 |                         0x7d26, 0x1b2a, 0xeb55, 0xad4, 0x12da, 0xa95d, 0x95a, 0x149a, 0x9a4d, 0x1a4a, 0x11aa5,
 72 |                         0x16a8, 0x16d4,
 73 |                         0xd2da, 0x12b6, 0x936, 0x9497, 0x1496, 0x1564b, 0xd4a, 0xda8, 0xd5b4, 0x156c, 0x12ae,
 74 |                         0xa92f, 0x92e, 0xc96,
 75 |                         0x6d4a, 0x1d4a, 0x10d65, 0xb58, 0x156c, 0xb26d, 0x125c, 0x192c, 0x9a95, 0x1a94, 0x1b4a,
 76 |                         0x4b55, 0xad4,
 77 |                         0xf55b, 0x4ba, 0x125a, 0xb92b, 0x152a, 0x1694, 0x96aa, 0x15aa, 0x12ab5, 0x974, 0x14b6,
 78 |                         0xca57, 0xa56, 0x1526,
 79 |                         0x8e95, 0xd54, 0x15aa, 0x49b5, 0x96c, 0xd4ae, 0x149c, 0x1a4c, 0xbd26, 0x1aa6, 0xb54,
 80 |                         0x6d6a, 0x12da, 0x1695d,
 81 |                         0x95a, 0x149a, 0xda4b, 0x1a4a, 0x1aa4, 0xbb54, 0x16b4, 0xada, 0x495b, 0x936, 0xf497,
 82 |                         0x1496, 0x154a, 0xb6a5,
 83 |                         0xda4, 0x15b4, 0x6ab6, 0x126e, 0x1092f, 0x92e, 0xc96, 0xcd4a, 0x1d4a, 0xd64, 0x956c,
 84 |                         0x155c, 0x125c, 0x792e,
 85 |                         0x192c, 0xfa95, 0x1a94, 0x1b4a, 0xab55, 0xad4, 0x14da, 0x8a5d, 0xa5a, 0x1152b, 0x152a,
 86 |                         0x1694, 0xd6aa,
 87 |                         0x15aa, 0xab4, 0x94ba, 0x14b6, 0xa56, 0x7527, 0xd26, 0xee53, 0xd54, 0x15aa, 0xa9b5, 0x96c,
 88 |                         0x14ae, 0x8a4e,
 89 |                         0x1a4c, 0x11d26, 0x1aa4, 0x1b54, 0xcd6a, 0xada, 0x95c, 0x949d, 0x149a, 0x1a2a, 0x5b25,
 90 |                         0x1aa4, 0xfb52,
 91 |                         0x16b4, 0xaba, 0xa95b, 0x936, 0x1496, 0x9a4b, 0x154a, 0x136a5, 0xda4, 0x15ac]
 92 |     # 额外添加数据，方便快速计算阴历转阳历 每个元素的存储格式如下：
 93 |     #    12~7         6~5    4~0
 94 |     #  离元旦多少天  春节月  春节日
 95 |     #####################################################################################
 96 |     solar_1_1 = [1887, 0xec04c, 0xec23f, 0xec435, 0xec649, 0xec83e, 0xeca51, 0xecc46, 0xece3a,
 97 |                  0xed04d, 0xed242, 0xed436, 0xed64a, 0xed83f, 0xeda53, 0xedc48, 0xede3d, 0xee050, 0xee244, 0xee439,
 98 |                  0xee64d,
 99 |                  0xee842, 0xeea36, 0xeec4a, 0xeee3e, 0xef052, 0xef246, 0xef43a, 0xef64e, 0xef843, 0xefa37, 0xefc4b,
100 |                  0xefe41,
101 |                  0xf0054, 0xf0248, 0xf043c, 0xf0650, 0xf0845, 0xf0a38, 0xf0c4d, 0xf0e42, 0xf1037, 0xf124a, 0xf143e,
102 |                  0xf1651,
103 |                  0xf1846, 0xf1a3a, 0xf1c4e, 0xf1e44, 0xf2038, 0xf224b, 0xf243f, 0xf2653, 0xf2848, 0xf2a3b, 0xf2c4f,
104 |                  0xf2e45,
105 |                  0xf3039, 0xf324d, 0xf3442, 0xf3636, 0xf384a, 0xf3a3d, 0xf3c51, 0xf3e46, 0xf403b, 0xf424e, 0xf4443,
106 |                  0xf4638,
107 |                  0xf484c, 0xf4a3f, 0xf4c52, 0xf4e48, 0xf503c, 0xf524f, 0xf5445, 0xf5639, 0xf584d, 0xf5a42, 0xf5c35,
108 |                  0xf5e49,
109 |                  0xf603e, 0xf6251, 0xf6446, 0xf663b, 0xf684f, 0xf6a43, 0xf6c37, 0xf6e4b, 0xf703f, 0xf7252, 0xf7447,
110 |                  0xf763c,
111 |                  0xf7850, 0xf7a45, 0xf7c39, 0xf7e4d, 0xf8042, 0xf8254, 0xf8449, 0xf863d, 0xf8851, 0xf8a46, 0xf8c3b,
112 |                  0xf8e4f,
113 |                  0xf9044, 0xf9237, 0xf944a, 0xf963f, 0xf9853, 0xf9a47, 0xf9c3c, 0xf9e50, 0xfa045, 0xfa238, 0xfa44c,
114 |                  0xfa641,
115 |                  0xfa836, 0xfaa49, 0xfac3d, 0xfae52, 0xfb047, 0xfb23a, 0xfb44e, 0xfb643, 0xfb837, 0xfba4a, 0xfbc3f,
116 |                  0xfbe53,
117 |                  0xfc048, 0xfc23c, 0xfc450, 0xfc645, 0xfc839, 0xfca4c, 0xfcc41, 0xfce36, 0xfd04a, 0xfd23d, 0xfd451,
118 |                  0xfd646,
119 |                  0xfd83a, 0xfda4d, 0xfdc43, 0xfde37, 0xfe04b, 0xfe23f, 0xfe453, 0xfe648, 0xfe83c, 0xfea4f, 0xfec44,
120 |                  0xfee38,
121 |                  0xff04c, 0xff241, 0xff436, 0xff64a, 0xff83e, 0xffa51, 0xffc46, 0xffe3a, 0x10004e, 0x100242,
122 |                  0x100437,
123 |                  0x10064b, 0x100841, 0x100a53, 0x100c48, 0x100e3c, 0x10104f, 0x101244, 0x101438, 0x10164c,
124 |                  0x101842, 0x101a35,
125 |                  0x101c49, 0x101e3d, 0x102051, 0x102245, 0x10243a, 0x10264e, 0x102843, 0x102a37, 0x102c4b,
126 |                  0x102e3f, 0x103053,
127 |                  0x103247, 0x10343b, 0x10364f, 0x103845, 0x103a38, 0x103c4c, 0x103e42, 0x104036, 0x104249,
128 |                  0x10443d, 0x104651,
129 |                  0x104846, 0x104a3a, 0x104c4e, 0x104e43, 0x105038, 0x10524a, 0x10543e, 0x105652, 0x105847,
130 |                  0x105a3b, 0x105c4f,
131 |                  0x105e45, 0x106039, 0x10624c, 0x106441, 0x106635, 0x106849, 0x106a3d, 0x106c51, 0x106e47,
132 |                  0x10703c, 0x10724f,
133 |                  0x107444, 0x107638, 0x10784c, 0x107a3f, 0x107c53, 0x107e48]
134 | 
135 |     def LunarToSolar(self, lunar):
136 |         days = LunarSolarConverter.lunar_month_days[lunar.lunarYear - LunarSolarConverter.lunar_month_days[0]]
137 |         leap = GetBitInt(days, 4, 13)
138 |         offset = 0
139 |         loopend = leap
140 |         if not lunar.isleap:
141 | 
142 |             if lunar.lunarMonth <= leap or leap == 0:
143 | 
144 |                 loopend = lunar.lunarMonth - 1
145 | 
146 |             else:
147 | 
148 |                 loopend = lunar.lunarMonth
149 | 
150 |         for i in range(0, loopend):
151 |             offset += GetBitInt(days, 1, 12 - i) == 1 and 30 or 29
152 | 
153 |         offset += lunar.lunarDay
154 | 
155 |         solar11 = LunarSolarConverter.solar_1_1[lunar.lunarYear - LunarSolarConverter.solar_1_1[0]]
156 | 
157 |         y = GetBitInt(solar11, 12, 9)
158 |         m = GetBitInt(solar11, 4, 5)
159 |         d = GetBitInt(solar11, 5, 0)
160 | 
161 |         return SolarFromInt(SolarToInt(y, m, d) + offset - 1)
162 | 
163 |     def SolarToLunar(self, solar):
164 | 
165 |         lunar = Lunar(0, 0, 0, False)
166 |         index = solar.solarYear - LunarSolarConverter.solar_1_1[0]
167 |         data = (solar.solarYear << 9) | (solar.solarMonth << 5) | solar.solarDay
168 |         if LunarSolarConverter.solar_1_1[index] > data:
169 |             index -= 1
170 | 
171 |         solar11 = LunarSolarConverter.solar_1_1[index]
172 |         y = GetBitInt(solar11, 12, 9)
173 |         m = GetBitInt(solar11, 4, 5)
174 |         d = GetBitInt(solar11, 5, 0)
175 |         offset = SolarToInt(solar.solarYear, solar.solarMonth, solar.solarDay) - SolarToInt(y, m, d)
176 | 
177 |         days = LunarSolarConverter.lunar_month_days[index]
178 |         leap = GetBitInt(days, 4, 13)
179 | 
180 |         lunarY = index + LunarSolarConverter.solar_1_1[0]
181 |         lunarM = 1
182 |         offset += 1
183 | 
184 |         for i in range(0, 13):
185 | 
186 |             dm = GetBitInt(days, 1, 12 - i) == 1 and 30 or 29
187 |             if offset > dm:
188 | 
189 |                 lunarM += 1
190 |                 offset -= dm
191 | 
192 |             else:
193 | 
194 |                 break
195 | 
196 |         lunarD = int(offset)
197 |         lunar.lunarYear = lunarY
198 |         lunar.lunarMonth = lunarM
199 |         lunar.isleap = False
200 |         if leap != 0 and lunarM > leap:
201 | 
202 |             lunar.lunarMonth = lunarM - 1
203 |             if lunarM == leap + 1:
204 |                 lunar.isleap = True
205 | 
206 |         lunar.lunarDay = lunarD
207 |         return lunar
208 | 
209 |     def __init__(self):
210 |         pass
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     converter = LunarSolarConverter()
215 |     solar = Solar(2111, 1, 25)
216 |     pprint(vars(solar))
217 |     lunar = converter.SolarToLunar(solar)
218 |     pprint(vars(lunar))
219 |     solar = converter.LunarToSolar(lunar)
220 |     pprint(vars(solar))
221 |     print(len(converter.solar_1_1))
222 |     print("Done")
223 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/README.md:
--------------------------------------------------------------------------------
 1 | ## 简介
 2 | Time-NLP的python3版本   
 3 | python 版本https://github.com/sunfiyes/Time-NLPY  
 4 | Java 版本https://github.com/shinyke/Time-NLP
 5 | ## 功能说明
 6 | 用于句子中时间词的抽取和转换  
 7 | 详情请见test.py
 8 | 
 9 |     res = tn.parse(target=u'过十分钟') # target为待分析语句，timeBase为基准时间默认是当前时间
10 |     print(res)
11 |     res = tn.parse(target=u'2013年二月二十八日下午四点三十分二十九秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
12 |     print(res)
13 |     res = tn.parse(target=u'我需要大概33天2分钟四秒', timeBase='2013-02-28 16:30:29') # target为待分析语句，timeBase为基准时间默认是当前时间
14 |     print(res)
15 |     res = tn.parse(target=u'今年儿童节晚上九点一刻') # target为待分析语句，timeBase为基准时间默认是当前时间
16 |     print(res)
17 |     res = tn.parse(target=u'2个小时以前') # target为待分析语句，timeBase为基准时间默认是当前时间
18 |     print(res)
19 |     res = tn.parse(target=u'晚上8点到上午10点之间') # target为待分析语句，timeBase为基准时间默认是当前时间
20 |     print(res)
21 | 返回结果：
22 | 
23 |     {"timedelta": "0 days, 0:10:00", "type": "timedelta"}
24 |     {"timestamp": "2013-02-28 16:30:29", "type": "timestamp"}
25 |     {"type": "timedelta", "timedelta": {"year": 0, "month": 1, "day": 3, "hour": 0, "minute": 2, "second": 4}}
26 |     {"timestamp": "2018-06-01 21:15:00", "type": "timestamp"}
27 |     {"error": "no time pattern could be extracted."}
28 |     {"type": "timespan", "timespan": ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]}
29 |     
30 | ## 使用方式 
31 | demo：python3 Test.py
32 | 
33 | 优化说明
34 |     
35 | | 问题          | 以前版本                                     | 现在版本                    |
36 | | ----------- | ---------------------------------------- | ---------------------- |
37 | | 无法解析下下周末     | "timestamp": "2018-04-01 00:00:00"                                    | "timestamp": "2018-04-08 00:00:00"                 |
38 | | 无法解析 3月4         | "2018-03-01"                                   | "2018-03-04"               |
39 | | 无法解析 初一 初二      | cannot parse                                    | "2018-02-16"              |
40 | | 晚上8点到上午10点之间  无法解析上午      | ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] |  ["2018-03-16 20:00:00", "2018-03-16 10:00:00"]|
41 | | 3月21号  错误解析成2019年      | "2019-03-21" | "2018-03-21" |
42 | 
43 | 感谢@[tianyuningmou](https://github.com/tianyuningmou) 目前增加了对24节气的支持
44 | 
45 | 
46 |     temp = ['今年春分']
47 |     "timestamp" : "2020-03-20 00:00:00"
48 | 
49 | ## TODO
50 | 
51 | | 问题          | 现在版本                                     | 正确
52 | | ----------- | ---------------------------------------- | ---------------------- |
53 | | 晚上8点到上午10点之间     |  ["2018-03-16 20:00:00", "2018-03-16 22:00:00"] |  ["2018-03-16 20:00:00", "2018-03-17 10:00:00"]"                                    | "timestamp": "2018-04-08 00:00:00"                 |
54 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/RangeTimeEnum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/11/20 16:27
 4 | # @Author  : zhm
 5 | # @File    : RangeTimeEnum.py
 6 | # @Software: PyCharm
 7 | 
 8 | 
 9 | 
10 | # 范围时间的默认时间点
11 | class RangeTimeEnum():
12 |     day_break = 3  # 黎明
13 |     early_morning = 8  # 早
14 |     morning = 10  # 上午
15 |     noon = 12  # 中午、午间
16 |     afternoon = 15  # 下午、午后
17 |     night = 18  # 晚上、傍晚
18 |     lateNight = 20  # 晚、晚间
19 |     midNight = 23  # 深夜
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     print(RangeTimeEnum.afternoon)
24 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/StringPreHandler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/11/20 15:42
  4 | # @Author  : zhm
  5 | # @File    : StringPreHandler.py
  6 | # @Software: PyCharm
  7 | import regex as re
  8 | 
  9 | # * 字符串预处理模块，为分析器TimeNormalizer提供相应的字符串预处理服务
 10 | class StringPreHandler:
 11 |     @classmethod
 12 |     def delKeyword(cls, target, rules):
 13 |         """
 14 |         该方法删除一字符串中所有匹配某一规则字串
 15 |         可用于清理一个字符串中的空白符和语气助词
 16 |         :param target: 待处理字符串
 17 |         :param rules: 删除规则
 18 |         :return: 清理工作完成后的字符串
 19 |         """
 20 |         pattern = re.compile(rules)
 21 |         res = pattern.sub('', target)
 22 |         # print res
 23 |         return res
 24 | 
 25 | 
 26 |     @classmethod
 27 |     def numberTranslator(cls, target):
 28 |         """
 29 |         该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
 30 |         如"这里有一千两百个人，六百零五个来自中国"可以转化为
 31 |         "这里有1200个人，605个来自中国"
 32 |         此外添加支持了部分不规则表达方法
 33 |         如两万零六百五可转化为20650
 34 |         两百一十四和两百十四都可以转化为214
 35 |         一六零加一五八可以转化为160+158
 36 |         该方法目前支持的正确转化范围是0-99999999
 37 |         该功能模块具有良好的复用性
 38 |         :param target: 待转化的字符串
 39 |         :return: 转化完毕后的字符串
 40 |         """
 41 |         pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
 42 |         match = pattern.finditer(target)
 43 |         for m in match:
 44 |             group = m.group()
 45 |             s = group.split(u"万")
 46 |             s = filter(None, s)
 47 |             num = 0
 48 |             if len(s) == 2:
 49 |                 num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000
 50 |             target = pattern.sub(str(num), target, 1)
 51 | 
 52 |         pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
 53 |         match = pattern.finditer(target)
 54 |         for m in match:
 55 |             group = m.group()
 56 |             s = group.split(u"千")
 57 |             s = filter(None, s)
 58 |             num = 0
 59 |             if len(s) == 2:
 60 |                 num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100
 61 |             target = pattern.sub(str(num), target, 1)
 62 | 
 63 |         pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
 64 |         match = pattern.finditer(target)
 65 |         for m in match:
 66 |             group = m.group()
 67 |             s = group.split(u"百")
 68 |             s = filter(None, s)
 69 |             num = 0
 70 |             if len(s) == 2:
 71 |                 num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10
 72 |             target = pattern.sub(str(num), target, 1)
 73 | 
 74 |         pattern = re.compile(u"[零一二两三四五六七八九]")
 75 |         match = pattern.finditer(target)
 76 |         for m in match:
 77 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 78 | 
 79 |         pattern = re.compile(u"(?<=(周|星期))[末天日]")
 80 |         match = pattern.finditer(target)
 81 |         for m in match:
 82 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 83 | 
 84 |         pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
 85 |         match = pattern.finditer(target)
 86 |         for m in match:
 87 |             group = m.group()
 88 |             s = group.split(u"十")
 89 |             num = 0
 90 |             ten = cls.strToInt(s[0])
 91 |             if ten == 0:
 92 |                 ten = 1
 93 |             unit = cls.strToInt(s[1])
 94 |             num = ten * 10 + unit
 95 |             target = pattern.sub(str(num), target, 1)
 96 | 
 97 |         pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
 98 |         match = pattern.finditer(target)
 99 |         for m in match:
100 |             group = m.group()
101 |             s = group.split(u"百")
102 |             s = filter(None, s)
103 |             num = 0
104 |             if len(s) == 1:
105 |                 hundred = int(s[0])
106 |                 num += hundred * 100
107 |             elif len(s) == 2:
108 |                 hundred = int(s[0])
109 |                 num += hundred * 100
110 |                 num += int(s[1])
111 |             target = pattern.sub(str(num), target, 1)
112 | 
113 |         pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
114 |         match = pattern.finditer(target)
115 |         for m in match:
116 |             group = m.group()
117 |             s = group.split(u"千")
118 |             s = filter(None, s)
119 |             num = 0
120 |             if len(s) == 1:
121 |                 thousand = int(s[0])
122 |                 num += thousand * 1000
123 |             elif len(s) == 2:
124 |                 thousand = int(s[0])
125 |                 num += thousand * 1000
126 |                 num += int(s[1])
127 |             target = pattern.sub(str(num), target, 1)
128 | 
129 |         pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
130 |         match = pattern.finditer(target)
131 |         for m in match:
132 |             group = m.group()
133 |             s = group.split(u"万")
134 |             s = filter(None, s)
135 |             num = 0
136 |             if len(s) == 1:
137 |                 tenthousand = int(s[0])
138 |                 num += tenthousand * 10000
139 |             elif len(s) == 2:
140 |                 tenthousand = int(s[0])
141 |                 num += tenthousand * 10000
142 |                 num += int(s[1])
143 |             target = pattern.sub(str(num), target, 1)
144 | 
145 |         return target
146 | 
147 |     @classmethod
148 |     def wordToNumber(cls, s):
149 |         """
150 |         方法numberTranslator的辅助方法，可将[零-九]正确翻译为[0-9]
151 |         :param s: 大写数字
152 |         :return: 对应的整形数，如果不是数字返回-1
153 |         """
154 |         if (s == u'零') or (s == '0'):
155 |             return 0
156 |         elif (s == u'一') or (s == '1'):
157 |             return 1
158 |         elif (s == u'二') or (s == u'两') or (s == '2'):
159 |             return 2
160 |         elif (s == u'三') or (s == '3'):
161 |             return 3
162 |         elif (s == u'四') or (s == '4'):
163 |             return 4
164 |         elif (s == u'五') or (s == '5'):
165 |             return 5
166 |         elif (s == u'六') or (s == '6'):
167 |             return 6
168 |         elif (s == u'七') or (s == u'天') or (s == u'日') or (s == u'末') or (s == '7'):
169 |             return 7
170 |         elif (s == u'八') or (s == '8'):
171 |             return 8
172 |         elif (s == u'九') or (s == '9'):
173 |             return 9
174 |         else:
175 |             return -1
176 | 
177 |     @classmethod
178 |     def strToInt(cls, s):
179 |         try:
180 |             res = int(s)
181 |         except:
182 |             res = 0
183 |         return res


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: TimeConverter
 3 | Version: 1.1.0
 4 | Summary: ...
 5 | Home-page: http://test.com
 6 | Author: test
 7 | Author-email: test@gmail.com
 8 | License: MIT Licence
 9 | Description: ...
10 | Keywords: time,nlp
11 | Platform: any
12 | Classifier: Programming Language :: Python :: 2.6
13 | Classifier: Programming Language :: Python :: 2.7
14 | Classifier: Programming Language :: Python :: 3.6
15 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LunarSolarConverter.py
 2 | README.md
 3 | RangeTimeEnum.py
 4 | StringPreHandler.py
 5 | Test.py
 6 | TimeNormalizer.py
 7 | TimePoint.py
 8 | TimeUnit.py
 9 | __init__.py
10 | setup.py
11 | TimeConverter.egg-info/PKG-INFO
12 | TimeConverter.egg-info/SOURCES.txt
13 | TimeConverter.egg-info/dependency_links.txt
14 | TimeConverter.egg-info/not-zip-safe
15 | TimeConverter.egg-info/requires.txt
16 | TimeConverter.egg-info/top_level.txt
17 | resource/__init__.py


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | regex>=2017
2 | arrow>=0.10
3 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeConverter.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | resource
3 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimeNormalizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/11/20 16:39
  4 | # @Author  : zhm
  5 | # @File    : TimeNormalizer.py
  6 | # @Software: PyCharm
  7 | import pickle
  8 | import regex as re
  9 | import arrow
 10 | import json
 11 | import os
 12 | 
 13 | 
 14 | from cocoNLP.config.basic.time_nlp.StringPreHandler import StringPreHandler
 15 | from cocoNLP.config.basic.time_nlp.TimePoint import TimePoint
 16 | from cocoNLP.config.basic.time_nlp.TimeUnit import TimeUnit
 17 | 
 18 | # 时间表达式识别的主要工作类
 19 | class TimeNormalizer:
 20 |     def __init__(self, isPreferFuture=True):
 21 |         self.isPreferFuture = isPreferFuture
 22 |         self.pattern, self.holi_solar, self.holi_lunar = self.init()
 23 | 
 24 |     # 这里对一些不规范的表达做转换
 25 |     def _filter(self, input_query):
 26 |         # 这里对于下个周末这种做转化 把个给移除掉
 27 |         input_query = StringPreHandler.numberTranslator(input_query)
 28 | 
 29 |         rule = u"[0-9]月[0-9]"
 30 |         pattern = re.compile(rule)
 31 |         match = pattern.search(input_query)
 32 |         if match != None:
 33 |             index = input_query.find('月')
 34 |             rule = u"日|号"
 35 |             pattern = re.compile(rule)
 36 |             match = pattern.search(input_query[index:])
 37 |             if match == None:
 38 |                 rule = u"[0-9]月[0-9]+"
 39 |                 pattern = re.compile(rule)
 40 |                 match = pattern.search(input_query)
 41 |                 if match != None:
 42 |                     end = match.span()[1]
 43 |                     input_query = input_query[:end] + '号' + input_query[end:]
 44 | 
 45 |         rule = u"月"
 46 |         pattern = re.compile(rule)
 47 |         match = pattern.search(input_query)
 48 |         if match == None:
 49 |             input_query = input_query.replace('个', '')
 50 | 
 51 |         input_query = input_query.replace('中旬', '15号')
 52 |         input_query = input_query.replace('傍晚', '午后')
 53 |         input_query = input_query.replace('大年', '')
 54 |         input_query = input_query.replace('五一', '劳动节')
 55 |         input_query = input_query.replace('白天', '早上')
 56 |         input_query = input_query.replace('：', ':')
 57 |         return input_query
 58 | 
 59 |     def init(self):
 60 |         fpath = os.path.dirname(__file__) + '/resource/reg.pkl'
 61 |         # print(os.path.dirname(__file__))
 62 |         try:
 63 |             with open(fpath, 'rb') as f:
 64 |                 pattern = pickle.load(f)
 65 |         except:
 66 |             with open(os.path.dirname(__file__) + '/resource/regex.txt', 'r') as f:
 67 |                 content = f.read()
 68 |             p = re.compile(content)
 69 |             with open(fpath, 'wb') as f:
 70 |                 pickle.dump(p, f)
 71 |             with open(fpath, 'rb') as f:
 72 |                 pattern = pickle.load(f)
 73 |         with open(os.path.dirname(__file__) + '/resource/holi_solar.json', 'r', encoding='utf-8') as f:
 74 |             holi_solar = json.load(f)
 75 |         with open(os.path.dirname(__file__) + '/resource/holi_lunar.json', 'r', encoding='utf-8') as f:
 76 |             holi_lunar = json.load(f)
 77 |         return pattern, holi_solar, holi_lunar
 78 | 
 79 |     def parse(self, target, timeBase=arrow.now()):
 80 |         """
 81 |         TimeNormalizer的构造方法，timeBase取默认的系统当前时间
 82 |         :param timeBase: 基准时间点
 83 |         :param target: 待分析字符串
 84 |         :return: 时间单元数组
 85 |         """
 86 |         self.isTimeSpan = False
 87 |         self.invalidSpan = False
 88 |         self.timeSpan = ''
 89 |         self.target = self._filter(target)
 90 |         self.timeBase = arrow.get(timeBase).format('YYYY-M-D-H-m-s')
 91 |         self.nowTime = timeBase
 92 |         self.oldTimeBase = self.timeBase
 93 |         self.__preHandling()
 94 |         self.timeToken = self.__timeEx()
 95 |         dic = {}
 96 |         res = self.timeToken
 97 | 
 98 |         if self.isTimeSpan:
 99 | 
100 |             if self.invalidSpan:
101 |                 dic['error'] = 'no time pattern could be extracted.'
102 |             else:
103 |                 result = {}
104 |                 dic['type'] = 'timedelta'
105 |                 dic['timedelta'] = self.timeSpan
106 |                 # print(dic['timedelta'])
107 |                 index = dic['timedelta'].find('days')
108 | 
109 |                 days = int(dic['timedelta'][:index-1])
110 |                 result['year'] = int(days / 365)
111 |                 result['month'] = int(days / 30 - result['year'] * 12)
112 |                 result['day'] = int(days - result['year'] * 365 - result['month'] * 30)
113 |                 index = dic['timedelta'].find(',')
114 |                 time = dic['timedelta'][index+1:]
115 |                 time = time.split(':')
116 |                 result['hour'] = int(time[0])
117 |                 result['minute'] = int(time[1])
118 |                 result['second'] = int(time[2])
119 |                 dic['timedelta'] = result
120 |         else:
121 |             if len(res) == 0:
122 |                 dic['error'] = 'no time pattern could be extracted.'
123 |             elif len(res) == 1:
124 |                 dic['type'] = 'timestamp'
125 |                 dic['timestamp'] = res[0].time.format("YYYY-MM-DD HH:mm:ss")
126 |             else:
127 |                 dic['type'] = 'timespan'
128 |                 dic['timespan'] = [res[0].time.format("YYYY-MM-DD HH:mm:ss"), res[1].time.format("YYYY-MM-DD HH:mm:ss")]
129 |         return json.dumps(dic)
130 | 
131 |     def __preHandling(self):
132 |         """
133 |         待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理
134 |         :return:
135 |         """
136 |         self.target = StringPreHandler.delKeyword(self.target, u"\\s+")  # 清理空白符
137 |         self.target = StringPreHandler.delKeyword(self.target, u"[的]+")  # 清理语气助词
138 |         self.target = StringPreHandler.numberTranslator(self.target)  # 大写数字转化
139 | 
140 |     def __timeEx(self):
141 |         """
142 | 
143 |         :param target: 输入文本字符串
144 |         :param timeBase: 输入基准时间
145 |         :return: TimeUnit[]时间表达式类型数组
146 |         """
147 |         startline = -1
148 |         endline = -1
149 |         rpointer = 0
150 |         temp = []
151 | 
152 |         match = self.pattern.finditer(self.target)
153 |         for m in match:
154 |             startline = m.start()
155 |             if startline == endline:
156 |                 rpointer -= 1
157 |                 temp[rpointer] = temp[rpointer] + m.group()
158 |             else:
159 |                 temp.append(m.group())
160 |             endline = m.end()
161 |             rpointer += 1
162 |         res = []
163 |         # 时间上下文： 前一个识别出来的时间会是下一个时间的上下文，用于处理：周六3点到5点这样的多个时间的识别，第二个5点应识别到是周六的。
164 |         contextTp = TimePoint()
165 |         # print(self.timeBase)
166 |         # print('temp',temp)
167 |         for i in range(0, rpointer):
168 |             # 这里是一个类嵌套了一个类
169 |             res.append(TimeUnit(temp[i], self, contextTp))
170 |             # res[i].tp.tunit[3] = -1
171 |             contextTp = res[i].tp
172 |             # print(self.nowTime.year)
173 |             # print(contextTp.tunit)
174 |         res = self.__filterTimeUnit(res)
175 | 
176 |         return res
177 | 
178 |     def __filterTimeUnit(self, tu_arr):
179 |         """
180 |         过滤timeUnit中无用的识别词。无用识别词识别出的时间是1970.01.01 00:00:00(fastTime=0)
181 |         :param tu_arr:
182 |         :return:
183 |         """
184 |         if (tu_arr is None) or (len(tu_arr) < 1):
185 |             return tu_arr
186 |         res = []
187 |         for tu in tu_arr:
188 |             if tu.time.timestamp != 0:
189 |                 res.append(tu)
190 |         return res
191 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/TimePoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/11/20 15:37
 4 | # @Author  : zhm
 5 | # @File    : TimePoint.py
 6 | # @Software: PyCharm
 7 | 
 8 | 
 9 | #  * 时间表达式单元规范化对应的内部类,
10 | #  * 对应时间表达式规范化的每个字段，
11 | #  * 六个字段分别是：年-月-日-时-分-秒，
12 | #  * 每个字段初始化为-1
13 | class TimePoint:
14 |     def __init__(self):
15 |         self.tunit = [-1, -1, -1, -1, -1, -1]
16 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 2017/11/23 13:22
4 | # @Author  : zhm
5 | # @File    : __init__.py
6 | # @Software: PyCharm


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/LunarSolarConverter.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/RangeTimeEnum.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/StringPreHandler.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/Test.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeNormalizer.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimePoint.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/TimeUnit.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 2017/12/5 17:29
4 | # @Author  : zhm
5 | # @File    : __init__.py
6 | # @Software: PyCharm


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/holi_lunar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "中和节": "02-02", 
 3 |   "中秋节": "08-15", 
 4 |   "中元节": "07-15", 
 5 |   "端午节": "05-05", 
 6 |   "春节": "01-01", 
 7 |   "元宵节": "01-15", 
 8 |   "重阳节": "09-09", 
 9 |   "7夕节": "07-07",
10 |   "初1节": "01-01",
11 |   "初2节": "01-02",
12 |   "初3节": "01-03",
13 |   "初4节": "01-04",
14 |   "初5节": "01-05",
15 |   "初6节": "01-06",
16 |   "初7节": "01-07",
17 |   "初8节": "01-08",
18 |   "初9节": "01-09",
19 |   "初10节": "01-10",
20 |   "初11节": "01-11",
21 |   "初12节": "01-12",
22 |   "初13节": "01-13",
23 |   "初14节": "01-14",
24 |   "初15节": "01-15"
25 | }
26 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/holi_solar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "植树节": "03-12", 
 3 |   "圣诞节": "12-25", 
 4 |   "青年节": "05-04", 
 5 |   "教师节": "09-10", 
 6 |   "儿童节": "06-01", 
 7 |   "元旦节": "01-01", 
 8 |   "国庆节": "10-01", 
 9 |   "劳动节": "05-01", 
10 |   "妇女节": "03-08", 
11 |   "建军节": "08-01", 
12 |   "航海日节": "07-11", 
13 |   "建党节": "07-01", 
14 |   "记者节": "11-08",
15 |   "情人节":"02-14",
16 |   "母亲节":"05-11"
17 | }
18 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/reg.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/reg.pkl


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/basic/time_nlp/resource/regex.txt:
--------------------------------------------------------------------------------
1 | ((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[:：]\d+([:：]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?<!\\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)点)|(今年([零一二三四五六七八九十百千万]+|\d+))|(\d+[:：]\d+(分|))|((\d+):(\d+))|(\d+/\d+/\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\d+分)+(\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\d*]世纪末?)|((\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\d+点)+(\d+分)?(\d+秒)?左右?)|((\d+)年代)|(本月(\d+))|(第(\d+)天)|((\d+)岁)|((\d+)年(\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(今年(\d+)月(\d+)日)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\d+)年(\d+)月(\d+)日)|((\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\d+)点)|(去年(\d+)月(\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\d+)日晚(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\d+)月(\d+)日)|(([零一二三四五六七八九十百千万]+|\d+)周)|((\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\d+))|(长久)|(清晨)|((\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+))|(圣诞节)|(学期)|(\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\d+)月底)|(当天)|((\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时)|(连[年月日夜])|((\d+)年(\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时(\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时(\d+)分)|(同日)|((\d+)年(\d+)月底)|((\d+)分钟)|((\d+)世纪)|(冬季)|(清明)(节)?|(立春)|(雨水)|(惊蛰)|(春分)|(谷雨)|(立夏)|(小满 )|(芒种)|(夏至)|(小暑)|(大暑)|(立秋)|(处暑)|(白露)|(秋分)|(寒露)|(霜降)|(立冬)|(小雪)|(大雪)|(冬至)|(小寒)|(大寒)|(青年节)|(教师节)|(中元节)|(端午)(节)?|(劳动节)|(7夕)(节)?|(建党节)|(建军节)|(初13)|(初14)|(初15)|(初12)|(初11)|(初9)|(初8)|(初7)|(初6)|(初5)|(初4)|(初3)|(初2)|(初1)|(情人节)|(母亲节)|(中和节)|(圣诞)(节)?|(中秋)(节)?|(春节)|(元宵)(节)?|(航海日)|(儿童节)|(国庆)(节)?|(植树节)|(元旦)|(重阳节)|(妇女节)|(记者节)|(年代)|(([零一二三四五六七八九十百千万]+|\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[:：]\d+([:：]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?<!\d))([0-3][0-9]|[1-9]))|(现在)|(届时)|(这个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)日)|(晚些时候)|(今年)|(长期)|(以前)|(过去)|(时期)|(时代)|(当时)|(近来)|(([零一二三四五六七八九十百千万]+|\d+)夜)|(当前)|(日(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)点)|(今年([零一二三四五六七八九十百千万]+|\d+))|(\d+[:：]\d+(分|))|((\d+):(\d+))|(\d+/\d+/\d+)|(未来)|((充满美丽、希望、挑战的)?未来)|(最近)|(早上)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(日前)|(新世纪)|(小时)|(([0-3][0-9]|[1-9])(日|号))|(明天)|(\d+)月|(([0-3][0-9]|[1-9])[日号])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|([一二三四五六七八九十百千万几多]+[天日周月年][后前左右]*)|(每[年月日天小时分秒钟]+)|((\d+分)+(\d+秒)?)|([一二三四五六七八九十]+来?[岁年])|([新?|\d*]世纪末?)|((\d+)时)|(世纪)|(([零一二三四五六七八九十百千万]+|\d+)岁)|(今年)|([星期周]+[一二三四五六七])|(星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)年)|([本后昨当新后明今去前那这][一二三四五六七八九十]?[年月日天])|(早|早晨|早上|上午|中午|午后|下午|晚上|晚间|夜里|夜|凌晨|深夜)|(回归前后)|((\d+点)+(\d+分)?(\d+秒)?左右?)|((\d+)年代)|(本月(\d+))|(第(\d+)天)|((\d+)岁)|((\d+)年(\d+)月)|([去今明]?[年月](底|末))|(([零一二三四五六七八九十百千万]+|\d+)世纪)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(年度)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)星期)|(年底)|([下个本]+赛季)|(\d+)月(\d+)日|(\d+)月(\d+)|(今年(\d+)月(\d+)日)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(今年晚些时候)|(两个星期)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)周)|(本赛季)|(半个(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(稍晚)|((\d+)号晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)年)|(这个时候)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个小时)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(凌晨)|((\d+)年(\d+)月(\d+)日)|((\d+)个月)|(今天早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(第[一二三四五六七八九十\d+]+季)|(当地时间)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)年)|(早晨)|(一段时间)|([本上]周[一二三四五六七])|(凌晨(\d+)点)|(去年(\d+)月(\d+)日)|(年关)|(如今)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(当晚)|((\d+)日晚(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年(\d+)月(\d+)日)|(([零一二三四五六七八九十百千万]+|\d+)周)|((\d+)月)|(农历)|(两个小时)|(本周([零一二三四五六七八九十百千万]+|\d+))|(长久)|(清晨)|((\d+)号晚)|(春节)|(星期日)|(圣诞)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段)|(现年)|(当日)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分钟)|(\d+(天|日|周|月|年)(后|前|))|((文艺复兴|巴洛克|前苏联|前一|暴力和专制|成年时期|古罗马|我们所处的敏感)+时期)|((\d+)[年月天])|(清早)|(两年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(昨天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+))|(圣诞节)|(学期)|(\d+来?分钟)|(过去(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(星期天)|(夜间)|((\d+)日凌晨)|(([零一二三四五六七八九十百千万]+|\d+)月底)|(当天)|((\d+)日)|(((10)|(11)|(12)|([1-9]))月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(今年(\d+)月份)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时)|(连[年月日夜])|((\d+)年(\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((一|二|两|三|四|五|六|七|八|九|十|百|千|万|几|多|上|\d+)+个?(天|日|周|月|年)(后|前|半|))|((胜利的)日子)|(青春期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|([0-9]{4}年)|(周末)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(([(小学)|初中?|高中?|大学?|研][一二三四五六七八九十]?(\d+)?)?[上下]半?学期)|(([零一二三四五六七八九十百千万]+|\d+)时期)|(午间)|(次年)|(这时候)|(农历新年)|([春夏秋冬](天|季))|((\d+)天)|(元宵节)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)分)|((\d+)月(\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)时(\d+)分)|(傍晚)|(周([零一二三四五六七八九十百千万]+|\d+))|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时(\d+)分)|(同日)|((\d+)年(\d+)月底)|((\d+)分钟)|((\d+)世纪)|(冬季)|(年代)|(([零一二三四五六七八九十百千万]+|\d+)年半)|(今年年底)|(新年)|(本周)|(当地时间星期([零一二三四五六七八九十百千万]+|\d+))|(([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(半小时)|(每周)|(([零一二三四五六七八九十百千万]+|\d+)周年)|((重要|最后)?时刻)|(([零一二三四五六七八九十百千万]+|\d+)期间)|(周日)|(晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(今后)|(([零一二三四五六七八九十百千万]+|\d+)段时间)|(明年)|([12][09][0-9]{2}(年度?))|(([零一二三四五六七八九十百千万]+|\d+)生)|(今天凌晨)|(过去(\d+)年)|(元月)|((\d+)月(\d+)日凌晨)|([前去今明后新]+年)|((\d+)月(\d+))|(夏天)|((\d+)日凌晨(\d+)时许)|((\d+)月(\d+)日)|((\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\d+)月(\d+)日)|((\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\d+)个礼拜)|(昨日)|([年月]初)|((\d+)年的(\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\d+)月份)|(今年(\d+)月(\d+)号)|(今年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)月底)|(未来(\d+)年)|(第([零一二三四五六七八九十百千万]+|\d+)季)|(\d?多年)|(([零一二三四五六七八九十百千万]+|\d+)个星期)|((\d+)年([零一二三四五六七八九十百千万]+|\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)年(\d+)月(\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)天)|(工作日)|(稍后)|((\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)刻)|(很久)|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\d+)月(\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(古代)|(两天)|(\d+个?(小时|星期))|((\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\d+天)+(\d+点)?(\d+分)?(\d+秒)?)|((\d+)日([零一二三四五六七八九十百千万]+|\d+)时)|((\d+)周年)|(([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)日)|(去年(\d+)月)|(过去([零一二三四五六七八九十百千万]+|\d+)年)|((\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\d+)日)|(第([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)点(\d+)分)|([长近多]年)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\d+)早些时候)|((\d+)([零一二三四五六七八九十百千万]+|\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\d+)个小时)|(([零一二三四五六七八九十百千万]+|\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)个月)|((\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)时(\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)年)|(月底)|(这个月)|((\d+)年(\d+)个月)|(\d+大寿)|(周([零一二三四五六七八九十百千万]+|\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\d+)月)|((\d+)月(\d+)号)|((\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(午后)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\d+:\d+:\d+)|(\d+/\d+/\d+:\d+:\d+.\d+)|(\?\?\?\?-\?\?-\?\?T\d+:\d+:\d+)|(\d+-\d+-\d+T\d+:\d+:\d+)|(\d+/\d+/\d+ \d+:\d+:\d+.\d+)|(\d+-\d+-\d+|[0-9]{8})|(((\d+)年)?((10)|(11)|(12)|([1-9]))月(\d+))|((\d[\.\-])?((10)|(11)|(12)|([1-9]))[\.\-](\d+))))|(([零一二三四五六七八九十百千万]+|\d+)生)|(今天凌晨)|(过去(\d+)年)|(元月)|((\d+)月(\d+)日凌晨)|([前去今明后新]+年)|((\d+)月(\d+))|(夏天)|((\d+)日凌晨(\d+)时许)|((\d+)月(\d+)日)|((\d+)点半)|(去年底)|(最后一[天刻])|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|(圣诞节?)|(下?个?(星期|周)(一|二|三|四|五|六|七|天))|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)年)|(当天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(每年的(\d+)月(\d+)日)|((\d+)日晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(星期([零一二三四五六七八九十百千万]+|\d+)晚)|(深夜)|(现如今)|([上中下]+午)|(第(一|二|三|四|五|六|七|八|九|十|百|千|万|几|多|\d+)+个?(天|日|周|月|年))|(昨晚)|(近年)|(今天清晨)|(中旬)|(星期([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)战期间)|(星期)|(昨天晚(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(较早时)|(个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|((民主高中|我们所处的|复仇主义和其它危害人类的灾难性疾病盛行的|快速承包电影主权的|恢复自我美德|人类审美力基础设施|饱受暴力、野蛮、流血、仇恨、嫉妒的|童年|艰苦的童年)+时代)|(元旦)|(([零一二三四五六七八九十百千万]+|\d+)个礼拜)|(昨日)|([年月]初)|((\d+)年的(\d+)月)|(每年)|(([零一二三四五六七八九十百千万]+|\d+)月份)|(今年(\d+)月(\d+)号)|(今年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)月底)|(未来(\d+)年)|(第([零一二三四五六七八九十百千万]+|\d+)季)|(\d?多年)|(([零一二三四五六七八九十百千万]+|\d+)个星期)|((\d+)年([零一二三四五六七八九十百千万]+|\d+)月)|([下上中]午)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(([零一二三四五六七八九十百千万]+|\d+)个(数|多|多少|好几|几|差不多|近|前|后|上|左右)月)|(同([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)号凌晨)|(夜里)|(两个(数|多|多少|好几|几|差不多|近|前|后|上|左右)小时)|(昨天)|(罗马时代)|(目(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)年(\d+)月(\d+)号)|(((10)|(11)|(12)|([1-9]))月份?)|([12][0-9]世纪)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)天)|(工作日)|(稍后)|((\d+)号(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(未来([零一二三四五六七八九十百千万]+|\d+)年)|([0-9]+[天日周月年][后前左右]*)|(([零一二三四五六七八九十百千万]+|\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(最(数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)刻)|(很久)|((\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)岁)|(去年(\d+)月(\d+)号)|(两个月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(古代)|(两天)|(\d+个?(小时|星期))|((\d+)年半)|(较早)|(([零一二三四五六七八九十百千万]+|\d+)个小时)|([一二三四五六七八九十]+周年)|(星期([零一二三四五六七八九十百千万]+|\d+)(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(时刻)|((\d+天)+(\d+点)?(\d+分)?(\d+秒)?)|((\d+)日([零一二三四五六七八九十百千万]+|\d+)时)|((\d+)周年)|(([零一二三四五六七八九十百千万]+|\d+)早)|(([零一二三四五六七八九十百千万]+|\d+)日)|(去年(\d+)月)|(过去([零一二三四五六七八九十百千万]+|\d+)年)|((\d+)个星期)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)(数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(执政期间)|([当前昨今明后春夏秋冬]+天)|(去年(\d+)月份)|(今(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)周)|(两星期)|(([零一二三四五六七八九十百千万]+|\d+)年代)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)天)|(昔日)|(两个半月)|([印尼|北京|美国]?当地时间)|(连日)|(本月(\d+)日)|(第([零一二三四五六七八九十百千万]+|\d+)天)|((\d+)点(\d+)分)|([长近多]年)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午(\d+)时)|(那时)|(冷战时代)|(([零一二三四五六七八九十百千万]+|\d+)天)|(这个星期)|(去年)|(昨天傍晚)|(近期)|(星期([零一二三四五六七八九十百千万]+|\d+)早些时候)|((\d+)([零一二三四五六七八九十百千万]+|\d+)年)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)两个月)|((\d+)个小时)|(([零一二三四五六七八九十百千万]+|\d+)个月)|(当年)|(本月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)([零一二三四五六七八九十百千万]+|\d+)个月)|((\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(目前)|(去年([零一二三四五六七八九十百千万]+|\d+)月)|((\d+)时(\d+)分)|(每月)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)段时间)|((\d+)日晚)|(早(数|多|多少|好几|几|差不多|近|前|后|上|左右)(\d+)点(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(下旬)|((\d+)月份)|(逐年)|(稍(数|多|多少|好几|几|差不多|近|前|后|上|左右))|((\d+)年)|(月底)|(这个月)|((\d+)年(\d+)个月)|(\d+大寿)|(周([零一二三四五六七八九十百千万]+|\d+)早(数|多|多少|好几|几|差不多|近|前|后|上|左右))|(半年)|(今日)|(末日)|(昨天深夜)|(今年(\d+)月)|((\d+)月(\d+)号)|((\d+)日夜)|((早些|某个|晚间|本星期早些|前些)+时候)|(同年)|((北京|那个|更长的|最终冲突的)时间)|(每个月)|(一早)|((\d+)来?[岁年])|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个月)|([鼠牛虎兔龙蛇马羊猴鸡狗猪]年)|(季度)|(早些时候)|(今天)|(每天)|(年半)|(下*个?月)|(午后)|((\d+)日(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|((数|多|多少|好几|几|差不多|近|前|后|上|左右)个星期)|(\d+秒)|(今天(数|多|多少|好几|几|差不多|近|前|后|上|左右)午)|(同[一二三四五六七八九十][年|月|天])|(T\d+:\d+:\d+)|(\d+/\d+/\d+:\d+:\d+.\d+)|(\?\?\?\?-\?\?-\?\?T\d+:\d+:\d+)|(\d+-\d+-\d+T\d+:\d+:\d+)|(\d+/\d+/\d+ \d+:\d+:\d+.\d+)|(\d+-\d+-\d+|[0-9]{8})|(((\d+)年)?((10)|(11)|(12)|([1-9]))月(\d+))|((\d[\.\-])?((10)|(11)|(12)|([1-9]))[\.\-](\d+))


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fighting41love/cocoNLP/6c68f1ddb771de8de6fd8c7a6872554190bbd36a/dist/cocoNLP-0.0.9/cocoNLP/config/phrase/__pycache__/rake.cpython-36.pyc


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/config/phrase/rake.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Implementation of Rapid Automatic Keyword Extraction algorithm.
  3 | 
  4 | As described in the paper `Automatic keyword extraction from individual
  5 | documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
  6 | """
  7 | 
  8 | import string
  9 | from collections import Counter, defaultdict
 10 | from itertools import chain, groupby, product
 11 | import jieba
 12 | import re
 13 | from enum import Enum
 14 | 
 15 | import os
 16 | import sys
 17 | pwd_path = os.path.abspath(os.path.dirname(__file__))
 18 | 
 19 | 
 20 | class Metric(Enum):
 21 |     """Different metrics that can be used for ranking."""
 22 | 
 23 |     DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
 24 |     WORD_DEGREE = 1  # Uses d(w) alone as the metric
 25 |     WORD_FREQUENCY = 2  # Uses f(w) alone as the metric
 26 | 
 27 | 
 28 | class Rake(object):
 29 |     """Rapid Automatic Keyword Extraction Algorithm."""
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         punctuations=None,
 34 |         ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
 35 |         max_length=100000,
 36 |         min_length=1,
 37 |     ):
 38 |         """Constructor.
 39 | 
 40 |         :param stopwords: List of Words to be ignored for keyword extraction.
 41 |         :param punctuations: Punctuations to be ignored for keyword extraction.
 42 |         :param language: Language to be used for stopwords
 43 |         :param max_length: Maximum limit on the number of words in a phrase
 44 |                            (Inclusive. Defaults to 100000)
 45 |         :param min_length: Minimum limit on the number of words in a phrase
 46 |                            (Inclusive. Defaults to 1)
 47 |         """
 48 |         # By default use degree to frequency ratio as the metric.
 49 |         if isinstance(ranking_metric, Metric):
 50 |             self.metric = ranking_metric
 51 |         else:
 52 |             self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
 53 | 
 54 |         # If stopwords not provided we use language stopwords by default.
 55 |         self.stopwords = self.load_stopwords()
 56 | 
 57 |         # If punctuations are not provided we ignore all punctuation symbols.
 58 |         self.punctuations = punctuations
 59 |         if self.punctuations is None:
 60 |             self.punctuations = string.punctuation + ',，。！？!?' # add chinese punctuation
 61 | 
 62 |         # All things which act as sentence breaks during keyword extraction.
 63 |         self.to_ignore = set(chain(self.stopwords, self.punctuations))
 64 | 
 65 |         # Assign min or max length to the attributes
 66 |         self.min_length = min_length
 67 |         self.max_length = max_length
 68 | 
 69 |         # Stuff to be extracted from the provided text.
 70 |         self.frequency_dist = None
 71 |         self.degree = None
 72 |         self.rank_list = None
 73 |         self.ranked_phrases = None
 74 | 
 75 |     def load_stopwords(self, path = pwd_path+'/'+'data/stopwords.txt'):
 76 |         """load stopwords list
 77 |         eg: stopwords_list = load_stopwords(path)
 78 | 
 79 |         :param path: 停用词表path，提前整理好的，直接读进来
 80 |         :return: list<stopwords>
 81 |         """
 82 |         with open(path) as f:
 83 |             stopwords = f.readlines()
 84 |         stopwords_list = []
 85 |         for word in stopwords:
 86 |             stopwords_list.append(word.replace('\n', '').replace(' ', ''))
 87 | 
 88 |         return stopwords_list
 89 | 
 90 |     def tokenize_chinese(self,text):
 91 | 
 92 |         sentences = re.split('(。|！|\!|\.|？|\?)', text)  # 保留分割符
 93 | 
 94 |         new_sents = []
 95 |         for i in range(int(len(sentences) / 2)):
 96 |             sent = sentences[2 * i] + sentences[2 * i + 1]
 97 |             new_sents.append(sent)
 98 |         return new_sents
 99 | 
100 |     def extract_keywords_from_text(self, text, min_len, max_len):
101 |         """Method to extract keywords from the text provided.
102 | 
103 |         :param text: Text to extract keywords from, provided as a string.
104 |         """
105 |         sentences = self.tokenize_chinese(text)
106 |         self.extract_keywords_from_sentences(sentences, min_len, max_len)
107 | 
108 |     def extract_keywords_from_sentences(self, sentences, min_len, max_len):
109 |         """Method to extract keywords from the list of sentences provided.
110 | 
111 |         :param sentences: Text to extraxt keywords from, provided as a list
112 |                           of strings, where each string is a sentence.
113 |         """
114 |         phrase_list = self._generate_phrases(sentences, min_len, max_len)
115 |         self._build_frequency_dist(phrase_list)
116 |         self._build_word_co_occurance_graph(phrase_list)
117 |         self._build_ranklist(phrase_list)
118 | 
119 |     def get_ranked_phrases(self):
120 |         """Method to fetch ranked keyword strings.
121 | 
122 |         :return: List of strings where each string represents an extracted
123 |                  keyword string.
124 |         """
125 |         return self.ranked_phrases
126 | 
127 |     def get_ranked_phrases_with_scores(self):
128 |         """Method to fetch ranked keyword strings along with their scores.
129 | 
130 |         :return: List of tuples where each tuple is formed of an extracted
131 |                  keyword string and its score. Ex: (5.68, 'Four Scoures')
132 |         """
133 |         return self.rank_list
134 | 
135 |     def get_word_frequency_distribution(self):
136 |         """Method to fetch the word frequency distribution in the given text.
137 | 
138 |         :return: Dictionary (defaultdict) of the format `word -> frequency`.
139 |         """
140 |         return self.frequency_dist
141 | 
142 |     def get_word_degrees(self):
143 |         """Method to fetch the degree of words in the given text. Degree can be
144 |         defined as sum of co-occurances of the word with other words in the
145 |         given text.
146 | 
147 |         :return: Dictionary (defaultdict) of the format `word -> degree`.
148 |         """
149 |         return self.degree
150 | 
151 |     def _build_frequency_dist(self, phrase_list):
152 |         """Builds frequency distribution of the words in the given body of text.
153 | 
154 |         :param phrase_list: List of List of strings where each sublist is a
155 |                             collection of words which form a contender phrase.
156 |         """
157 |         self.frequency_dist = Counter(chain.from_iterable(phrase_list))
158 | 
159 |     def _build_word_co_occurance_graph(self, phrase_list):
160 |         """Builds the co-occurance graph of words in the given body of text to
161 |         compute degree of each word.
162 | 
163 |         :param phrase_list: List of List of strings where each sublist is a
164 |                             collection of words which form a contender phrase.
165 |         """
166 |         co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
167 |         for phrase in phrase_list:
168 |             # For each phrase in the phrase list, count co-occurances of the
169 |             # word with other words in the phrase.
170 |             #
171 |             # Note: Keep the co-occurances graph as is, to help facilitate its
172 |             # use in other creative ways if required later.
173 |             for (word, coword) in product(phrase, phrase):
174 |                 co_occurance_graph[word][coword] += 1
175 |         self.degree = defaultdict(lambda: 0)
176 |         for key in co_occurance_graph:
177 |             self.degree[key] = sum(co_occurance_graph[key].values())
178 | 
179 |     def _build_ranklist(self, phrase_list):
180 |         """Method to rank each contender phrase using the formula
181 | 
182 |               phrase_score = sum of scores of words in the phrase.
183 |               word_score = d(w)/f(w) where d is degree and f is frequency.
184 | 
185 |         :param phrase_list: List of List of strings where each sublist is a
186 |                             collection of words which form a contender phrase.
187 |         """
188 |         self.rank_list = []
189 |         for phrase in phrase_list:
190 |             rank = 0.0
191 |             for word in phrase:
192 |                 if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
193 |                     rank += 1.0 * self.degree[word] / self.frequency_dist[word]
194 |                 elif self.metric == Metric.WORD_DEGREE:
195 |                     rank += 1.0 * self.degree[word]
196 |                 else:
197 |                     rank += 1.0 * self.frequency_dist[word]
198 |             self.rank_list.append((rank, " ".join(phrase)))
199 |         self.rank_list.sort(reverse=True)
200 |         self.ranked_phrases = [ph[1] for ph in self.rank_list]
201 | 
202 |     def _generate_phrases(self, sentences, min_len, max_len):
203 |         """Method to generate contender phrases given the sentences of the text
204 |         document.
205 | 
206 |         :param sentences: List of strings where each string represents a
207 |                           sentence which forms the text.
208 |         :return: Set of string tuples where each tuple is a collection
209 |                  of words forming a contender phrase.
210 |         """
211 |         phrase_list = set()
212 |         # Create contender phrases from sentences.
213 |         for sentence in sentences:
214 |             word_list = [word for word in list(jieba.cut(sentence))]
215 |             phrase_list.update(self._get_phrase_list_from_words(word_list, min_len, max_len))
216 |         return phrase_list
217 | 
218 |     def _get_phrase_list_from_words(self, word_list, min_len, max_len):
219 |         """Method to create contender phrases from the list of words that form
220 |         a sentence by dropping stopwords and punctuations and grouping the left
221 |         words into phrases. Only phrases in the given length range (both limits
222 |         inclusive) would be considered to build co-occurrence matrix. Ex:
223 | 
224 |         Sentence: Red apples, are good in flavour.
225 |         List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
226 |         List after dropping punctuations and stopwords.
227 |         List of words: ['red', 'apples', *, *, good, *, 'flavour']
228 |         List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
229 | 
230 |         List of phrases with a correct length:
231 |         For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
232 |         For the range [1, 1]: [('good',), ('flavour',)]
233 |         For the range [2, 2]: [('red', 'apples')]
234 | 
235 |         :param word_list: List of words which form a sentence when joined in
236 |                           the same order.
237 |         :return: List of contender phrases that are formed after dropping
238 |                  stopwords and punctuations.
239 |         """
240 |         groups = groupby(word_list, lambda x: x not in self.to_ignore)
241 |         phrases = []
242 |         for group in groups:
243 |             tmp = tuple(group[1])
244 |             len_g1 = len(list(tmp))
245 |             if group[0] and len_g1>=min_len and len_g1<=max_len: # restrict the length of the phrase
246 |                 phrases.append(tuple(tmp))
247 | 
248 |         return list(
249 |             filter(
250 |                 lambda x: self.min_length <= len(x) <= self.max_length, phrases
251 |             )
252 |         )
253 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/cocoNLP/extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from phone import Phone
  4 | from itertools import groupby
  5 | import phonenumbers
  6 | from pyhanlp import *
  7 | from cocoNLP.config.basic.time_nlp.TimeNormalizer import *
  8 | 
  9 | 
 10 | 
 11 | __all__ = ['extract_email', 'replace_chinese','extract_cellphone', 'extract_cellphone', 'extract_cellphone_location',
 12 |            'get_location', 'extract_locations', 'replace_cellphoneNum', 'extract_time', 'extract_name', 'most_common']
 13 | 
 14 | class extractor():
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def extract_email(self, text):
 19 |         """
 20 |         extract all email addresses from texts<string>
 21 |         eg: extract_email('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 22 | 
 23 | 
 24 |         :param: raw_text
 25 |         :return: email_addresses_list<list>
 26 |         """
 27 |         eng_texts = self.replace_chinese(text)
 28 |         eng_texts = eng_texts.replace(' at ','@').replace(' dot ','.')
 29 |         sep = ',!?:; ，。！？《》、|\\/'
 30 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
 31 | 
 32 |         email_pattern = r'^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$'
 33 | 
 34 |         emails = []
 35 |         for eng_text in eng_split_texts:
 36 |             result = re.match(email_pattern, eng_text, flags=0)
 37 |             if result:
 38 |                 emails.append(result.string)
 39 |         return emails
 40 | 
 41 |     def replace_chinese(self, text):
 42 |         """
 43 |         remove all the chinese characters in text
 44 |         eg: replace_chinese('我的email是ifee@baidu.com和dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 45 | 
 46 | 
 47 |         :param: raw_text
 48 |         :return: text_without_chinese<str>
 49 |         """
 50 |         filtrate = re.compile(u'[\u4E00-\u9FA5]')
 51 |         text_without_chinese = filtrate.sub(r' ', text)
 52 |         return text_without_chinese
 53 | 
 54 |     def extract_cellphone(self, text, nation):
 55 |         """
 56 |         extract all cell phone numbers from texts<string>
 57 |         eg: extract_email('my email address is sldisd@baidu.com and dsdsd@dsdsd.com,李林的邮箱是eewewe@gmail.com哈哈哈')
 58 | 
 59 | 
 60 |         :param: raw_text
 61 |         :return: email_addresses_list<list>
 62 |         """
 63 |         eng_texts = self.replace_chinese(text)
 64 |         sep = ',!?:; ：，。！？《》、|\\/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 65 |         eng_split_texts = [''.join(g) for k, g in groupby(eng_texts, sep.__contains__) if not k]
 66 |         if nation=='CHN':
 67 |             phone_pattern = r'^(\+86)?([- ])?(|(13[0-9])|(14[0-9])|(15[0-9])|(17[0-9])|(18[0-9])|(19[0-9]))([- ])?\d{3}([- ])?\d{4}([- ])?\d{4}'
 68 | 
 69 |         if nation!='CHN':
 70 |             phone_pattern = r'\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b'
 71 | 
 72 |         phones = []
 73 |         for eng_text in eng_split_texts:
 74 |             result = re.match(phone_pattern, eng_text, flags=0)
 75 |             if result:
 76 |                 phones.append(result.string.replace('+86','').replace('-',''))
 77 |         return phones
 78 | 
 79 |     def extract_cellphone_location(self, phoneNum, nation='CHN'):
 80 |         """
 81 |         extract cellphone number locations according to the given number
 82 |         eg: extract_cellphone_location('181000765143',nation=CHN)
 83 | 
 84 | 
 85 |         :param: phoneNum<string>, nation<string>
 86 |         :return: location<dict>{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}
 87 | 
 88 |         """
 89 |         if nation=='CHN':
 90 |             p = Phone()
 91 |             loc_dict = p.find(phoneNum)
 92 |         if nation!='CHN':
 93 |             x = phonenumbers.parse(phoneNum, 'GB')
 94 |             if phonenumbers.is_possible_number(x):
 95 |                 loc_dict = x
 96 |         # print(loc_dict)
 97 |         return loc_dict
 98 | 
 99 |     def get_location(self, word_pos_list):
100 |         """
101 |         get location by the pos of the word, such as 'ns'
102 |         eg: get_location('内蒙古赤峰市松山区')
103 | 
104 | 
105 |         :param: word_pos_list<list>
106 |         :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
107 | 
108 |         """
109 |         location_list = []
110 | 
111 |         for i,t in enumerate(word_pos_list):
112 |             word = t[0]
113 |             nature = t[1]
114 |             if nature == 'ns':
115 |                 loc_tmp = word
116 |                 count = i + 1
117 |                 while count < len(word_pos_list):
118 |                     next_word_pos = word_pos_list[count]
119 |                     next_pos = next_word_pos[1]
120 |                     next_word = next_word_pos[0]
121 |                     if next_pos=='ns' or 'n' == next_pos[0]:
122 |                         loc_tmp += next_word
123 |                     else:
124 |                         break
125 |                     count += 1
126 |                 location_list.append(loc_tmp)
127 | 
128 |         return location_list # max(location_list)
129 | 
130 |     def extract_locations(self, text):
131 |         """
132 |         extract locations by from texts
133 |         eg: extract_locations('我家住在陕西省安康市汉滨区。')
134 | 
135 | 
136 |         :param: raw_text<string>
137 |         :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
138 | 
139 |         """
140 |         seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)]
141 |         location_list = self.get_location(seg_list)
142 |         return location_list
143 | 
144 |     def replace_cellphoneNum(self, text):
145 |         """
146 |         remove cellphone number from texts. If text contains cellphone No., the extract_time will report errors.
147 |         hence, we remove it here.
148 |         eg: extract_locations('我家住在陕西省安康市汉滨区，我的手机号是181-0006-5143。')
149 | 
150 | 
151 |         :param: raw_text<string>
152 |         :return: text_without_cellphone<string> eg: '我家住在陕西省安康市汉滨区，我的手机号是。'
153 | 
154 |         """
155 |         cell_phones = self.extract_cellphone(text,'CHN')
156 |         for phone_num in cell_phones:
157 |             text = text.replace(phone_num,'')
158 |         return text
159 | 
160 |     def extract_time(self, text):
161 |         """
162 |         extract timestamp from texts
163 |         eg: extract_time('我于2018年1月1日获得1000万美金奖励。')
164 | 
165 | 
166 |         :param: raw_text<string>
167 |         :return: time_info<time_dict> eg: {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"}
168 | 
169 |         """
170 |         tmp_text = self.replace_cellphoneNum(text)
171 |         tn = TimeNormalizer()
172 |         res = tn.parse(target=tmp_text)  # target为待分析语句，timeBase为基准时间默认是当前时间
173 |         return res
174 | 
175 |     def extract_name(self, text):
176 |         """
177 |         extract chinese names from texts
178 |         eg: extract_time('急寻王龙，短发，王龙，男，丢失发型短发，...如有线索，请迅速与警方联系：19909156745')
179 | 
180 | 
181 |         :param: raw_text<string>
182 |         :return: name_list<list> eg: ['王龙', '王龙']
183 | 
184 |         """
185 |         seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)]
186 |         names = []
187 |         for ele_tup in seg_list:
188 |             if 'nr' in ele_tup[1]:
189 |                 names.append(ele_tup[0])
190 |                 # print(ele_tup[0],ele_tup[1])
191 |         return self.most_common(names)
192 | 
193 |     def most_common(self, content_list):
194 |         """
195 |         return the most common element in a list
196 |         eg: extract_time(['王龙'，'王龙'，'李二狗'])
197 | 
198 | 
199 |         :param: content_list<list>
200 |         :return: name<string> eg: '王龙'
201 |         """
202 |         return max(set(content_list), key=content_list.count)
203 | 
204 | 
205 | 
206 | 
207 | 
208 | if __name__ == '__main__':
209 | 
210 |     text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发，...如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com'
211 |     ex = extractor()
212 | 
213 |     emails = ex.extract_email(text)
214 |     cellphones = ex.extract_cellphone(text,nation='CHN')
215 |     cell_loc = []
216 |     for cell in cellphones:
217 |         cell_loc.append(ex.extract_cellphone_location(cell,'CHN'))
218 | 
219 |     locations = ex.extract_locations(text)
220 |     times = ex.extract_time(text)
221 |     names = ex.extract_name(text)
222 | 
223 |     result_dict = {}
224 |     result_dict['email'] = emails
225 |     result_dict['cellphone'] = cellphones
226 |     result_dict['cellphone_location'] = cell_loc
227 |     result_dict['location'] = locations
228 |     result_dict['time'] = times
229 |     result_dict['name'] = names
230 |     for key in result_dict.keys():
231 |         print(key,result_dict[key])


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/readme.md:
--------------------------------------------------------------------------------
  1 | ## This is a Chinese nlp package, which can extract information from texts.
  2 | 
  3 | [![pypiv](https://img.shields.io/pypi/v/rake-nltk.svg)](https://pypi.org/project/cocoNLP/)
  4 | [![Thanks](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://www.zhihu.com/people/mountain-blue-64/posts)
  5 | 
  6 | ## It is developed for a public welfare program, a weibo robot [@寻人微博](https://weibo.com/xrwbyangyangfuture).
  7 | 
  8 | ## installation
  9 | It works well on macOS Mojave with python=3.6.
 10 | ```
 11 | pip install cocoNLP
 12 | ```
 13 | 
 14 | ## Directly from the repository
 15 | 
 16 | ```
 17 | git clone https://github.com/fighting41love/cocoNLP.git
 18 | cd cocoNLP
 19 | python setup.py install
 20 | ```
 21 | 
 22 | ## Quick start
 23 | 
 24 | ### Extract basic information from texts
 25 | ```
 26 | >>> from cocoNLP.extractor import extractor
 27 | 
 28 | >>> ex = extractor()
 29 | 
 30 | >>> text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发，...如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com'
 31 | 
 32 | # 抽取邮箱
 33 | >>> emails = ex.extract_email(text)
 34 | >>> print(emails)
 35 | 
 36 | ['baizhantang@sina.com.cn', 'yangyangfuture@gmail.com.cn']
 37 | ```
 38 | 
 39 | ```
 40 | # 抽取手机号
 41 | >>> cellphones = ex.extract_cellphone(text,nation='CHN')
 42 | >>> print(cellphones)
 43 | 
 44 | ['18100065143', '13261562938']
 45 | ```
 46 | 
 47 | ```
 48 | # 抽取手机归属地、运营商
 49 | >>> cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones]
 50 | >>> print(cell_locs)
 51 | 
 52 | cellphone_location [{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}]
 53 | ```
 54 | 
 55 | ```
 56 | # 抽取地址信息
 57 | >>> locations = ex.extract_locations(text)
 58 | >>> print(locations)
 59 | ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
 60 | ```
 61 | ```
 62 | # 抽取时间点
 63 | >>> times = ex.extract_time(text)
 64 | >>> print(times)
 65 | time {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"}
 66 | ```
 67 | ```
 68 | # 抽取人名
 69 | >>> name = ex.extract_name(text)
 70 | >>> print(name)
 71 | 特朗普
 72 | 
 73 | ```
 74 | ### Extract phrases from texts
 75 | ```
 76 | >>> from cocoNLP.config.phrase import rake
 77 | 
 78 | >>> r = rake.Rake()
 79 | 
 80 | >>> # Extraction given the list of strings where each string is a sentence.
 81 | >>> r.extract_keywords_from_sentences(['2015年5月11日，“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、华某（25岁）名誉权纠纷及成某（38岁）名誉权纠纷二案，要求被诉人公开赔礼道歉、恢复名誉、删除相关视频、断开转载该视频的链接，赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀法院已经受理了这两起案件。原告章泽天诉称，她被许多网友称为“奶茶妹妹”，在网络上获得相当的关注度。2014年4月18日，北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并发布了名为“奶茶妹妹恋情或为炒作，百万炒作团队浮出水面”的视频，该段视频捏造包括“奶茶妹妹走红，实为幕后商业策划”、“100万，奶茶妹妹花巨资，请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论，包括声称其就是原告聘请的“幕后推手和炒作专家”，原告曾花100万聘请其为之宣传策划，原告与刘强东的恋情系两者合作的结果等等。
 82 | '],2,4)
 83 | 
 84 | >>> # To get keyword phrases ranked highest to lowest.
 85 | >>> ranked_words = r.get_ranked_phrases()
 86 | 
 87 | >>> # To get keyword phrases ranked highest to lowest with scores.
 88 | >>> ranked_words_score = r.get_ranked_phrases_with_scores()
 89 | 
 90 | >>> for ele in ranked_words_score:
 91 | >>>     print(ele)
 92 | 
 93 | (16.0, '要求 被诉人 公开 赔礼道歉')
 94 | (15.0, '上述 节目 中 捏造')
 95 | (14.5, '该段 视频 捏造 包括')
 96 | (14.0, '实为 幕后 商业 策划')
 97 | (14.0, '奶茶 妹妹 花 巨资')
 98 | (9.5, '删除 相关 视频')
 99 | (9.0, '请人 策划 走红')
100 | (9.0, '网络 上 获得')
101 | (9.0, '想方设法 地转 学院')
102 | (9.0, '奶茶 妹妹 走红')
103 | (9.0, '名誉权 纠纷 及成')
104 | (9.0, '名誉权 纠纷 二案')
105 | (8.5, '奶茶 妹妹 恋情')
106 | (8.5, '原告 章泽天 诉称')
107 | (6.0, '奶茶 妹妹')
108 | (5.0, '节目 制作')
109 | (5.0, '幕后 推手')
110 | (5.0, '宣传 策划')
111 | ```
112 | 
113 | 
114 | ## References
115 | 
116 | This is a python implementation of the algorithm as mentioned in paper [Automatic keyword extraction from individual documents by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley](https://www.researchgate.net/profile/Stuart_Rose/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents/links/55071c570cf27e990e04c8bb.pdf)
117 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | pyhanlp
3 | phone
4 | phonenumbers
5 | regex
6 | arrow
7 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build = 
3 | tag_date = 0
4 | 
5 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from setuptools import setup
 4 | from setuptools.command.develop import develop
 5 | from setuptools.command.install import install
 6 | from subprocess import call
 7 | 
 8 | 
 9 | here = path.abspath(path.dirname(__file__))
10 | 
11 | 
12 | class PostDevelop(develop):
13 |     """Post-installation for development mode."""
14 | 
15 |     def run(self):
16 |         develop.run(self)
17 | 
18 | 
19 | class PostInstall(install):
20 |     """Post-installation for production mode."""
21 | 
22 |     def run(self):
23 |         install.run(self)
24 | 
25 | 
26 | class MyInstall(install):
27 |     def run(self):
28 |         call(["pip install -r requirements.txt --no-clean"], shell=True)
29 |         install.run(self)
30 | 
31 | # Get package and author details.
32 | about = {}
33 | with open(path.join(here, "cocoNLP", "__version__.py")) as f:
34 |     exec(f.read(), about)
35 | 
36 | setup(
37 |     # Name of the module
38 |     name="cocoNLP",
39 |     # Details
40 |     version=about["__version__"],
41 |     description=about["__description__"],
42 |     #long_description=long_description,
43 |     # The project's main homepage.
44 |     url=about["__url__"],
45 |     # Author details
46 |     author=about["__author__"],
47 |     author_email=about["__author_email__"],
48 |     # License
49 |     license=about["__license__"],
50 |     packages=["cocoNLP"],
51 |     test_suite="tests",
52 |     keywords="nlp text-mining information extraction",
53 |     include_package_data=True,
54 |     classifiers=[
55 |         # Intended Audience.
56 |         "Intended Audience :: Developers",
57 |         "Intended Audience :: Education",
58 |         # License.
59 |         "License :: OSI Approved :: MIT License",
60 |         # Project maturity.
61 |         "Development Status :: 3 - Alpha",
62 |         # Operating Systems.
63 |         "Operating System :: POSIX",
64 |         # Supported Languages.
65 |         "Programming Language :: Python :: 2.7",
66 |         "Programming Language :: Python :: 3.4",
67 |         "Programming Language :: Python :: 3.5",
68 |         "Programming Language :: Python :: 3.6",
69 |         # Topic tags.
70 |         "Topic :: Software Development :: Build Tools",
71 |         "Topic :: Software Development :: Libraries :: Python Modules",
72 |     ],
73 |     setup_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"],
74 |     install_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"],
75 |     cmdclass={'install': MyInstall},
76 | )
77 | 


--------------------------------------------------------------------------------
/dist/cocoNLP-0.0.9/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from cocoNLP.extractor import extractor
 3 | 
 4 | ex = extractor()
 5 | 
 6 | text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发，...如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com'
 7 | 
 8 | # 抽取邮箱
 9 | emails = ex.extract_email(text)
10 | print(emails)
11 | 
12 | # 抽取手机号
13 | cellphones = ex.extract_cellphone(text,nation='CHN')
14 | print(cellphones)
15 | 
16 | # 抽取手机归属地、运营商
17 | cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones]
18 | print(cell_locs)
19 | 
20 | # 抽取地址信息
21 | locations = ex.extract_locations(text)
22 | print(locations)
23 | 
24 | # 抽取时间点
25 | times = ex.extract_time(text)
26 | print(times)
27 | 
28 | # 抽取人名
29 | name = ex.extract_name(text)
30 | print(name)
31 | 
32 | 
33 | from cocoNLP.config.phrase import rake
34 | 
35 | r = rake.Rake()
36 | 
37 | # Extraction given the list of strings where each string is a sentence.
38 | r.extract_keywords_from_sentences(['2015年5月11日，“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、'
39 |                                    '华某（25岁）名誉权纠纷及成某（38岁）名誉权纠纷二案，要求被诉人公开赔礼道歉、恢复名誉、'
40 |                                    '删除相关视频、断开转载该视频的链接，赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀'
41 |                                    '法院已经受理了这两起案件。原告章泽天诉称，她被许多网友称为“奶茶妹妹”，在网络上获得相当的'
42 |                                    '关注度。2014年4月18日，北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并'
43 |                                    '发布了名为“奶茶妹妹恋情或为炒作，百万炒作团队浮出水面”的视频，该段视频捏造包括“奶茶妹妹走红'
44 |                                    '，实为幕后商业策划”、“100万，奶茶妹妹花巨资，请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、'
45 |                                    '想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论，包括声称其就是原告聘请的'
46 |                                    '“幕后推手和炒作专家”，原告曾花100万聘请其为之宣传策划，原告与刘强东的恋情系两者合作的结果等等。'],2,4)
47 | 
48 | # To get keyword phrases ranked highest to lowest.
49 | ranked_words = r.get_ranked_phrases()
50 | 
51 | # To get keyword phrases ranked highest to lowest with scores.
52 | ranked_words_score = r.get_ranked_phrases_with_scores()
53 | 
54 | for ele in ranked_words_score:
55 |     print(ele)
56 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | ## This is a Chinese nlp package, which can extract information from texts.
  2 | 
  3 | [![pypiv](https://img.shields.io/pypi/v/rake-nltk.svg)](https://pypi.org/project/cocoNLP/)
  4 | [![Thanks](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://www.zhihu.com/people/mountain-blue-64/posts)
  5 | 
  6 | ## It is developed for a public welfare program, a weibo robot [@寻人微博](https://weibo.com/xrwbyangyangfuture).
  7 | 
  8 | ## installation
  9 | It works well on macOS Mojave with python=3.6.
 10 | ```
 11 | pip install cocoNLP
 12 | ```
 13 | 
 14 | ## Directly from the repository
 15 | 
 16 | ```
 17 | git clone https://github.com/fighting41love/cocoNLP.git
 18 | cd cocoNLP
 19 | python setup.py install
 20 | ```
 21 | 
 22 | ## Quick start
 23 | 
 24 | ### Extract basic information from texts
 25 | ```
 26 | >>> from cocoNLP.extractor import extractor
 27 | 
 28 | >>> ex = extractor()
 29 | 
 30 | >>> text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。丢失发型短发，...如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com'
 31 | 
 32 | # 抽取邮箱
 33 | >>> emails = ex.extract_email(text)
 34 | >>> print(emails)
 35 | 
 36 | ['baizhantang@sina.com.cn', 'yangyangfuture@gmail.com.cn']
 37 | ```
 38 | 
 39 | ```
 40 | # 抽取手机号
 41 | >>> cellphones = ex.extract_cellphone(text,nation='CHN')
 42 | >>> print(cellphones)
 43 | 
 44 | ['18100065143', '13261562938']
 45 | ```
 46 | 
 47 | ```
 48 | # 抽取身份证号
 49 | >>> ids = ex.extract_ids(text)
 50 | >>> print(ids)
 51 | 
 52 | ['410105196904010537']
 53 | ```
 54 | 
 55 | ```
 56 | # 抽取手机归属地、运营商
 57 | >>> cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones]
 58 | >>> print(cell_locs)
 59 | 
 60 | cellphone_location [{'phone': '18100065143', 'province': '上海', 'city': '上海', 'zip_code': '200000', 'area_code': '021', 'phone_type': '电信'}]
 61 | ```
 62 | 
 63 | ```
 64 | # 抽取地址信息
 65 | >>> locations = ex.extract_locations(text)
 66 | >>> print(locations)
 67 | ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区']
 68 | ```
 69 | ```
 70 | # 抽取时间点
 71 | >>> times = ex.extract_time(text)
 72 | >>> print(times)
 73 | time {"type": "timestamp", "timestamp": "2018-11-27 11:00:00"}
 74 | ```
 75 | ```
 76 | # 抽取人名
 77 | >>> name = ex.extract_name(text)
 78 | >>> print(name)
 79 | 特朗普
 80 | 
 81 | ```
 82 | ### Extract phrases from texts
 83 | ```
 84 | >>> from cocoNLP.config.phrase import rake
 85 | 
 86 | >>> r = rake.Rake()
 87 | 
 88 | >>> # Extraction given the list of strings where each string is a sentence.
 89 | >>> r.extract_keywords_from_sentences(['2015年5月11日，“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、华某（25岁）名誉权纠纷及成某（38岁）名誉权纠纷二案，要求被诉人公开赔礼道歉、恢复名誉、删除相关视频、断开转载该视频的链接，赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀法院已经受理了这两起案件。原告章泽天诉称，她被许多网友称为“奶茶妹妹”，在网络上获得相当的关注度。2014年4月18日，北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并发布了名为“奶茶妹妹恋情或为炒作，百万炒作团队浮出水面”的视频，该段视频捏造包括“奶茶妹妹走红，实为幕后商业策划”、“100万，奶茶妹妹花巨资，请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论，包括声称其就是原告聘请的“幕后推手和炒作专家”，原告曾花100万聘请其为之宣传策划，原告与刘强东的恋情系两者合作的结果等等。
 90 | '],2,4)
 91 | 
 92 | >>> # To get keyword phrases ranked highest to lowest.
 93 | >>> ranked_words = r.get_ranked_phrases()
 94 | 
 95 | >>> # To get keyword phrases ranked highest to lowest with scores.
 96 | >>> ranked_words_score = r.get_ranked_phrases_with_scores()
 97 | 
 98 | >>> for ele in ranked_words_score:
 99 | >>>     print(ele)
100 | 
101 | (16.0, '要求 被诉人 公开 赔礼道歉')
102 | (15.0, '上述 节目 中 捏造')
103 | (14.5, '该段 视频 捏造 包括')
104 | (14.0, '实为 幕后 商业 策划')
105 | (14.0, '奶茶 妹妹 花 巨资')
106 | (9.5, '删除 相关 视频')
107 | (9.0, '请人 策划 走红')
108 | (9.0, '网络 上 获得')
109 | (9.0, '想方设法 地转 学院')
110 | (9.0, '奶茶 妹妹 走红')
111 | (9.0, '名誉权 纠纷 及成')
112 | (9.0, '名誉权 纠纷 二案')
113 | (8.5, '奶茶 妹妹 恋情')
114 | (8.5, '原告 章泽天 诉称')
115 | (6.0, '奶茶 妹妹')
116 | (5.0, '节目 制作')
117 | (5.0, '幕后 推手')
118 | (5.0, '宣传 策划')
119 | ```
120 | 
121 | 
122 | ## References
123 | 
124 | This is a python implementation of the algorithm as mentioned in paper [Automatic keyword extraction from individual documents by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley](https://www.researchgate.net/profile/Stuart_Rose/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents/links/55071c570cf27e990e04c8bb.pdf)
125 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | pyhanlp
3 | phone
4 | phonenumbers
5 | regex
6 | arrow==0.14.3
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from setuptools import setup
 4 | from setuptools.command.develop import develop
 5 | from setuptools.command.install import install
 6 | from subprocess import call
 7 | 
 8 | 
 9 | here = path.abspath(path.dirname(__file__))
10 | 
11 | 
12 | class PostDevelop(develop):
13 |     """Post-installation for development mode."""
14 | 
15 |     def run(self):
16 |         develop.run(self)
17 | 
18 | 
19 | class PostInstall(install):
20 |     """Post-installation for production mode."""
21 | 
22 |     def run(self):
23 |         install.run(self)
24 | 
25 | 
26 | class MyInstall(install):
27 |     def run(self):
28 |         call(["pip install -r requirements.txt --no-clean"], shell=True)
29 |         install.run(self)
30 | 
31 | # Get package and author details.
32 | about = {}
33 | with open(path.join(here, "cocoNLP", "__version__.py")) as f:
34 |     exec(f.read(), about)
35 | 
36 | setup(
37 |     # Name of the module
38 |     name="cocoNLP",
39 |     # Details
40 |     version=about["__version__"],
41 |     description=about["__description__"],
42 |     #long_description=long_description,
43 |     # The project's main homepage.
44 |     url=about["__url__"],
45 |     # Author details
46 |     author=about["__author__"],
47 |     author_email=about["__author_email__"],
48 |     # License
49 |     license=about["__license__"],
50 |     packages=["cocoNLP"],
51 |     test_suite="tests",
52 |     keywords="nlp text-mining information extraction",
53 |     include_package_data=True,
54 |     classifiers=[
55 |         # Intended Audience.
56 |         "Intended Audience :: Developers",
57 |         "Intended Audience :: Education",
58 |         # License.
59 |         "License :: OSI Approved :: MIT License",
60 |         # Project maturity.
61 |         "Development Status :: 3 - Alpha",
62 |         # Operating Systems.
63 |         "Operating System :: POSIX",
64 |         # Supported Languages.
65 |         "Programming Language :: Python :: 2.7",
66 |         "Programming Language :: Python :: 3.4",
67 |         "Programming Language :: Python :: 3.5",
68 |         "Programming Language :: Python :: 3.6",
69 |         # Topic tags.
70 |         "Topic :: Software Development :: Build Tools",
71 |         "Topic :: Software Development :: Libraries :: Python Modules",
72 |     ],
73 |     setup_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"],
74 |     install_requires=["jieba","pyhanlp","phone","phonenumbers","regex","arrow"],
75 |     cmdclass={'install': MyInstall},
76 | )
77 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from cocoNLP.extractor import extractor
 3 | 
 4 | ex = extractor()
 5 | 
 6 | text = '急寻特朗普，男孩，于2018年11月27号11时在陕西省安康市汉滨区走失。身份证号码410105196904010537丢失发型短发，...' \
 7 |        '如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com13673630861'
 8 | text = '急寻特朗普，男孩，于2018年11月27号11时在鼓楼区走失。身份证号码410105196904010537丢失发型短发，...' \
 9 |        '如有线索，请迅速与警方联系：18100065143，132-6156-2938，baizhantang@sina.com.cn 和yangyangfuture at gmail dot com13673630861'
10 | # text = '4点15分钟后的番茄炒蛋'
11 | # text = '我下午2点15分30秒的番茄炒蛋'
12 | # text = '晚上8点15的番茄炒蛋'
13 | 
14 | 
15 | # 抽取邮箱
16 | emails = ex.extract_email(text)
17 | print(emails)
18 | 
19 | # 抽取手机号
20 | cellphones = ex.extract_cellphone(text,nation='CHN')
21 | print(cellphones)
22 | 
23 | # 抽取身份证号
24 | ids = ex.extract_ids(text)
25 | print(ids)
26 | 
27 | # 抽取手机归属地、运营商
28 | cell_locs = [ex.extract_cellphone_location(cell,'CHN') for cell in cellphones]
29 | print(cell_locs)
30 | 
31 | # 抽取地址信息
32 | locations = ex.extract_locations(text)
33 | print(locations)
34 | 
35 | # 抽取时间点
36 | times = ex.extract_time(text)
37 | print(times)
38 | 
39 | # 抽取人名
40 | name = ex.extract_name(text)
41 | print(name)
42 | 
43 | 
44 | from cocoNLP.config.phrase import rake
45 | 
46 | r = rake.Rake()
47 | 
48 | # Extraction given the list of strings where each string is a sentence.
49 | r.extract_keywords_from_text('2015年5月11日，“奶茶妹妹”章泽天分别起诉北京搜狐互联网信息服务有限公司、'
50 |                                    '华某（25岁）名誉权纠纷及成某（38岁）名誉权纠纷二案，要求被诉人公开赔礼道歉、恢复名誉、'
51 |                                    '删除相关视频、断开转载该视频的链接，赔偿经济损失、精神损害抚慰金共计170万元。北京市海淀'
52 |                                    '法院已经受理了这两起案件。原告章泽天诉称，她被许多网友称为“奶茶妹妹”，在网络上获得相当的'
53 |                                    '关注度。2014年4月18日，北京搜狐互联网信息服务有限公司的“搜狐视频娱乐播报调查”节目制作并'
54 |                                    '发布了名为“奶茶妹妹恋情或为炒作，百万炒作团队浮出水面”的视频，该段视频捏造包括“奶茶妹妹走红'
55 |                                    '，实为幕后商业策划”、“100万，奶茶妹妹花巨资，请人策划走红”、“奶茶妹妹在清华大学挂科、作弊、'
56 |                                    '想方设法地转学院”等等。华某在上述节目中捏造了大量的对原告的虚假言论，包括声称其就是原告聘请的'
57 |                                    '“幕后推手和炒作专家”，原告曾花100万聘请其为之宣传策划，原告与刘强东的恋情系两者合作的结果等等。',2,4)
58 | 
59 | # r.extract_keywords_from_sentences(['如果您认识的人你要通知他一下就行了好吧对吧因为我们这边都，如果您认识的人你要通知他一下就行了好吧对波因为我们这边都'],2,4)
60 | 
61 | # To get keyword phrases ranked highest to lowest.
62 | ranked_words = r.get_ranked_phrases()
63 | 
64 | # To get keyword phrases ranked highest to lowest with scores.
65 | ranked_words_score = r.get_ranked_phrases_with_scores()
66 | print(ranked_words_score)
67 | for ele in ranked_words_score:
68 |     print(ele)
69 | 


--------------------------------------------------------------------------------