├── .dockerignore ├── .github └── workflows │ └── main.yml ├── .gitignore ├── Dockerfile ├── README.md ├── main.py ├── pyunit_address ├── __init__.py ├── address.bz2 ├── address.py ├── addressType.py ├── correctionAddress.py ├── findAddress.py ├── hot.zip ├── multitree.py ├── supplementAddress.py └── tool.py ├── requirements.txt ├── setup.py └── test.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # pytype static type analyzer 137 | .pytype/ 138 | 139 | # Cython debug symbols 140 | cython_debug/ 141 | .idea/ 142 | .github/ 143 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: ADDRESS 2 | 3 | on: push 4 | 5 | jobs: 6 | deploy: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - name: 下载代码 12 | uses: actions/checkout@master 13 | 14 | - name: 下载Python环境 15 | uses: actions/setup-python@master 16 | with: 17 | python-version: '3.8' 18 | 19 | - name: 初始化Python 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | 24 | - name: 构建Python包并上传到PYPI 25 | env: 26 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 27 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 28 | run: | 29 | pip install -r requirements.txt 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | 33 | - name: 构建Docker并上传到DockerHub 34 | uses: docker/build-push-action@v1.1.0 35 | with: 36 | username: ${{ secrets.DOCKER_USERNAME }} 37 | password: ${{ secrets.DOCKER_PASSWORD }} 38 | repository: jtyoui/pyunit-address 39 | tag_with_ref: true 40 | tag_with_sha: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | venv/* 3 | *.pyc 4 | build/* 5 | dist/* 6 | *.egg-info/* 7 | .pytest_cache/* 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.8-alpine3.10 2 | 3 | MAINTAINER Jytoui 4 | 5 | COPY requirements.txt /app/requirements.txt 6 | 7 | # 加入pip源 8 | ENV pypi https://pypi.douban.com/simple 9 | 10 | # 更换APK源 11 | RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories 12 | 13 | # 安装Python3环境 14 | RUN apk add --no-cache --virtual mypacks \ 15 | gcc \ 16 | python3-dev \ 17 | linux-headers \ 18 | musl-dev \ 19 | && pip3 install --no-cache-dir -r /app/requirements.txt -i ${pypi} && \ 20 | apk del mypacks 21 | 22 | COPY ./pyunit_address /app/pyunit_address 23 | COPY ./main.py /app/main.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **PyUnit-Address** [![](https://gitee.com/tyoui/logo/raw/master/logo/photolog.png)][1] 2 | 3 | ## 字符串地址查询,支持自定义地址词库 4 | 5 | [![](https://img.shields.io/badge/Python-3.7-green.svg)](https://pypi.org/project/pyunit-address/) 6 | 7 | ## 安装 8 | 9 | pip install pyunit-address 10 | 11 | ## 说明 12 | 13 | 该算法有两个词库,一个是全国五级地址,统计时间是2019年。这个地址库是默认加载。不能删除也不能替换。 14 | 如果需要提取非规则的地址,则实用深度模型: https://github.com/PyUnit/pyunit-ner 15 | 建议两者一起使用,互相补足。 16 | 17 | ## 测试 18 | 19 | ```python 20 | from pyunit_address import * 21 | import time 22 | 23 | address = Address(is_max_address=True) 24 | address.add_vague_text(['红花岗', '花溪']) # 加入地址名称 25 | address.add_vague_text('贵州省-遵义市-遵义县-虾子镇-乐安村-乐石台') # 加入一串有顺序的地址 26 | address.add_vague_text('自定义词库.txt') # 加载词库文件,词库文件中的每一行,可以是一串顺序地址,也可以是一个地址 27 | 28 | 29 | def all_test(): 30 | string_ = '我家在红花岗,你家在贵州贵阳花溪区,他家在贵州省遵义市花溪区' 31 | finds = find_address(address, string_) 32 | for find in finds: 33 | print() 34 | print('地址', find) 35 | print('补全地址', supplement_address(address, find)) 36 | print('纠错地址', correct_address(address, find)) 37 | print('--------------------------') 38 | 39 | 40 | # 地址 红花岗 41 | # 补全地址 ['贵州省-遵义市-红花岗区'] 42 | # 纠错地址 贵州省-遵义市-红花岗区 43 | # -------------------------- 44 | # 45 | # 地址 贵州贵阳花溪区 46 | # 补全地址 ['贵州省-贵阳市-花溪区'] 47 | # 纠错地址 贵州省-贵阳市-花溪区 48 | # -------------------------- 49 | # 50 | # 地址 贵州省遵义市花溪区 注:这个地址是错误的 51 | # 补全地址 [] 注:错误的地址无法补全 52 | # 纠错地址 贵州省-贵阳市-花溪区 注:错误的地址被纠正为对的地址 53 | # -------------------------- 54 | 55 | 56 | if __name__ == '__main__': 57 | start = time.time() 58 | all_test() 59 | print(time.time() - start) # 0.0002001047134399414秒 60 | 61 | ``` 62 | 63 | ## 查询地址 64 | 65 | ```python 66 | from pyunit_address import Address, find_address 67 | 68 | 69 | def test(): 70 | address = Address(is_max_address=True) 71 | 72 | # 添加词库,可以是一个字符串、可以是列表字符串、可以是词库文件,一个词语占一行 73 | address.add_vague_text('红花岗') # 在默认词库上追加地址词库 74 | address.add_vague_text('贵州省-遵义市-遵义县-虾子镇-乐安村') # 添加补全地址 75 | address.add_vague_text(['花溪', '贵州省-遵义市-遵义县-虾子镇-乐安村']) # 加载词库列表,替换默认词库 76 | address.add_vague_text('自定义词库.txt') # 加载词库文件,替换默认词库 77 | af = find_address(address, '我家在贵州遵义红花岗区') 78 | print(af) 79 | 80 | 81 | if __name__ == '__main__': 82 | test() 83 | ``` 84 | 85 | ### 自动补全地址:输入一句话 86 | 87 | ```python 88 | from pyunit_address import Address, supplement_address 89 | 90 | 91 | def test_supplement_address(): 92 | address = Address(is_max_address=True) 93 | asu = supplement_address(address, '我家在遵义县') # [贵州省-遵义市-遵义县] 94 | print(asu) 95 | 96 | 97 | if __name__ == '__main__': 98 | test_supplement_address() 99 | ``` 100 | 101 | ### 自动纠正地址 102 | 103 | ```python 104 | from pyunit_address import Address, correct_address 105 | 106 | 107 | def correct_address_test(): 108 | address = Address(is_max_address=True) 109 | print(correct_address(address, '贵州省遵义市花溪区')) # 贵州省-贵阳市-花溪区 110 | 111 | 112 | if __name__ == '__main__': 113 | correct_address_test() 114 | ``` 115 | 116 | ## Docker部署 117 | 118 | docker pull jtyoui/pyunit-address 119 | docker run -d -P pyunit-time 120 | 121 | ## Swagger在线api文档 122 | 123 | http://localhost:xxx/docs 124 | 125 | ### 寻找地址的请求参数 126 | 127 | |**参数名**|**类型**|**是否可以为空**|**说明**| 128 | |------|------|-------|--------| 129 | |data|string|YES|输入一句带有地址的句子| 130 | 131 | ### 请求示例 132 | 133 | > #### Python3 Requests测试 134 | 135 | ```python 136 | import requests 137 | 138 | url = "http://127.0.0.1:2312/pyunit/address/find" 139 | data = { 140 | 'data': '我家在贵州龙里' 141 | } 142 | response = requests.get(url, params=data).json() 143 | print(response) 144 | ``` 145 | 146 | > #### 返回结果 147 | 148 | ```json 149 | { 150 | "code": 200, 151 | "result": [ 152 | { 153 | "address": "龙里", 154 | "correct_address": "贵州省-黔南布依族苗族自治州-龙里县", 155 | "supplement_address": [ 156 | { 157 | "key": "贵州省-黔南布依族苗族自治州-龙里县" 158 | } 159 | ], 160 | "type": "区县" 161 | } 162 | ] 163 | } 164 | ``` 165 | 166 | ### 增加地址词库请求参数 167 | 168 | |**参数名**|**类型**|**是否可以为空**|**说明**| 169 | |------|------|-------|--------| 170 | |data|string|YES|输入一句带有地址的句子| 171 | 172 | ### 请求示例 173 | 174 | > #### Python3 Requests测试 175 | 176 | ```python 177 | import json 178 | import requests 179 | 180 | url = "http://127.0.0.1:2312/pyunit/address/add" 181 | data = { 182 | 'data': json.dumps(['贵州省-贵阳市-观山湖区-观山湖公园', '金融大街', '小吃城']) 183 | } 184 | response = requests.get(url, params=data).json() 185 | print(response) 186 | ``` 187 | 188 | ### 删除地址词库请求参数 189 | 190 | |**参数名**|**类型**|**是否可以为空**|**说明**| 191 | |------|------|-------|--------| 192 | |data|string|YES|输入一句带有地址的句子| 193 | 194 | ### 请求示例 195 | 196 | > #### Python3 Requests测试 197 | 198 | ```python 199 | import json 200 | 201 | import requests 202 | 203 | url = "http://127.0.0.1:2312/pyunit/address/del" 204 | data = { 205 | 'data': json.dumps(['金融大街', '小吃城']), 206 | } 207 | response = requests.get(url, params=data).json() 208 | print(response) 209 | ``` 210 | 211 | > #### 返回结果 212 | 213 | ```json 214 | { 215 | "code": 200, 216 | "result": "del success" 217 | } 218 | ``` 219 | 220 | ## TODO 221 | 222 | - [x] 自动寻找最长地址长度 223 | - [x] 全国五级地址新词库 224 | - [x] 支持自定义地址词库 225 | - [x] 不支持非规则地址 226 | - [x] 支持地址自动补全 227 | - [x] 支持快速高效搜索 228 | - [x] 支持纠错地址 229 | 230 | *** 231 | 232 | [1]: https://blog.jtyoui.com 233 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python3.8 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/5/7 上午11:27 4 | # @Author: Jtyoui@qq.com 5 | # @Notes : flask 启动 6 | import json 7 | import os 8 | from fastapi import FastAPI, Query 9 | from fastapi.middleware.cors import CORSMiddleware 10 | from pydantic import BaseModel 11 | from pyunit_address import AddressType 12 | 13 | docs_url = os.environ.get('DOCS', "/docs") 14 | app = FastAPI(title='地址抽取', description='基于规则抽取、地址抽取接口文档', version='1.0', docs_url=docs_url) 15 | 16 | app.add_middleware( 17 | CORSMiddleware, 18 | allow_origins=['*'], 19 | allow_credentials=True, 20 | allow_methods=["*"], 21 | allow_headers=["*"], 22 | ) 23 | address = AddressType() 24 | 25 | 26 | class ResponseModal(BaseModel): 27 | """返回格式类型""" 28 | msg: str = 'success' 29 | code: int = 200 30 | result: list = [] 31 | 32 | 33 | @app.get('/pyunit/address/find', description='查找地址接口', response_model=ResponseModal) 34 | def correct(data: str = Query(..., description='输入一句话')): 35 | try: 36 | result = address.address_message(data) 37 | return ResponseModal(result=result) 38 | except Exception as e: 39 | return ResponseModal(code=0, msg=str(e)) 40 | 41 | 42 | @app.get('/pyunit/address/add', response_model=ResponseModal) 43 | def adds(data: str = Query(..., description="增加地址,有顺序的地址用-分开。地址的格式:['贵州省-贵阳市-观山湖区-观山湖公园', '金融大街']")): 44 | try: 45 | words = json.loads(data) 46 | if isinstance(words, list): 47 | address.add_vague_text(words, '-') 48 | return ResponseModal(msg='add success') 49 | else: 50 | return ResponseModal(code=400, msg='data not is list') 51 | except Exception as e: 52 | return ResponseModal(code=0, msg=str(e)) 53 | 54 | 55 | @app.get('/pyunit/address/del', response_model=ResponseModal) 56 | def delete(data: str = Query(..., description="删除地址。地址的格式:['金融大街']")): 57 | try: 58 | words = json.loads(data) 59 | if isinstance(words, list): 60 | address.delete_vague_text(words) 61 | return ResponseModal(msg='del success') 62 | else: 63 | return ResponseModal(code=400, msg='data not is list') 64 | except Exception as e: 65 | return ResponseModal(code=0, msg=str(e)) 66 | -------------------------------------------------------------------------------- /pyunit_address/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/2/18 15:23 4 | # @Author: Jtyoui@qq.com 5 | from .address import Address # 地址初始化 6 | from .addressType import AddressType # 得到地址类型 7 | from .correctionAddress import correct_address # 纠错地址 8 | from .findAddress import find_address # 查询地址 9 | from .supplementAddress import supplement_address # 补全地址 10 | from .tool import * 11 | 12 | __version__ = '2021.3.31' 13 | __author__ = 'Jtyoui' 14 | __description__ = '全国五级地址查询' 15 | __email__ = 'jtyoui@qq.com' 16 | __names__ = 'pyUnit_address' 17 | __url__ = 'https://github.com/PyUnit/pyunit-address' 18 | -------------------------------------------------------------------------------- /pyunit_address/address.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyunits/pyunit-address/f754285feaaf136c802aaf4b8b554783e50262fb/pyunit_address/address.bz2 -------------------------------------------------------------------------------- /pyunit_address/address.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/2/18 15:23 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 地址初始化 6 | from .tool import reset_key 7 | from .multitree import MultiTree 8 | from collections.abc import Iterable 9 | import ahocorasick 10 | import bz2 11 | import os 12 | import json 13 | import re 14 | import itertools 15 | 16 | 17 | class Address: 18 | 19 | def __init__(self, is_max_address=False): 20 | """初始化 21 | 22 | :param is_max_address: 满足最长地址 23 | """ 24 | 25 | # 加载精准匹配的词库,共40万 26 | self.suffix_stop = '[省市县区]' 27 | self.ac = ahocorasick.Automaton() 28 | self.count = itertools.count(0) 29 | self.is_max_address = is_max_address 30 | self.root = self._unzip() 31 | 32 | def delete_vague_text(self, words: [str, Iterable]): 33 | """删除默认词库 34 | 35 | 传入的参数可以是:一个词语、一个列表、一个元组、甚至是一个文件地址,文件地址里面是包含一列一个词语 36 | 37 | 格式1:删除一个词,传入字符串 38 | 39 | 格式2:删除一列词,传入列表 40 | 41 | 删除不支持一串顺序地址删除:例如:贵州省-贵阳市-遵义市 42 | """ 43 | if isinstance(words, str): 44 | words = words.strip() # 去除空格 45 | if os.path.exists(words): 46 | with open(words, encoding='UTF-8')as fp: 47 | for word in fp: 48 | self.delete_vague_text(word) 49 | else: 50 | self.ac.remove_word(words) 51 | elif isinstance(words, Iterable): 52 | for word in words: 53 | self.delete_vague_text(word) 54 | 55 | def add_vague_text(self, words: [str, Iterable], separators='-'): 56 | """增加地址词语 57 | 58 | 传入的参数可以是:一个词语、一个列表、一个元组、甚至是一个文件地址,文件地址里面是包含一列一个词语 59 | 60 | 格式1: 只增加一个词 61 | 62 | 格式2:增加一个列表 63 | 64 | :param words: 可以传列表、文件地址、或者字符串,如果字符串包含separators,则默认为传入有序地址, 65 | 比如:贵州省-遵义市-遵义县-虾子镇-乐安村-乐石台 66 | 传入有序地址后,可以进行补全地址。 67 | 如果单独传入一个字符串,只能找到该字符串不能进行补全地址。 68 | 69 | :param separators: 地址分割符,比如:贵州省-遵义市-遵义县-虾子镇-乐安村-乐石台、分割符是:- 70 | """ 71 | if isinstance(words, str): 72 | words = words.strip() # 去除空格 73 | if os.path.exists(words): # 判断是否存在该文件 74 | with open(words, encoding='UTF-8')as fp: 75 | for word in fp: 76 | self.add_vague_text(word, separators) 77 | elif separators in words: # 判断是否带有分割符 78 | self.root.add_value(words, self.ac, separators) 79 | else: # 纯字符串 80 | m = MultiTree(value=words, parent=None) 81 | self.flag_ac_contain_key(words, m) 82 | elif isinstance(words, Iterable): # 迭代器 83 | for word in words: 84 | self.add_vague_text(word, separators) 85 | 86 | def recursion_load(self, root, address): 87 | """递归加载数据中的地址""" 88 | if isinstance(address, list): 89 | for addr in address: 90 | reset = reset_key(addr) 91 | tree = MultiTree(value=reset, parent=root) 92 | root.add_children(tree) 93 | self.flag_ac_contain_key(reset, root) 94 | else: 95 | for value, parent in address.items(): 96 | tree = MultiTree(value=value, parent=root) 97 | root.add_children(tree) 98 | self.flag_ac_contain_key(value, tree) 99 | self.recursion_load(tree, parent) 100 | 101 | def _unzip(self) -> (list, dict): 102 | """解压地址数据包""" 103 | name = 'address' 104 | bz = bz2.BZ2File(os.path.dirname(__file__) + os.sep + name + '.bz2') 105 | lines = bz.read().decode('utf-8') 106 | address = json.loads(lines[512:-1134], encoding='utf8') 107 | root = MultiTree(value='中国', parent=None) 108 | self.recursion_load(root, address) 109 | return root 110 | 111 | def max_match_cut(self, sentence): 112 | """正向最长匹配算法""" 113 | words = [''] 114 | for i in sentence: 115 | if self.ac.match(words[-1] + i): 116 | words[-1] += i 117 | else: 118 | words.append(i) 119 | values = list(filter(lambda x: len(x) > 1, words)) 120 | return [v for v in values if v in self.ac] # 检验是否在ac自动机里面的词语 121 | 122 | def flag_ac_contain_key(self, key, obj): 123 | """判断ac自动机里面是否包含相同的key值""" 124 | stop_key = re.sub(self.suffix_stop, '', key) # 包含一些停用词 125 | flag = True if len(stop_key) <= 1 else (stop_key == key) 126 | if key not in self.ac: 127 | flag or self.ac.add_word(stop_key, [obj]) 128 | self.ac.add_word(key, [obj]) 129 | else: 130 | flag or self.ac.get(stop_key).append(obj) 131 | self.ac.get(key).append(obj) 132 | -------------------------------------------------------------------------------- /pyunit_address/addressType.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python3.8 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/8/27 上午10:32 4 | # @Author: 张伟 5 | # @EMAIL: Jtyoui@qq.com 6 | # @Notes : 获取地址的类型 7 | import os 8 | import time 9 | import zipfile 10 | 11 | from .address import Address 12 | from .correctionAddress import correct_address 13 | from .findAddress import find_address 14 | from .supplementAddress import supplement_address 15 | 16 | 17 | class AddressType: 18 | 19 | def __init__(self): 20 | self.hot = {} 21 | # 加载文本初始化 22 | hot_file = os.environ.get('PYUNIT_ADDRESS_HOT_FILE', None) 23 | if hot_file: 24 | with open(hot_file, encoding='utf-8') as fp: 25 | for line in fp.readlines(): 26 | name, addr = line.strip().split() 27 | self.hot[name] = addr 28 | else: 29 | zip_file = os.path.join(os.path.dirname(__file__), 'hot.zip') 30 | zips = zipfile.ZipFile(zip_file, 'r') 31 | data = zips.read('hot.txt').decode('utf-8') 32 | for line in data.split('\r\n'): 33 | if line: 34 | name, addr = line.replace(' ', '').split('\t') 35 | self.hot[name] = addr 36 | self.address = Address() 37 | 38 | @staticmethod 39 | def get_address_type(address): 40 | """根据地址信息来细化类型 41 | 42 | 类型包括: 43 | 中国省份 44 | 中国城市 45 | 中国城市区县 46 | 中国城市街道 47 | 中国地理热点 48 | 49 | >>> AddressType().get_address_type('云南省') 50 | '省份' 51 | 52 | >>> AddressType().get_address_type('贵州省贵阳市') 53 | '城市' 54 | 55 | >>> AddressType().get_address_type('金阳路105号') 56 | '街道' 57 | 58 | >>> AddressType().get_address_type('观山湖区') 59 | '区县' 60 | 61 | :param address: 输入一个地址文本 62 | :return: 地址类型 63 | """ 64 | if '区' in address or '县' in address: 65 | return '区县' 66 | elif '路' in address or '号' in address: 67 | return '街道' 68 | elif '市' in address: 69 | return '城市' 70 | elif '省' in address: 71 | return '省份' 72 | return None 73 | 74 | def address_message(self, word): 75 | """根据一个地址文本,分析出改文本中的地址和景点地区""" 76 | start = time.time() 77 | result = [] 78 | for key in self.hot: 79 | if word and key in word: 80 | types = '地理热点' 81 | ca = self.hot[key] 82 | result.append({'address': key, 'supplement_address': [], 'correct_address': ca, 'type': types}) 83 | finds = find_address(self.address, word) 84 | for find in finds: 85 | sa = supplement_address(self.address, find) # 补全地址 86 | ca = correct_address(self.address, find) # 纠错地址 87 | s = [{'key': i} for i in sa] 88 | types = self.get_address_type(ca) 89 | result.append({'address': find, 'supplement_address': s, 'correct_address': ca, 'type': types}) 90 | print(time.time() - start) 91 | return result 92 | 93 | def add_vague_text(self, words, separators): 94 | self.address.add_vague_text(words, separators) 95 | 96 | def delete_vague_text(self, words): 97 | self.address.delete_vague_text(words) 98 | -------------------------------------------------------------------------------- /pyunit_address/correctionAddress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/4/6 12:39 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 自动纠正地址 6 | """这里的纠正是表示地址错误,并不是说文字错误,比如:四川省,写成:四穿省,并不会纠错。 7 | 例如:正确的地址:贵州省贵阳市花溪区 8 | 错误的地址:贵州省遵义市花溪区 9 | 10 | 会自动纠正为:贵州省贵阳市花溪区 11 | """ 12 | from .supplementAddress import key_to_address 13 | 14 | 15 | def max_key_filter(keys): 16 | """最多关键词过滤算法 17 | 18 | 依据关键词出现的次数来判断改地址的重要性 19 | """ 20 | 21 | def inner(x): 22 | flag = 0 23 | for key in keys: 24 | if key in x: 25 | flag += 1 26 | return x, flag 27 | 28 | return inner 29 | 30 | 31 | def correct_address(cls, sentence, max_length_address=True): 32 | """自动纠正地址 33 | 34 | :param cls: Address类对象 35 | :param sentence: 要纠错的句子 36 | :param max_length_address: 是否返回最长地址 37 | :return: 纠错后的地址 38 | """ 39 | max_seq_list = [] 40 | keys = cls.max_match_cut(sentence) # 分割地址关键词 41 | all_ = key_to_address(cls, keys) # 根据关键词搜索地址 42 | filter_address = dict(map(max_key_filter(keys), all_)) # 判断关键词出现的频率 43 | if filter_address: 44 | sort_address = list(sorted(filter_address.items(), key=lambda x: x[1], reverse=True)) # 根据频率排序 45 | max_seq = sort_address[0][1] # 获取最大的频率 46 | for address, flag in sort_address: 47 | if max_seq == flag: 48 | max_seq_list.append(address) # 获取最大的频率组 49 | if max_length_address: 50 | return max(max_seq_list, key=lambda x: len(x)) # 返回字符串最长的一个地址 51 | else: 52 | return max_seq_list 53 | return [] 54 | -------------------------------------------------------------------------------- /pyunit_address/findAddress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/4/6 12:31 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 自动寻找地址 6 | import re 7 | from .tool import remove_subset 8 | 9 | 10 | def checkout_re_address(address, text): 11 | """检验是否是有效地址 12 | 检验: xx省xx市xx组团x栋|xx(号)楼xx层|x座 13 | xx的有效数字不超过5位数 14 | """ 15 | compiles = re.search(rf'{address}(\w+?)([a-zA-Z\d]+组团|[栋楼层座])', text, flags=re.S) 16 | if compiles: 17 | addr = compiles.group(1) 18 | if '的' not in addr: 19 | return compiles.group() 20 | else: 21 | compiles = re.search(rf'{address}(\w+?)\d+号', text, flags=re.S) 22 | return compiles.group() if compiles else False 23 | return False 24 | 25 | 26 | def find_address(cls, data: str, is_max_address=True, ignore_special_characters=True) -> list: 27 | """查找地址 28 | 29 | :param cls: Address类对象 30 | :param data: 查找地址数据 31 | :param is_max_address: 是否查找最长地址 32 | :param ignore_special_characters: 是否去掉特殊字符 33 | :return: 地址列表 34 | """ 35 | if ignore_special_characters: 36 | data = re.sub(r"[!#$%&'()*+,-./::,。?!;‘’、《》;<=>?@[\]^_`{|}~\s]", '', data) 37 | ls = cls.max_match_cut(data) 38 | if is_max_address: 39 | max_address = [] 40 | match = re.sub('|'.join(sorted(ls, key=lambda x: len(x), reverse=True)), lambda x: '*' * len(x.group()), data) 41 | for addr in re.finditer(r'[*]+', match): 42 | address = data[addr.start():addr.end()] 43 | address = checkout_re_address(address, data) 44 | if address: 45 | max_address.append(address) 46 | return remove_subset(max_address) if max_address else [] 47 | return ls 48 | -------------------------------------------------------------------------------- /pyunit_address/hot.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyunits/pyunit-address/f754285feaaf136c802aaf4b8b554783e50262fb/pyunit_address/hot.zip -------------------------------------------------------------------------------- /pyunit_address/multitree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/4/4 19:43 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 多叉树 6 | 7 | 8 | class MultiTree: 9 | def __init__(self, value, parent): 10 | self.parent = parent 11 | self.value = value 12 | self.children = [] 13 | 14 | def add_children(self, children): 15 | """增加树的一个节点""" 16 | self.children.append(children) 17 | 18 | def add_value(self, values: str, ac, separators: str = '-'): 19 | """给出一条路径 20 | 21 | :param values: 一条路径:比例:'贵州省-遵义市-遵义县-虾子镇' 22 | :param ac: AC自动机对象 23 | :param separators: 分割符 24 | """ 25 | value = values.split(separators) 26 | cls = self 27 | while value: 28 | v = value.pop(0) 29 | for node in cls.children: 30 | if node.value == v: 31 | cls = node 32 | break 33 | else: 34 | new_cls = MultiTree(value=v, parent=cls) 35 | cls.add_children(new_cls) 36 | if v in ac: 37 | ac.get(v).append(new_cls) 38 | else: 39 | ac.add_word(v, [new_cls]) 40 | cls = new_cls 41 | -------------------------------------------------------------------------------- /pyunit_address/supplementAddress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/4/6 12:36 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 自动补全地址 6 | from collections.abc import Iterable 7 | 8 | from .tool import remove_subset 9 | 10 | 11 | def satisfy_filter(finds_address, is_order): 12 | """满足条件的过滤算法 13 | 14 | 保证每一个key都在地址中 15 | 16 | 算法流程:满足每一个地址提取的实体 17 | """ 18 | 19 | def inner(x): 20 | order = [] 21 | for address in finds_address: 22 | if address not in x: 23 | return False 24 | else: 25 | order.append(x.find(address)) 26 | else: 27 | if is_order: 28 | return True if order == list(sorted(order)) else False 29 | return True 30 | 31 | return inner 32 | 33 | 34 | def search(cls, link: str = '-') -> str: 35 | """搜索该值的路径 36 | 37 | :param cls: 多叉树对象 38 | :param link: 拼接符 39 | :return: 该值的路径 40 | """ 41 | n = [] 42 | while cls.parent: 43 | n.append(cls.value) 44 | cls = cls.parent 45 | return link.join(reversed(n)) 46 | 47 | 48 | def key_to_address(cls, keys): 49 | """根据关键字获取地址 50 | 51 | :param cls: Address类对象 52 | :param keys: 关键字 53 | :return: 返回关键字对应的地址 54 | """ 55 | all_ = [] 56 | if isinstance(keys, str): 57 | objs = cls.ac.get(keys) 58 | address = [search(obj) for obj in objs] 59 | return address 60 | elif isinstance(keys, Iterable): 61 | for key in keys: 62 | objs = cls.ac.get(key) 63 | address = [search(obj) for obj in objs] 64 | all_.extend(address) 65 | return all_ 66 | 67 | 68 | def supplement_address(cls, address_name, is_max_address=None, is_order=False, link: str = '-') -> list: 69 | """补全地址 70 | 71 | 输入零碎的地址信息。补全地址,比如输入:山西孝义,补全为:山西省-吕梁市-文水县-孝义镇 72 | 73 | 当参数:is_max_address=False时。默认补全最短地址。比如:山西孝义,补全为:山西省-吕梁市-文水县-孝义镇 74 | 当参数:is_max_address=True。补全最长地址。比如:山西孝义,补全为:山西省-吕梁市-文水县-孝义镇-孝义村委会 75 | 76 | 当参数:is_order=False。补全的地址是无序的,比如:孝义山西,也能补全为:山西省-吕梁市-文水县-孝义镇 77 | 当参数:is_order=True。补全的地址是有序的,比如:孝义山西,则补全不出。无法在孝义下面找到关于山西的地址字眼。 78 | 79 | :param cls: Address类对象 80 | :param address_name: 要补全的地址,比如:山西孝义 81 | :param is_max_address: 是否是最大补全地址,默认是否。 82 | :param is_order: 地址补全,是否遵守顺序。默认是:无序 83 | :param link: 补全路径的拼接符,默认是:- 84 | """ 85 | keys = cls.max_match_cut(address_name) 86 | all_ = key_to_address(cls, keys) 87 | match = filter(satisfy_filter(keys, is_order), all_) # 根据过滤算法来去掉不是关键字的地址 88 | ls = remove_subset(match) 89 | if ls: 90 | if is_max_address is True: 91 | return [max(ls, key=lambda x: len(x))] 92 | elif is_max_address is False: 93 | return [min(ls, key=lambda x: len(x))] 94 | return ls 95 | -------------------------------------------------------------------------------- /pyunit_address/tool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/4/6 12:29 4 | # @Author: Jtyoui@qq.com 5 | # @interpret: 常用工具 6 | def remove_subset(ls) -> list: 7 | """去除列表中的子集 8 | 9 | 比如:['aa','a','ab'] --> ['aa','ab'] 10 | 11 | :param ls: 字符串列表 12 | :return: 返回去重后的结果 13 | """ 14 | ls = sorted(ls, key=lambda x: len(x), reverse=True) 15 | total = [] 16 | for subset in ls: 17 | if subset not in total: 18 | flag = True 19 | for word in total: 20 | if subset in word: 21 | flag = False 22 | break 23 | if flag: 24 | total.append(subset) 25 | return total 26 | 27 | 28 | def reset_key(key): 29 | """重新设置key 30 | 31 | 比如:有一些地名是: xx街道办事处 -> xx街道 32 | xx村委会 -> xx村 33 | 等等 34 | """ 35 | if key.endswith('社区居委会'): 36 | key = key[:-3] 37 | elif key.endswith('村委会'): 38 | key = key[:-2] 39 | elif key.endswith('街道办事处'): 40 | key = key[:-3] 41 | elif key.endswith('村村民委员会'): 42 | key = key[:-5] 43 | elif key == '居委会': 44 | key = '\x02' 45 | return key 46 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyahocorasick -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/2/18 15:23 4 | # @Author: Jtyoui@qq.com 5 | from setuptools import setup, find_packages 6 | from pyunit_address import __version__, __author__, __description__, __email__, __names__, __url__ 7 | 8 | with open('README.md', encoding='utf-8') as f: 9 | long_text = f.read() 10 | 11 | with open('requirements.txt', encoding='utf-8') as f: 12 | install_requires = f.read().strip().splitlines() 13 | 14 | setup( 15 | name=__names__.lower(), 16 | version=__version__, 17 | description=__description__, 18 | long_description=long_text, 19 | long_description_content_type="text/markdown", 20 | url=__url__, 21 | author=__author__, 22 | author_email=__email__, 23 | license='MIT Licence', 24 | packages=find_packages(), 25 | platforms='any', 26 | package_data={'': ['*']}, 27 | install_requires=install_requires, 28 | classifiers=[ 29 | "Programming Language :: Python :: 3", 30 | "License :: OSI Approved :: MIT License", 31 | "Operating System :: OS Independent", 32 | ], 33 | zip_safe=True, 34 | ) 35 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2020/2/18 15:23 4 | # @Author: Jtyoui@qq.com 5 | import time 6 | 7 | from pyunit_address import * 8 | 9 | address = Address(is_max_address=True) 10 | address.add_vague_text(['红花岗', '花溪']) 11 | address.add_vague_text('贵州省-遵义市-遵义县-虾子镇-乐安村-乐石台') 12 | 13 | 14 | def find_address_test(): 15 | af = find_address(address, '我家在贵州遵义红花岗区,你家在贵州贵阳花溪区') 16 | print(af) # ['贵州遵义红花岗区', '贵州贵阳花溪区'] 17 | 18 | 19 | def test_supplement_address(): 20 | print(supplement_address(address, '我家在遵义市乐石台', is_order=True)) # ['贵州省-遵义市-遵义县-虾子镇-乐安村-乐石台'] 21 | print(supplement_address(address, '山西孝义镇')) # ['山西省-吕梁市-文水县-孝义镇'] 22 | print(supplement_address(address, '我在三家镇乐安村')) # ['海南省-省直辖县级行政区划-东方市-三家镇-乐安村'] 23 | print(supplement_address(address, '我在新舟镇')) # ['贵州省-遵义市-遵义县-新舟镇'] 24 | 25 | 26 | def correct_address_test(): 27 | print(correct_address(address, '贵州省遵义市花溪区', False)) # ['贵州省-遵义市', '贵州省-贵阳市-花溪区'],未开启最长地址 28 | 29 | 30 | def all_test(): 31 | string_ = '我家在红花岗,你家在贵州贵阳花溪区,他家在贵州省遵义市花溪区' 32 | finds = find_address(address, string_) 33 | for find in finds: 34 | print() 35 | print('地址', find) 36 | print('补全地址', supplement_address(address, find)) 37 | print('纠错地址', correct_address(address, find)) 38 | print('--------------------------') 39 | 40 | 41 | def optimization(): 42 | data = """ 43 | 我家住在贵阳市观山湖区中天会展城A2组团1栋 44 | 嗯,行好的,观山湖区金朱东路11号贵州金融城11号楼3层哈 45 | 公司在贵阳市观山湖区金融城maxB座贵州小爱机器人有限公司 46 | 我现在在贵阳上班,我想把我之前遵义交的公积金转过来 47 | 花溪区政府里面.那当前的话,鉴于花溪区政府的地址,已经搬至了花溪区两家坡大数据产业园3号楼花溪区政务服务大厅里面。 48 | 白云区,长坡岭国家森林公园,融创中国控股有限公司(楼盘:楼盘名称融创云麓长林),市民来电反映自己在今年9月25日购房已交20000元的定金和首付以及各项手续费用总共79824元,当时有签订认筹合同,市民表示房开公司不允许市民使用公积金贷款购房,市民对此表示不理解,于是向白云区住建局反映,但未解决市民的诉求,市民希望房开允许市民使用公积金贷款购房或者将定金及各项手续费全额退还,请相关职能部门及时处理。 49 | 白云区,融创云麓,长林楼盘,市民来电反映2020年9月30日,在白云区融创云麓长林购置一套房子时被告知不能用公积金组合贷款,于是其没有继续签合同,现在房开不给组合贷款买房,也不明确答复市民的首付款能不能退,一直拖到现在,市民希望尽快协调处理公积金贷款购房事宜,请相关职能部门及时处理。(市民需要职能部门回复) 50 | 经开区,开发大道,融创城,市民来电反映其10月18日去到该处看房,该处告知购买后先商贷,但两年后可以转为公积金贷款,市民便缴纳了2万元的订金,但后来却告知交房需要2年半,交房后再过2年半才能转,当时市民使用微信转账的费用,经开区住建局之前告知市民只能通过协商处理并且告知其融创会与其联系处理问题,市民至今未接到融创处理电话,市民表示自己是被欺骗消费,并且之前协商多次无果,市民持有沟通贷款时录音,市民现要求退换此笔费用,请相关职能部门及时处理。(同案件:2010271053065) 51 | """.strip().split('\n') 52 | for line in data: 53 | print(find_address(address, line.strip())) 54 | 55 | 56 | if __name__ == '__main__': 57 | # start = time.time() 58 | # find_address_test() 59 | # test_supplement_address() 60 | # correct_address_test() 61 | # all_test() 62 | # print(time.time() - start) 63 | optimization() 64 | --------------------------------------------------------------------------------