├── .github └── workflows │ └── java8.yml ├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.md ├── README_old.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── bitlap │ │ └── geocoding │ │ ├── Geocoding.kt │ │ ├── GeocodingX.kt │ │ ├── core │ │ ├── AddressInterpreter.kt │ │ ├── AddressPersister.kt │ │ ├── Computer.kt │ │ ├── Context.kt │ │ ├── RegionCache.kt │ │ ├── Segmenter.kt │ │ ├── TermIndexVisitor.kt │ │ ├── impl │ │ │ ├── DefaultAddressInterpreter.kt │ │ │ ├── DefaultAddressPersister.kt │ │ │ ├── DefaultRegoinCache.kt │ │ │ ├── RegionInterpreterVisitor.kt │ │ │ └── SimilarityComputer.kt │ │ └── segment │ │ │ ├── AsciiSegmenter.kt │ │ │ ├── IKAnalyzerSegmenter.kt │ │ │ ├── SimpleSegmenter.kt │ │ │ ├── SmartCNSegmenter.kt │ │ │ └── WordSegmenter.kt │ │ ├── index │ │ ├── TermIndexBuilder.kt │ │ ├── TermIndexEntry.kt │ │ ├── TermIndexItem.kt │ │ └── TermType.kt │ │ ├── model │ │ ├── Address.kt │ │ ├── AddressEntity.kt │ │ ├── Division.kt │ │ ├── RegionEntity.kt │ │ └── RegionType.kt │ │ ├── similarity │ │ ├── Document.kt │ │ ├── MatchedResult.kt │ │ ├── MatchedTerm.kt │ │ └── Term.kt │ │ └── utils │ │ └── StringHelper.kt └── resources │ ├── IKAnalyzer.cfg.xml │ ├── core │ └── region.dat │ ├── dic │ ├── community.dic │ ├── region.dic │ └── stop.dic │ ├── logback.xml │ └── word.local.conf └── test ├── java └── org │ └── bitlap │ └── geocoding │ ├── TestCustomDatSave.kt │ ├── TestNormalizing.kt │ ├── TestNormalizingAddRegionEntry.kt │ ├── TestNormalizingCustom.kt │ ├── TestSegments.kt │ ├── TestSimilarity.kt │ └── region │ ├── Main.java │ ├── README.md │ ├── RegionDatFileHelper.java │ ├── RegionSqlHelper.java │ ├── model │ └── RegionEntity.java │ ├── sql │ └── china.sql │ └── util │ ├── JdbcUtil.java │ └── OutUtil.java └── resources ├── address.txt ├── region_2021.dat └── sql └── create.sql /.github/workflows/java8.yml: -------------------------------------------------------------------------------- 1 | name: Java 8 CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v2 18 | with: 19 | fetch-depth: 1 20 | - name: Set up JDK 1.8 21 | uses: actions/setup-java@v1 22 | with: 23 | java-version: 1.8 24 | - name: Build with Maven 25 | run: mvn --batch-mode --update-snapshots verify 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jar 2 | !/gradle/wrapper/gradle-wrapper.jar 3 | *.war 4 | *~ 5 | *.class 6 | *.lock 7 | *.DS_Store 8 | *.swp 9 | *.out 10 | target/ 11 | build/ 12 | *.iml 13 | *.ipr 14 | *.iws 15 | .gradle/ 16 | .settings/ 17 | .classpath 18 | .project 19 | .metadata/ 20 | .idea/ 21 | logs/ 22 | dev.properties 23 | dependency-reduced-pom.xml 24 | *.rdb 25 | *.orig -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | * 1.1.3 4 | * 新增自定义地址设置 5 | * 1.1.4 6 | * 修复一些匹配错误的bug 7 | * 1.1.6 8 | * 升级地址库和包版本, 修复一些匹配错误的地址 9 | * 1.2.0 10 | * geocoding项目转移到组织:bitlap/geocoding 11 | * 1.3.0 12 | * 新增自定义地址文件库配置参数 13 | * 添加自定义地址新增replace参数 14 | * 1.3.1 15 | * 修复normalizing方法多线程调用报错的问题 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright © 2018 IceMimosa 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | this software and associated documentation files (the “Software”), to deal in 6 | the Software without restriction, including without limitation the rights to 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | the Software, and to permit persons to whom the Software is furnished to do so, 9 | subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Project stage](https://img.shields.io/badge/Project%20Stage-Production%20Ready-brightgreen.svg)](https://github.com/bitlap/bitlap/wiki/Project-Stages) 3 | [![Java 8 CI](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml/badge.svg)](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml) 4 | [![Maven Central](https://img.shields.io/maven-central/v/org.bitlap/geocoding)](https://central.sonatype.com/artifact/org.bitlap/geocoding) 5 | 6 | # 介绍 7 | 本项目旨在将不规范(或者连续)的文本地址进行尽可能的**标准化**, 以及对两个地址进行**相似度的计算**。 8 | 9 | 地理编码技术, 主要分为如下步骤 10 | * 地址标准库 11 | * 地址标准化 12 | * 相似度计算 13 | 14 | ## pom 15 | 16 | ```xml 17 | 18 | 19 | org.bitlap 20 | geocoding 21 | 1.3.1 22 | 23 | 24 | ``` 25 | 26 | # 1. 数据测试 27 | 28 | 方法调用: `Geocoding` 类 29 | * normalizing: 标准化 30 | * analyze: 解析成分词文档 31 | * similarity: 相似度计算 32 | * similarityWithResult: 相似度计算, 返回包含更多丰富的数据 33 | 34 | ## 1.1 标准化 35 | 36 | ```java 37 | >> 输入: 山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303 38 | >> 输出: 39 | Address( 40 | provinceId=370000000000, province=山东省, 41 | cityId=370200000000, city=青岛市, 42 | districtId=370203000000, district=市北区, 43 | streetId=370203030000, street=水清沟街道, 44 | townId=null, town=null, 45 | villageId=null, village=null, 46 | road=九江路, 47 | roadNum=20号, 48 | buildingNum=3号楼2单元1303, 49 | text=大都会 50 | ) 51 | ``` 52 | 53 | ```java 54 | >> 输入: 上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近) 55 | >> 输出: 56 | Address( 57 | provinceId=310000000000, province=上海, 58 | cityId=310100000000, city=上海市, 59 | districtId=310113000000, district=宝山区, 60 | streetId=null, street=null, 61 | townId=null, town=null, 62 | villageId=null, village=null, 63 | road=新沪路, 64 | roadNum=58弄, 65 | buildingNum=11-802, 66 | text=水韵华庭水韵华庭附近 67 | ) 68 | ``` 69 | 70 | * 返回的对象解释 71 | * province相关: 省 72 | * city相关: 市 73 | * district相关: 区、县 74 | * street相关: 街道 75 | * town相关: 乡镇 76 | * village相关: 村 77 | * road: 道路 78 | * roadNum: 路号 79 | * buildingNum: 建筑物号 80 | * text: 标准化后为匹配的地址。一般包含小区, 商场名称等信息 81 | 82 | > 注: 如果对text的结果不是很满意, 比如出现重复或不准确, 可以通过分词的手段解决 83 | 84 | ## 1.2 相似度 85 | 86 | ```java 87 | >> 输入: 88 | 浙江金华义乌市南陈小区8幢2号 89 | 浙江金华义乌市稠城街道浙江省义乌市宾王路99号后面南陈小区8栋2号 90 | >> 输出: 91 | 0.8451542547285166 92 | ``` 93 | 94 | ```java 95 | >> 输入: 96 | 山东省沂水县四十里堡镇东艾家庄村206号 97 | 浙江金华义乌市南陈小区8幢2号 98 | >> 输出: 99 | 0.0 100 | ``` 101 | 102 | ## 1.3 自定义地址文件设置 103 | 104 | ```kotlin 105 | // 加载自定义地址文件 106 | val geocoding = GeocodingX("region_2021.dat") 107 | 108 | // 添加自定义区县"临平区" 109 | geocoding.addRegionEntry(330113000000, 330100000000, "临平区", RegionType.District, "", true) 110 | 111 | // 保存自定义字典文件 112 | geocoding.save("xxx.dat") 113 | ``` 114 | 115 | ## 1.4 自定义地址设置 116 | 117 | ```kotlin 118 | // 100000000000 代表中国的ID 119 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province) 120 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City) 121 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District) 122 | 123 | >> 输入: 中国尼玛省尼玛市泥煤市泥煤大道888号xxx 124 | >> 输出: 125 | Address( 126 | provinceId=88888888, province=尼玛省, 127 | cityId=8888888, city=尼玛市, 128 | districtId=888888, district=泥煤市, 129 | streetId=null, street=null, 130 | townId=null, town=null, 131 | villageId=null, village=null, 132 | road=泥煤大道, 133 | roadNum=888号, 134 | buildingNum=null, 135 | text=xxx 136 | ) 137 | ``` 138 | 139 | > Tips: 可以从「国家标准地址库」中获取「父级城市ID」 140 | 141 | # 2. 说明 142 | 143 | ## 2.1 标准地址库 144 | 项目目前采用的是 [~~淘宝物流4级地址~~][1] (已过期,可通过淘宝收货地址获取实际调用地址)的标准地址库, 也可以采用`国家的标准地址库` (对应的github库, [中国5级行政区域mysql库][3]). 145 | * [国家标准地址库2023](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023) 146 | * [国家标准地址库2022](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022) 147 | * [国家标准地址库2021](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2021) 148 | 149 | ### 导入中国5级行政区域mysql库注意事项 150 | 151 | [参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md) 152 | 153 | ## 2.2 标准地址库(兼容本项目) 154 | 155 | | 标准库文件 | 描述 | 参考 | 感谢 | 156 | |-----------------|-------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------| 157 | | region_2021.dat | 国家标准地址库2021 | [ISSUE-163](https://github.com/bitlap/geocoding/issues/163) | [TsLenMo](https://github.com/TsLenMo)、[weijiang.lin](https://github.com/linweijiang) | 158 | 159 | 使用方式:文件下载到`classpath`,使用自定义的`GeocodingX`类即可。 160 | 161 | ## 2.3 标准化 162 | 1. 首先基于正则提取出道路、建筑物号等信息 163 | 2. 省市区等匹配 164 | 1. 将标准的地址库建立**倒排索引** 165 | 2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条 166 | 3. 对所有匹配结果进行标准行政区域从属关系校验 167 | 168 | ## 2.4 相似度计算 169 | 1. 对输入的两个地址进行标准化 170 | 2. 对省市区等信息分配不同的权重 171 | 3. 对道路号, 建筑号进行语义处理, 分配权重 172 | 4. 对剩余文本(text)使用**IK Analyzer**进行分词 173 | 5. 对两个结果集使用**余弦相似度算法**计算相似度 174 | 175 | 176 | 项目参考[address-semantic-search][4],简化了流程,修复了各种不规则错误,使得使用更加方便。 177 | 178 | ## 感谢 179 | 180 | * Python封装库:[casuallyName/Geocoding](https://github.com/casuallyName/Geocoding) 181 | 182 | 183 | ## Release Log 184 | 185 | [Change Log](./CHANGES.md) 186 | 187 | ## LICENSE 188 | 189 | MIT 190 | 191 | [1]:https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do 192 | [3]:https://github.com/kakuilan/china_area_mysql 193 | [4]:https://github.com/liuzhibin-cn/address-semantic-search 194 | -------------------------------------------------------------------------------- /README_old.md: -------------------------------------------------------------------------------- 1 | 2 | [![Java 8 CI](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml/badge.svg)](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml) 3 | 4 | # 介绍 5 | 本项目旨在将不规范(或者连续)的文本地址进行尽可能的**标准化**, 以及对两个地址进行**相似度的计算**。 6 | 7 | 地理编码技术, 主要分为如下步骤 8 | * 地址标准库 9 | * 地址标准化 10 | * 相似度计算 11 | 12 | ## pom 13 | 14 | ```xml 15 | 16 | 17 | io.patamon.geocoding 18 | geocoding 19 | 1.1.6 20 | 21 | 22 | 23 | 24 | 25 | geocoding 26 | github release repository 27 | https://maven.pkg.github.com/IceMimosa/geocoding 28 | 29 | 30 | ``` 31 | 32 | > PS: 需要申请github token才能访问, [Authenticating to GitHub Packages](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-apache-maven-registry#authenticating-to-github-packages). 比如在 `~/.m2/settings.xml` 添加如下, [token申请地址](https://github.com/settings/tokens) 33 | 34 | ```xml 35 | 36 | 37 | geocoding 38 | [YOUR_NAME] 39 | [YOUR_TOKEN] 40 | 41 | 42 | ``` 43 | 44 | 45 | # 1. 数据测试 46 | 47 | 方法调用: `Geocoding` 类 48 | * normalizing: 标准化 49 | * analyze: 解析成分词文档 50 | * similarity: 相似度计算 51 | * similarityWithResult: 相似度计算, 返回包含更多丰富的数据 52 | 53 | ## 1.1 标准化 54 | 55 | ```java 56 | >> 输入: 山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303 57 | >> 输出: 58 | Address( 59 | provinceId=370000000000, province=山东省, 60 | cityId=370200000000, city=青岛市, 61 | districtId=370203000000, district=市北区, 62 | streetId=370203030000, street=水清沟街道, 63 | townId=null, town=null, 64 | villageId=null, village=null, 65 | road=九江路, 66 | roadNum=20号, 67 | buildingNum=3号楼2单元1303, 68 | text=大都会 69 | ) 70 | ``` 71 | 72 | ```java 73 | >> 输入: 上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近) 74 | >> 输出: 75 | Address( 76 | provinceId=310000000000, province=上海, 77 | cityId=310100000000, city=上海市, 78 | districtId=310113000000, district=宝山区, 79 | streetId=null, street=null, 80 | townId=null, town=null, 81 | villageId=null, village=null, 82 | road=新沪路, 83 | roadNum=58弄, 84 | buildingNum=11-802, 85 | text=水韵华庭水韵华庭附近 86 | ) 87 | ``` 88 | 89 | * 返回的对象解释 90 | * province相关: 省 91 | * city相关: 市 92 | * district相关: 区、县 93 | * street相关: 街道 94 | * town相关: 乡镇 95 | * village相关: 村 96 | * road: 道路 97 | * roadNum: 路号 98 | * buildingNum: 建筑物号 99 | * text: 标准化后为匹配的地址。一般包含小区, 商场名称等信息 100 | 101 | > 注: 如果对text的结果不是很满意, 比如出现重复或不准确, 可以通过分词的手段解决 102 | 103 | ## 1.2 相似度 104 | 105 | ```java 106 | >> 输入: 107 | 浙江金华义乌市南陈小区8幢2号 108 | 浙江金华义乌市稠城街道浙江省义乌市宾王路99号后面南陈小区8栋2号 109 | >> 输出: 110 | 0.8451542547285166 111 | ``` 112 | 113 | ```java 114 | >> 输入: 115 | 山东省沂水县四十里堡镇东艾家庄村206号 116 | 浙江金华义乌市南陈小区8幢2号 117 | >> 输出: 118 | 0.0 119 | ``` 120 | 121 | ## 1.3 自定义地址设置 122 | 123 | ```kotlin 124 | // 100000000000 代表中国的ID 125 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province) 126 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City) 127 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District) 128 | 129 | >> 输入: 中国尼玛省尼玛市泥煤市泥煤大道888号xxx 130 | >> 输出: 131 | Address( 132 | provinceId=88888888, province=尼玛省, 133 | cityId=8888888, city=尼玛市, 134 | districtId=888888, district=泥煤市, 135 | streetId=null, street=null, 136 | townId=null, town=null, 137 | villageId=null, village=null, 138 | road=泥煤大道, 139 | roadNum=888号, 140 | buildingNum=null, 141 | text=xxx 142 | ) 143 | ``` 144 | 145 | > Tips: 可以从「国家标准地址库」中获取「父级城市ID」 146 | 147 | # 2. 说明 148 | 149 | ## 2.1 标准地址库 150 | 项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用[国家的标准地址库][2] (对应的github库, [中国5级行政区域mysql库][3]). 151 | 152 | ### 导入中国5级行政区域mysql库注意事项 153 | 154 | [参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md) 155 | 156 | ## 2.2 标准化 157 | 1. 首先基于正则提取出道路、建筑物号等信息 158 | 2. 省市区等匹配 159 | 1. 将标准的地址库建立**倒排索引** 160 | 2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条 161 | 3. 对所有匹配结果进行标准行政区域从属关系校验 162 | 163 | ## 2.3 相似度计算 164 | 1. 对输入的两个地址进行标准化 165 | 2. 对省市区等信息分配不同的权重 166 | 3. 对道路号, 建筑号进行语义处理, 分配权重 167 | 4. 对剩余文本(text)使用**IK Analyzer**进行分词 168 | 5. 对两个结果集使用**余弦相似度算法**计算相似度 169 | 170 | 171 | 项目参考[address-semantic-search][4],简化了流程,修复了各种不规则错误,使得使用更加方便。 172 | 173 | ## 感谢 174 | 175 | * Python封装库:[casuallyName/Geocoding](https://github.com/casuallyName/Geocoding) 176 | 177 | 178 | ## Release Log 179 | 180 | * 1.1.3 181 | * 新增自定义地址设置 182 | * 1.1.4 183 | * 修复一些匹配错误的bug 184 | * 1.1.6 185 | * 升级地址库和包版本, 修复一些匹配错误的地址 186 | * 1.2.0 187 | - geocoding项目转移到组织:bitlap/geocoding 188 | 189 | [1]:https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do 190 | [2]:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html 191 | [3]:https://github.com/kakuilan/china_area_mysql 192 | [4]:https://github.com/liuzhibin-cn/address-semantic-search 193 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.bitlap 8 | geocoding 9 | 1.3.1 10 | 11 | geocoding 12 | 地理编码技术,提供地址标准化和相似度计算。 13 | https://github.com/bitlap/geocoding 14 | 2018 15 | 16 | 17 | 18 | MIT 19 | https://opensource.org/licenses/MIT 20 | repo 21 | 22 | 23 | 24 | 25 | scm:git:git://github.com/bitlap/geocoding.git 26 | https://github.com/bitlap/geocoding 27 | HEAD 28 | 29 | 30 | 31 | 32 | IceMimosa 33 | ChenKai 34 | http://patamon.me 35 | chk19940609@gmail.com 36 | 37 | 38 | jxnu-liguobin 39 | 梦境迷离 40 | https://dreamylost.cn 41 | dreamylost@outlook.com 42 | 43 | 44 | overcat 45 | Jun Luo 46 | https://keybase.io/overcat 47 | 4catcode@gmail.com 48 | 49 | 50 | cheese8 51 | cheese8 52 | https://github.com/cheese8 53 | 54 | 55 | 56 | 57 | UTF-8 58 | UTF-8 59 | 1.6.10 60 | 1.6.0 61 | 8.5.2 62 | 1.3 63 | 2.9.0 64 | 2012_u6 65 | true 66 | 1.8 67 | 68 | 69 | 70 | 71 | org.jetbrains.kotlin 72 | kotlin-stdlib 73 | ${kotlin.version} 74 | 75 | 76 | com.google.code.gson 77 | gson 78 | ${gson.version} 79 | 80 | 81 | 82 | commons-io 83 | commons-io 84 | 2.11.0 85 | test 86 | 87 | 88 | com.google.guava 89 | guava 90 | 31.1-jre 91 | test 92 | 93 | 94 | 95 | org.jetbrains.kotlin 96 | kotlin-test-junit 97 | ${kotlin.version} 98 | test 99 | 100 | 101 | junit 102 | junit 103 | 4.13.2 104 | test 105 | 106 | 107 | mysql 108 | mysql-connector-java 109 | 8.0.28 110 | test 111 | 112 | 113 | 114 | 126 | 147 | 148 | com.janeluo 149 | ikanalyzer 150 | ${ik.analyzer.version} 151 | 152 | 153 | org.apache.lucene 154 | lucene-core 155 | 156 | 157 | org.apache.lucene 158 | lucene-queryparser 159 | 160 | 161 | org.apache.lucene 162 | lucene-analyzers-common 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | org.apache.maven.plugins 172 | maven-compiler-plugin 173 | 3.8.1 174 | 175 | 1.8 176 | 1.8 177 | 178 | 179 | 180 | kotlin-maven-plugin 181 | org.jetbrains.kotlin 182 | ${kotlin.version} 183 | 184 | 185 | compile 186 | process-sources 187 | 188 | compile 189 | 190 | 191 | 192 | test-compile 193 | process-test-sources 194 | 195 | test-compile 196 | 197 | 198 | 199 | 200 | 201 | org.apache.maven.plugins 202 | maven-source-plugin 203 | 3.2.1 204 | 205 | 206 | package 207 | 208 | jar 209 | 210 | 211 | 212 | 213 | 214 | org.apache.maven.plugins 215 | maven-resources-plugin 216 | 3.2.0 217 | 218 | 219 | org.apache.maven.plugins 220 | maven-jar-plugin 221 | 2.4 222 | 223 | 224 | **/logback.xml 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | release 234 | 235 | 236 | central 237 | Maven Snapshots 238 | https://s01.oss.sonatype.org/content/repositories/snapshots/ 239 | 240 | 241 | central 242 | Maven Releases 243 | https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/ 244 | 245 | 246 | 247 | 248 | 249 | org.jetbrains.dokka 250 | dokka-maven-plugin 251 | ${dokka.version} 252 | 253 | 254 | package 255 | 256 | dokka 257 | javadocJar 258 | 259 | 260 | 261 | 262 | 263 | 264 | org.jetbrains.dokka 265 | kotlin-as-java-plugin 266 | ${dokka.version} 267 | 268 | 269 | 270 | 271 | 272 | org.apache.maven.plugins 273 | maven-gpg-plugin 274 | 3.0.1 275 | 276 | 277 | sign-artifacts 278 | verify 279 | 280 | sign 281 | 282 | 283 | 284 | 285 | 286 | org.apache.maven.plugins 287 | maven-deploy-plugin 288 | 2.8.2 289 | 290 | true 291 | 292 | 293 | 294 | org.sonatype.plugins 295 | nexus-staging-maven-plugin 296 | 1.6.13 297 | true 298 | 299 | central 300 | https://s01.oss.sonatype.org/ 301 | true 302 | 303 | 304 | 305 | 306 | 307 | 308 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/Geocoding.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding; 2 | 3 | import org.bitlap.geocoding.core.Context 4 | import org.bitlap.geocoding.model.Address 5 | import org.bitlap.geocoding.model.RegionEntity 6 | import org.bitlap.geocoding.model.RegionType 7 | import org.bitlap.geocoding.similarity.Document 8 | import org.bitlap.geocoding.similarity.MatchedResult 9 | 10 | /** 11 | * Desc: 提供服务的主类 12 | * Mail: chk19940609@gmail.com 13 | * Created by IceMimosa 14 | * Date: 2017/1/12 15 | */ 16 | object Geocoding { 17 | 18 | @JvmField 19 | val DEFAULT = GeocodingX() 20 | 21 | /** 22 | * 地址的标准化, 将不规范的地址清洗成标准的地址格式 23 | */ 24 | @JvmStatic 25 | fun normalizing(address: String): Address? { 26 | return DEFAULT.normalizing(address) 27 | } 28 | 29 | /** 30 | * 将地址进行切分 31 | */ 32 | @JvmStatic 33 | fun analyze(address: String): Document? { 34 | return DEFAULT.analyze(address) 35 | } 36 | @JvmStatic 37 | fun analyze(address: Address?): Document? { 38 | return DEFAULT.analyze(address) 39 | } 40 | 41 | /** 42 | * 地址的相似度计算 43 | */ 44 | @JvmStatic 45 | fun similarity(address1: String, address2: String): Double { 46 | return DEFAULT.similarity(address1, address2) 47 | } 48 | @JvmStatic 49 | fun similarity(address1: Address?, address2: Address?): Double { 50 | return DEFAULT.similarity(address1, address2) 51 | } 52 | 53 | /** 54 | * 地址相似度计算, 包含匹配的所有结果 55 | */ 56 | @JvmStatic 57 | fun similarityWithResult(address1: String, address2: String): MatchedResult { 58 | return DEFAULT.similarityWithResult(address1, address2) 59 | } 60 | @JvmStatic 61 | fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult { 62 | return DEFAULT.similarityWithResult(address1, address2) 63 | } 64 | 65 | /** 66 | * 深度优先匹配符合[text]的地址信息 67 | */ 68 | @JvmStatic 69 | fun match(text: String): List { 70 | return DEFAULT.match(text) 71 | } 72 | 73 | @JvmStatic 74 | fun getContext(): Context = DEFAULT.ctx 75 | 76 | /** 77 | * 设置自定义地址 78 | * 79 | * @param id 地址的ID 80 | * @param parentId 地址的父ID, 必须存在 81 | * @param name 地址的名称 82 | * @param type 地址类型, [RegionType] 83 | * @param alias 地址的别名 84 | * @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换 85 | */ 86 | @JvmStatic 87 | fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): Geocoding { 88 | DEFAULT.addRegionEntry(id, parentId, name, type, alias, replace) 89 | return this 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/GeocodingX.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.core.Context 4 | import org.bitlap.geocoding.model.Address 5 | import org.bitlap.geocoding.model.RegionEntity 6 | import org.bitlap.geocoding.model.RegionType 7 | import org.bitlap.geocoding.similarity.Document 8 | import org.bitlap.geocoding.similarity.MatchedResult 9 | 10 | 11 | /** 12 | * Create custom geocoding 13 | */ 14 | open class GeocodingX(val ctx: Context) { 15 | 16 | constructor(): this(false) 17 | constructor(strict: Boolean): this("core/region.dat", strict) 18 | constructor(dataClassPath: String): this(dataClassPath, false) 19 | 20 | /** 21 | * @param dataClassPath 自定义地址文档的classpath路径 22 | * @param strict 解析模式, 默认为false。当发现没有省和市,且匹配的父项数量等于1时,能成功匹配。 23 | * * true: 严格模式,当发现没有省和市,且匹配的父项数量大于1时,返回null 24 | * * false: 非严格模式,当发现没有省和市,且匹配的父项数量大于1时,匹配随机一项省和市 25 | */ 26 | constructor(dataClassPath: String, strict: Boolean): this(Context(dataClassPath, strict)) 27 | 28 | /** 29 | * 地址的标准化, 将不规范的地址清洗成标准的地址格式 30 | */ 31 | fun normalizing(address: String): Address? { 32 | return Address.build(ctx.interpreter.interpret(address)) 33 | } 34 | 35 | /** 36 | * 将地址进行切分 37 | */ 38 | fun analyze(address: String): Document? { 39 | val add = normalizing(address) ?: return null 40 | return ctx.computer.analyze(add) 41 | } 42 | fun analyze(address: Address?): Document? { 43 | address ?: return null 44 | return ctx.computer.analyze(address) 45 | } 46 | 47 | /** 48 | * 地址的相似度计算 49 | */ 50 | fun similarity(address1: String, address2: String): Double { 51 | val compute = ctx.computer.compute( 52 | normalizing(address1), 53 | normalizing(address2) 54 | ) 55 | return compute.similarity 56 | } 57 | fun similarity(address1: Address?, address2: Address?): Double { 58 | val compute = ctx.computer.compute(address1, address2) 59 | return compute.similarity 60 | } 61 | 62 | /** 63 | * 地址相似度计算, 包含匹配的所有结果 64 | */ 65 | fun similarityWithResult(address1: String, address2: String): MatchedResult { 66 | return ctx.computer.compute( 67 | normalizing(address1), 68 | normalizing(address2) 69 | ) 70 | } 71 | fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult { 72 | return ctx.computer.compute(address1, address2) 73 | } 74 | 75 | /** 76 | * 深度优先匹配符合[text]的地址信息 77 | */ 78 | fun match(text: String): List { 79 | val terms = ctx.interpreter.getTermIndexBuilder().fullMatch(text) ?: emptyList() 80 | return terms.mapNotNull { it.value } 81 | } 82 | 83 | /** 84 | * 设置自定义地址 85 | * 86 | * @param id 地址的ID 87 | * @param parentId 地址的父ID, 必须存在 88 | * @param name 地址的名称 89 | * @param type 地址类型, [RegionType] 90 | * @param alias 地址的别名 91 | * @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换 92 | */ 93 | fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): GeocodingX { 94 | ctx.persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId") 95 | if (name.isBlank()) { 96 | throw IllegalArgumentException("name should not be blank.") 97 | } 98 | // 构建 region 对象 99 | val region = RegionEntity() 100 | region.id = id 101 | region.parentId = parentId 102 | region.name = name 103 | region.alias = alias 104 | region.type = type 105 | // 暂时在这里初始化下级行政区划列表 106 | region.children = arrayListOf() 107 | // 1. Add to cache (id -> Region) 108 | ctx.persister.addRegionEntity(region) 109 | // 2. Build term index 110 | val indexBuilder = ctx.interpreter.getTermIndexBuilder() 111 | indexBuilder.indexRegions(listOf(region), replace) 112 | return this 113 | } 114 | 115 | fun save(path: String) { 116 | ctx.persister.save(path) 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/AddressInterpreter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | import org.bitlap.geocoding.index.TermIndexBuilder 4 | import org.bitlap.geocoding.model.AddressEntity 5 | 6 | /** 7 | * Desc: 地址解析操作 8 | * 从地址文本中解析出省、市、区、街道、乡镇、道路等地址组成部分 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/1/12 12 | */ 13 | interface AddressInterpreter { 14 | 15 | /** 16 | * 将`脏`地址进行标准化处理, 解析成 [AddressEntity] 17 | */ 18 | fun interpret(address: String?): AddressEntity? 19 | 20 | 21 | /** 22 | * 获取 [TermIndexBuilder] 23 | */ 24 | fun getTermIndexBuilder(): TermIndexBuilder 25 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/AddressPersister.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | import org.bitlap.geocoding.model.RegionEntity 4 | 5 | /** 6 | * Desc: 地址持久层的操作, 这边暂时只是对标准地址库的处理. 7 | * 暂时不将标准化后的地址存储在数据出中。 8 | * Mail: chk19940609@gmail.com 9 | * Created by IceMimosa 10 | * Date: 2017/1/12 11 | */ 12 | interface AddressPersister { 13 | 14 | /** 15 | * 获取行政规划地址树状结构关系 16 | */ 17 | fun getRootRegion(): RegionEntity 18 | 19 | /** 20 | * 根据id获取 21 | */ 22 | fun getRegion(id: Long): RegionEntity? 23 | 24 | /** 25 | * 新增一个region信息 26 | */ 27 | fun addRegionEntity(entity: RegionEntity) 28 | 29 | /** 30 | * 保存一个新的dat文件 31 | */ 32 | fun save(path: String) 33 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/Computer.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | import org.bitlap.geocoding.model.Address 4 | import org.bitlap.geocoding.similarity.Document 5 | import org.bitlap.geocoding.similarity.MatchedResult 6 | 7 | /** 8 | * Desc: 相似度算法相关逻辑 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/2/5 12 | */ 13 | interface Computer { 14 | 15 | /** 16 | * 将标准地址转化成文档对象 17 | * 1. 对text进行分词 18 | * 2. 对每个部分计算 IDF 19 | */ 20 | fun analyze(address: Address): Document 21 | 22 | /** 23 | * 计算两个标准地址的相似度 24 | * 1. 将两个地址形成 Document 25 | * 2. 为每个Document的Term设置权重 26 | * 3. 计算两个分词组的余弦相似度 27 | */ 28 | fun compute(addr1: Address?, addr2: Address?): MatchedResult 29 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/Context.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | import org.bitlap.geocoding.core.impl.DefaultAddressInterpreter 4 | import org.bitlap.geocoding.core.impl.DefaultAddressPersister 5 | import org.bitlap.geocoding.core.impl.DefaultRegionCache 6 | import org.bitlap.geocoding.core.impl.SimilarityComputer 7 | 8 | /** 9 | * Desc: 上下文 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/1/12 13 | */ 14 | open class Context( 15 | val dataClassPath: String, 16 | val strict: Boolean, 17 | val persister: AddressPersister = DefaultAddressPersister(DefaultRegionCache(dataClassPath)), 18 | val interpreter: AddressInterpreter = DefaultAddressInterpreter(persister, strict), 19 | val computer: Computer = SimilarityComputer(), 20 | ) { 21 | 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/RegionCache.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | import org.bitlap.geocoding.model.RegionEntity 4 | 5 | /** 6 | * Desc: 获取 region entity 的抽象接口 7 | * 默认从 region.dat 中获取, 还可以从比如数据库中获取 8 | * Mail: chk19940609@gmail.com 9 | * Created by IceMimosa 10 | * Date: 2017/1/12 11 | */ 12 | interface RegionCache { 13 | 14 | /** 15 | * 加载全部区域列表,按照行政区域划分构建树状结构关系 16 | */ 17 | fun get(): RegionEntity 18 | 19 | 20 | /** 21 | * 加载区域map结构, key是区域id, 值是区域实体 22 | */ 23 | fun getCache(): Map 24 | 25 | /** 26 | * 新增一个region信息 27 | */ 28 | fun addRegionEntity(entity: RegionEntity) 29 | 30 | /** 31 | * 保存一个新的dat文件 32 | */ 33 | fun save(path: String) 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/Segmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core 2 | 3 | /** 4 | * Desc: 分词器接口,对文本执行分词操作。 5 | * 实现可以是 SmartCN, IKAnalyzer, Word等等 6 | * Mail: chk19940609@gmail.com 7 | * Created by IceMimosa 8 | * Date: 2017/2/6 9 | */ 10 | interface Segmenter { 11 | 12 | /** 13 | * 分词方法 14 | */ 15 | fun segment(text: String): List 16 | 17 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/TermIndexVisitor.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core; 2 | 3 | import org.bitlap.geocoding.index.TermIndexEntry; 4 | import org.bitlap.geocoding.model.Division 5 | 6 | /** 7 | * Desc: 基于词条倒排索引搜索的访问者 8 | * Mail: chk19940609@gmail.com 9 | * Created by IceMimosa 10 | * Date: 2017/1/12 11 | */ 12 | interface TermIndexVisitor { 13 | 14 | /** 15 | * 开始一轮词条匹配。 16 | */ 17 | fun startRound() 18 | 19 | /** 20 | * 匹配到一个索引条目,由访问者确定是否是可接受的匹配项。 21 | * 索引条目 [entry] 下的items一定包含一个或多个索引对象 22 | * 23 | * @return 可以接受返回true, 否则返回false。对于可以接受的索引条目调用 [endVisit] 结束访问 24 | */ 25 | fun visit(entry: TermIndexEntry, text: String, pos: Int): Boolean 26 | 27 | /** 28 | * [visit] 接受某个索引项之后当前匹配的指针位置 29 | */ 30 | fun position(): Int 31 | 32 | /** 33 | * 结束索引访问 34 | */ 35 | fun endVisit(entry: TermIndexEntry, text: String, pos: Int) 36 | 37 | /** 38 | * 结束一轮词条匹配。 39 | */ 40 | fun endRound() 41 | 42 | /** 43 | * 是否匹配上了结果 44 | */ 45 | fun hasResult(): Boolean 46 | 47 | /** 48 | * 获取访问后最终匹配结果 49 | */ 50 | fun devision(): Division 51 | 52 | fun matchCount(): Int 53 | fun fullMatchCount(): Int 54 | 55 | /** 56 | * 获取最终匹配结果的终止位置 57 | */ 58 | fun endPosition(): Int 59 | 60 | /** 61 | * 状态复位 62 | */ 63 | fun reset() 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/impl/DefaultAddressInterpreter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.impl 2 | 3 | import org.bitlap.geocoding.core.AddressInterpreter 4 | import org.bitlap.geocoding.core.AddressPersister 5 | import org.bitlap.geocoding.core.TermIndexVisitor 6 | import org.bitlap.geocoding.index.TermIndexBuilder 7 | import org.bitlap.geocoding.index.TermType 8 | import org.bitlap.geocoding.model.AddressEntity 9 | import org.bitlap.geocoding.model.RegionEntity 10 | import org.bitlap.geocoding.utils.head 11 | import org.bitlap.geocoding.utils.remove 12 | import org.bitlap.geocoding.utils.removeRepeatNum 13 | import org.bitlap.geocoding.utils.tail 14 | import org.bitlap.geocoding.utils.take 15 | import java.util.regex.Pattern 16 | 17 | /** 18 | * Desc: 地址解析操作 19 | * 从地址文本中解析出省、市、区、街道、乡镇、道路等地址组成部分 20 | * Mail: chk19940609@gmail.com 21 | * Created by IceMimosa 22 | * Date: 2017/1/17 23 | */ 24 | open class DefaultAddressInterpreter(val persister: AddressPersister, val strict: Boolean) : AddressInterpreter { 25 | 26 | private var indexBuilder: TermIndexBuilder? = null 27 | private val ignoringRegionNames = mutableListOf( 28 | // JD, Tmall 29 | "其它区", "其他地区", "其它地区", "全境", "城区", "城区以内", "城区以外", "郊区", "县城内", "内环以内", "开发区", "经济开发区", "经济技术开发区", 30 | // ehaier (来自TMall或HP) 31 | "省直辖", "省直辖市县", 32 | // 其他 33 | "地区", "市区" 34 | ) 35 | 36 | init { 37 | // 初始化索引builder 38 | indexBuilder = TermIndexBuilder(persister.getRootRegion(), ignoringRegionNames) 39 | } 40 | 41 | 42 | companion object { 43 | // 特殊字符1 44 | private val specialChars1 = "  \r\n\t,,。·..;;::、!@$%*^`~=+&'\"|_-\\/".toCharArray() 45 | // 包裹的特殊字符2 46 | private val specialChars2 = "{}【】〈〉<>[]「」“”()()".toCharArray() 47 | 48 | /** 49 | * 匹配没有路号的情况 50 | * xx路xx号楼 51 | * xx路xx-xx 52 | */ 53 | private val P_BUILDING_NUM0 = Pattern.compile( 54 | //"((路|街|巷)[0-9]+号([0-9A-Z一二三四五六七八九十][\\#\\-一-/\\\\]|楼)?)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+([\\#\\-一-/\\\\]|单元|门|梯|层|座))?([0-9]+(室|房)?)?" 55 | "((路|街|巷)[0-9]+号([0-9A-Z一二三四五六七八九十][\\#\\-一-—/\\\\]|楼)?)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|楼|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+([\\#\\-一-—/\\\\]|单元|门|梯|层|座|组))?([0-9]+([\\#\\-一-—/\\\\]|室|房|层|楼|号|户)?)?([0-9]+号?)?" 56 | ) 57 | /** 58 | * 标准匹配building的模式:xx栋xx单元xxx。
59 | * 注1:山东青岛市南区宁夏路118号4号楼6单元202。如果正则模式开始位置不使用(路[0-9]+号)?,则第一个符合条件的匹配结果是【118号4】, 60 | * 按照逻辑会将匹配结果及之后的所有字符当做building,导致最终结果为:118号4号楼6单元202 61 | * 62 | * 所以需要先匹配 (路[0-9]+号)? 63 | */ 64 | private val P_BUILDING_NUM1 = Pattern.compile( 65 | "((路|街|巷)[0-9]+号)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+(单元|门|梯|层|座))?([0-9]+(室|房)?)?" 66 | ) 67 | /** 68 | * 校验building的模式。building1M能够匹配到纯数字等不符合条件的文本,使用building1V排除掉 69 | */ 70 | private val P_BUILDING_NUM_V = Pattern.compile( 71 | "(栋|幢|橦|号楼|号|\\#|\\#楼|单元|室|房|门)+" 72 | ) 73 | /** 74 | * 匹配building的模式:12-2-302,12栋3单元302 75 | */ 76 | private val P_BUILDING_NUM2 = Pattern.compile( 77 | "[A-Za-z0-9]+([\\#\\-一-/\\\\]+[A-Za-z0-9]+)+" 78 | ) 79 | /** 80 | * 匹配building的模式:10组21号,农村地址 81 | */ 82 | private val P_BUILDING_NUM3 = Pattern.compile( 83 | "[0-9]+(组|通道)[A-Z0-9\\-一]+号?" 84 | ) 85 | 86 | // 简单括号匹配 87 | private val BRACKET_PATTERN = Pattern.compile( 88 | "(?([\\((\\{\\<〈\\[【「][^\\))\\}\\>〉\\]】」]*[\\))\\}\\>〉\\]】」]))" 89 | ) 90 | 91 | // 道路信息 92 | private val P_ROAD = Pattern.compile( 93 | "^(?([\u4e00-\u9fa5]{2,6}(路|街坊|街|道|大街|大道)))(?[甲乙丙丁])?(?[0-90123456789一二三四五六七八九十]+(号院|号楼|号大院|号|號|巷|弄|院|区|条|\\#院|\\#))?" 94 | ) 95 | // 道路中未匹配到的building信息 96 | private val P_ROAD_BUILDING = Pattern.compile( 97 | "[0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)" 98 | ) 99 | 100 | // 村信息 101 | private val P_TOWN1 = Pattern.compile("^((?[\u4e00-\u9fa5]{2,2}(镇|乡))(?[\u4e00-\u9fa5]{1,3}村)?)") 102 | private val P_TOWN2 = Pattern.compile("^((?[\u4e00-\u9fa5]{1,3}镇)?(?[\u4e00-\u9fa5]{1,3}乡)?(?[\u4e00-\u9fa5]{1,3}村(?!(村|委|公路|(东|西|南|北)?(大街|大道|路|街))))?)") 103 | private val P_TOWN3 = Pattern.compile("^(?[\u4e00-\u9fa5]{1,3}村(?!(村|委|公路|(东|西|南|北)?(大街|大道|路|街))))?") 104 | private var invalidTown: MutableSet = mutableSetOf() 105 | private var invalidTownFollowings: MutableSet = mutableSetOf() 106 | init { 107 | invalidTownFollowings.add("政府") 108 | invalidTownFollowings.add("大街") 109 | invalidTownFollowings.add("大道") 110 | invalidTownFollowings.add("社区") 111 | invalidTownFollowings.add("小区") 112 | invalidTownFollowings.add("小学") 113 | invalidTownFollowings.add("中学") 114 | invalidTownFollowings.add("医院") 115 | invalidTownFollowings.add("银行") 116 | invalidTownFollowings.add("中心") 117 | invalidTownFollowings.add("卫生") 118 | invalidTownFollowings.add("一小") 119 | invalidTownFollowings.add("一中") 120 | invalidTownFollowings.add("政局") 121 | invalidTownFollowings.add("企局") 122 | 123 | invalidTown.add("新村") 124 | invalidTown.add("外村") 125 | invalidTown.add("大村") 126 | invalidTown.add("后村") 127 | invalidTown.add("东村") 128 | invalidTown.add("南村") 129 | invalidTown.add("北村") 130 | invalidTown.add("西村") 131 | invalidTown.add("上村") 132 | invalidTown.add("下村") 133 | invalidTown.add("一村") 134 | invalidTown.add("二村") 135 | invalidTown.add("三村") 136 | invalidTown.add("四村") 137 | invalidTown.add("五村") 138 | invalidTown.add("六村") 139 | invalidTown.add("七村") 140 | invalidTown.add("八村") 141 | invalidTown.add("九村") 142 | invalidTown.add("十村") 143 | invalidTown.add("中村") 144 | invalidTown.add("街村") 145 | invalidTown.add("头村") 146 | invalidTown.add("店村") 147 | invalidTown.add("桥村") 148 | invalidTown.add("楼村") 149 | invalidTown.add("老村") 150 | invalidTown.add("户村") 151 | invalidTown.add("山村") 152 | invalidTown.add("才村") 153 | invalidTown.add("子村") 154 | invalidTown.add("旧村") 155 | invalidTown.add("文村") 156 | invalidTown.add("全村") 157 | invalidTown.add("和村") 158 | invalidTown.add("湖村") 159 | invalidTown.add("甲村") 160 | invalidTown.add("乙村") 161 | invalidTown.add("丙村") 162 | invalidTown.add("邻村") 163 | invalidTown.add("乡村") 164 | invalidTown.add("村二村") 165 | invalidTown.add("中关村") 166 | invalidTown.add("城乡") 167 | invalidTown.add("县乡") 168 | invalidTown.add("头乡") 169 | invalidTown.add("牌乡") 170 | invalidTown.add("茶乡") 171 | invalidTown.add("水乡") 172 | invalidTown.add("港乡") 173 | invalidTown.add("巷乡") 174 | invalidTown.add("七乡") 175 | invalidTown.add("站乡") 176 | invalidTown.add("西乡") 177 | invalidTown.add("宝乡") 178 | invalidTown.add("还乡") 179 | invalidTown.add("古镇") 180 | invalidTown.add("小镇") 181 | invalidTown.add("街镇") 182 | invalidTown.add("城镇") 183 | invalidTown.add("环镇") 184 | invalidTown.add("湾镇") 185 | invalidTown.add("岗镇") 186 | invalidTown.add("镇镇") 187 | invalidTown.add("场镇") 188 | invalidTown.add("新镇") 189 | invalidTown.add("乡镇") 190 | invalidTown.add("屯镇") 191 | invalidTown.add("大镇") 192 | invalidTown.add("南镇") 193 | invalidTown.add("店镇") 194 | invalidTown.add("铺镇") 195 | invalidTown.add("关镇") 196 | invalidTown.add("口镇") 197 | invalidTown.add("和镇") 198 | invalidTown.add("建镇") 199 | invalidTown.add("集镇") 200 | invalidTown.add("庙镇") 201 | invalidTown.add("河镇") 202 | invalidTown.add("村镇") 203 | } 204 | } 205 | 206 | /** 207 | * 将`脏`地址进行标准化处理, 解析成 [AddressEntity] 208 | */ 209 | override fun interpret(address: String?): AddressEntity? { 210 | return interpret(address, RegionInterpreterVisitor(persister, strict)) 211 | } 212 | 213 | private fun interpret(address: String?, visitor: TermIndexVisitor): AddressEntity? { 214 | if (address.isNullOrBlank()) return null 215 | 216 | val entity = AddressEntity(address) 217 | 218 | // 清洗下开头垃圾数据, 针对用户数据 219 | prepare(entity) 220 | // extractBuildingNum, 提取建筑物号 221 | extractBuildingNum(entity) 222 | // 去除特殊字符 223 | removeSpecialChars(entity) 224 | // 提取包括的数据 225 | var brackets = extractBrackets(entity) 226 | // 去除包括的特殊字符 227 | brackets = brackets.remove(specialChars2) 228 | removeBrackets(entity) 229 | // 提取行政规划标准地址 230 | extractRegion(entity, visitor) 231 | // 规整省市区街道等匹配的结果 232 | removeRedundancy(entity, visitor) 233 | // 提取道路信息 234 | extractRoad(entity) 235 | // 提取农村信息 236 | // extractTownVillage(entity) 237 | 238 | entity.text = entity.text!!.replace("[0-9A-Za-z\\#]+(单元|楼|室|层|米|户|\\#)", "") 239 | entity.text = entity.text!!.replace("[一二三四五六七八九十]+(单元|楼|室|层|米|户)", "") 240 | if (brackets.isNotEmpty()) { 241 | entity.text = entity.text + brackets 242 | // 如果没有道路信息, 可能存在于 Brackets 中 243 | if (entity.road.isNullOrBlank()) extractRoad(entity) 244 | } 245 | 246 | return entity 247 | } 248 | 249 | // 清洗下开头垃圾数据 250 | private fun prepare(entity: AddressEntity) { 251 | // 去除开头的数字, 字母, 空格等 252 | if (entity.text.isNullOrBlank()) return 253 | 254 | val p = Pattern.compile("[ \\da-zA-Z\r\n\t,,。·..;;::、!@$%*^`~=+&'\"|_\\-\\/]") 255 | entity.text = entity.text?.trimStart { 256 | p.matcher("$it").find() 257 | } 258 | 259 | // 将地址中的 ー-—- 等替换为- 260 | entity.text = entity.text?.replace(Regex("[ー_-—/]|(--)"), "-") 261 | } 262 | 263 | // 提取建筑物号 264 | private fun extractBuildingNum(entity: AddressEntity): Boolean { 265 | if (entity.text.isNullOrBlank()) return false 266 | 267 | var found = false // 是否找到的标志 268 | var building: String? // 最后匹配的文本 269 | 270 | // 使用 P_BUILDING_NUM0 先进行匹配 271 | var matcher = P_BUILDING_NUM0.matcher(entity.text) 272 | while (matcher.find()) { 273 | if (matcher.start() == matcher.end()) continue 274 | building = entity.text!!.take(matcher.start(), matcher.end() - 1) 275 | // 查看匹配数量, 对building进行最小匹配 276 | var notEmptyGroups = 0 277 | for (i in 0 until matcher.groupCount()) { 278 | if (matcher.group(i) != null) notEmptyGroups++ 279 | } 280 | // 如果匹配group的数量大于3, 并且匹配到了building 281 | // 去除前面的 `xx路xx号` 前缀 282 | if (P_BUILDING_NUM_V.matcher(building).find() && notEmptyGroups > 3) { 283 | var pos = matcher.start() 284 | if (building.startsWith("路") || building.startsWith("街") || building.startsWith("巷")) { 285 | if (building.contains("号楼")) pos += building.indexOf("路") + 1 286 | else pos += building.indexOf("号") + 1 287 | building = entity.text!!.take(pos, matcher.end() - 1) 288 | } 289 | entity.buildingNum = building 290 | entity.text = entity.text.head(pos) + entity.text!!.take(matcher.end()) 291 | found = true 292 | break 293 | } 294 | } 295 | 296 | if (!found) { 297 | matcher = P_BUILDING_NUM1.matcher(entity.text) 298 | while (matcher.find()) { 299 | if (matcher.start() == matcher.end()) continue 300 | building = entity.text!!.take(matcher.start(), matcher.end() - 1) 301 | // 查看匹配数量, 对building进行最小匹配 302 | var notEmptyGroups = 0 303 | for (i in 0 until matcher.groupCount()) { 304 | if (matcher.group(i) != null) notEmptyGroups++ 305 | } 306 | // 如果匹配group的数量大于3, 并且匹配到了building 307 | // 去除前面的 `xx路xx号` 前缀 308 | if (P_BUILDING_NUM_V.matcher(building).find() && notEmptyGroups > 3) { 309 | var pos = matcher.start() 310 | if (building.startsWith("路") || building.startsWith("街") || building.startsWith("巷")) { 311 | pos += building.indexOf("号") + 1 312 | building = entity.text!!.take(pos, matcher.end() - 1) 313 | } 314 | entity.buildingNum = building 315 | entity.text = entity.text.head(pos) + entity.text!!.take(matcher.end()) 316 | found = true 317 | break 318 | } 319 | } 320 | } 321 | 322 | if (!found) { 323 | //xx-xx-xx(xx栋xx单元xxx) 324 | matcher = P_BUILDING_NUM2.matcher(entity.text) 325 | if (matcher.find()) { 326 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1) 327 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end()) 328 | found = true 329 | } 330 | } 331 | if (!found) { 332 | //xx组xx号, xx通道xx号 333 | matcher = P_BUILDING_NUM3.matcher(entity.text) 334 | if (matcher.find()) { 335 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1) 336 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end()) 337 | found = true 338 | } 339 | } 340 | return found 341 | } 342 | 343 | // 去除特殊字符 344 | private fun removeSpecialChars(entity: AddressEntity) { 345 | if (entity.text.isNullOrBlank()) return 346 | 347 | var text = entity.text!! 348 | // 1. 删除特殊字符1, 简单场景比 replaceAll 优化了10~20倍 349 | text = text.remove(specialChars1) 350 | 351 | // 2. 删除连续出现6个以上的数字, TODO: 可能真会出现, 这个暂做这个处理 352 | text = text.removeRepeatNum(6) 353 | entity.text = text 354 | 355 | // 去除building 356 | var building = entity.buildingNum 357 | if (building.isNullOrBlank()) return 358 | building = building.remove(specialChars1, "-一-_#") 359 | building = building.removeRepeatNum(6) 360 | entity.buildingNum = building 361 | } 362 | 363 | // 去除包裹的特殊字符 364 | private fun removeBrackets(entity: AddressEntity) { 365 | if (entity.text.isNullOrBlank()) return 366 | entity.text = entity.text!!.remove(specialChars2) 367 | } 368 | 369 | // 提取包括的数据 370 | private fun extractBrackets(entity: AddressEntity): String { 371 | if (entity.text.isNullOrBlank()) return "" 372 | 373 | // 匹配出带有 `Brackets` 的文字 374 | // 最后将文字拼接到 text 中 375 | val matcher = BRACKET_PATTERN.matcher(entity.text) 376 | var found = false 377 | val brackets = StringBuilder() 378 | while (matcher.find()) { 379 | val bracket = matcher.group("bracket") 380 | if (bracket.length <= 2) continue // 如果没有文字 381 | brackets.append(bracket.take(1, bracket.length - 2)) 382 | found = true 383 | } 384 | if (found) { 385 | val result = brackets.toString() 386 | entity.text = matcher.replaceAll("") 387 | return result 388 | } 389 | return "" 390 | } 391 | 392 | 393 | // 提取标准4级地址 394 | private fun extractRegion(entity: AddressEntity, visitor: TermIndexVisitor): Boolean { 395 | if (entity.text.isNullOrBlank()) return false 396 | 397 | // 开始匹配 398 | visitor.reset() 399 | indexBuilder!!.deepMostQuery(entity.text, visitor) 400 | entity.province = visitor.devision().province 401 | entity.city = visitor.devision().city 402 | entity.district = visitor.devision().district 403 | entity.street = visitor.devision().street 404 | entity.town = visitor.devision().town 405 | entity.village = visitor.devision().village 406 | entity.text = entity.text!!.take(visitor.endPosition() + 1) 407 | return visitor.hasResult() 408 | } 409 | 410 | 411 | private fun removeRedundancy(entity: AddressEntity, visitor: TermIndexVisitor): Boolean { 412 | if (entity.text.isNullOrBlank() || !entity.hasProvince() || !entity.hasCity()) return false 413 | 414 | var removed = false 415 | // 采用后序数组方式匹配省市区 416 | var endIndex = entity.text!!.length - 2 417 | var i = 0 418 | while (i < endIndex) { 419 | visitor.reset() 420 | indexBuilder!!.deepMostQuery(entity.text, i, visitor) 421 | if (visitor.matchCount() < 2 || visitor.fullMatchCount() < 1) { 422 | // 没有匹配上,或者匹配上的行政区域个数少于2个认当做无效匹配 423 | i++ 424 | continue 425 | } 426 | // 匹配上的省份、地级市不正确 427 | if (entity.province!! != visitor.devision().province || entity.city!! != visitor.devision().city) { 428 | i++ 429 | continue 430 | } 431 | // 正确匹配,进行回馈 432 | val devision = visitor.devision() 433 | // > 修复区信息 434 | if (!entity.hasDistrict() && devision.hasDistrict() && devision.district!!.parentId == entity.city!!.id) 435 | entity.district = devision.district 436 | // > 修复街道信息 437 | if (entity.hasDistrict() && !entity.hasStreet() 438 | && devision.hasStreet() && devision.street!!.parentId == entity.district!!.id) { 439 | entity.street = devision.street 440 | } 441 | // > 修复乡镇信息 442 | if (entity.hasDistrict() && !entity.hasTown() 443 | && devision.hasTown() && devision.town!!.parentId == entity.district!!.id) 444 | entity.town = devision.town 445 | else if (entity.hasDistrict() && entity.hasTown() && entity.town!! == entity.street 446 | && devision.hasTown() 447 | && devision.town!! != devision.street 448 | && devision.town!!.parentId == entity.district!!.id) 449 | entity.town = devision.town 450 | if (entity.hasDistrict() && !entity.hasVillage() && devision.hasVillage() 451 | && devision.village!!.parentId == entity.district!!.id) 452 | entity.village = devision.village 453 | 454 | // 正确匹配上,删除 455 | entity.text = entity.text!!.take(visitor.endPosition() + 1) 456 | endIndex = entity.text!!.length 457 | i = 0 458 | removed = true 459 | } 460 | return removed 461 | } 462 | 463 | // 提取道路信息 464 | private fun extractRoad(entity: AddressEntity): Boolean { 465 | if (entity.text.isNullOrBlank()) return false 466 | // 如果已经提取过了 467 | if (entity.road != null && entity.road!!.isNotEmpty()) return true 468 | val matcher = P_ROAD.matcher(entity.text) 469 | if (matcher.find()) { 470 | val road = matcher.group("road") 471 | val ex = matcher.group("ex") 472 | var roadNum: String? = matcher.group("roadnum") 473 | roadNum = (ex ?: "") + if (roadNum == null) "" else roadNum 474 | val leftText = entity.text!!.take(road.length + roadNum.length) 475 | if (leftText.startsWith("小区")) return false 476 | entity.road = fixRoad(road) 477 | // 仅包含【甲乙丙丁】单个汉字,不能作为门牌号 478 | if (roadNum.length == 1) { 479 | entity.text = roadNum + leftText 480 | } else { 481 | entity.roadNum = roadNum 482 | entity.text = leftText 483 | } 484 | // 修复road中存在building的问题 485 | if (entity.buildingNum.isNullOrBlank()) { 486 | fixRoadBuilding(entity) 487 | } 488 | return true 489 | } 490 | return false 491 | } 492 | 493 | // 修复重复出现的情况 494 | private fun fixRoad(road: String): String { 495 | if (road.isBlank() || road.length % 2 == 1) return road 496 | // 从中间截取 497 | val first = road.substring(0, road.length / 2) 498 | val second = road.substring(road.length / 2) 499 | if (first == second) { 500 | return first 501 | } 502 | return road 503 | } 504 | 505 | // 修复road中存在 xx号楼 的问题 506 | private fun fixRoadBuilding(entity: AddressEntity): Boolean { 507 | if (entity.text.isNullOrBlank()) return false 508 | val matcher = P_ROAD_BUILDING.matcher(entity.text) 509 | // 最开始匹配, 先这样处理 510 | if (matcher.find() && matcher.start() == 0) { 511 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1) 512 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end()) 513 | return true 514 | } 515 | return false 516 | } 517 | 518 | // 提取农村信息 519 | private fun extractTownVillage(addr: AddressEntity) { 520 | if (extractTownVillage(addr, P_TOWN1, "z", null, "c") >= 0) return 521 | if (addr.hasTown()) 522 | extractTownVillage(addr, P_TOWN3, null, null, "c") 523 | else 524 | extractTownVillage(addr, P_TOWN2, "z", "x", "c") 525 | } 526 | 527 | //返回值: 528 | // 1: 执行了匹配操作,匹配成功 529 | //-1: 执行了匹配操作,未匹配上 530 | // 0: 未执行匹配操作 531 | private fun extractTownVillage(addr: AddressEntity, pattern: Pattern, gz: String?, gx: String?, gc: String?): Int { 532 | if (addr.text.isNullOrBlank() || !addr.hasDistrict()) return 0 533 | 534 | var result = -1 535 | val matcher = pattern.matcher(addr.text) 536 | 537 | if (matcher.find()) { 538 | val text = addr.text!! 539 | var c: String? = if (gc == null) null else matcher.group("c") 540 | var ic = if (gc == null) -1 else matcher.end("c") 541 | 542 | if (gz != null) { 543 | val z = matcher.group(gz) 544 | val iz = matcher.end(gz) 545 | if (!z.isNullOrBlank()) { //镇 546 | if (z.length == 2 && text.startsWith("村", z.length)) { 547 | c = z + "村" 548 | ic = iz + 1 549 | } else if (isAcceptableTownFollowingChars(z, text, z.length)) { 550 | if (acceptTown(z, addr.district) >= 0) { 551 | addr.text = text.take(iz) 552 | result = 1 553 | } 554 | } 555 | } 556 | } 557 | 558 | if (gx != null) { 559 | val x = matcher.group(gx) 560 | val ix = matcher.end(gx) 561 | if (!x.isNullOrBlank()) { //镇 562 | if (x.length == 2 && text.startsWith("村", x.length)) { 563 | c = x + "村" 564 | ic = ix + 1 565 | } else if (isAcceptableTownFollowingChars(x, text, x.length)) { 566 | if (acceptTown(x, addr.district) >= 0) { 567 | addr.text = text.take(ix) 568 | result = 1 569 | } 570 | } 571 | } 572 | } 573 | 574 | if (!c.isNullOrBlank()) { //村 575 | if (c.endsWith("农村")) return result 576 | var leftString = text.take(ic) 577 | if (c.endsWith("村村")) { 578 | c = c.head(c.length - 1) 579 | leftString = "村" + leftString 580 | } 581 | if (leftString.startsWith("委") || leftString.startsWith("民委员")) { 582 | leftString = "村" + leftString 583 | } 584 | if (c!!.length >= 4 && (c[0] == '东' || c[0] == '西' || c[0] == '南' || c[0] == '北')) 585 | c = c.tail(c.length - 1) 586 | if (c!!.length == 2 && !isAcceptableTownFollowingChars(c, leftString, 0)) return ic 587 | if (acceptTown(c, addr.district) >= 0) { 588 | addr.text = leftString 589 | result = 1 590 | } 591 | } 592 | } 593 | return result 594 | } 595 | 596 | private fun isAcceptableTownFollowingChars(matched: String, text: String?, start: Int): Boolean { 597 | if (text == null || start >= text.length) return true 598 | if (matched.length == 4) { 599 | when (text[start]) { 600 | '区', '县', '乡', '镇', '村', '街', '路' -> return false 601 | } 602 | } 603 | var s1 = text.take(start, start + 1) 604 | if (invalidTownFollowings.contains(s1)) return false 605 | s1 = text.take(start, start + 2) 606 | if (invalidTownFollowings.contains(s1)) return false 607 | return true 608 | } 609 | 610 | //返回值: 611 | // -1: 无效的匹配 612 | // 0: 有效的匹配,无需执行添加操作 613 | // 1: 有效的匹配,已经执行添加操作 614 | private fun acceptTown(town: String?, district: RegionEntity?): Int { 615 | if (town.isNullOrBlank() || district == null) return -1 616 | if (invalidTown.contains(town)) return -1 617 | 618 | // 已加入bas_region表,不再添加 619 | val items = indexBuilder!!.fullMatch(town) 620 | if (items != null) { 621 | for (item in items) { 622 | if (item.type != TermType.Town && item.type != TermType.Street && item.type != TermType.Village) 623 | continue 624 | val region = item.value as RegionEntity 625 | if (region.parentId == district.id) return 0 626 | } 627 | } 628 | 629 | // 排除一些特殊情况:草滩街镇、西乡街镇等 630 | if (town.length == 4 && town[2] == '街') return -1 631 | 632 | return 1 633 | } 634 | 635 | /** 636 | * 获取 [TermIndexBuilder] 637 | */ 638 | override fun getTermIndexBuilder(): TermIndexBuilder { 639 | return this.indexBuilder!! 640 | } 641 | } 642 | 643 | -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/impl/DefaultAddressPersister.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.impl 2 | 3 | import org.bitlap.geocoding.core.AddressPersister 4 | import org.bitlap.geocoding.core.RegionCache 5 | import org.bitlap.geocoding.model.RegionEntity 6 | 7 | /** 8 | * Desc: 地址持久层的操作, 这边暂时只是对标准地址库的处理. 9 | * 暂时不将标准化后的地址存储在数据出中。 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/1/17 13 | */ 14 | open class DefaultAddressPersister ( 15 | // 行政规划准地址库 16 | private val regionCache: RegionCache 17 | ) : AddressPersister { 18 | 19 | /** 20 | * 获取行政规划地址树状结构关系 21 | */ 22 | override fun getRootRegion(): RegionEntity { 23 | return regionCache.get() 24 | } 25 | 26 | /** 27 | * 根据id获取 28 | */ 29 | override fun getRegion(id: Long): RegionEntity? { 30 | return regionCache.getCache()[id] 31 | } 32 | 33 | /** 34 | * 新增一个region信息 35 | */ 36 | override fun addRegionEntity(entity: RegionEntity) { 37 | regionCache.addRegionEntity(entity) 38 | } 39 | 40 | /** 41 | * 保存一个新的dat文件 42 | */ 43 | override fun save(path: String) { 44 | regionCache.save(path) 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/impl/DefaultRegoinCache.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.impl 2 | 3 | import com.google.gson.Gson 4 | import org.bitlap.geocoding.core.RegionCache 5 | import org.bitlap.geocoding.model.RegionEntity 6 | import org.bitlap.geocoding.model.RegionType 7 | import java.io.ByteArrayInputStream 8 | import java.io.ByteArrayOutputStream 9 | import java.io.FileOutputStream 10 | import java.util.* 11 | import java.util.zip.GZIPInputStream 12 | import java.util.zip.GZIPOutputStream 13 | import kotlin.text.Charsets.UTF_8 14 | 15 | /** 16 | * Desc: 默认 [RegionEntity] 获取的缓存类 17 | * 默认从 region.dat 中获取 18 | * Mail: chk19940609@gmail.com 19 | * Created by IceMimosa 20 | * Date: 2017/1/12 21 | */ 22 | open class DefaultRegionCache(dataClassPath: String) : RegionCache { 23 | 24 | private var regions: RegionEntity? = null 25 | private val REGION_CACHE = hashMapOf() 26 | 27 | init { 28 | // 加载区域数据 29 | if (regions == null) { 30 | val input = this.javaClass.classLoader.getResourceAsStream(dataClassPath) 31 | ?: throw IllegalArgumentException("Geocoding data file [$dataClassPath] does not exist.") 32 | regions = Gson().fromJson(decode(String(input.readBytes())), RegionEntity::class.java) 33 | } 34 | // 加载cache 35 | REGION_CACHE[regions!!.id] = regions!! 36 | loadChildrenInCache(regions) 37 | } 38 | 39 | private fun loadChildrenInCache(parent: RegionEntity?) { 40 | // 已经到最底层,结束 41 | if (parent == null || parent.type == RegionType.Street || 42 | parent.type == RegionType.Village || 43 | parent.type == RegionType.PlatformL4 || 44 | parent.type == RegionType.Town) return 45 | 46 | // 递归children 47 | parent.children?.forEach { 48 | REGION_CACHE[it.id] = it 49 | this.loadChildrenInCache(it) 50 | } 51 | } 52 | 53 | /** 54 | * 解压缩数据 55 | */ 56 | private fun decode(dat: String): String { 57 | return String(GZIPInputStream(ByteArrayInputStream(Base64.getMimeDecoder().decode(dat))).readBytes()) 58 | } 59 | 60 | /** 61 | * 加载全部区域列表,按照行政区域划分构建树状结构关系 62 | */ 63 | override fun get(): RegionEntity { 64 | if (regions == null) throw IllegalArgumentException("行政规划区域数据加载失败!") 65 | return regions!! 66 | } 67 | 68 | /** 69 | * 加载区域map结构, key是区域id, 值是区域实体 70 | */ 71 | override fun getCache(): Map { 72 | return REGION_CACHE 73 | } 74 | 75 | /** 76 | * 新增一个region信息 77 | */ 78 | override fun addRegionEntity(entity: RegionEntity) { 79 | this.loadChildrenInCache(entity) 80 | this.REGION_CACHE[entity.id] = entity 81 | this.REGION_CACHE[entity.parentId]?.children?.add(entity) 82 | } 83 | 84 | /** 85 | * 保存一个新的dat文件 86 | */ 87 | override fun save(path: String) { 88 | val gzip = ByteArrayOutputStream() 89 | GZIPOutputStream(gzip, 8192).use { gzipos -> 90 | gzipos.write(Gson().toJson(regions, RegionEntity::class.java).toByteArray(UTF_8)) 91 | } 92 | val dat = Base64.getMimeEncoder().encode(gzip.toByteArray()) 93 | ByteArrayInputStream(dat).copyTo(FileOutputStream(path), 8192) 94 | } 95 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/impl/RegionInterpreterVisitor.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.impl 2 | 3 | import org.bitlap.geocoding.core.AddressPersister 4 | import org.bitlap.geocoding.core.TermIndexVisitor 5 | import org.bitlap.geocoding.index.TermIndexEntry 6 | import org.bitlap.geocoding.index.TermIndexItem 7 | import org.bitlap.geocoding.index.TermType 8 | import org.bitlap.geocoding.model.Division 9 | import org.bitlap.geocoding.model.RegionEntity 10 | import org.bitlap.geocoding.model.RegionType.City 11 | import org.bitlap.geocoding.model.RegionType.CityLevelDistrict 12 | import org.bitlap.geocoding.model.RegionType.District 13 | import org.bitlap.geocoding.model.RegionType.PlatformL4 14 | import org.bitlap.geocoding.model.RegionType.Province 15 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity1 16 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity2 17 | import org.bitlap.geocoding.model.RegionType.Street 18 | import org.bitlap.geocoding.model.RegionType.Town 19 | import org.bitlap.geocoding.model.RegionType.Village 20 | 21 | import java.util.ArrayDeque 22 | 23 | /** 24 | * Desc: 基于倒排索引搜索匹配省市区行政区划的访问者 25 | * Mail: chk19940609@gmail.com 26 | * Created by IceMimosa 27 | * Date: 2017/1/12 28 | */ 29 | open class RegionInterpreterVisitor ( 30 | // 地址持久层对象 31 | val persister: AddressPersister, 32 | val strict: Boolean 33 | ) : TermIndexVisitor { 34 | 35 | private var currentLevel = 0 36 | private var deepMostLevel = 0 37 | private var currentPos = -1 38 | private var deepMostPos = -1 39 | 40 | private var fullMatchCount = 0 41 | private var deepMostFullMatchCount = 0 42 | 43 | private val deepMostDivision = Division() 44 | private val curDivision = Division() 45 | private val stack = ArrayDeque() 46 | 47 | companion object { 48 | private val ambiguousChars = mutableListOf('市', '县', '区', '镇', '乡') 49 | } 50 | 51 | /** 52 | * 开始一轮词条匹配。 53 | */ 54 | override fun startRound() { 55 | currentLevel++ 56 | } 57 | 58 | /** 59 | * 匹配到一个索引条目,由访问者确定是否是可接受的匹配项。 60 | * 索引条目 [entry] 下的items一定包含一个或多个索引对象 61 | * 62 | * @return 可以接受返回true, 否则返回false。对于可以接受的索引条目调用 [endVisit] 结束访问 63 | */ 64 | override fun visit(entry: TermIndexEntry, text: String, pos: Int): Boolean { 65 | // 找到最匹配的 被索引对象. 没有匹配对象,匹配不成功,返回 66 | val acceptableItem = findAcceptableItem(entry, text, pos) ?: return false 67 | 68 | // acceptableItem可能为TermType.Ignore类型,此时其value并不是RegionEntity对象,因此下面region的值可能为null 69 | val region = acceptableItem.value as? RegionEntity 70 | 71 | // 更新当前状态 72 | stack.push(acceptableItem) // 匹配项压栈 73 | // 使用全名匹配的词条数 74 | if (isFullMatch(entry, region)) 75 | fullMatchCount++ 76 | currentPos = positioning(region, entry, text, pos) // 当前结束的位置 77 | updateCurrentDivisionState(region, entry) // 刷新当前已经匹配上的省市区 78 | 79 | return true 80 | } 81 | 82 | private fun findAcceptableItem(entry: TermIndexEntry, text: String, pos: Int): TermIndexItem? { 83 | var mostPriority = -1 84 | var acceptableItem: TermIndexItem? = null 85 | 86 | // 每个 被索引对象循环,找出最匹配的 87 | loop@ for (item in entry.items) { 88 | // 仅处理省市区类型的 被索引对象,忽略其它类型的 89 | if (!isAcceptableItemType(item.type!!)) continue 90 | 91 | //省市区中的特殊名称 92 | if (item.type == TermType.Ignore) { 93 | if (acceptableItem == null) { 94 | mostPriority = 4 95 | acceptableItem = item 96 | } 97 | continue 98 | } 99 | 100 | val region = item.value as RegionEntity 101 | // 从未匹配上任何一个省市区,则从全部被索引对象中找出一个级别最高的 102 | if (!curDivision.hasProvince()) { 103 | 104 | // 在为匹配上任务省市区情况下, 由于 `xx路` 的xx是某县区/市区/省的别名, 如江苏路, 绍兴路等等, 导致错误的匹配。 105 | // 如 延安路118号, 错误匹配上了延安县 106 | if (!isFullMatch(entry, region) && pos + 1 <= text.length - 1) { 107 | if (region.type == Province 108 | || region.type == City 109 | || region.type == CityLevelDistrict || region.type == District 110 | || region.type == Street || region.type == PlatformL4 111 | || region.type == Town) { // 县区或街道 112 | 113 | // 如果是某某路, 街等 114 | when (text[pos + 1]) { 115 | '路', '街', '巷', '道' -> continue@loop 116 | } 117 | } 118 | } 119 | 120 | if (mostPriority == -1) { 121 | mostPriority = region.type.value 122 | acceptableItem = item 123 | } 124 | if (region.type.value < mostPriority) { 125 | mostPriority = region.type.value 126 | acceptableItem = item 127 | } 128 | continue 129 | } 130 | 131 | // 对于省市区全部匹配, 并且当前term属于非完全匹配的时候 132 | // 需要忽略掉当前term, 以免污染已经匹配的省市区 133 | if (!isFullMatch(entry, region) && hasThreeDivision()) { 134 | when (region.type) { 135 | Province -> { 136 | if (region.id != curDivision.province!!.id) { 137 | continue@loop 138 | } 139 | } 140 | City, CityLevelDistrict -> { 141 | if (region.id != curDivision.city!!.id) { 142 | continue@loop 143 | } 144 | } 145 | District -> { 146 | if (region.id != curDivision.district!!.id) { 147 | continue@loop 148 | } 149 | } 150 | else -> { } 151 | } 152 | } 153 | 154 | // 已经匹配上部分省市区,按下面规则判断最匹配项 155 | // 高优先级的排除情况 156 | if (!isFullMatch(entry, region) && pos + 1 <= text.length - 1) { // 使用别名匹配,并且后面还有一个字符 157 | // 1. 湖南益阳沅江市万子湖乡万子湖村 158 | // 错误匹配方式:提取省市区时,将【万子湖村】中的字符【万子湖】匹配成【万子湖乡】,剩下一个【村】。 159 | // 2. 广东广州白云区均和街新市镇 160 | // 白云区下面有均和街道,街道、乡镇使用别名匹配时,后续字符不能是某些行政区域和道路关键字符 161 | if (region.type == Province 162 | || region.type == City 163 | || region.type in listOf(CityLevelDistrict, District) 164 | || region.type == Street 165 | || region.type == Town) { //街道、乡镇 166 | when (text[pos + 1]) { 167 | '区', '县', '乡', '镇', '村', '街', '路' -> continue@loop 168 | '大' -> if (pos + 2 <= text.length - 1) { 169 | val c = text[pos + 2] 170 | if (c == '街' || c == '道') continue@loop 171 | } 172 | } 173 | } 174 | } 175 | 176 | // 1. 匹配度最高的情况,正好是下一级行政区域 177 | if (region.parentId == curDivision.leastRegion().id) { 178 | acceptableItem = item 179 | break 180 | } 181 | 182 | // 2. 中间缺一级的情况。 183 | if (mostPriority == -1 || mostPriority > 2) { 184 | val parent = persister.getRegion(region.parentId) 185 | // 2.1 缺地级市 186 | if (!curDivision.hasCity() && curDivision.hasProvince() && region.type == District 187 | && curDivision.province!!.id == parent!!.parentId) { 188 | mostPriority = 2 189 | acceptableItem = item 190 | continue 191 | } 192 | // 2.2 缺区县 193 | if (!curDivision.hasDistrict() && curDivision.hasCity() 194 | && (region.type == Street || region.type == Town 195 | || region.type == PlatformL4 || region.type == Village) 196 | && curDivision.city!!.id == parent!!.parentId) { 197 | mostPriority = 2 198 | acceptableItem = item 199 | continue 200 | } 201 | } 202 | 203 | // 3. 地址中省市区重复出现的情况 204 | if (mostPriority == -1 || mostPriority > 3) { 205 | if (curDivision.hasProvince() && curDivision.province!!.id == region.id || 206 | curDivision.hasCity() && curDivision.city!!.id == region.id || 207 | curDivision.hasDistrict() && curDivision.district!!.id == region.id || 208 | curDivision.hasStreet() && curDivision.street!!.id == region.id || 209 | curDivision.hasTown() && curDivision.town!!.id == region.id || 210 | curDivision.hasVillage() && curDivision.village!!.id == region.id) { 211 | mostPriority = 3 212 | acceptableItem = item 213 | continue 214 | } 215 | } 216 | 217 | // 4. 容错 218 | if (mostPriority == -1 || mostPriority > 4) { 219 | // 4.1 新疆阿克苏地区阿拉尔市 220 | // 到目前为止,新疆下面仍然有地级市【阿克苏地区】 221 | //【阿拉尔市】是县级市,以前属于地级市【阿克苏地区】,目前已变成新疆的省直辖县级行政区划 222 | // 即,老的行政区划关系为:新疆->阿克苏地区->阿拉尔市 223 | // 新的行政区划关系为: 224 | // 新疆->阿克苏地区 225 | // 新疆->阿拉尔市 226 | // 错误匹配方式:新疆 阿克苏地区 阿拉尔市,会导致在【阿克苏地区】下面无法匹配到【阿拉尔市】 227 | // 正确匹配结果:新疆 阿拉尔市 228 | if (region.type == CityLevelDistrict 229 | && curDivision.hasProvince() && curDivision.province!!.id == region.parentId) { 230 | mostPriority = 4 231 | acceptableItem = item 232 | continue 233 | } 234 | // 4.2 地级市-区县从属关系错误,但区县对应的省份正确,则将使用区县的地级市覆盖已匹配的地级市 235 | // 主要是地级市的管辖范围有调整,或者由于外部系统地级市与区县对应关系有调整导致 236 | if (region.type == District // 必须是普通区县 237 | && curDivision.hasCity() && curDivision.hasProvince() 238 | && isFullMatch(entry, region) // 使用的全名匹配 239 | && curDivision.city!!.id != region.parentId) { 240 | val city = persister.getRegion(region.parentId)!! // 区县的地级市 241 | if (city.parentId == curDivision.province!!.id && !hasThreeDivision()) { 242 | mostPriority = 4 243 | acceptableItem = item 244 | continue 245 | } 246 | } 247 | } 248 | 249 | // 5. 街道、乡镇,且不符合上述情况 250 | if (region.type == Street || region.type == Town 251 | || region.type == Village || region.type == PlatformL4) { 252 | if (!curDivision.hasDistrict()) { 253 | var parent = persister.getRegion(region.parentId) // parent为区县 254 | parent = persister.getRegion(parent!!.parentId) // parent为地级市 255 | if (curDivision.hasCity() && curDivision.city!!.id == parent!!.id) { 256 | mostPriority = 5 257 | acceptableItem = item 258 | continue 259 | } 260 | } else if (region.parentId == curDivision.district!!.id) { 261 | //已经匹配上区县 262 | mostPriority = 5 263 | acceptableItem = item 264 | continue 265 | } 266 | } 267 | } 268 | return acceptableItem 269 | } 270 | 271 | private fun isFullMatch(entry: TermIndexEntry, region: RegionEntity?): Boolean { 272 | if (region == null) return false 273 | if (entry.key!!.length == region.name.length) return true 274 | if (region.type == Street && region.name.endsWith("街道") && region.name.length == entry.key!!.length + 1) 275 | return true //xx街道,使用别名xx镇、xx乡匹配上的,认为是全名匹配 276 | return false 277 | } 278 | 279 | /** 280 | * 索引对象是否是可接受的省市区等类型。 281 | */ 282 | private fun isAcceptableItemType(type: TermType): Boolean { 283 | return when (type) { 284 | TermType.Province, TermType.City, TermType.District, 285 | TermType.Street, TermType.Town, TermType.Village, TermType.Ignore -> true 286 | else -> false 287 | } 288 | } 289 | 290 | /** 291 | * 当前是否已经完全匹配了省市区了 292 | */ 293 | private fun hasThreeDivision(): Boolean { 294 | return (curDivision.hasProvince() && curDivision.hasCity() && curDivision.hasDistrict()) 295 | && (curDivision.city!!.parentId == curDivision.province!!.id) 296 | && (curDivision.district!!.parentId == curDivision.city!!.id) 297 | } 298 | 299 | private fun positioning(acceptedRegion: RegionEntity?, entry: TermIndexEntry, text: String, pos: Int): Int { 300 | if (acceptedRegion == null) return pos 301 | // 需要调整指针的情况 302 | // 1. 山东泰安肥城市桃园镇桃园镇山东省泰安市肥城县桃园镇东伏村 303 | // 错误匹配方式:提取省市区时,将【肥城县】中的字符【肥城】匹配成【肥城市】,剩下一个【县】 304 | if ((acceptedRegion.type == City || acceptedRegion.type == District 305 | || acceptedRegion.type == Street) 306 | && !isFullMatch(entry, acceptedRegion) && pos + 1 <= text.length - 1) { 307 | val c = text[pos + 1] 308 | if (ambiguousChars.contains(c)) { //后续跟着特殊字符 309 | for (child in acceptedRegion.children ?: arrayListOf()) { 310 | if (child.name[0] == c) return pos 311 | } 312 | return pos + 1 313 | } 314 | // fix: 如果已经匹配最低等级 315 | if (curDivision.hasTown() || curDivision.hasStreet()) { 316 | // 如果不是特殊字符的, 由于存在 `xx小区, xx苑, xx是以镇名字命名的情况` 317 | if (!ambiguousChars.contains(c)) { 318 | deepMostPos = currentPos // 则不移动当前指针 319 | } 320 | } 321 | } 322 | return pos 323 | } 324 | 325 | /** 326 | * 更新当前已匹配区域对象的状态。 327 | * @param region 328 | */ 329 | private fun updateCurrentDivisionState(region: RegionEntity?, entry: TermIndexEntry) { 330 | if (region == null) return 331 | // region为重复项,无需更新状态 332 | if (region == curDivision.province || region == curDivision.city 333 | || region == curDivision.district || region == curDivision.street 334 | || region == curDivision.town || region == curDivision.village) 335 | return 336 | 337 | // 非严格模式 || 只有一个父项 338 | val needUpdateCityAndProvince = !strict || (entry.items.size == 1) 339 | when (region.type) { 340 | Province, ProvinceLevelCity1 -> { 341 | curDivision.province = region 342 | curDivision.city = null 343 | } 344 | City, ProvinceLevelCity2 -> { 345 | curDivision.city = region 346 | if (!curDivision.hasProvince()) 347 | curDivision.province = persister.getRegion(region.parentId) 348 | } 349 | CityLevelDistrict -> { 350 | curDivision.city = region 351 | curDivision.district = region 352 | if (!curDivision.hasProvince()) 353 | curDivision.province = persister.getRegion(region.parentId) 354 | } 355 | District -> { 356 | curDivision.district = region 357 | //成功匹配了区县,则强制更新地级市 358 | curDivision.city = persister.getRegion(curDivision.district!!.parentId) 359 | if (!curDivision.hasProvince()) 360 | curDivision.province = persister.getRegion(curDivision.city!!.parentId) 361 | } 362 | Street, PlatformL4 -> { 363 | if (!curDivision.hasStreet()) curDivision.street = region 364 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId) 365 | if (needUpdateCityAndProvince) { 366 | updateCityAndProvince(curDivision.district) 367 | } 368 | } 369 | Town -> { 370 | if (!curDivision.hasTown()) curDivision.town = region 371 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId) 372 | if (needUpdateCityAndProvince) { 373 | updateCityAndProvince(curDivision.district) 374 | } 375 | } 376 | Village -> { 377 | if (!curDivision.hasVillage()) curDivision.village = region 378 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId) 379 | if (needUpdateCityAndProvince) { 380 | updateCityAndProvince(curDivision.district) 381 | } 382 | } 383 | else -> { } 384 | } 385 | } 386 | 387 | private fun updateCityAndProvince(distinct: RegionEntity?) { 388 | if (distinct == null) return 389 | if (!curDivision.hasCity()) { 390 | curDivision.city = persister.getRegion(distinct.parentId)?.also { city -> 391 | if (!curDivision.hasProvince()) { 392 | curDivision.province = persister.getRegion(city.parentId) 393 | } 394 | } 395 | } 396 | } 397 | 398 | /** 399 | * [visit] 接受某个索引项之后当前匹配的指针位置 400 | */ 401 | override fun position(): Int { 402 | return this.currentPos 403 | } 404 | 405 | /** 406 | * 结束索引访问 407 | */ 408 | override fun endVisit(entry: TermIndexEntry, text: String, pos: Int) { 409 | this.checkDeepMost() 410 | 411 | val indexTerm = stack.pop() // 当前访问的索引对象出栈 412 | currentPos = pos - entry.key!!.length // 恢复当前位置指针 413 | val region = indexTerm.value as? RegionEntity 414 | if (isFullMatch(entry, region)) fullMatchCount++ //更新全名匹配的数量 415 | if (indexTerm.type == TermType.Ignore) return //如果是忽略项,无需更新当前已匹配的省市区状态 416 | 417 | // 扫描一遍stack,找出街道street、乡镇town、村庄village,以及省市区中级别最低的一个least 418 | var least: RegionEntity? = null 419 | var street: RegionEntity? = null 420 | var town: RegionEntity? = null 421 | var village: RegionEntity? = null 422 | stack.forEach { 423 | if (it.type == TermType.Ignore) return@forEach 424 | val r = it.value as RegionEntity 425 | when (r.type) { 426 | Street, PlatformL4 -> { 427 | street = r 428 | return@forEach 429 | } 430 | Town -> { 431 | town = r 432 | return@forEach 433 | } 434 | Village -> { 435 | village = r 436 | return@forEach 437 | } 438 | else -> { } 439 | } 440 | if (least == null) { 441 | least = r 442 | return@forEach 443 | } 444 | } 445 | if (street == null) curDivision.street = null // 剩余匹配项中没有街道了 446 | if (town == null) curDivision.town = null // 剩余匹配项中没有乡镇了 447 | if (village == null) curDivision.village = null // 剩余匹配项中没有村庄了 448 | // 只有街道、乡镇、村庄都没有时,才开始清空省市区 449 | if (curDivision.hasStreet() || curDivision.hasTown() || curDivision.hasVillage()) return 450 | if (least != null) { 451 | when (least!!.type) { 452 | Province, ProvinceLevelCity1 -> { 453 | curDivision.city = null 454 | curDivision.district = null 455 | return 456 | } 457 | City, ProvinceLevelCity2 -> { 458 | curDivision.district = null 459 | return 460 | } 461 | else -> return 462 | } 463 | } 464 | // least为null,说明stack中什么都不剩了 465 | curDivision.province = null 466 | curDivision.city = null 467 | curDivision.district = null 468 | } 469 | 470 | /** 471 | * 结束一轮词条匹配。 472 | */ 473 | override fun endRound() { 474 | this.checkDeepMost() 475 | currentLevel-- 476 | } 477 | 478 | private fun checkDeepMost() { 479 | if (stack.size > deepMostLevel) { 480 | deepMostLevel = stack.size 481 | deepMostPos = currentPos 482 | deepMostFullMatchCount = fullMatchCount 483 | deepMostDivision.province = curDivision.province 484 | deepMostDivision.city = curDivision.city 485 | deepMostDivision.district = curDivision.district 486 | deepMostDivision.street = curDivision.street 487 | deepMostDivision.town = curDivision.town 488 | deepMostDivision.village = curDivision.village 489 | } 490 | } 491 | 492 | /** 493 | * 是否匹配上了结果 494 | */ 495 | override fun hasResult(): Boolean { 496 | return deepMostPos > 0 && deepMostDivision.hasDistrict() 497 | } 498 | 499 | /** 500 | * 获取访问后的对象 501 | */ 502 | override fun devision(): Division { 503 | return deepMostDivision 504 | } 505 | 506 | override fun matchCount(): Int { 507 | return deepMostLevel 508 | } 509 | 510 | override fun fullMatchCount(): Int { 511 | return deepMostFullMatchCount 512 | } 513 | 514 | /** 515 | * 获取最终匹配结果的终止位置 516 | */ 517 | override fun endPosition(): Int { 518 | return deepMostPos 519 | } 520 | 521 | /** 522 | * 状态复位 523 | */ 524 | override fun reset() { 525 | currentLevel = 0 526 | deepMostLevel = 0 527 | currentPos = -1 528 | deepMostPos = -1 529 | fullMatchCount = 0 530 | deepMostFullMatchCount = 0 531 | deepMostDivision.province = null 532 | deepMostDivision.city = null 533 | deepMostDivision.district = null 534 | deepMostDivision.street = null 535 | deepMostDivision.town = null 536 | deepMostDivision.village = null 537 | curDivision.province = null 538 | curDivision.city = null 539 | curDivision.district = null 540 | curDivision.street = null 541 | curDivision.town = null 542 | curDivision.village = null 543 | } 544 | 545 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/impl/SimilarityComputer.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.impl 2 | 3 | import org.bitlap.geocoding.core.Computer 4 | import org.bitlap.geocoding.core.segment.AsciiSegmenter 5 | import org.bitlap.geocoding.core.segment.IKAnalyzerSegmenter 6 | import org.bitlap.geocoding.model.Address 7 | import org.bitlap.geocoding.similarity.Document 8 | import org.bitlap.geocoding.similarity.MatchedResult 9 | import org.bitlap.geocoding.similarity.MatchedTerm 10 | import org.bitlap.geocoding.similarity.Term 11 | import org.bitlap.geocoding.similarity.Term.TermType 12 | import org.bitlap.geocoding.similarity.Term.TermType.Building 13 | import org.bitlap.geocoding.similarity.Term.TermType.City 14 | import org.bitlap.geocoding.similarity.Term.TermType.District 15 | import org.bitlap.geocoding.similarity.Term.TermType.Province 16 | import org.bitlap.geocoding.similarity.Term.TermType.Road 17 | import org.bitlap.geocoding.similarity.Term.TermType.RoadNum 18 | import org.bitlap.geocoding.similarity.Term.TermType.Street 19 | import org.bitlap.geocoding.similarity.Term.TermType.Text 20 | import org.bitlap.geocoding.similarity.Term.TermType.Town 21 | import org.bitlap.geocoding.similarity.Term.TermType.Village 22 | import org.bitlap.geocoding.utils.isAsciiChars 23 | import org.bitlap.geocoding.utils.isNumericChars 24 | 25 | /** 26 | * Desc: 相似度算法相关逻辑 27 | * 28 | * * 关于 TF-IDF 29 | * * TC: 词数 Term Count, 某个词在文档中出现的次数 30 | * * TF: 词频 Term Frequency, 某个词在文档中出现的频率. TF = 该词在文档中出现的次数 / 该文档的总词数 31 | * * IDF: 逆文档词频 Inverse Document Frequency. IDF = log( 语料库文档总数 / ( 包含该词的文档数 + 1 ) ). 分母加1是为了防止分母出现0的情况 32 | * * TF-IDF: 词条的特征值, TF-IDF = TF * IDF 33 | * 34 | * Mail: chk19940609@gmail.com 35 | * Created by IceMimosa 36 | * Date: 2017/2/5 37 | */ 38 | open class SimilarityComputer : Computer { 39 | 40 | private val segmenter = IKAnalyzerSegmenter() // text的分词, 默认 ik 分词器 41 | 42 | // private val simpleSegmenter = SimpleSegmenter() // 暂时用于处理 building 的分词 43 | private val simpleSegmenter = AsciiSegmenter() // 暂时用于处理 building 的分词 44 | 45 | // 中文数字字符 46 | private val NUMBER_CN = arrayOf('一', '二', '三', '四', '五', '六', '七', '八', '九', '0', '1', '2' ,'3' ,'4' ,'5' ,'6' ,'7' ,'8' ,'9') 47 | 48 | // 权重值常量 49 | private val BOOST_M = 1.0 // 正常权重 50 | private val BOOST_L = 2.0 // 加权高值 51 | private val BOOST_XL = 4.0 // 加权高值 52 | private val BOOST_S = 0.5 // 降权 53 | private val BOOST_XS = 0.25 // 降权 54 | 55 | /** 56 | * 将标准地址转化成文档对象 57 | * 1. 对text进行分词 58 | * 2. 对每个部分设置权重 59 | */ 60 | override fun analyze(address: Address): Document { 61 | val doc = Document() 62 | 63 | var tokens: List = emptyList() 64 | // 1. 对 text (地址解析后剩余文本) 进行分词 65 | if (!address.text.isNullOrBlank()) { 66 | tokens = segmenter.segment(address.text!!) 67 | } 68 | 69 | val terms = arrayListOf() 70 | // 2. 生成 term 71 | // 2.1 town 72 | val town = if (!address.town.isNullOrBlank()) address.town else address.street 73 | if (!town.isNullOrBlank()) { 74 | doc.town = Term(Town, town) 75 | terms.add(doc.town!!) 76 | } 77 | // 2.2 village 78 | val village = address.village 79 | if (!village.isNullOrBlank()) { 80 | doc.village = Term(Village, village) 81 | terms.add(doc.village!!) 82 | } 83 | // 2.3 road 84 | val road = address.road 85 | if (!road.isNullOrBlank()) { 86 | doc.road = Term(Road, road) 87 | terms.add(doc.road!!) 88 | } 89 | // 2.4 road num 90 | val roadNum = address.roadNum 91 | if (!roadNum.isNullOrBlank()) { 92 | val roadNumTerm = Term(RoadNum, roadNum) 93 | doc.roadNum = roadNumTerm 94 | doc.roadNumValue = translateRoadNum(roadNum) 95 | roadNumTerm.ref = doc.road 96 | terms.add(doc.roadNum!!) 97 | } 98 | // 2.5 building num 99 | val buildingNum = address.buildingNum 100 | if (!buildingNum.isNullOrBlank()) { 101 | // 转换 building串 102 | translateBuilding(buildingNum).forEach { 103 | terms.add(Term(Building, it)) 104 | } 105 | } 106 | 107 | // 3. 将分词放置到token中 108 | val termTexts = terms.map(Term::text) 109 | tokens.forEach { 110 | // 如果 terms 中不包含 111 | // 并且乡镇道路中不包含 112 | if (!termTexts.contains(it) && town != it && village != it && road != it) { 113 | terms.add(Term(Text, it)) 114 | } 115 | } 116 | 117 | // 4. 设置每个 Term 的 IDF 118 | // 由于 TF-IDF 在计算地址相似度上意义不是特别明显 119 | putIdfs(terms) 120 | 121 | doc.terms = terms 122 | return doc 123 | } 124 | 125 | /** 126 | * 计算两个标准地址的相似度 127 | * 1. 将两个地址形成 Document 128 | * 2. 为每个Document的Term设置权重 129 | * 3. 计算两个分词组的余弦相似度, 值为0~1,值越大表示相似度越高,返回值为1则表示完全相同 130 | */ 131 | override fun compute(addr1: Address?, addr2: Address?): MatchedResult { 132 | if (addr1 == null || addr2 == null) { 133 | return MatchedResult() 134 | } 135 | // 如果两个地址不在同一个省市区, 则认为是不相同地址 136 | if (addr1.provinceId != addr2.provinceId || addr1.cityId != addr2.cityId || addr1.districtId != addr2.districtId) { 137 | return MatchedResult() 138 | } 139 | 140 | // 为每个address计算词条 141 | val doc1 = analyze(addr1) 142 | val doc2 = analyze(addr2) 143 | 144 | // 计算两个document的相似度 145 | val cp1 = computeSimilarity(doc1, doc2) 146 | val cp2 = computeSimilarity(doc2, doc1) 147 | 148 | // 暂时获取计算结果最小的那个 149 | if (cp1.similarity < cp2.similarity) { 150 | return cp1 151 | } 152 | return cp2 153 | } 154 | 155 | 156 | /** 157 | * 提取 道路门牌号中的数字, 如 40号、一号院 158 | */ 159 | private fun translateRoadNum(roadNum: String?): Int { 160 | if (roadNum.isNullOrBlank()) return 0 161 | 162 | val sb = StringBuilder() 163 | var isTen = false // 是否含有十 164 | loop@ for (i in roadNum.indices) { 165 | val c = roadNum[i] 166 | 167 | // 识别汉字中的 "十", 由于 "十号" 和 "二十号" 的意义不同 168 | if (isTen) { 169 | val pre = sb.isNotEmpty() 170 | val post = NUMBER_CN.contains(c) || c in '0'..'9' 171 | if (pre) { // 如果前面含有, 则追加 0 172 | if (post) { /*do nothing*/ 173 | } else { 174 | sb.append('0') 175 | } 176 | } else { 177 | if (post) sb.append('1') 178 | else sb.append("10") 179 | } 180 | isTen = false 181 | } 182 | // 追加数字 183 | when (c) { 184 | '一' -> { sb.append(1); continue@loop } 185 | '二' -> { sb.append(2); continue@loop } 186 | '三' -> { sb.append(3); continue@loop } 187 | '四' -> { sb.append(4); continue@loop } 188 | '五' -> { sb.append(5); continue@loop } 189 | '六' -> { sb.append(6); continue@loop } 190 | '七' -> { sb.append(7); continue@loop } 191 | '八' -> { sb.append(8); continue@loop } 192 | '九' -> { sb.append(9); continue@loop } 193 | '十' -> { isTen = true; continue@loop } 194 | } 195 | 196 | //ANSI数字字符 197 | if (c in '0'..'9') { 198 | sb.append(c) 199 | continue 200 | } 201 | //中文全角数字字符 202 | when (c) { 203 | '0' -> { sb.append(0); continue@loop} 204 | '1' -> { sb.append(1); continue@loop} 205 | '2' -> { sb.append(2); continue@loop} 206 | '3' -> { sb.append(3); continue@loop} 207 | '4' -> { sb.append(4); continue@loop} 208 | '5' -> { sb.append(5); continue@loop} 209 | '6' -> { sb.append(6); continue@loop} 210 | '7' -> { sb.append(7); continue@loop} 211 | '8' -> { sb.append(8); continue@loop} 212 | '9' -> { sb.append(9); continue@loop} 213 | } 214 | } 215 | if (isTen) { 216 | if (sb.isNotEmpty()) 217 | sb.append('0') 218 | else 219 | sb.append("10") 220 | } 221 | if (sb.isNotEmpty()) return Integer.parseInt(sb.toString()) 222 | return 0 223 | } 224 | 225 | /** 226 | * 与 road 不同的是, building可能存在多个数字 227 | * 将字符串中的数字, 字母等提取出来 228 | */ 229 | private fun translateBuilding(building: String?): List { 230 | if (building.isNullOrBlank()) return emptyList() 231 | return simpleSegmenter.segment(building) 232 | } 233 | 234 | /** 235 | * 获取 termText -> IDF 的映射 236 | * 简单实现, TODO: 未进行语料库的统计 237 | */ 238 | private fun putIdfs(terms: List) { 239 | terms.forEach { 240 | // 计算 IDF 241 | val key = it.text 242 | if (key.isNumericChars()) it.idf = 2.0 243 | else if (key.isAsciiChars()) it.idf = 2.0 244 | // else it.idf = Math.log(docs / (tdocs + 1)) 245 | else it.idf = 4.0 // 由于未进行语料库的统计, 默认4 246 | } 247 | } 248 | 249 | /** 250 | * 计算两个文档的余弦相似度 251 | */ 252 | private fun computeSimilarity(doc1: Document, doc2: Document): MatchedResult { 253 | 254 | // 1. 计算Terms中 text类型词条 的匹配率 255 | var qTextTermCount = 0 // 文档1的Text类型词条数目 256 | var dTextTermMatchCount = 0 // 与文档2的Text类型词条匹配数目 257 | // 匹配此处之间的词数间隔 258 | var matchStart = -1 259 | var matchEnd = -1 260 | for (term1 in doc1.terms ?: emptyList()) { 261 | if (term1.type != TermType.Text) continue 262 | qTextTermCount++ 263 | for ((i, term2) in (doc2.terms ?: emptyList()).withIndex()) { 264 | if (term2.type != TermType.Text) continue 265 | if (term1.text == term2.text) { 266 | dTextTermMatchCount++ 267 | if (matchStart == -1) { 268 | matchEnd = i 269 | matchStart = matchEnd 270 | break 271 | } 272 | if (i > matchEnd) 273 | matchEnd = i 274 | else if (i < matchStart) 275 | matchStart = i 276 | break 277 | } 278 | } 279 | } 280 | 281 | // 1.1 计算匹配率 282 | var termCoord = 1.0 283 | if (qTextTermCount > 0) { 284 | // Math.sqrt( 匹配上的词条数 / doc1的Text词条数 ) * 0.5 + 0.5 285 | termCoord = Math.sqrt(dTextTermMatchCount * 1.0 / qTextTermCount) * 0.5 + 0.5 286 | } 287 | // 1.2 计算稠密度 288 | var termDensity = 1.0 289 | if (qTextTermCount >= 2 && dTextTermMatchCount >= 2) { 290 | // Math.sqrt( 匹配上的词条数 / doc2匹配词条之间的距离 ) * 0.5 + 0.5 291 | termDensity = Math.sqrt(dTextTermMatchCount * 1.0 / (matchEnd - matchStart + 1)) * 0.5 + 0.5 292 | } 293 | 294 | // 2. 计算 TF-IDF(非标准) 和 余弦相似度的中间值 295 | val result = MatchedResult() 296 | result.doc1 = doc1 297 | result.doc2 = doc2 298 | 299 | // 余弦相似度的中间值 300 | var sumQD = 0.0 301 | var sumQQ = 0.0 302 | var sumDD = 0.0 303 | for (qterm in doc1.terms ?: emptyList()) { 304 | val qboost = getBoostValue(false, doc1, qterm, doc2, null) 305 | val q_TF_IDF = qboost * qterm.idf!! 306 | // 文档2的term 307 | var dterm = doc2.getTerm(qterm.text) 308 | if (dterm == null && RoadNum == qterm.type) { 309 | // 从文档2中找门牌号词条 310 | if (doc2.roadNum != null && doc2.road != null && doc2.road == qterm.ref) 311 | dterm = doc2.roadNum 312 | } 313 | 314 | val dboost = if (dterm == null) 0.0 else getBoostValue(true, doc1, qterm, doc2, dterm) 315 | val coord = if (dterm != null && Text == dterm.type) termCoord else 1.0 316 | val density = if (dterm != null && Text == dterm.type) termDensity else 1.0 317 | val d_TF_IDF = (if (dterm != null) dterm.idf else qterm.idf)!! * dboost * coord * density 318 | 319 | // 计算相似度 320 | if (dterm != null) { 321 | val matchedTerm = MatchedTerm(dterm) 322 | matchedTerm.boost = dboost 323 | matchedTerm.tfidf = d_TF_IDF 324 | if (Text == dterm.type) { 325 | matchedTerm.density = density 326 | matchedTerm.coord = coord 327 | } else { 328 | matchedTerm.density = -1.0 329 | matchedTerm.coord = -1.0 330 | } 331 | result.terms.add(matchedTerm) 332 | } 333 | 334 | sumQQ += q_TF_IDF * q_TF_IDF 335 | sumQD += q_TF_IDF * d_TF_IDF 336 | sumDD += d_TF_IDF * d_TF_IDF 337 | } 338 | 339 | if (sumDD == 0.0 || sumQQ == 0.0) return result 340 | 341 | // 计算余弦相似度 342 | result.similarity = sumQD / Math.sqrt(sumQQ * sumDD) 343 | 344 | return result 345 | } 346 | 347 | /** 348 | * 根据不同的词条设置不同的权重 349 | * [forDoc] 350 | * > true 则计算 [ddoc] 的权重, 此时 [qdoc], [qterm], [ddoc], [dterm] 不为空 351 | * > false 则计算 [qdoc] 的权重, 此时 [qdoc], [qterm], [ddoc] 不为空, [dterm] 为空 352 | */ 353 | private fun getBoostValue(forDoc: Boolean, qdoc: Document, qterm: Term, ddoc: Document, dterm: Term?): Double { 354 | 355 | val termType = if (forDoc) dterm!!.type else qterm.type 356 | // 权重值 357 | var boost = BOOST_M 358 | when (termType) { 359 | // 省市区、道路出现频次高, IDF值较低, 但重要程度最高, 因此给予比较高的加权权重 360 | Province, City, District -> boost = BOOST_XL 361 | // 一般人对于城市街道范围概念不强,在地址中随意选择街道的可能性较高,因此降权处理 362 | Street -> boost = BOOST_XS 363 | // 乡镇和村庄 364 | Town, Village -> { 365 | boost = BOOST_XS 366 | // 乡镇 367 | if (Town == termType) { 368 | // 查询两个文档之间都有乡镇, 为乡镇加权。注意:存在乡镇相同、不同两种情况。 369 | // > 乡镇相同:查询文档和地址库文档都加权BOOST_L,提高相似度 370 | // > 乡镇不同:只有查询文档的词条加权BOOST_L, 地址库文档的词条因无法匹配不会进入该函数。结果是拉开相似度的差异 371 | if (qdoc.town != null && ddoc.town != null) boost = BOOST_L 372 | } 373 | // 村庄 374 | else { 375 | // 两个文档都有乡镇且乡镇相同,且查询文档和地址库文档都有村庄时,为村庄加权 376 | // 与上述乡镇类似,存在村庄相同和不同两种情况 377 | if (qdoc.village != null && ddoc.village != null && qdoc.town != null) { 378 | if (qdoc.town == ddoc.town) { // 镇相同 379 | if (qdoc.village == ddoc.village) boost = BOOST_XL 380 | else boost = BOOST_L 381 | } else if (ddoc.town != null) { // 镇不同 382 | if (!forDoc) boost = BOOST_L 383 | else boost = BOOST_S 384 | } 385 | } 386 | } 387 | } 388 | // 道路信息 389 | Road, RoadNum, Building -> { 390 | // 有乡镇有村庄,不再考虑道路、门牌号的加权 391 | if (qdoc.town == null || qdoc.village == null) { 392 | // 道路 393 | if (Road == termType) { 394 | if (qdoc.road != null && ddoc.road != null) boost = BOOST_L 395 | } 396 | // 门牌号。注意:查询文档和地址库文档的门牌号都会进入此处执行, 这一点跟Road、Town、Village不同。 397 | // TODO: building 暂时和道路号的权重一致, 后期需优化单独处理 398 | else { 399 | if (qdoc.roadNumValue > 0 && ddoc.roadNumValue > 0 && qdoc.road != null && qdoc.road == ddoc.road) { 400 | if (qdoc.roadNumValue == ddoc.roadNumValue) 401 | boost = 3.0 402 | else 403 | boost = if (forDoc) 404 | 1 / Math.sqrt(Math.sqrt((Math.abs(qdoc.roadNumValue - ddoc.roadNumValue) + 1).toDouble())) * BOOST_L 405 | else 406 | 3.0 407 | } 408 | } 409 | } 410 | } 411 | Text -> boost = BOOST_M 412 | else -> boost = BOOST_M 413 | } 414 | 415 | return boost 416 | } 417 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/segment/AsciiSegmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.segment 2 | 3 | /** 4 | * Desc: 简单的分词, 直接按单个字符切分,连续出现的数字、英文字母会作为一个词条. 5 | * 去除非 ASCII 字符 (其实只保留英文和数字) 6 | * Mail: chk19940609@gmail.com 7 | * Created by IceMimosa 8 | * Date: 2017/2/28 9 | */ 10 | class AsciiSegmenter : SimpleSegmenter() { 11 | 12 | /** 13 | * 分词方法 14 | */ 15 | override fun segment(text: String): List { 16 | return super.segment(text, true) 17 | } 18 | 19 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/segment/IKAnalyzerSegmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.segment 2 | 3 | import org.bitlap.geocoding.core.Segmenter 4 | import org.wltea.analyzer.core.IKSegmenter 5 | import org.wltea.analyzer.core.Lexeme 6 | import java.io.StringReader 7 | 8 | /** 9 | * Desc: ik 分词器 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/2/12 13 | */ 14 | class IKAnalyzerSegmenter : Segmenter { 15 | 16 | /** 17 | * 分词方法 18 | */ 19 | override fun segment(text: String): List { 20 | val segs = arrayListOf() 21 | val reader = StringReader(text) 22 | // 设置ik的智能分词 23 | val ik = IKSegmenter(reader, true) 24 | var lexeme: Lexeme? = ik.next() 25 | while (lexeme != null) { 26 | segs.add(lexeme.lexemeText) 27 | lexeme = ik.next() 28 | } 29 | reader.close() 30 | return segs 31 | } 32 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/segment/SimpleSegmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.segment 2 | 3 | import org.bitlap.geocoding.core.Segmenter 4 | import org.bitlap.geocoding.utils.take 5 | 6 | /** 7 | * Desc: 简单的分词, 直接按单个字符切分,连续出现的数字、英文字母会作为一个词条 8 | * Mail: chk19940609@gmail.com 9 | * Created by IceMimosa 10 | * Date: 2017/2/6 11 | */ 12 | open class SimpleSegmenter : Segmenter { 13 | 14 | /** 15 | * 分词方法 16 | */ 17 | override fun segment(text: String): List { 18 | return segment(text, false) 19 | } 20 | 21 | /** 22 | * [remove] 是否去除 非ascii字符, 其实只保留英文和数字 23 | */ 24 | protected fun segment(text: String, remove: Boolean): List { 25 | val segs = arrayListOf() 26 | if (text.isBlank()) { 27 | return segs 28 | } 29 | var digitNum = 0 30 | var ansiCharNum = 0 31 | for (i in 0 until text.length) { 32 | val c = text[i] 33 | // 是否是数字 34 | if (c in '0'..'9') { 35 | // 截取出字母 36 | if (ansiCharNum > 0) { 37 | segs.add(text.take(i - ansiCharNum, i - 1)) 38 | ansiCharNum = 0 39 | } 40 | digitNum++ 41 | continue 42 | } 43 | // 是否是字母 44 | if (c in 'A'..'Z' || c in 'a'..'z') { 45 | // 截取出数字 46 | if (digitNum > 0) { 47 | segs.add(text.take(i - digitNum, i - 1)) 48 | digitNum = 0 49 | } 50 | ansiCharNum++ 51 | continue 52 | } 53 | // 非数字字母时, 截取 54 | if (digitNum > 0 || ansiCharNum > 0) { //digitNum, ansiCharNum中只可能一个大于0 55 | segs.add(text.take(i - digitNum - ansiCharNum, i - 1)) 56 | ansiCharNum = 0 57 | digitNum = 0 58 | } 59 | if (!remove) segs.add(c.toString()) 60 | } 61 | // 截取剩余 62 | if (digitNum > 0 || ansiCharNum > 0) { //digitNum, ansiCharNum中只可能一个大于0 63 | segs.add(text.take(text.length - digitNum - ansiCharNum)) 64 | // ansiCharNum = 0 65 | // digitNum = 0 66 | } 67 | return segs 68 | } 69 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/segment/SmartCNSegmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.segment 2 | 3 | import org.bitlap.geocoding.core.Segmenter 4 | //import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer 5 | //import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 6 | 7 | /** 8 | * Desc: lucene 的 smartCN 分词器 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/2/6 12 | */ 13 | open class SmartCNSegmenter : Segmenter { 14 | 15 | // private val ANALYZER = SmartChineseAnalyzer() 16 | 17 | /** 18 | * 分词方法 19 | */ 20 | override fun segment(text: String): List { 21 | val segs = arrayListOf() 22 | // 切分 23 | // val ts = ANALYZER.tokenStream("text", text) 24 | // ts.reset() 25 | // while (ts.incrementToken()) { 26 | // val attr = ts.getAttribute(CharTermAttribute::class.java) 27 | // segs.add(attr.toString()) 28 | // } 29 | // ts.end() 30 | // ts.close() 31 | return segs 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/core/segment/WordSegmenter.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.core.segment 2 | 3 | import org.bitlap.geocoding.core.Segmenter 4 | 5 | /** 6 | * Desc: word 分词器 @see https://github.com/ysc/word 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/2/6 10 | */ 11 | open class WordSegmenter : Segmenter { 12 | 13 | /** 14 | * 分词方法 15 | */ 16 | override fun segment(text: String): List { 17 | val segs = arrayListOf() 18 | // 去除停用词 19 | // WordSegmenter.segWithStopWords(text).forEach { 20 | // segs.add(it.text) 21 | // } 22 | return segs 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/index/TermIndexBuilder.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.index 2 | 3 | import org.bitlap.geocoding.core.TermIndexVisitor 4 | import org.bitlap.geocoding.model.RegionEntity 5 | import org.bitlap.geocoding.model.RegionType.City 6 | import org.bitlap.geocoding.model.RegionType.CityLevelDistrict 7 | import org.bitlap.geocoding.model.RegionType.Country 8 | import org.bitlap.geocoding.model.RegionType.District 9 | import org.bitlap.geocoding.model.RegionType.PlatformL4 10 | import org.bitlap.geocoding.model.RegionType.Province 11 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity1 12 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity2 13 | import org.bitlap.geocoding.model.RegionType.Street 14 | import org.bitlap.geocoding.model.RegionType.Town 15 | import org.bitlap.geocoding.model.RegionType.Village 16 | import org.bitlap.geocoding.utils.head 17 | 18 | /** 19 | * Desc: 行政区划建立倒排索引 20 | * Mail: chk19940609@gmail.com 21 | * Created by IceMimosa 22 | * Date: 2017/1/17 23 | */ 24 | open class TermIndexBuilder( 25 | rootRegion: RegionEntity, 26 | ignoringRegionNames: List 27 | ) { 28 | 29 | private val indexRoot = TermIndexEntry() 30 | 31 | init { 32 | this.indexRegions(rootRegion.children ?: emptyList()) 33 | this.indexIgnoring(ignoringRegionNames) 34 | } 35 | 36 | // 为行政区划(标准地址库建立倒排索引) 37 | @Synchronized 38 | fun indexRegions(regions: List, replace: Boolean = false) { 39 | if (regions.isEmpty()) return 40 | for (region in regions) { 41 | val indexItem = TermIndexItem(convertRegionType(region), region) 42 | for (alias in region.orderedNames ?: emptyList()) { 43 | indexRoot.buildIndex(alias, 0, indexItem, replace) 44 | } 45 | 46 | //1. 为xx街道,建立xx镇、xx乡的别名索引项 47 | //2. 为xx镇,建立xx乡的别名索引项 48 | //3. 为xx乡,建立xx镇的别名索引项 49 | val rName = region.name 50 | var autoAlias = rName.length <= 5 && region.alias.isEmpty() 51 | && (region.isTown() || rName.endsWith("街道")) 52 | if (autoAlias && rName.length == 5) { 53 | when (region.name[2]) { 54 | '路', '街', '门', '镇', '村', '区' -> autoAlias = false 55 | } 56 | } 57 | if (autoAlias) { 58 | var shortName: String? 59 | if (region.isTown()) { 60 | shortName = rName.head(rName.length - 1) ?: "" 61 | } else { 62 | shortName = rName.head(rName.length - 2) ?: "" 63 | } 64 | // 建立索引 65 | if (shortName.length >= 2) { 66 | indexRoot.buildIndex(shortName, 0, indexItem, replace) 67 | } 68 | if (rName.endsWith("街道") || rName.endsWith("镇")) 69 | indexRoot.buildIndex(shortName + "乡", 0, indexItem, replace) 70 | if (rName.endsWith("街道") || rName.endsWith("乡")) 71 | indexRoot.buildIndex(shortName + "镇", 0, indexItem, replace) 72 | } 73 | 74 | // 递归 75 | if (region.children != null && region.children!!.isNotEmpty()) { 76 | this.indexRegions(region.children!!) 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * 为忽略列表建立倒排索引 83 | */ 84 | @Synchronized 85 | fun indexIgnoring(ignoringRegionNames: List, replace: Boolean = false) { 86 | if (ignoringRegionNames.isEmpty()) return 87 | for (ignore in ignoringRegionNames) { 88 | indexRoot.buildIndex(ignore, 0, TermIndexItem(TermType.Ignore, null), replace) 89 | } 90 | } 91 | 92 | // 获取 region 的类型 93 | private fun convertRegionType(region: RegionEntity): TermType = 94 | when (region.type) { 95 | Country -> TermType.Country 96 | Province, ProvinceLevelCity1 -> TermType.Province 97 | City, ProvinceLevelCity2 -> TermType.City 98 | District, CityLevelDistrict -> TermType.District 99 | PlatformL4 -> TermType.Street 100 | Town -> TermType.Town 101 | Village -> TermType.Village 102 | Street -> if (region.isTown()) TermType.Town else TermType.Street 103 | else -> TermType.Undefined 104 | } 105 | 106 | /** 107 | * 深度优先匹配词条 108 | */ 109 | fun deepMostQuery(text: String?, visitor: TermIndexVisitor) { 110 | if (text == null || text.isEmpty()) return 111 | // 判断是否有中国开头 112 | var p = 0 113 | if (text.startsWith("中国") || text.startsWith("天朝")) { 114 | p += 2 115 | } 116 | this.deepMostQuery(text, p, visitor) 117 | } 118 | 119 | fun deepMostQuery(text: String?, pos: Int, visitor: TermIndexVisitor) { 120 | if (text == null || text.isEmpty()) return 121 | // 开始匹配 122 | visitor.startRound() 123 | this.deepFirstQueryRound(text, pos, indexRoot.children ?: emptyMap(), visitor) 124 | visitor.endRound() 125 | } 126 | 127 | private fun deepFirstQueryRound(text: String, pos: Int, entries: Map, visitor: TermIndexVisitor) { 128 | // 获取索引对象 129 | if (pos > text.length - 1) return 130 | val entry = entries[text[pos]] ?: return 131 | 132 | if (entry.children != null && pos + 1 <= text.length - 1) { 133 | this.deepFirstQueryRound(text, pos + 1, entry.children ?: emptyMap(), visitor) 134 | } 135 | if (entry.hasItem()) { 136 | if (visitor.visit(entry, text, pos)) { 137 | // 给访问者一个调整当前指针的机会 138 | val p = visitor.position() 139 | if (p + 1 <= text.length - 1) { 140 | deepMostQuery(text, p + 1, visitor) 141 | } 142 | visitor.endVisit(entry, text, p) 143 | } 144 | } 145 | } 146 | 147 | fun fullMatch(text: String?): List? { 148 | if (text == null || text.isEmpty()) return null 149 | return fullMatch(text, 0, indexRoot.children) 150 | } 151 | 152 | private fun fullMatch(text: String, pos: Int, entries: Map?): List? { 153 | if (entries == null) return null 154 | val c = text[pos] 155 | val entry = entries[c] ?: return null 156 | if (pos == text.length - 1) return entry.items 157 | return fullMatch(text, pos + 1, entry.children) 158 | } 159 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/index/TermIndexEntry.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.index 2 | 3 | import org.bitlap.geocoding.utils.head 4 | 5 | /** 6 | * Desc: 索引条目 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/1/12 10 | */ 11 | open class TermIndexEntry { 12 | // 条目的key 13 | var key: String? = null 14 | // 每个条目下的所有索引对象 15 | var items = mutableListOf() 16 | // 子条目 17 | var children = hashMapOf() 18 | 19 | fun addItem(item: TermIndexItem): TermIndexEntry { 20 | this.items.add(item) 21 | return this 22 | } 23 | fun hasItem(): Boolean = this.items.isNotEmpty() 24 | 25 | /** 26 | * 初始化倒排索引 27 | */ 28 | fun buildIndex(text: String?, pos: Int, item: TermIndexItem, replace: Boolean) { 29 | if (text.isNullOrBlank() || pos < 0 || pos >=text.length) { 30 | return 31 | } 32 | val c = text[pos] 33 | var entry = this.children[c] 34 | if (entry == null) { 35 | entry = TermIndexEntry() 36 | entry.key = text.head(pos + 1) 37 | this.children[c] = entry 38 | } 39 | if (pos == text.length - 1) { 40 | if (replace && item.value != null) { 41 | entry.items.removeIf { item.value.equalsWithoutId(it.value) } 42 | } 43 | entry.addItem(item) 44 | return 45 | } 46 | entry.buildIndex(text, pos + 1, item, replace) 47 | } 48 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/index/TermIndexItem.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.index 2 | 3 | import org.bitlap.geocoding.model.RegionEntity 4 | 5 | /** 6 | * Desc: 索引对象 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/1/16 10 | */ 11 | data class TermIndexItem constructor(val type: TermType, val value: RegionEntity?) -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/index/TermType.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.index 2 | 3 | 4 | /** 5 | * Desc: 词条的类型 6 | * 地址虽算不上标准结构化文本,但格式具备一定的规则性,例如省/市/区、道路/门牌号、小区/楼号/户号等 7 | * 词条类型用来标记该词条属于地址的哪一组成部分,主要用于相似度计算时,为不同组成部分区别性的进行加权 8 | * Mail: chk19940609@gmail.com 9 | * Created by IceMimosa 10 | * Date: 2017/1/12 11 | */ 12 | enum class TermType(val type: Char) { 13 | 14 | Undefined('0'), 15 | // 国家 16 | Country('C'), 17 | // 省 18 | Province('1'), 19 | // 地级市 20 | City('2'), 21 | // 区县 22 | District('3'), 23 | // 街道 24 | Street('4'), 25 | // 乡镇 26 | Town('T'), 27 | // 村 28 | Village('V'), 29 | // 道路 30 | Road('R'), 31 | // 门牌号 32 | RoadNum('N'), 33 | // 其他地址文本 34 | Text('X'), 35 | // 忽略项 36 | Ignore('I'); 37 | 38 | // 获取枚举类型 39 | fun toEnum(type: Char): TermType { 40 | val enums = TermType.values() 41 | for (e in enums) { 42 | if (e.type == type) return e 43 | } 44 | return TermType.Undefined 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/model/Address.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.model 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Desc: address 实体类 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/1/18 10 | */ 11 | open class Address : Serializable { 12 | 13 | // 省 14 | var provinceId: Long? = null 15 | var province: String? = null 16 | // 市 17 | var cityId: Long? = null 18 | var city: String? = null 19 | // 区 20 | var districtId: Long? = null 21 | var district: String? = null 22 | // 街道 23 | var streetId: Long? = null 24 | var street: String? = null 25 | // 乡镇 26 | var townId: Long? = null 27 | var town: String? = null 28 | // 村 29 | var villageId: Long? = null 30 | var village: String? = null 31 | // 道路 32 | var road: String? = null 33 | // 道路号 34 | var roadNum: String? = null 35 | // 建筑物信息 36 | var buildingNum: String? = null 37 | // 切分剩余未解析出来的地址 38 | var text: String? = null 39 | 40 | companion object { 41 | // 构建一个Address对象 42 | fun build(entity: AddressEntity?): Address? { 43 | if (entity == null || !entity.hasProvince()) return null 44 | val address = Address() 45 | address.provinceId = entity.province?.id 46 | address.province = entity.province?.name 47 | address.cityId = entity.city?.id 48 | address.city = entity.city?.name 49 | address.districtId = entity.district?.id 50 | address.district = entity.district?.name 51 | address.streetId = entity.street?.id 52 | address.street = entity.street?.name 53 | address.townId = entity.town?.id 54 | address.town = entity.town?.name 55 | address.villageId = entity.village?.id 56 | address.village = entity.village?.name 57 | address.road = entity.road 58 | address.roadNum = entity.roadNum 59 | address.buildingNum = entity.buildingNum 60 | address.text = entity.text 61 | return address 62 | } 63 | } 64 | 65 | constructor() 66 | constructor(provinceId: Long?, province: String?, cityId: Long?, city: String?, districtId: Long?, district: String?, streetId: Long?, street: String?, townId: Long?, town: String?, villageId: Long?, village: String?, road: String?, roadNum: String?, buildingNum: String?, text: String?) { 67 | this.provinceId = provinceId 68 | this.province = province 69 | this.cityId = cityId 70 | this.city = city 71 | this.districtId = districtId 72 | this.district = district 73 | this.streetId = streetId 74 | this.street = street 75 | this.townId = townId 76 | this.town = town 77 | this.villageId = villageId 78 | this.village = village 79 | this.road = road 80 | this.roadNum = roadNum 81 | this.buildingNum = buildingNum 82 | this.text = text 83 | } 84 | 85 | override fun toString(): String { 86 | return "Address(\n\tprovinceId=$provinceId, province=$province, " + 87 | "\n\tcityId=$cityId, city=$city, " + 88 | "\n\tdistrictId=$districtId, district=$district, " + 89 | "\n\tstreetId=$streetId, street=$street, " + 90 | "\n\ttownId=$townId, town=$town, " + 91 | "\n\tvillageId=$villageId, village=$village, " + 92 | "\n\troad=$road, " + 93 | "\n\troadNum=$roadNum, " + 94 | "\n\tbuildingNum=$buildingNum, " + 95 | "\n\ttext=$text\n)" 96 | 97 | } 98 | 99 | override fun equals(other: Any?): Boolean { 100 | if (this === other) return true 101 | if (other !is Address) return false 102 | 103 | if (provinceId != other.provinceId) return false 104 | if (province != other.province) return false 105 | if (cityId != other.cityId) return false 106 | if (city != other.city) return false 107 | if (districtId != other.districtId) return false 108 | if (district != other.district) return false 109 | if (streetId != other.streetId) return false 110 | if (street != other.street) return false 111 | if (townId != other.townId) return false 112 | if (town != other.town) return false 113 | if (villageId != other.villageId) return false 114 | if (village != other.village) return false 115 | if (road != other.road) return false 116 | if (roadNum != other.roadNum) return false 117 | if (buildingNum != other.buildingNum) return false 118 | if (text != other.text) return false 119 | 120 | return true 121 | } 122 | 123 | override fun hashCode(): Int { 124 | var result = provinceId?.hashCode() ?: 0 125 | result = 31 * result + (province?.hashCode() ?: 0) 126 | result = 31 * result + (cityId?.hashCode() ?: 0) 127 | result = 31 * result + (city?.hashCode() ?: 0) 128 | result = 31 * result + (districtId?.hashCode() ?: 0) 129 | result = 31 * result + (district?.hashCode() ?: 0) 130 | result = 31 * result + (streetId?.hashCode() ?: 0) 131 | result = 31 * result + (street?.hashCode() ?: 0) 132 | result = 31 * result + (townId?.hashCode() ?: 0) 133 | result = 31 * result + (town?.hashCode() ?: 0) 134 | result = 31 * result + (villageId?.hashCode() ?: 0) 135 | result = 31 * result + (village?.hashCode() ?: 0) 136 | result = 31 * result + (road?.hashCode() ?: 0) 137 | result = 31 * result + (roadNum?.hashCode() ?: 0) 138 | result = 31 * result + (buildingNum?.hashCode() ?: 0) 139 | result = 31 * result + (text?.hashCode() ?: 0) 140 | return result 141 | } 142 | 143 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/model/AddressEntity.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.model 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Desc: 标准地址实体类 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/1/17 10 | */ 11 | open class AddressEntity constructor() : Division(), Serializable { 12 | 13 | /** 14 | * 解析地址后剩余的地址 15 | */ 16 | var text: String? = null 17 | set(value) { 18 | if (value == null) field = "" else field = value.trim() 19 | } 20 | /** 21 | * 解析出的道路信息 22 | */ 23 | var road: String? = null 24 | set(value) { 25 | if (value == null) field = "" else field = value.trim() 26 | } 27 | /** 28 | * 解析出的道路号 29 | */ 30 | var roadNum: String? = null 31 | set(value) { 32 | if (value == null) field = "" else field = value.trim() 33 | } 34 | /** 35 | * 解析出的建筑信息 36 | */ 37 | var buildingNum: String? = null 38 | set(value) { 39 | if (value == null) field = "" else field = value.trim() 40 | } 41 | /** 42 | * 源地址的hash值, 保留做唯一性处理 43 | */ 44 | var hash: Int? = null 45 | /** 46 | * 源地址保留 47 | */ 48 | var address: String? = null 49 | 50 | constructor(address: String?) : this() { 51 | this.address = address 52 | this.text = address 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/model/Division.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.model 2 | 3 | import org.bitlap.geocoding.model.RegionType.PlatformL4 4 | import org.bitlap.geocoding.model.RegionType.Street 5 | import org.bitlap.geocoding.model.RegionType.Town 6 | 7 | /** 8 | * Desc: 行政区规范实体 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/1/13 12 | */ 13 | open class Division { 14 | 15 | // 省 16 | var province: RegionEntity? = null 17 | // 市 18 | var city: RegionEntity? = null 19 | // 区 20 | var district: RegionEntity? = null 21 | // 街道 22 | var street: RegionEntity? = null 23 | // 乡镇 24 | var town: RegionEntity? = null 25 | set(town) { 26 | town ?: return 27 | when(town.type) { 28 | Town -> field = town 29 | Street, PlatformL4 -> this.street = town 30 | else -> return 31 | } 32 | } 33 | get() { 34 | if (field != null) return field 35 | if (this.street == null) return null 36 | return if (this.street!!.isTown()) this.street else null 37 | } 38 | // 村 39 | var village: RegionEntity? = null 40 | 41 | 42 | fun hasProvince(): Boolean = this.province != null 43 | fun hasCity(): Boolean = this.city != null 44 | fun hasDistrict(): Boolean = this.district != null 45 | fun hasStreet(): Boolean = this.street != null 46 | fun hasTown(): Boolean = this.town != null 47 | fun hasVillage(): Boolean = this.village != null 48 | 49 | /** 50 | * 获取最小一级有效行政区域对象。 51 | */ 52 | fun leastRegion(): RegionEntity { 53 | if (hasVillage()) return this.village!! 54 | if (hasTown()) return this.town!! 55 | if (hasStreet()) return this.street!! 56 | if (hasDistrict()) return this.district!! 57 | if (hasCity()) return this.city!! 58 | return this.province!! 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/model/RegionEntity.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.model 2 | 3 | import org.bitlap.geocoding.model.RegionType.Street 4 | import org.bitlap.geocoding.model.RegionType.Town 5 | import java.io.Serializable 6 | import java.util.* 7 | 8 | /** 9 | * Desc: 区域实体类, 标准地址库4级地址(region.dat from Taobao, JD) 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/1/12 13 | */ 14 | open class RegionEntity : Serializable { 15 | 16 | var id: Long = 0 17 | var parentId: Long = 0 18 | var name: String = "" 19 | var alias = "" 20 | var type: RegionType = RegionType.Undefined 21 | var zip = "" 22 | var children: ArrayList? = null 23 | var orderedNames: List? = null 24 | get() { 25 | synchronized(this) { 26 | if (field != null) return field 27 | field = buildOrderedNames() 28 | return field 29 | } 30 | } 31 | 32 | // 创建排序后的别名, 并按照长度排序 33 | private fun buildOrderedNames(): List { 34 | val fields = mutableListOf(this.name) 35 | if (this.alias.isBlank()) return fields 36 | this.alias.split(";").forEach { 37 | if (it.isNotBlank()) { 38 | fields.add(it) 39 | } 40 | } 41 | // 按长度倒序 42 | fields.sortWith { t1, t2 -> 43 | t2.length - t1.length 44 | } 45 | return fields 46 | } 47 | 48 | /** 49 | * 判断是否是乡镇 50 | */ 51 | fun isTown(): Boolean { 52 | when (this.type) { 53 | Town -> return true 54 | Street -> { 55 | if (this.name.isBlank()) return false 56 | return this.name.length <= 4 && (this.name.last() == '镇' || this.name[this.name.lastIndex] == '乡') 57 | } 58 | else -> return false 59 | } 60 | } 61 | 62 | 63 | override fun equals(other: Any?): Boolean { 64 | if (other == null || other.javaClass != RegionEntity::class.java) return false 65 | val region = other as RegionEntity? 66 | return this.id == region!!.id 67 | } 68 | 69 | override fun hashCode(): Int { 70 | return this.id.hashCode() 71 | } 72 | 73 | fun equalsWithoutId(other: Any?): Boolean { 74 | if (other == null || other.javaClass != RegionEntity::class.java) return false 75 | other as RegionEntity 76 | 77 | if (parentId != other.parentId) return false 78 | if (name != other.name) return false 79 | if (alias != other.alias) return false 80 | if (type != other.type) return false 81 | if (zip != other.zip) return false 82 | 83 | return true 84 | } 85 | 86 | override fun toString(): String { 87 | return "RegionEntity(id=$id, parentId=$parentId, name='$name', alias='$alias', type=$type, zip='$zip')" 88 | } 89 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/model/RegionType.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.model 2 | 3 | /** 4 | * Desc: 区域类型 5 | * Mail: chk19940609@gmail.com 6 | * Created by IceMimosa 7 | * Date: 2017/1/12 8 | */ 9 | enum class RegionType(val value: Int) { 10 | // 未定义区域类型 11 | Undefined(0), 12 | // 国家 13 | Country(10), 14 | // 省份 15 | Province(100), 16 | // 直辖市-与省份并行的一级 17 | ProvinceLevelCity1(150), 18 | // 直辖市-与城市并行的一级 19 | ProvinceLevelCity2(151), 20 | // 地级市 21 | City(200), 22 | // 省直辖县级市 23 | CityLevelDistrict(250), 24 | // 县、区 25 | District(300), 26 | // 街道乡镇一级 27 | Street(450), 28 | // 特定平台的4级地址 29 | PlatformL4(460), 30 | // 附加:乡镇 31 | Town(400), 32 | // 附加:村 33 | Village(410); 34 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/similarity/Document.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.similarity 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Desc: 文档对象 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/2/5 10 | */ 11 | open class Document : Serializable { 12 | 13 | // 文档所有词条, 按照文档顺序, 未去重 14 | var terms: List? = null 15 | // Term.text -> Term 16 | var termsMap: HashMap? = null 17 | 18 | // 乡镇相关的词条信息 19 | var town: Term? = null 20 | var village: Term? = null 21 | 22 | // 道路信息 23 | var road: Term? = null 24 | var roadNum: Term? = null 25 | var roadNumValue = 0 26 | 27 | /** 28 | * 获取 Term 29 | */ 30 | fun getTerm(text: String?): Term? { 31 | if (this.terms == null || this.terms!!.isEmpty()) return null 32 | if (this.termsMap == null) { 33 | // build cache 34 | synchronized(this) { 35 | if (this.termsMap == null) { 36 | this.termsMap = hashMapOf() 37 | this.terms?.forEach { 38 | if (!it.text.isNullOrBlank()) { 39 | this.termsMap!!.put(it.text!!, it) 40 | } 41 | } 42 | } 43 | } 44 | } 45 | return this.termsMap!![text] 46 | } 47 | 48 | override fun toString(): String { 49 | return "Document(terms=$terms, town=$town, village=$village, road=$road, roadNum=$roadNum, roadNumValue=$roadNumValue)" 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/similarity/MatchedResult.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.similarity 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Desc: 相似度匹配的结果 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/2/7 10 | */ 11 | open class MatchedResult : Serializable { 12 | 13 | // 两个地址分析出的文档 14 | var doc1: Document? = null 15 | var doc2: Document? = null 16 | 17 | // 匹配的词条信息 18 | var terms: ArrayList = arrayListOf() 19 | 20 | // 相似度值 21 | var similarity = 0.0 22 | 23 | override fun toString(): String { 24 | return "MatchedResult(\n\tdoc1=$doc1, \n\tdoc2=$doc2, \n\tterms=$terms, \n\tsimilarity=$similarity\n)" 25 | } 26 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/similarity/MatchedTerm.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.similarity 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Desc: 匹配的词条信息 7 | * Mail: chk19940609@gmail.com 8 | * Created by IceMimosa 9 | * Date: 2017/2/7 10 | */ 11 | open class MatchedTerm : Serializable { 12 | 13 | // 匹配的词条 14 | var term: Term? = null 15 | 16 | // 匹配率 17 | var coord: Double = 0.0 18 | 19 | // 稠密度 20 | var density: Double = 0.0 21 | 22 | // 权重 23 | var boost: Double = 0.0 24 | 25 | // 特征值 TF-IDF 26 | var tfidf: Double = 0.0 27 | 28 | constructor(term: Term) { 29 | this.term = term 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/similarity/Term.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.similarity 2 | 3 | import org.bitlap.geocoding.similarity.Term.TermType.City 4 | import org.bitlap.geocoding.similarity.Term.TermType.District 5 | import org.bitlap.geocoding.similarity.Term.TermType.Ignore 6 | import org.bitlap.geocoding.similarity.Term.TermType.Province 7 | import org.bitlap.geocoding.similarity.Term.TermType.Street 8 | import org.bitlap.geocoding.similarity.Term.TermType.Town 9 | import java.io.Serializable 10 | 11 | /** 12 | * Desc: 词条 13 | * Mail: chk19940609@gmail.com 14 | * Created by 15 | * Date: 2017/2/5 16 | */ 17 | open class Term : Serializable { 18 | // 词条内容 19 | var text: String? = null 20 | 21 | // 词条类型 22 | var type: TermType? = null 23 | 24 | // Inverse Document Frequency,逆文档词频 25 | var idf: Double? = null 26 | get() { 27 | when (type) { 28 | Province, City, District -> return 0.0 29 | Street -> return 1.0 30 | // Town, Village, Road, RoadNum, Text, 31 | else -> return field 32 | } 33 | } 34 | 35 | // 相关联的词条引用 36 | var ref: Term? = null 37 | 38 | constructor(type: TermType, text: String?) { 39 | this.type = type 40 | if (text == null) { 41 | this.text = null 42 | return 43 | } 44 | when (type) { 45 | Province, City, District, Street, Town, Ignore -> this.text = text.intern() 46 | else -> this.text = text 47 | } 48 | } 49 | 50 | override fun equals(other: Any?): Boolean { 51 | if (other == null || other.javaClass != Term::class.java) 52 | return false 53 | val t = other as Term 54 | if (this.text == null) return t.text == null 55 | return this.text == t.text 56 | } 57 | 58 | override fun hashCode(): Int { 59 | if (this.text == null) return 0 60 | return this.text!!.hashCode() 61 | } 62 | 63 | override fun toString(): String { 64 | return "Term($text)" 65 | } 66 | 67 | 68 | // 词条类型, 主要用于给每部分加权重 69 | enum class TermType(val value: Char) { 70 | Undefined('0'), 71 | // 省 72 | Province('1'), 73 | // 地级市 74 | City('2'), 75 | // 区县 76 | District('3'), 77 | // 街道 78 | Street('4'), 79 | // 乡镇 80 | Town('T'), 81 | // 村 82 | Village('V'), 83 | // 道路 84 | Road('R'), 85 | // 门牌号 86 | RoadNum('N'), 87 | // 建筑物号 88 | Building('B'), 89 | // 其他地址文本 90 | Text('X'), 91 | // 忽略项 92 | Ignore('I'); 93 | } 94 | 95 | 96 | } -------------------------------------------------------------------------------- /src/main/java/org/bitlap/geocoding/utils/StringHelper.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.utils 2 | 3 | /** 4 | * Desc: String 一些帮助类 5 | * Mail: chk19940609@gmail.com 6 | * Created by IceMimosa 7 | * Date: 2017/1/17 8 | */ 9 | 10 | 11 | /** 12 | * 获取String头部length字符的子串。 13 | * 此处优化边界处理 14 | */ 15 | fun String?.head(length: Int): String? { 16 | if (this.isNullOrBlank() || this.length <= length) return this 17 | if (length <= 0) return "" 18 | return this.substring(0, length) 19 | } 20 | 21 | /** 22 | * 获取String尾部length字符的子串。 23 | * 此处优化边界处理 24 | */ 25 | fun String?.tail(length: Int): String? { 26 | if (this.isNullOrBlank() || this.length <= length) return this 27 | if (length <= 0) return "" 28 | return this.substring(this.length - length) 29 | } 30 | 31 | /** 32 | * 提取子串, 优化边界判断 33 | * [begin]: 开始位置, 包括 34 | */ 35 | fun String.take(begin: Int): String { 36 | if (this.isBlank() || begin <= 0) return this 37 | if (begin > this.length - 1) return "" 38 | return this.substring(begin) 39 | } 40 | /** 41 | * 提取子串, 优化边界判断 42 | * [begin]: 开始位置, 包括 43 | * [end]: 结束位置, 包括 44 | */ 45 | fun String.take(begin: Int, end: Int): String { 46 | if (this.isBlank()) return this 47 | val s = if (begin <= 0) 0 else begin 48 | val e = if (end >= this.length - 1) this.length - 1 else end 49 | if (s > e) return "" 50 | if (s == 0 && e == this.length - 1) return this 51 | return this.substring(s, e + 1) 52 | } 53 | 54 | /** 55 | * 删除数组中对应的字符 56 | */ 57 | @JvmOverloads 58 | fun String.remove(array: CharArray, exclude: String = ""): String { 59 | if (this.isBlank() || array.isEmpty()) return this 60 | // 去除字符 61 | val sb = StringBuilder(this.length) 62 | var remove = false 63 | this.forEach { 64 | if (array.contains(it) && !exclude.contains(it)) { 65 | remove = true 66 | return@forEach 67 | } 68 | sb.append(it) 69 | } 70 | return if (remove) sb.toString() else this 71 | } 72 | 73 | /** 74 | * 去除重复出现 [length] 个以上的数字 75 | * [length] : 重复出现的次数 76 | */ 77 | fun String.removeRepeatNum(length: Int): String { 78 | if (this.isBlank() || this.length < length) return this 79 | val sb = StringBuilder(this.length) 80 | var count = 0 81 | this.forEachIndexed { i, c -> 82 | if (c in '0'..'9') { 83 | count++ 84 | return@forEachIndexed 85 | } 86 | // 如果小于重复出现的长度 87 | if (count in 1 until length) { 88 | sb.append(this.take(i - count, i - 1)) 89 | } 90 | // 重置标志 91 | count = 0 92 | sb.append(c) 93 | } 94 | if (count in 1 until length) { 95 | sb.append(this.tail(count)) 96 | } 97 | return sb.toString() 98 | } 99 | 100 | /** 101 | * 判断是否是纯数字 102 | */ 103 | fun String?.isNumericChars(): Boolean { 104 | if (this.isNullOrBlank()) return false 105 | return this.none { 106 | it !in '0'..'9' 107 | } 108 | } 109 | 110 | /** 111 | * 全部为 ASCII 字母 112 | */ 113 | fun String?.isAsciiChars(): Boolean { 114 | if (this.isNullOrBlank()) return false 115 | return this.none { 116 | it !in 'a'..'z' && it !in 'A'..'Z' 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/resources/IKAnalyzer.cfg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | IK Analyzer扩展配置 5 | dic/region.dic;dic/community.dic 6 | dic/stop.dic; 7 | -------------------------------------------------------------------------------- /src/main/resources/dic/stop.dic: -------------------------------------------------------------------------------- 1 | 到了 2 | 联系 3 | 附近 4 | 街上 5 | 街 6 | 省 7 | 市 8 | 区 -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/resources/word.local.conf: -------------------------------------------------------------------------------- 1 | dic.path=classpath:dic/region.dic,classpath:dic/community.dic 2 | stopwords.path=classpath:dic/stop.dic -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestCustomDatSave.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.model.Address 4 | import org.bitlap.geocoding.model.RegionType 5 | import org.junit.Test 6 | import kotlin.test.assertEquals 7 | 8 | /** 9 | * Desc: 测试保存自定义文件 10 | * Mail: blvyoucan@163.com 11 | * Created by hechongbin 12 | * Date: 2023/6/7 13 | */ 14 | class TestCustomDatSave { 15 | 16 | @Test 17 | fun saveAndNomalizing() { 18 | val geocoding = GeocodingX("region_2021.dat") 19 | val addrss = "浙江省杭州市临平区经济开发区新颜路22号501D" 20 | 21 | // 未添加自定义地区"临平区" 22 | assertEquals( 23 | geocoding.normalizing(addrss), 24 | Address( 25 | 330000000000, "浙江省", 26 | 330100000000, "杭州市", 27 | 330110000000, "余杭区", 28 | 330110001000, "临平街道", 29 | null, null, 30 | null, null, 31 | null, null, 32 | "501", 33 | "区经济开发区新颜路22号D" 34 | ) 35 | ) 36 | 37 | // 添加自定义地区"临平区" 38 | geocoding.addRegionEntry(330113000000, 330100000000, "临平区", RegionType.District, "", true) 39 | 40 | val addNew = Address( 41 | 330000000000, "浙江省", 42 | 330100000000, "杭州市", 43 | 330113000000, "临平区", 44 | null, null, 45 | null, null, 46 | null, null, 47 | "新颜路", "22号", 48 | "501", 49 | "D" 50 | ) 51 | 52 | assertEquals(geocoding.normalizing(addrss), addNew) 53 | 54 | // 添加后"临平区"后保存自定义字典文件 55 | val filename = "mydata.dat" 56 | val filePath = "${this.javaClass.classLoader.getResource("").path}/$filename" 57 | geocoding.save(filePath) 58 | 59 | // 读取添加了"临平区"的自定义字典文件 60 | val myGeocoding = GeocodingX(filename) 61 | assertEquals(myGeocoding.normalizing(addrss), addNew) 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestNormalizing.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.model.Address 4 | import org.junit.Test 5 | import java.sql.DriverManager 6 | import kotlin.test.assertEquals 7 | 8 | /** 9 | * Desc: 测试地址标准化 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/1/18 13 | */ 14 | class TestNormalizing { 15 | 16 | @Test 17 | fun testNormalizing() { 18 | assertEquals( 19 | Geocoding.normalizing("江苏泰州兴化市昌荣镇【康琴网吧】 (昌荣镇附近)"), 20 | Address( 21 | 320000000000, "江苏省", 22 | 321200000000, "泰州市", 23 | 321281000000, "兴化市", 24 | 321281119000, "昌荣镇", 25 | 321281119000, "昌荣镇", 26 | null, null, 27 | null, null, 28 | null, 29 | "康琴网吧昌荣镇附近" 30 | ) 31 | ) 32 | assertEquals( 33 | Geocoding.normalizing("中国山东临沂兰山区小埠东社区居委会【绿杨榭公寓31-1-101 】 (绿杨榭公寓附近)"), 34 | Address( 35 | 370000000000, "山东省", 36 | 371300000000, "临沂市", 37 | 371302000000, "兰山区", 38 | null, null, 39 | null, null, 40 | null, null, 41 | null, null, 42 | "31-1-101", 43 | "小埠东社区居委会绿杨榭公寓绿杨榭公寓附近" 44 | ) 45 | ) 46 | assertEquals( 47 | Geocoding.normalizing("抚顺顺城区将军桥【将军水泥厂住宅4-1-102】 (将军桥附近)"), 48 | Address( 49 | 210000000000, "辽宁省", 50 | 210400000000, "抚顺市", 51 | 210411000000, "顺城区", 52 | null, null, 53 | null, null, 54 | null, null, 55 | null, 56 | null, 57 | "4-1-102", 58 | "将军桥将军水泥厂住宅将军桥附近" 59 | ) 60 | ) 61 | assertEquals( 62 | Geocoding.normalizing("中国辽宁沈阳辽中县北一路【虹桥商厦西行100米】(邮政储蓄银行北一路支行附近)"), 63 | Address( 64 | 210000000000, "辽宁省", 65 | 210100000000, "沈阳市", 66 | 210122000000, "辽中县", 67 | null, null, 68 | null, null, 69 | null, null, 70 | "北一路", 71 | "", 72 | null, 73 | "虹桥商厦西行100米邮政储蓄银行北一路支行附近" 74 | ) 75 | ) 76 | assertEquals( 77 | Geocoding.normalizing("辽宁 沈阳 辽中县中国辽宁 沈阳 辽中县虹桥商厦苏宁易购"), 78 | Address( 79 | 210000000000, "辽宁省", 80 | 210100000000, "沈阳市", 81 | 210122000000, "辽中县", 82 | null, null, 83 | null, null, 84 | null, null, 85 | null, 86 | null, 87 | null, 88 | "虹桥商厦苏宁易购" 89 | ) 90 | ) 91 | assertEquals( 92 | Geocoding.normalizing("辽宁沈阳于洪区沈阳市辽中县县城虹桥商厦西侧三单元外跨楼梯3-2-23-"), 93 | Address( 94 | 210000000000, "辽宁省", 95 | 210100000000, "沈阳市", 96 | 210114000000, "于洪区", 97 | null, null, 98 | null, null, 99 | null, null, 100 | null, 101 | null, 102 | "3-2-23", 103 | "辽中县县城虹桥商厦西侧三单元外跨楼梯" 104 | ) 105 | ) 106 | assertEquals( 107 | Geocoding.normalizing("山东济宁任城区金宇路【杨柳国际新城K8栋3单元1302】(杨柳国际新城·丽宫附近)"), 108 | Address( 109 | 370000000000, "山东省", 110 | 370800000000, "济宁市", 111 | 370811000000, "任城区", 112 | null, null, 113 | null, null, 114 | null, null, 115 | "金宇路", 116 | "", 117 | "K8栋3单元1302", 118 | "杨柳国际新城杨柳国际新城丽宫附近" 119 | ) 120 | ) 121 | assertEquals( 122 | Geocoding.normalizing("上海宝山区杨行镇宝山区江杨北路98号农贸批发市场蔬菜二区7通道A16号"), 123 | Address( 124 | 310000000000, "上海", 125 | 310100000000, "上海市", 126 | 310113000000, "宝山区", 127 | 310113103000, "杨行镇", 128 | 310113103000, "杨行镇", 129 | null, null, 130 | "江杨北路", 131 | "98号", 132 | "7通道A16号", 133 | "农贸批发市场蔬菜二区" 134 | ) 135 | ) 136 | assertEquals( 137 | Geocoding.normalizing("上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)"), 138 | Address( 139 | 310000000000, "上海", 140 | 310100000000, "上海市", 141 | 310113000000, "宝山区", 142 | null, null, 143 | null, null, 144 | null, null, 145 | "新沪路", 146 | "58弄", 147 | "11-802", 148 | "水韵华庭水韵华庭附近" 149 | ) 150 | ) 151 | // 精确度缺失 152 | assertEquals( 153 | Geocoding.normalizing("赤城街道赤城大厦10E"), 154 | Address( 155 | 330000000000, "浙江省", 156 | 331000000000, "台州市", 157 | 331023000000, "天台县", 158 | 331023001000, "赤城街道", 159 | null, null, 160 | null, null, 161 | null, 162 | null, 163 | null, 164 | "大厦10E" 165 | ) 166 | ) 167 | assertEquals( 168 | Geocoding.normalizing("上海黄浦区内环以内黄浦区小东门聚奎街43号"), 169 | Address( 170 | 310000000000, "上海", 171 | 310100000000, "上海市", 172 | 310101000000, "黄浦区", 173 | null, null, 174 | null, null, 175 | null, null, 176 | "小东门聚奎街", 177 | "43号", 178 | null, 179 | "" 180 | ) 181 | ) 182 | assertEquals( 183 | Geocoding.normalizing("河南信阳平桥区王岗镇【镇上】 (王岗乡(大杨墩)附近)"), 184 | Address( 185 | 410000000000, "河南省", 186 | 411500000000, "信阳市", 187 | 411503000000, "平桥区", 188 | 411503209000, "王岗乡", 189 | 411503209000, "王岗乡", 190 | null, null, 191 | null, 192 | null, 193 | null, 194 | "附近镇上王岗乡大杨墩" 195 | ) 196 | ) 197 | // fix 若干电话号码 198 | assertEquals( 199 | Geocoding.normalizing("四川自贡贡井区四川省自贡市贡井区莲花镇四川自贡贡井区莲花镇黄桷村7组22号13298213121/15609000090/18681337139"), 200 | Address( 201 | 510000000000, "四川省", 202 | 510300000000, "自贡市", 203 | 510303000000, "贡井区", 204 | 510303107000, "莲花镇", 205 | 510303107000, "莲花镇", 206 | null, null, 207 | null, 208 | null, 209 | "7组22号", 210 | "黄桷村" 211 | ) 212 | ) 213 | // fix 大云小区, 大云是镇名称的情 214 | assertEquals( 215 | Geocoding.normalizing("浙江嘉兴嘉善县浙江省嘉兴市嘉善县大云镇大云镇大云小区公寓楼1号302室"), 216 | Address( 217 | 330000000000, "浙江省", 218 | 330400000000, "嘉兴市", 219 | 330421000000, "嘉善县", 220 | 330421102000, "大云镇", 221 | 330421102000, "大云镇", 222 | null, null, 223 | null, 224 | null, 225 | "1号302室", 226 | "大云小区公寓楼" 227 | ) 228 | ) 229 | // fix xx路xx号楼 230 | assertEquals( 231 | Geocoding.normalizing("辽宁沈阳铁西区中国辽宁沈阳沈阳市铁西区南十一西路12号楼472 (第九医院(沈阳)附近)"), 232 | Address( 233 | 210000000000, "辽宁省", 234 | 210100000000, "沈阳市", 235 | 210106000000, "铁西区", 236 | null, null, 237 | null, null, 238 | null, null, 239 | "南十一西路", 240 | "", 241 | "12号楼472", 242 | "附近第九医院沈阳" 243 | ) 244 | ) 245 | assertEquals( 246 | Geocoding.normalizing("重庆重庆渝北区重庆渝北区两路镇双龙西路236号5-4(交警12支队红绿灯路口渝达商务宾馆楼上5-4)"), 247 | Address( 248 | 500000000000, "重庆", 249 | 500100000000, "重庆市", 250 | 500112000000, "渝北区", 251 | 500112016000, "两路街道", 252 | null, null, 253 | null, null, 254 | "双龙西路", 255 | "236号", 256 | "5-4", 257 | "交警12支队红绿灯路口渝达商务宾馆楼上54" 258 | ) 259 | ) 260 | assertEquals( 261 | Geocoding.normalizing("山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303"), 262 | Address( 263 | 370000000000, "山东省", 264 | 370200000000, "青岛市", 265 | 370203000000, "市北区", 266 | 370203030000, "水清沟街道", 267 | null, null, 268 | null, null, 269 | "九江路", 270 | "20号", 271 | "3号楼2单元1303", 272 | "大都会" 273 | ) 274 | ) 275 | assertEquals( 276 | Geocoding.normalizing("中国山东青岛城阳区湘潭路【华胥美邦 到了联系20-1-1402】 (中铁华胥美邦附近)"), 277 | Address( 278 | 370000000000, "山东省", 279 | 370200000000, "青岛市", 280 | 370214000000, "城阳区", 281 | null, null, 282 | null, null, 283 | null, null, 284 | "湘潭路", 285 | "", 286 | "20-1-1402", 287 | "华胥美邦到了联系中铁华胥美邦附近" 288 | ) 289 | ) 290 | assertEquals( 291 | Geocoding.normalizing("辽宁沈阳沈河区辽宁沈阳市沈河区一环内会武街56号4-3-2"), 292 | Address( 293 | 210000000000, "辽宁省", 294 | 210100000000, "沈阳市", 295 | 210103000000, "沈河区", 296 | null, null, 297 | null, null, 298 | null, null, 299 | "一环内会武街", 300 | "56号", 301 | "4-3-2", 302 | "" 303 | ) 304 | ) 305 | // fix 辣鸡数据 306 | assertEquals(Geocoding.normalizing("1008中国"), null) 307 | // fix 3层/楼 308 | assertEquals( 309 | Geocoding.normalizing("清徐县中国山西太原清徐县清徐县人民医院附近苹果社区2号楼1单元3层"), 310 | Address( 311 | 140000000000, "山西省", 312 | 140100000000, "太原市", 313 | 140121000000, "清徐县", 314 | null, null, 315 | null, null, 316 | null, null, 317 | null, 318 | null, 319 | "2号楼1单元3层", 320 | "人民医院附近苹果社区" 321 | ) 322 | ) 323 | // fix 3组 324 | assertEquals( 325 | Geocoding.normalizing("辽宁辽阳宏伟区辽宁省辽阳市宏伟区新村街道龙鼎山小区B区08栋3组401号"), 326 | Address( 327 | 210000000000, "辽宁省", 328 | 211000000000, "辽阳市", 329 | 211004000000, "宏伟区", 330 | 211004003000, "新村街道", 331 | null, null, 332 | null, null, 333 | null, 334 | null, 335 | "08栋3组401号", 336 | "龙鼎山小区B区" 337 | ) 338 | ) 339 | // fix 3门 340 | assertEquals( 341 | Geocoding.normalizing("北京北京市西城区 白纸坊街道右安门内西街甲10号院11楼3门501"), 342 | Address( 343 | 110000000000, "北京", 344 | 110100000000, "北京市", 345 | 110102000000, "西城区", 346 | 110102019000, "白纸坊街道", 347 | null, null, 348 | null, null, 349 | "右安门内西街", 350 | "甲10号院", 351 | "11楼3门501", 352 | "" 353 | ) 354 | ) 355 | // fix 延川是县区的情况, 不能将延川路识别成延川县 356 | assertEquals(Geocoding.normalizing("延川路116号绿城城园东区7号楼2单元802户"), null) 357 | // fix 绍兴路匹配上绍兴市的情况 358 | assertEquals(Geocoding.normalizing("绍兴路59号速递易"), null) 359 | // fix 同上, 不能识别成金水区 360 | assertEquals(Geocoding.normalizing("金水路751号1号楼3单元501"), null) 361 | assertEquals( 362 | Geocoding.normalizing("中国上海上海宝山区 顾村镇菊太路777弄24号602室"), 363 | Address( 364 | 310000000000, "上海", 365 | 310100000000, "上海市", 366 | 310113000000, "宝山区", 367 | 310113109000, "顾村镇", 368 | 310113109000, "顾村镇", 369 | null, null, 370 | "菊太路", 371 | "777弄", 372 | "24号602室", 373 | "" 374 | ) 375 | ) 376 | // fix字符 — 377 | assertEquals( 378 | Geocoding.normalizing("辽宁大连甘井子区辽宁, 大连, 甘井子区, 泡崖街玉境路26号3—2—1"), 379 | Address( 380 | 210000000000, "辽宁省", 381 | 210200000000, "大连市", 382 | 210211000000, "甘井子区", 383 | null, null, 384 | null, null, 385 | null, null, 386 | "泡崖街玉境路", 387 | "26号", 388 | "3-2-1", 389 | "" 390 | ) 391 | ) 392 | // fix 开发区的影响 393 | assertEquals( 394 | Geocoding.normalizing("山东德州德城区宋官屯街道开发区段庄村"), 395 | Address( 396 | 370000000000, "山东省", 397 | 371400000000, "德州市", 398 | 371402000000, "德城区", 399 | 371402008000, "宋官屯街道", 400 | null, null, 401 | null, null, 402 | null, 403 | null, 404 | null, 405 | "段庄村" 406 | ) 407 | ) 408 | // fix 只有 1号楼 的情 409 | assertEquals( 410 | Geocoding.normalizing("北京市西城区新康街2号院1号楼北侧楼房"), 411 | Address( 412 | 110000000000, "北京", 413 | 110100000000, "北京市", 414 | 110102000000, "西城区", 415 | null, null, 416 | null, null, 417 | null, null, 418 | "新康街", 419 | "2号院", 420 | "1号楼", 421 | "北侧楼房" 422 | ) 423 | ) 424 | // Fix issues #10 425 | assertEquals( 426 | Geocoding.normalizing("福建福州鼓楼区六一路111号金三桥大厦"), 427 | Address( 428 | 350000000000, "福建省", 429 | 350100000000, "福州市", 430 | 350102000000, "鼓楼区", 431 | null, null, 432 | null, null, 433 | null, null, 434 | "六一路", 435 | "111号", 436 | null, 437 | "金三桥大厦" 438 | ) 439 | ) 440 | // Fix issues #8 441 | assertEquals( 442 | Geocoding.normalizing("广东省河源市源城区中山大道16号华怡小区"), 443 | Address( 444 | 440000000000, "广东省", 445 | 441600000000, "河源市", 446 | 441602000000, "源城区", 447 | null, null, 448 | null, null, 449 | null, null, 450 | "中山大道", 451 | "16号", 452 | null, 453 | "华怡小区" 454 | ) 455 | 456 | ) 457 | assertEquals( 458 | Geocoding.normalizing("广东省河源市中山大道16号华怡小区"), 459 | Address( 460 | 440000000000, "广东省", 461 | 441600000000, "河源市", 462 | null, null, 463 | null, null, 464 | null, null, 465 | null, null, 466 | "中山大道", 467 | "16号", 468 | null, 469 | "华怡小区" 470 | ) 471 | ) 472 | // Fix issues #9 473 | assertEquals( 474 | Geocoding.normalizing("浙江省杭州市西湖区中国建设银河西湖支行"), 475 | Address( 476 | 330000000000, "浙江省", 477 | 330100000000, "杭州市", 478 | 330106000000, "西湖区", 479 | null, null, 480 | null, null, 481 | null, null, 482 | null, 483 | null, 484 | null, 485 | "中国建设银河西湖支行" 486 | ) 487 | ) 488 | assertEquals( 489 | Geocoding.normalizing("江西赣州市赣县区王母渡镇"), 490 | Address( 491 | 360000000000, "江西省", 492 | 360700000000, "赣州市", 493 | 360721000000, "赣县区", 494 | 360721101000, "王母渡镇", 495 | 360721101000, "王母渡镇", 496 | null, null, 497 | null, 498 | null, 499 | null, 500 | "" 501 | ) 502 | ) 503 | // fix 只有父级地址 504 | assertEquals( 505 | Geocoding.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"), 506 | Address( 507 | 130000000000, "河北省", 508 | 130600000000, "保定市", 509 | 130634000000, "曲阳县", 510 | 130634101000, "灵山镇", 511 | 130634101000, "灵山镇", 512 | null, null, 513 | "海榆大道", 514 | "4号", 515 | "11#楼2单元203", 516 | "绿地城润园" 517 | ) 518 | ) 519 | } 520 | 521 | 522 | /** 523 | * 将测试数据解析, 载入到数据库, 便于观察 524 | * 525 | * 表结构在 sql/creat.sql 526 | * 527 | * 注意: 自行修改数据库连接地址 528 | */ 529 | // @Test 530 | fun testImport() { 531 | Class.forName("com.mysql.jdbc.Driver") 532 | val connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/geocoding", "root", "anywhere") 533 | val statement = connection.prepareStatement( 534 | "INSERT INTO `addr_address` (`province`, `city`, `district`, `street`, `text`, `town`, `village`, `road`, `road_num`, `building_num`, `raw_text`) VALUES (?,?,?,?,?,?,?,?,?,?,?)" 535 | ) 536 | TestNormalizing::class.java.classLoader.getResourceAsStream("address.txt").reader().readLines().forEach { 537 | val address = Geocoding.normalizing(it) 538 | statement.setLong(1, address?.provinceId ?: 0) 539 | statement.setLong(2, address?.cityId ?: 0) 540 | statement.setLong(3, address?.districtId ?: 0) 541 | statement.setLong(4, address?.streetId ?: 0) 542 | statement.setString(5, address?.text ?: "") 543 | statement.setString(6, address?.town ?: "") 544 | statement.setString(7, address?.village ?: "") 545 | statement.setString(8, address?.road ?: "") 546 | statement.setString(9, address?.roadNum ?: "") 547 | statement.setString(10, address?.buildingNum ?: "") 548 | statement.setString(11, it) 549 | 550 | statement.execute() 551 | } 552 | 553 | statement.close() 554 | connection.close() 555 | } 556 | 557 | } 558 | -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestNormalizingAddRegionEntry.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.model.Address 4 | import org.bitlap.geocoding.model.RegionType 5 | import org.junit.Test 6 | import kotlin.test.assertEquals 7 | 8 | /** 9 | * Desc: 测试地址标准化 10 | * Mail: chk19940609@gmail.com 11 | * Created by IceMimosa 12 | * Date: 2017/1/18 13 | */ 14 | class TestNormalizingAddRegionEntry { 15 | 16 | @Test 17 | fun testNormalizing() { 18 | Geocoding.addRegionEntry(888888, 321200000000, "泥煤市", RegionType.District) 19 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"), 20 | Address( 21 | 320000000000, "江苏省", 22 | 321200000000, "泰州市", 23 | 888888, "泥煤市", 24 | null, null, 25 | null, null, 26 | null, null, 27 | "泥煤大道", "888号", 28 | null, 29 | "" 30 | ) 31 | ) 32 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province) 33 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City) 34 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District) 35 | assertEquals( 36 | Geocoding.normalizing("中国尼玛省尼玛市泥煤市泥煤大道888号xxx"), 37 | Address( 38 | 88888888, "尼玛省", 39 | 8888888, "尼玛市", 40 | 888888, "泥煤市", 41 | null, null, 42 | null, null, 43 | null, null, 44 | "泥煤大道", "888号", 45 | null, 46 | "xxx" 47 | ) 48 | ) 49 | } 50 | 51 | @Test 52 | fun testNormalizingReplace() { 53 | Geocoding.addRegionEntry(888888, 321200000000, "泥煤市", RegionType.District) 54 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"), 55 | Address( 56 | 320000000000, "江苏省", 57 | 321200000000, "泰州市", 58 | 888888, "泥煤市", 59 | null, null, 60 | null, null, 61 | null, null, 62 | "泥煤大道", "888号", 63 | null, 64 | "" 65 | ) 66 | ) 67 | Geocoding.addRegionEntry(888889, 321200000000, "泥煤市", RegionType.District) 68 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"), 69 | Address( 70 | 320000000000, "江苏省", 71 | 321200000000, "泰州市", 72 | 888889, "泥煤市", 73 | null, null, 74 | null, null, 75 | null, null, 76 | "泥煤大道", "888号", 77 | null, 78 | "" 79 | ) 80 | ) 81 | } 82 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestNormalizingCustom.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.model.Address 4 | import org.junit.Test 5 | import kotlin.test.assertEquals 6 | 7 | /** 8 | * Desc: 测试地址标准化 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/1/18 12 | */ 13 | class TestNormalizingCustom { 14 | 15 | @Test 16 | fun testNormalizing() { 17 | val geocoding = GeocodingX("region_2021.dat") 18 | assertEquals( 19 | geocoding.normalizing("江苏泰州兴化市昌荣镇【康琴网吧】 (昌荣镇附近)"), 20 | Address( 21 | 320000000000, "江苏省", 22 | 321200000000, "泰州市", 23 | 321281000000, "兴化市", 24 | 321281119000, "昌荣镇", 25 | null, null, 26 | null, null, 27 | null, null, 28 | null, 29 | "康琴网吧昌荣镇附近" 30 | ) 31 | ) 32 | assertEquals( 33 | geocoding.normalizing("中国山东临沂兰山区小埠东社区居委会【绿杨榭公寓31-1-101 】 (绿杨榭公寓附近)"), 34 | Address( 35 | 370000000000, "山东省", 36 | 371300000000, "临沂市", 37 | 371302000000, "兰山区", 38 | null, null, 39 | null, null, 40 | null, null, 41 | null, null, 42 | "31-1-101", 43 | "小埠东社区居委会绿杨榭公寓绿杨榭公寓附近" 44 | ) 45 | ) 46 | assertEquals( 47 | geocoding.normalizing("抚顺顺城区将军桥【将军水泥厂住宅4-1-102】 (将军桥附近)"), 48 | Address( 49 | 210000000000, "辽宁省", 50 | 210400000000, "抚顺市", 51 | 210411000000, "顺城区", 52 | null, null, 53 | null, null, 54 | null, null, 55 | null, 56 | null, 57 | "4-1-102", 58 | "将军桥将军水泥厂住宅将军桥附近" 59 | ) 60 | ) 61 | assertEquals( 62 | geocoding.normalizing("辽宁沈阳于洪区沈阳市辽中县县城虹桥商厦西侧三单元外跨楼梯3-2-23-"), 63 | Address( 64 | 210000000000, "辽宁省", 65 | 210100000000, "沈阳市", 66 | 210114000000, "于洪区", 67 | null, null, 68 | null, null, 69 | null, null, 70 | null, 71 | null, 72 | "3-2-23", 73 | "辽中县县城虹桥商厦西侧三单元外跨楼梯" 74 | ) 75 | ) 76 | assertEquals( 77 | geocoding.normalizing("山东济宁任城区金宇路【杨柳国际新城K8栋3单元1302】(杨柳国际新城·丽宫附近)"), 78 | Address( 79 | 370000000000, "山东省", 80 | 370800000000, "济宁市", 81 | 370811000000, "任城区", 82 | null, null, 83 | null, null, 84 | null, null, 85 | "金宇路", 86 | "", 87 | "K8栋3单元1302", 88 | "杨柳国际新城杨柳国际新城丽宫附近" 89 | ) 90 | ) 91 | assertEquals( 92 | geocoding.normalizing("上海宝山区杨行镇宝山区江杨北路98号农贸批发市场蔬菜二区7通道A16号"), 93 | Address( 94 | 310000000000, "上海市", 95 | 310100000000, "直辖区", 96 | 310113000000, "宝山区", 97 | 310113103000, "杨行镇", 98 | null, null, 99 | null, null, 100 | "江杨北路", 101 | "98号", 102 | "7通道A16号", 103 | "农贸批发市场蔬菜二区" 104 | ) 105 | ) 106 | assertEquals( 107 | geocoding.normalizing("上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)"), 108 | Address( 109 | 310000000000, "上海市", 110 | 310100000000, "直辖区", 111 | 310113000000, "宝山区", 112 | null, null, 113 | null, null, 114 | null, null, 115 | "新沪路", 116 | "58弄", 117 | "11-802", 118 | "水韵华庭水韵华庭附近" 119 | ) 120 | ) 121 | // 精确度缺失 122 | assertEquals( 123 | geocoding.normalizing("赤城街道赤城大厦10E"), 124 | Address( 125 | 330000000000, "浙江省", 126 | 331000000000, "台州市", 127 | 331023000000, "天台县", 128 | 331023001000, "赤城街道", 129 | null, null, 130 | null, null, 131 | null, 132 | null, 133 | null, 134 | "大厦10E" 135 | ) 136 | ) 137 | assertEquals( 138 | geocoding.normalizing("上海黄浦区内环以内黄浦区小东门聚奎街43号"), 139 | Address( 140 | 310000000000, "上海市", 141 | 310100000000, "直辖区", 142 | 310101000000, "黄浦区", 143 | null, null, 144 | null, null, 145 | null, null, 146 | "小东门聚奎街", 147 | "43号", 148 | null, 149 | "" 150 | ) 151 | ) 152 | // fix 若干电话号码 153 | assertEquals( 154 | geocoding.normalizing("四川自贡贡井区四川省自贡市贡井区莲花镇四川自贡贡井区莲花镇黄桷村7组22号13298213121/15609000090/18681337139"), 155 | Address( 156 | 510000000000, "四川省", 157 | 510300000000, "自贡市", 158 | 510303000000, "贡井区", 159 | 510303107000, "莲花镇", 160 | null, null, 161 | null, null, 162 | null, 163 | null, 164 | "7组22号", 165 | "黄桷村" 166 | ) 167 | ) 168 | // fix 大云小区, 大云是镇名称的情 169 | assertEquals( 170 | geocoding.normalizing("浙江嘉兴嘉善县浙江省嘉兴市嘉善县大云镇大云镇大云小区公寓楼1号302室"), 171 | Address( 172 | 330000000000, "浙江省", 173 | 330400000000, "嘉兴市", 174 | 330421000000, "嘉善县", 175 | 330421102000, "大云镇", 176 | null, null, 177 | null, null, 178 | null, 179 | null, 180 | "1号302室", 181 | "大云小区公寓楼" 182 | ) 183 | ) 184 | // fix xx路xx号楼 185 | assertEquals( 186 | geocoding.normalizing("辽宁沈阳铁西区中国辽宁沈阳沈阳市铁西区南十一西路12号楼472 (第九医院(沈阳)附近)"), 187 | Address( 188 | 210000000000, "辽宁省", 189 | 210100000000, "沈阳市", 190 | 210106000000, "铁西区", 191 | null, null, 192 | null, null, 193 | null, null, 194 | "南十一西路", 195 | "", 196 | "12号楼472", 197 | "附近第九医院沈阳" 198 | ) 199 | ) 200 | assertEquals( 201 | geocoding.normalizing("重庆重庆渝北区重庆渝北区两路镇双龙西路236号5-4(交警12支队红绿灯路口渝达商务宾馆楼上5-4)"), 202 | Address( 203 | 500000000000, "重庆市", 204 | 500100000000, "直辖区", 205 | 500112000000, "渝北区", 206 | 500112016000, "两路街道", 207 | null, null, 208 | null, null, 209 | "双龙西路", 210 | "236号", 211 | "5-4", 212 | "交警12支队红绿灯路口渝达商务宾馆楼上54" 213 | ) 214 | ) 215 | assertEquals( 216 | geocoding.normalizing("山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303"), 217 | Address( 218 | 370000000000, "山东省", 219 | 370200000000, "青岛市", 220 | 370203000000, "市北区", 221 | 370203030000, "水清沟街道", 222 | null, null, 223 | null, null, 224 | "九江路", 225 | "20号", 226 | "3号楼2单元1303", 227 | "大都会" 228 | ) 229 | ) 230 | assertEquals( 231 | geocoding.normalizing("中国山东青岛城阳区湘潭路【华胥美邦 到了联系20-1-1402】 (中铁华胥美邦附近)"), 232 | Address( 233 | 370000000000, "山东省", 234 | 370200000000, "青岛市", 235 | 370214000000, "城阳区", 236 | null, null, 237 | null, null, 238 | null, null, 239 | "湘潭路", 240 | "", 241 | "20-1-1402", 242 | "华胥美邦到了联系中铁华胥美邦附近" 243 | ) 244 | ) 245 | assertEquals( 246 | geocoding.normalizing("辽宁沈阳沈河区辽宁沈阳市沈河区一环内会武街56号4-3-2"), 247 | Address( 248 | 210000000000, "辽宁省", 249 | 210100000000, "沈阳市", 250 | 210103000000, "沈河区", 251 | null, null, 252 | null, null, 253 | null, null, 254 | "一环内会武街", 255 | "56号", 256 | "4-3-2", 257 | "" 258 | ) 259 | ) 260 | // fix 辣鸡数据 261 | assertEquals(geocoding.normalizing("1008中国"), null) 262 | // fix 3层/楼 263 | assertEquals( 264 | geocoding.normalizing("清徐县中国山西太原清徐县清徐县人民医院附近苹果社区2号楼1单元3层"), 265 | Address( 266 | 140000000000, "山西省", 267 | 140100000000, "太原市", 268 | 140121000000, "清徐县", 269 | null, null, 270 | null, null, 271 | null, null, 272 | null, 273 | null, 274 | "2号楼1单元3层", 275 | "人民医院附近苹果社区" 276 | ) 277 | ) 278 | // fix 3门 279 | assertEquals( 280 | geocoding.normalizing("北京北京市西城区 白纸坊街道右安门内西街甲10号院11楼3门501"), 281 | Address( 282 | 110000000000, "北京市", 283 | 110100000000, "直辖区", 284 | 110102000000, "西城区", 285 | 110102019000, "白纸坊街道", 286 | null, null, 287 | null, null, 288 | "右安门内西街", 289 | "甲10号院", 290 | "11楼3门501", 291 | "" 292 | ) 293 | ) 294 | // fix 延川是县区的情况, 不能将延川路识别成延川县 295 | assertEquals(geocoding.normalizing("延川路116号绿城城园东区7号楼2单元802户"), null) 296 | // fix 同上, 不能识别成金水区 297 | assertEquals(geocoding.normalizing("金水路751号1号楼3单元501"), null) 298 | assertEquals( 299 | geocoding.normalizing("中国上海上海宝山区 顾村镇菊太路777弄24号602室"), 300 | Address( 301 | 310000000000, "上海市", 302 | 310100000000, "直辖区", 303 | 310113000000, "宝山区", 304 | 310113109000, "顾村镇", 305 | null, null, 306 | null, null, 307 | "菊太路", 308 | "777弄", 309 | "24号602室", 310 | "" 311 | ) 312 | ) 313 | // fix字符 — 314 | assertEquals( 315 | geocoding.normalizing("辽宁大连甘井子区辽宁, 大连, 甘井子区, 泡崖街道玉境路26号3—2—1"), 316 | Address( 317 | 210000000000, "辽宁省", 318 | 210200000000, "大连市", 319 | 210211000000, "甘井子区", 320 | 210211007000, "泡崖街道", 321 | null, null, 322 | null, null, 323 | "玉境路", 324 | "26号", 325 | "3-2-1", 326 | "" 327 | ) 328 | ) 329 | // fix 只有 1号楼 的情 330 | assertEquals( 331 | geocoding.normalizing("北京市西城区新康街2号院1号楼北侧楼房"), 332 | Address( 333 | 110000000000, "北京市", 334 | 110100000000, "直辖区", 335 | 110102000000, "西城区", 336 | null, null, 337 | null, null, 338 | null, null, 339 | "新康街", 340 | "2号院", 341 | "1号楼", 342 | "北侧楼房" 343 | ) 344 | ) 345 | // Fix issues #10 346 | assertEquals( 347 | geocoding.normalizing("福建福州鼓楼区六一路111号金三桥大厦"), 348 | Address( 349 | 350000000000, "福建省", 350 | 350100000000, "福州市", 351 | 350102000000, "鼓楼区", 352 | null, null, 353 | null, null, 354 | null, null, 355 | "六一路", 356 | "111号", 357 | null, 358 | "金三桥大厦" 359 | ) 360 | ) 361 | // Fix issues #8 362 | assertEquals( 363 | geocoding.normalizing("广东省河源市源城区中山大道16号华怡小区"), 364 | Address( 365 | 440000000000, "广东省", 366 | 441600000000, "河源市", 367 | 441602000000, "源城区", 368 | null, null, 369 | null, null, 370 | null, null, 371 | "中山大道", 372 | "16号", 373 | null, 374 | "华怡小区" 375 | ) 376 | 377 | ) 378 | assertEquals( 379 | geocoding.normalizing("广东省河源市中山大道16号华怡小区"), 380 | Address( 381 | 440000000000, "广东省", 382 | 441600000000, "河源市", 383 | null, null, 384 | null, null, 385 | null, null, 386 | null, null, 387 | "中山大道", 388 | "16号", 389 | null, 390 | "华怡小区" 391 | ) 392 | ) 393 | // Fix issues #9 394 | assertEquals( 395 | geocoding.normalizing("浙江省杭州市西湖区中国建设银河西湖支行"), 396 | Address( 397 | 330000000000, "浙江省", 398 | 330100000000, "杭州市", 399 | 330106000000, "西湖区", 400 | null, null, 401 | null, null, 402 | null, null, 403 | null, 404 | null, 405 | null, 406 | "中国建设银河西湖支行" 407 | ) 408 | ) 409 | assertEquals( 410 | geocoding.normalizing("江西赣州市赣县区王母渡镇"), 411 | Address( 412 | 360000000000, "江西省", 413 | 360700000000, "赣州市", 414 | 360704000000, "赣县区", 415 | 360704101000, "王母渡镇", 416 | null, null, 417 | null, null, 418 | null, 419 | null, 420 | null, 421 | "" 422 | ) 423 | ) 424 | } 425 | 426 | @Test 427 | fun testNormalizingWithStrict() { 428 | // 严格模式 429 | val geocoding = GeocodingX(true) 430 | assertEquals( 431 | geocoding.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"), 432 | null 433 | ) 434 | 435 | // 非严格模式 436 | val geocoding2 = GeocodingX(false) 437 | assertEquals( 438 | geocoding2.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"), 439 | Address( 440 | 130000000000, "河北省", 441 | 130600000000, "保定市", 442 | 130634000000, "曲阳县", 443 | 130634101000, "灵山镇", 444 | 130634101000, "灵山镇", 445 | null, null, 446 | "海榆大道", 447 | "4号", 448 | "11#楼2单元203", 449 | "绿地城润园" 450 | ) 451 | ) 452 | } 453 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestSegments.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.bitlap.geocoding.core.segment.IKAnalyzerSegmenter 4 | import org.bitlap.geocoding.core.segment.SimpleSegmenter 5 | import org.bitlap.geocoding.core.segment.SmartCNSegmenter 6 | import org.bitlap.geocoding.core.segment.WordSegmenter 7 | import org.junit.Test 8 | 9 | /** 10 | * Desc: 测试 segments 11 | * Mail: chk19940609@gmail.com 12 | * Created by IceMimosa 13 | * Date: 2017/2/6 14 | */ 15 | class TestSegments { 16 | 17 | private val simple = SimpleSegmenter() 18 | private val smart = SmartCNSegmenter() 19 | private val word = WordSegmenter() 20 | private val ik = IKAnalyzerSegmenter() 21 | 22 | @Test 23 | fun test_segments() { 24 | var text = "7号楼1单元102室" 25 | // text = "九鼎2期B7号楼东数新都商贸购物中心附近" 26 | 27 | println(">>> simple 分词: ") 28 | println(simple.segment(text)) 29 | 30 | // println(">>> smart 分词: ") 31 | // println(smart.segment(text)) 32 | 33 | // println(">>> word 分词: ") 34 | // println(word.segment(text)) 35 | 36 | println(">>> ik 分词: ") 37 | println(ik.segment(text)) 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/TestSimilarity.kt: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding 2 | 3 | import org.junit.Test 4 | import java.util.concurrent.Callable 5 | import java.util.concurrent.Executors 6 | 7 | /** 8 | * Desc: 测试相似度 9 | * Mail: chk19940609@gmail.com 10 | * Created by IceMimosa 11 | * Date: 2017/2/7 12 | */ 13 | open class TestSimilarity { 14 | 15 | @Test 16 | fun test_similarity() { 17 | // 一般匹配 18 | var text1 = "山东省沂水县四十里堡镇东艾家庄村205号" 19 | var text2 = "山东省沂水县四十里堡镇东艾家庄村206号" 20 | 21 | // 带有building匹配 22 | text1 = "湖南衡阳常宁市湖南省衡阳市常宁市泉峰街道泉峰街道消防大队南园小区A栋1单元601" 23 | text2 = "湖南衡阳常宁市湖南省衡阳市常宁市泉峰街道泉峰街道消防大队南园小区A栋2单元601" 24 | 25 | // 特殊 26 | text1 = "山东青岛李沧区延川路116号绿城城园东区7号楼2单元802户" 27 | text2 = "山东青岛李沧区延川路绿城城园东区7-2-802" 28 | 29 | // 标准化 30 | val addr1 = Geocoding.normalizing(text1) 31 | val addr2 = Geocoding.normalizing(text2) 32 | println("addr1 >>>> $addr1") 33 | println(">>>>>>>>>>>>>>>>>") 34 | println("addr2 >>>> $addr2") 35 | 36 | println("相似度结果分析 >>>>>>>>> " + Geocoding.similarityWithResult(addr1, addr2)) 37 | } 38 | 39 | @Test 40 | fun test_fix_null_test() { 41 | // 一般匹配 42 | val text1 = "中国湖南郴州宜章县梅田镇【梅田镇】(梅田镇附近)" 43 | val text2 = "湖南省郴州市宜章县梅田镇上寮村2组" 44 | 45 | // 标准化 46 | val addr2 = Geocoding.normalizing(text1) 47 | val addr1 = Geocoding.normalizing(text2) 48 | println("addr1 >>>> $addr1") 49 | println(">>>>>>>>>>>>>>>>>") 50 | println("addr2 >>>> $addr2") 51 | 52 | println("相似度结果分析 >>>>>>>>> " + Geocoding.similarityWithResult(addr1, addr2)) 53 | } 54 | 55 | @Test 56 | fun test_similarity_threads() { 57 | val pool = Executors.newFixedThreadPool(10) 58 | 59 | val addr1 = "中国湖南郴州宜章县梅田镇【梅田镇】(梅田镇附近)" 60 | val addr2 = "湖南省郴州市宜章县梅田镇上寮村2组" 61 | 62 | (1 .. 1000).map { 63 | pool.submit(Callable { 64 | Geocoding.similarity(addr1, addr2) 65 | }) 66 | }.forEach { 67 | val r = it.get() 68 | assert(0.8164965809277261 == r) 69 | } 70 | pool.shutdown() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/Main.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region; 2 | 3 | import java.io.IOException; 4 | 5 | import org.bitlap.geocoding.region.model.RegionEntity; 6 | import org.bitlap.geocoding.region.util.OutUtil; 7 | 8 | public class Main { 9 | 10 | // 导入数据库成功后,执行china.sql,插入数据项:【中国】 11 | public static void main(String[] args) throws IOException { 12 | long start = System.currentTimeMillis(); 13 | String pathname = "/tmp/cnarea" + 20210707 + ".dat"; 14 | RegionDatFileHelper.writeDatFile(pathname); 15 | long end = System.currentTimeMillis(); 16 | OutUtil.info(String.format("cost %s ms", end - start)); 17 | RegionEntity regionEntity = RegionDatFileHelper.readDatFile(pathname); 18 | OutUtil.info(regionEntity.toString()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Java 8 CI](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml/badge.svg)](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml) 3 | 4 | # 介绍 5 | 项目目前采用的是 [淘宝物流4级地址](!https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do)的标准地址库,即`classpath:src/main/resources/core/region.dat`中的数据, 6 | 本package下代码可将 [中国5级行政区域](!https://github.com/kakuilan/china_area_mysql) 处理为兼容geocoding的标准地址库。 7 | 8 | ### 使用步骤 9 | 10 | 1. 成功导入china_area_mysql到数据库 11 | 2. 执行本package下sql/china.sql插`中国`数据 12 | 3. 修改本package下util/JdbcUtil.java中的jdbc相关参数 13 | 4. 执行本package下Maine类中main方法 14 | 5. 将生成的dat文件改名为region.dat并放入`classpath:src/main/resources/core/` 15 | 16 | ### 注意事项 17 | 本测试配置基于Server version: 8.0.21 MySQL Community Server - GPL环境,其它可能略有差异,可通过下面两个SQL确认配置是否OK 18 | 19 | ``` 20 | show variables like '%CHARACTER%'; 21 | show variables like '%max_allowed_packet%'; 22 | ``` 23 | 24 | 1. 设置max_allowed_packet,[mysqld]下max_allowed_packet = 2000M,[mysqldump]下max_allowed_packet = 2000M 25 | 2. 设置字符集,[client]下default-character-set=utf8mb4,[mysqld]下character-set-server=utf8mb4和init_connect='SET NAMES utf8mb4',[mysql]下default-character-set=utf8mb4 26 | -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/RegionDatFileHelper.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.File; 6 | import java.io.IOException; 7 | import java.sql.Connection; 8 | import java.util.Base64; 9 | import java.util.List; 10 | import java.util.zip.GZIPInputStream; 11 | import java.util.zip.GZIPOutputStream; 12 | 13 | import org.apache.commons.io.IOUtils; 14 | 15 | import com.google.common.collect.Lists; 16 | import com.google.common.io.Files; 17 | import com.google.gson.Gson; 18 | 19 | import org.bitlap.geocoding.model.RegionType; 20 | import org.bitlap.geocoding.region.model.RegionEntity; 21 | import org.bitlap.geocoding.region.util.JdbcUtil; 22 | import kotlin.text.Charsets; 23 | 24 | public class RegionDatFileHelper { 25 | 26 | final static List provinceLevelCity1 = Lists.newArrayList("北京市", "天津市", "上海市", "重庆市"); 27 | 28 | public static void writeDatFile(String pathname) throws IOException { 29 | write(pathname, ""); 30 | Connection conn = JdbcUtil.getConnection(); 31 | if (conn == null) return; 32 | List china = Lists.newArrayList(); 33 | List provinces = RegionSqlHelper.findProvinces(conn); 34 | for (int i = 0; i < provinces.size(); i++) { 35 | RegionEntity province = provinces.get(i); 36 | List list = RegionSqlHelper.findByProvince(conn, province.getShortName() + "%"); 37 | if (i == 0) { 38 | List tree = parseProvince(list); 39 | china.add(tree.get(0)); 40 | } else { 41 | List tree = parseProvince(list); 42 | china.get(0).getChildren().add(tree.get(0)); 43 | } 44 | } 45 | JdbcUtil.free(conn); 46 | Gson gson = new Gson(); 47 | 48 | byte[] context = encode(gson.toJson(china.get(0))); 49 | write(pathname, new String(context, Charsets.UTF_8)); 50 | } 51 | 52 | private static List parseProvince(List list) { 53 | List province = Lists.newArrayList(); 54 | 55 | for (RegionEntity entity : list) { 56 | if (entity.getParentId().equals(0L)) { 57 | if (entity.getChildren() == null) entity.setChildren(Lists.newArrayList()); 58 | entity.setType(of(entity.getId(), entity.getLevel(), entity.getName())); 59 | province.add(entity); 60 | } 61 | } 62 | 63 | for (RegionEntity item : province) { 64 | item = recursive(item, list, province.size()); 65 | } 66 | 67 | return province; 68 | } 69 | 70 | private static RegionEntity recursive(RegionEntity parent, List list, int j) { 71 | for (int i = j; i < list.size(); i++) { 72 | RegionEntity entity = list.get(i); 73 | if (parent.getId().equals(entity.getParentId())) { 74 | entity = recursive(entity, list, i + 1); 75 | entity.setType(of(entity.getId(), entity.getLevel(), entity.getName())); 76 | if (parent.getChildren() == null) parent.setChildren(Lists.newArrayList()); 77 | parent.getChildren().add(entity); 78 | } 79 | } 80 | return parent; 81 | } 82 | 83 | private static void write(final String fileName, final String contents) throws IOException { 84 | File file = new File(fileName); 85 | // file.deleteOnExit(); 86 | if (!file.exists()) { 87 | Files.createParentDirs(file); 88 | file.createNewFile(); 89 | } 90 | if (contents != null && !contents.trim().isEmpty()) { 91 | Files.write(contents.getBytes(), file); 92 | } 93 | } 94 | 95 | private static RegionType of(Long id, int level, String name) { 96 | if (id.equals(100000000000L)) return RegionType.Country; 97 | if (level == 0) { 98 | if (provinceLevelCity1.contains(name)) return RegionType.ProvinceLevelCity1; 99 | return RegionType.Province; 100 | } 101 | if (level == 1) { 102 | if ("直辖区".equalsIgnoreCase(name)) return RegionType.ProvinceLevelCity2; 103 | if ("直辖县".equalsIgnoreCase(name)) return RegionType.CityLevelDistrict; 104 | return RegionType.City; 105 | } 106 | if (level == 2) return RegionType.District; 107 | if (level == 3) { 108 | if (name.matches("乡$")) return RegionType.Town; 109 | if (name.matches("镇$")) return RegionType.Town; 110 | return RegionType.PlatformL4; 111 | } 112 | if (level == 4) return RegionType.Village; 113 | return RegionType.Undefined; 114 | } 115 | 116 | public static RegionEntity readDatFile(String file) throws IOException { 117 | byte[] byteArray = Files.toByteArray(new File(file)); 118 | String json = new String(byteArray); 119 | return new Gson().fromJson(decode(json), RegionEntity.class); 120 | } 121 | 122 | private static String decode(String str) throws IOException { 123 | byte decodedByteArray[] = Base64.getMimeDecoder().decode(str); 124 | GZIPInputStream gzipis = new GZIPInputStream(new ByteArrayInputStream(decodedByteArray)); 125 | return new String(IOUtils.toByteArray(gzipis), Charsets.UTF_8); 126 | } 127 | 128 | private static byte[] encode(String str) throws IOException { 129 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 130 | GZIPOutputStream gzipos = new GZIPOutputStream(out); 131 | gzipos.write(str.getBytes(Charsets.UTF_8)); 132 | gzipos.close(); 133 | return Base64.getMimeEncoder().encode(out.toByteArray()); 134 | } 135 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/RegionSqlHelper.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region; 2 | 3 | import java.sql.Connection; 4 | import java.sql.PreparedStatement; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.util.List; 8 | 9 | import com.google.common.collect.Lists; 10 | 11 | import org.bitlap.geocoding.region.model.RegionEntity; 12 | import org.bitlap.geocoding.region.util.JdbcUtil; 13 | import org.bitlap.geocoding.region.util.OutUtil; 14 | 15 | public class RegionSqlHelper { 16 | 17 | private static final String sqlFindAllProvinces = "select `level`, area_code as id, parent_code as parentId, " 18 | + "`name` as `name`, short_name as shortName, merger_name as `alias`, zip_code as zip " 19 | + "from cnarea_2020 where parent_code = 0 order by area_code"; 20 | 21 | private static final String sqlFindByProvince = "select `level`, area_code as id, parent_code as parentId, " 22 | + "`name` as `name`, short_name as shortName, merger_name as `alias`, zip_code as zip " 23 | + "from cnarea_2020 where merger_name like ? order by `level`, parent_code, area_code"; 24 | 25 | public static List findProvinces(Connection conn) { 26 | PreparedStatement pstmt = null; 27 | ResultSet rs = null; 28 | try { 29 | pstmt = conn.prepareStatement(sqlFindAllProvinces); 30 | rs = pstmt.executeQuery(); 31 | OutUtil.info(sqlFindAllProvinces); 32 | return convert(rs); 33 | } catch (SQLException sqle) { 34 | OutUtil.err("Exception: RegionEntityHelper.findProvinces " + sqle.getMessage()); 35 | }finally { 36 | JdbcUtil.free(rs, pstmt); 37 | } 38 | return Lists.newArrayList(); 39 | } 40 | 41 | 42 | public static List findByProvince(Connection conn, String name) { 43 | PreparedStatement pstmt = null; 44 | ResultSet rs = null; 45 | try { 46 | pstmt = conn.prepareStatement(sqlFindByProvince); 47 | pstmt.setString(1, name); 48 | rs = pstmt.executeQuery(); 49 | OutUtil.info(sqlFindByProvince.replace("?", "'" + name + "'")); 50 | return convert(rs); 51 | } catch (SQLException sqle) { 52 | OutUtil.err("Exception: RegionEntityHelper.findByProvince " + sqle.getMessage()); 53 | } finally { 54 | JdbcUtil.free(rs, pstmt); 55 | } 56 | return Lists.newArrayList(); 57 | } 58 | 59 | private static List convert(ResultSet rs) throws SQLException { 60 | List list = Lists.newArrayList(); 61 | while (rs != null && rs.next()) { 62 | RegionEntity regionEntity = new RegionEntity(); 63 | regionEntity.setAlias(rs.getString("alias")); 64 | regionEntity.setId(rs.getLong("id")); 65 | regionEntity.setLevel(rs.getInt("level")); 66 | regionEntity.setName(rs.getString("name")); 67 | regionEntity.setParentId(rs.getLong("parentId")); 68 | regionEntity.setShortName(rs.getString("shortName")); 69 | regionEntity.setZip(rs.getString("zip")); 70 | list.add(regionEntity); 71 | } 72 | return list; 73 | } 74 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/model/RegionEntity.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region.model; 2 | 3 | import com.google.gson.annotations.Expose; 4 | import org.bitlap.geocoding.model.RegionType; 5 | 6 | import java.io.Serializable; 7 | import java.util.List; 8 | 9 | public class RegionEntity implements Serializable{ 10 | 11 | private static final long serialVersionUID = 1L; 12 | 13 | private Long id = 0L; 14 | private Long parentId = 0L; 15 | @Expose(serialize = false, deserialize = false) 16 | private Integer level = 0; 17 | private String name = ""; 18 | @Expose(serialize = false, deserialize = false) 19 | private String shortName = ""; 20 | private String alias = ""; 21 | private RegionType type = RegionType.Undefined; 22 | private String zip = ""; 23 | private List children = null; 24 | private List orderedNames = null; 25 | 26 | public Long getId() { 27 | return id; 28 | } 29 | 30 | public void setId(Long id) { 31 | this.id = id; 32 | } 33 | 34 | public Long getParentId() { 35 | return parentId; 36 | } 37 | 38 | public void setParentId(Long parentId) { 39 | this.parentId = parentId; 40 | } 41 | 42 | public Integer getLevel() { 43 | return level; 44 | } 45 | 46 | public void setLevel(Integer level) { 47 | this.level = level; 48 | } 49 | 50 | public String getName() { 51 | return name; 52 | } 53 | 54 | public void setName(String name) { 55 | this.name = name; 56 | } 57 | 58 | public String getShortName() { 59 | return shortName; 60 | } 61 | 62 | public void setShortName(String shortName) { 63 | this.shortName = shortName; 64 | } 65 | 66 | public String getAlias() { 67 | return alias; 68 | } 69 | 70 | public void setAlias(String alias) { 71 | this.alias = alias; 72 | } 73 | 74 | public RegionType getType() { 75 | return type; 76 | } 77 | 78 | public void setType(RegionType type) { 79 | this.type = type; 80 | } 81 | 82 | public String getZip() { 83 | return zip; 84 | } 85 | 86 | public void setZip(String zip) { 87 | this.zip = zip; 88 | } 89 | 90 | public List getChildren() { 91 | return children; 92 | } 93 | 94 | public void setChildren(List children) { 95 | this.children = children; 96 | } 97 | 98 | public List getOrderedNames() { 99 | return orderedNames; 100 | } 101 | 102 | public void setOrderedNames(List orderedNames) { 103 | this.orderedNames = orderedNames; 104 | } 105 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/sql/china.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `cnarea_2020` 2 | (`level`, `parent_code`, `area_code`, `zip_code`, `city_code`, `name`, `short_name`, `merger_name`, `pinyin`, `lng`, `lat`) 3 | VALUES 4 | (0, 0, 100000000000, 000000, '', '中国', '中国', '中国', 'ZHONGGUO', 0.000000, 0.000000); -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/util/JdbcUtil.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region.util; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.sql.Statement; 8 | 9 | public class JdbcUtil { 10 | 11 | private static final String driver_class = "com.mysql.cj.jdbc.Driver"; // com.mysql.jdbc.Driver 12 | 13 | private static final String db_url = "jdbc:mysql://localhost:3306/cnarea"; 14 | 15 | private static final String db_userid = "root"; 16 | 17 | private static final String db_password = "12345678"; 18 | 19 | public static Connection getConnection() { 20 | Connection conn = null; 21 | try { 22 | Class.forName(driver_class); 23 | } catch (ClassNotFoundException cnfe) { 24 | OutUtil.err("Exception: JdbcUtil.getConnection driver_class not found"); 25 | return null; 26 | } 27 | try { 28 | conn = DriverManager.getConnection(db_url, db_userid, db_password); 29 | } catch (SQLException sqle) { 30 | OutUtil.err("Exception: JdbcUtil.getConnection get connection failed"); 31 | return null; 32 | } 33 | return conn; 34 | } 35 | 36 | public static void free(ResultSet rs, Statement stmt) { 37 | free(stmt); 38 | free(rs); 39 | } 40 | 41 | public static void free(ResultSet rs) { 42 | if (rs == null) return; 43 | try { 44 | rs.close(); 45 | } catch (SQLException sqle) {} 46 | } 47 | 48 | public static void free(Statement stmt) { 49 | if (stmt == null) return; 50 | try { 51 | stmt.close(); 52 | } catch (SQLException sqle) {} 53 | } 54 | 55 | public static void free(Connection conn) { 56 | if (conn == null) return; 57 | try { 58 | conn.close(); 59 | } catch (SQLException sqle) {} 60 | } 61 | } -------------------------------------------------------------------------------- /src/test/java/org/bitlap/geocoding/region/util/OutUtil.java: -------------------------------------------------------------------------------- 1 | package org.bitlap.geocoding.region.util; 2 | 3 | public class OutUtil { 4 | 5 | public static void err(String str) { 6 | System.err.println(str); 7 | } 8 | 9 | public static void info(String str) { 10 | System.out.println(str); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/test/resources/sql/create.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS `addr_address`; 2 | CREATE TABLE `addr_address` ( 3 | `id` BIGINT(11) NOT NULL AUTO_INCREMENT COMMENT 'Address Record ID', 4 | `province` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'Province ID', 5 | `city` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'City ID', 6 | `district` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'District ID', 7 | `street` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'Street ID', 8 | `text` varchar(100) NOT NULL DEFAULT '' COMMENT 'Address Text', 9 | `town` varchar(20) NOT NULL DEFAULT '' COMMENT '镇', 10 | `village` varchar(5) NOT NULL DEFAULT '' COMMENT '村', 11 | `road` varchar(8) NOT NULL DEFAULT '' COMMENT '道路', 12 | `road_num` varchar(10) NOT NULL DEFAULT '' COMMENT '道路号码', 13 | `building_num` varchar(20) NOT NULL DEFAULT '' COMMENT '几号楼+几单元+房间号', 14 | `hash` int(11) NOT NULL DEFAULT '0' COMMENT 'Address Text Hash Code', 15 | `raw_text` varchar(150) NOT NULL DEFAULT '' COMMENT 'Original Address Text', 16 | `prop1` varchar(20) NOT NULL DEFAULT '' COMMENT '扩展字段:订单号', 17 | `prop2` varchar(20) NOT NULL DEFAULT '' COMMENT '扩展字段:片区ID', 18 | `create_time` date NOT NULL DEFAULT '1900-01-01', 19 | PRIMARY KEY (`id`), 20 | KEY `ix_hash` (`hash`), 21 | KEY `ix_pid_cid_did` (`province`,`city`,`district`) 22 | ) ENGINE=InnoDB AUTO_INCREMENT=10001 DEFAULT CHARSET=utf8; --------------------------------------------------------------------------------