├── .github
└── workflows
│ └── java8.yml
├── .gitignore
├── CHANGES.md
├── LICENSE
├── README.md
├── README_old.md
├── pom.xml
└── src
├── main
├── java
│ └── org
│ │ └── bitlap
│ │ └── geocoding
│ │ ├── Geocoding.kt
│ │ ├── GeocodingX.kt
│ │ ├── core
│ │ ├── AddressInterpreter.kt
│ │ ├── AddressPersister.kt
│ │ ├── Computer.kt
│ │ ├── Context.kt
│ │ ├── RegionCache.kt
│ │ ├── Segmenter.kt
│ │ ├── TermIndexVisitor.kt
│ │ ├── impl
│ │ │ ├── DefaultAddressInterpreter.kt
│ │ │ ├── DefaultAddressPersister.kt
│ │ │ ├── DefaultRegoinCache.kt
│ │ │ ├── RegionInterpreterVisitor.kt
│ │ │ └── SimilarityComputer.kt
│ │ └── segment
│ │ │ ├── AsciiSegmenter.kt
│ │ │ ├── IKAnalyzerSegmenter.kt
│ │ │ ├── SimpleSegmenter.kt
│ │ │ ├── SmartCNSegmenter.kt
│ │ │ └── WordSegmenter.kt
│ │ ├── index
│ │ ├── TermIndexBuilder.kt
│ │ ├── TermIndexEntry.kt
│ │ ├── TermIndexItem.kt
│ │ └── TermType.kt
│ │ ├── model
│ │ ├── Address.kt
│ │ ├── AddressEntity.kt
│ │ ├── Division.kt
│ │ ├── RegionEntity.kt
│ │ └── RegionType.kt
│ │ ├── similarity
│ │ ├── Document.kt
│ │ ├── MatchedResult.kt
│ │ ├── MatchedTerm.kt
│ │ └── Term.kt
│ │ └── utils
│ │ └── StringHelper.kt
└── resources
│ ├── IKAnalyzer.cfg.xml
│ ├── core
│ └── region.dat
│ ├── dic
│ ├── community.dic
│ ├── region.dic
│ └── stop.dic
│ ├── logback.xml
│ └── word.local.conf
└── test
├── java
└── org
│ └── bitlap
│ └── geocoding
│ ├── TestCustomDatSave.kt
│ ├── TestNormalizing.kt
│ ├── TestNormalizingAddRegionEntry.kt
│ ├── TestNormalizingCustom.kt
│ ├── TestSegments.kt
│ ├── TestSimilarity.kt
│ └── region
│ ├── Main.java
│ ├── README.md
│ ├── RegionDatFileHelper.java
│ ├── RegionSqlHelper.java
│ ├── model
│ └── RegionEntity.java
│ ├── sql
│ └── china.sql
│ └── util
│ ├── JdbcUtil.java
│ └── OutUtil.java
└── resources
├── address.txt
├── region_2021.dat
└── sql
└── create.sql
/.github/workflows/java8.yml:
--------------------------------------------------------------------------------
1 | name: Java 8 CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - name: Checkout
17 | uses: actions/checkout@v2
18 | with:
19 | fetch-depth: 1
20 | - name: Set up JDK 1.8
21 | uses: actions/setup-java@v1
22 | with:
23 | java-version: 1.8
24 | - name: Build with Maven
25 | run: mvn --batch-mode --update-snapshots verify
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jar
2 | !/gradle/wrapper/gradle-wrapper.jar
3 | *.war
4 | *~
5 | *.class
6 | *.lock
7 | *.DS_Store
8 | *.swp
9 | *.out
10 | target/
11 | build/
12 | *.iml
13 | *.ipr
14 | *.iws
15 | .gradle/
16 | .settings/
17 | .classpath
18 | .project
19 | .metadata/
20 | .idea/
21 | logs/
22 | dev.properties
23 | dependency-reduced-pom.xml
24 | *.rdb
25 | *.orig
--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | * 1.1.3
4 | * 新增自定义地址设置
5 | * 1.1.4
6 | * 修复一些匹配错误的bug
7 | * 1.1.6
8 | * 升级地址库和包版本, 修复一些匹配错误的地址
9 | * 1.2.0
10 | * geocoding项目转移到组织:bitlap/geocoding
11 | * 1.3.0
12 | * 新增自定义地址文件库配置参数
13 | * 添加自定义地址新增replace参数
14 | * 1.3.1
15 | * 修复normalizing方法多线程调用报错的问题
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright © 2018 IceMimosa
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of
5 | this software and associated documentation files (the “Software”), to deal in
6 | the Software without restriction, including without limitation the rights to
7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8 | the Software, and to permit persons to whom the Software is furnished to do so,
9 | subject to the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/bitlap/bitlap/wiki/Project-Stages)
3 | [](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml)
4 | [](https://central.sonatype.com/artifact/org.bitlap/geocoding)
5 |
6 | # 介绍
7 | 本项目旨在将不规范(或者连续)的文本地址进行尽可能的**标准化**, 以及对两个地址进行**相似度的计算**。
8 |
9 | 地理编码技术, 主要分为如下步骤
10 | * 地址标准库
11 | * 地址标准化
12 | * 相似度计算
13 |
14 | ## pom
15 |
16 | ```xml
17 |
18 |
19 | org.bitlap
20 | geocoding
21 | 1.3.1
22 |
23 |
24 | ```
25 |
26 | # 1. 数据测试
27 |
28 | 方法调用: `Geocoding` 类
29 | * normalizing: 标准化
30 | * analyze: 解析成分词文档
31 | * similarity: 相似度计算
32 | * similarityWithResult: 相似度计算, 返回包含更多丰富的数据
33 |
34 | ## 1.1 标准化
35 |
36 | ```java
37 | >> 输入: 山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303
38 | >> 输出:
39 | Address(
40 | provinceId=370000000000, province=山东省,
41 | cityId=370200000000, city=青岛市,
42 | districtId=370203000000, district=市北区,
43 | streetId=370203030000, street=水清沟街道,
44 | townId=null, town=null,
45 | villageId=null, village=null,
46 | road=九江路,
47 | roadNum=20号,
48 | buildingNum=3号楼2单元1303,
49 | text=大都会
50 | )
51 | ```
52 |
53 | ```java
54 | >> 输入: 上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)
55 | >> 输出:
56 | Address(
57 | provinceId=310000000000, province=上海,
58 | cityId=310100000000, city=上海市,
59 | districtId=310113000000, district=宝山区,
60 | streetId=null, street=null,
61 | townId=null, town=null,
62 | villageId=null, village=null,
63 | road=新沪路,
64 | roadNum=58弄,
65 | buildingNum=11-802,
66 | text=水韵华庭水韵华庭附近
67 | )
68 | ```
69 |
70 | * 返回的对象解释
71 | * province相关: 省
72 | * city相关: 市
73 | * district相关: 区、县
74 | * street相关: 街道
75 | * town相关: 乡镇
76 | * village相关: 村
77 | * road: 道路
78 | * roadNum: 路号
79 | * buildingNum: 建筑物号
80 | * text: 标准化后为匹配的地址。一般包含小区, 商场名称等信息
81 |
82 | > 注: 如果对text的结果不是很满意, 比如出现重复或不准确, 可以通过分词的手段解决
83 |
84 | ## 1.2 相似度
85 |
86 | ```java
87 | >> 输入:
88 | 浙江金华义乌市南陈小区8幢2号
89 | 浙江金华义乌市稠城街道浙江省义乌市宾王路99号后面南陈小区8栋2号
90 | >> 输出:
91 | 0.8451542547285166
92 | ```
93 |
94 | ```java
95 | >> 输入:
96 | 山东省沂水县四十里堡镇东艾家庄村206号
97 | 浙江金华义乌市南陈小区8幢2号
98 | >> 输出:
99 | 0.0
100 | ```
101 |
102 | ## 1.3 自定义地址文件设置
103 |
104 | ```kotlin
105 | // 加载自定义地址文件
106 | val geocoding = GeocodingX("region_2021.dat")
107 |
108 | // 添加自定义区县"临平区"
109 | geocoding.addRegionEntry(330113000000, 330100000000, "临平区", RegionType.District, "", true)
110 |
111 | // 保存自定义字典文件
112 | geocoding.save("xxx.dat")
113 | ```
114 |
115 | ## 1.4 自定义地址设置
116 |
117 | ```kotlin
118 | // 100000000000 代表中国的ID
119 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province)
120 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City)
121 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District)
122 |
123 | >> 输入: 中国尼玛省尼玛市泥煤市泥煤大道888号xxx
124 | >> 输出:
125 | Address(
126 | provinceId=88888888, province=尼玛省,
127 | cityId=8888888, city=尼玛市,
128 | districtId=888888, district=泥煤市,
129 | streetId=null, street=null,
130 | townId=null, town=null,
131 | villageId=null, village=null,
132 | road=泥煤大道,
133 | roadNum=888号,
134 | buildingNum=null,
135 | text=xxx
136 | )
137 | ```
138 |
139 | > Tips: 可以从「国家标准地址库」中获取「父级城市ID」
140 |
141 | # 2. 说明
142 |
143 | ## 2.1 标准地址库
144 | 项目目前采用的是 [~~淘宝物流4级地址~~][1] (已过期,可通过淘宝收货地址获取实际调用地址)的标准地址库, 也可以采用`国家的标准地址库` (对应的github库, [中国5级行政区域mysql库][3]).
145 | * [国家标准地址库2023](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023)
146 | * [国家标准地址库2022](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022)
147 | * [国家标准地址库2021](http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2021)
148 |
149 | ### 导入中国5级行政区域mysql库注意事项
150 |
151 | [参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md)
152 |
153 | ## 2.2 标准地址库(兼容本项目)
154 |
155 | | 标准库文件 | 描述 | 参考 | 感谢 |
156 | |-----------------|-------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------|
157 | | region_2021.dat | 国家标准地址库2021 | [ISSUE-163](https://github.com/bitlap/geocoding/issues/163) | [TsLenMo](https://github.com/TsLenMo)、[weijiang.lin](https://github.com/linweijiang) |
158 |
159 | 使用方式:文件下载到`classpath`,使用自定义的`GeocodingX`类即可。
160 |
161 | ## 2.3 标准化
162 | 1. 首先基于正则提取出道路、建筑物号等信息
163 | 2. 省市区等匹配
164 | 1. 将标准的地址库建立**倒排索引**
165 | 2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条
166 | 3. 对所有匹配结果进行标准行政区域从属关系校验
167 |
168 | ## 2.4 相似度计算
169 | 1. 对输入的两个地址进行标准化
170 | 2. 对省市区等信息分配不同的权重
171 | 3. 对道路号, 建筑号进行语义处理, 分配权重
172 | 4. 对剩余文本(text)使用**IK Analyzer**进行分词
173 | 5. 对两个结果集使用**余弦相似度算法**计算相似度
174 |
175 |
176 | 项目参考[address-semantic-search][4],简化了流程,修复了各种不规则错误,使得使用更加方便。
177 |
178 | ## 感谢
179 |
180 | * Python封装库:[casuallyName/Geocoding](https://github.com/casuallyName/Geocoding)
181 |
182 |
183 | ## Release Log
184 |
185 | [Change Log](./CHANGES.md)
186 |
187 | ## LICENSE
188 |
189 | MIT
190 |
191 | [1]:https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do
192 | [3]:https://github.com/kakuilan/china_area_mysql
193 | [4]:https://github.com/liuzhibin-cn/address-semantic-search
194 |
--------------------------------------------------------------------------------
/README_old.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml)
3 |
4 | # 介绍
5 | 本项目旨在将不规范(或者连续)的文本地址进行尽可能的**标准化**, 以及对两个地址进行**相似度的计算**。
6 |
7 | 地理编码技术, 主要分为如下步骤
8 | * 地址标准库
9 | * 地址标准化
10 | * 相似度计算
11 |
12 | ## pom
13 |
14 | ```xml
15 |
16 |
17 | io.patamon.geocoding
18 | geocoding
19 | 1.1.6
20 |
21 |
22 |
23 |
24 |
25 | geocoding
26 | github release repository
27 | https://maven.pkg.github.com/IceMimosa/geocoding
28 |
29 |
30 | ```
31 |
32 | > PS: 需要申请github token才能访问, [Authenticating to GitHub Packages](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-apache-maven-registry#authenticating-to-github-packages). 比如在 `~/.m2/settings.xml` 添加如下, [token申请地址](https://github.com/settings/tokens)
33 |
34 | ```xml
35 |
36 |
37 | geocoding
38 | [YOUR_NAME]
39 | [YOUR_TOKEN]
40 |
41 |
42 | ```
43 |
44 |
45 | # 1. 数据测试
46 |
47 | 方法调用: `Geocoding` 类
48 | * normalizing: 标准化
49 | * analyze: 解析成分词文档
50 | * similarity: 相似度计算
51 | * similarityWithResult: 相似度计算, 返回包含更多丰富的数据
52 |
53 | ## 1.1 标准化
54 |
55 | ```java
56 | >> 输入: 山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303
57 | >> 输出:
58 | Address(
59 | provinceId=370000000000, province=山东省,
60 | cityId=370200000000, city=青岛市,
61 | districtId=370203000000, district=市北区,
62 | streetId=370203030000, street=水清沟街道,
63 | townId=null, town=null,
64 | villageId=null, village=null,
65 | road=九江路,
66 | roadNum=20号,
67 | buildingNum=3号楼2单元1303,
68 | text=大都会
69 | )
70 | ```
71 |
72 | ```java
73 | >> 输入: 上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)
74 | >> 输出:
75 | Address(
76 | provinceId=310000000000, province=上海,
77 | cityId=310100000000, city=上海市,
78 | districtId=310113000000, district=宝山区,
79 | streetId=null, street=null,
80 | townId=null, town=null,
81 | villageId=null, village=null,
82 | road=新沪路,
83 | roadNum=58弄,
84 | buildingNum=11-802,
85 | text=水韵华庭水韵华庭附近
86 | )
87 | ```
88 |
89 | * 返回的对象解释
90 | * province相关: 省
91 | * city相关: 市
92 | * district相关: 区、县
93 | * street相关: 街道
94 | * town相关: 乡镇
95 | * village相关: 村
96 | * road: 道路
97 | * roadNum: 路号
98 | * buildingNum: 建筑物号
99 | * text: 标准化后为匹配的地址。一般包含小区, 商场名称等信息
100 |
101 | > 注: 如果对text的结果不是很满意, 比如出现重复或不准确, 可以通过分词的手段解决
102 |
103 | ## 1.2 相似度
104 |
105 | ```java
106 | >> 输入:
107 | 浙江金华义乌市南陈小区8幢2号
108 | 浙江金华义乌市稠城街道浙江省义乌市宾王路99号后面南陈小区8栋2号
109 | >> 输出:
110 | 0.8451542547285166
111 | ```
112 |
113 | ```java
114 | >> 输入:
115 | 山东省沂水县四十里堡镇东艾家庄村206号
116 | 浙江金华义乌市南陈小区8幢2号
117 | >> 输出:
118 | 0.0
119 | ```
120 |
121 | ## 1.3 自定义地址设置
122 |
123 | ```kotlin
124 | // 100000000000 代表中国的ID
125 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province)
126 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City)
127 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District)
128 |
129 | >> 输入: 中国尼玛省尼玛市泥煤市泥煤大道888号xxx
130 | >> 输出:
131 | Address(
132 | provinceId=88888888, province=尼玛省,
133 | cityId=8888888, city=尼玛市,
134 | districtId=888888, district=泥煤市,
135 | streetId=null, street=null,
136 | townId=null, town=null,
137 | villageId=null, village=null,
138 | road=泥煤大道,
139 | roadNum=888号,
140 | buildingNum=null,
141 | text=xxx
142 | )
143 | ```
144 |
145 | > Tips: 可以从「国家标准地址库」中获取「父级城市ID」
146 |
147 | # 2. 说明
148 |
149 | ## 2.1 标准地址库
150 | 项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用[国家的标准地址库][2] (对应的github库, [中国5级行政区域mysql库][3]).
151 |
152 | ### 导入中国5级行政区域mysql库注意事项
153 |
154 | [参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md)
155 |
156 | ## 2.2 标准化
157 | 1. 首先基于正则提取出道路、建筑物号等信息
158 | 2. 省市区等匹配
159 | 1. 将标准的地址库建立**倒排索引**
160 | 2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条
161 | 3. 对所有匹配结果进行标准行政区域从属关系校验
162 |
163 | ## 2.3 相似度计算
164 | 1. 对输入的两个地址进行标准化
165 | 2. 对省市区等信息分配不同的权重
166 | 3. 对道路号, 建筑号进行语义处理, 分配权重
167 | 4. 对剩余文本(text)使用**IK Analyzer**进行分词
168 | 5. 对两个结果集使用**余弦相似度算法**计算相似度
169 |
170 |
171 | 项目参考[address-semantic-search][4],简化了流程,修复了各种不规则错误,使得使用更加方便。
172 |
173 | ## 感谢
174 |
175 | * Python封装库:[casuallyName/Geocoding](https://github.com/casuallyName/Geocoding)
176 |
177 |
178 | ## Release Log
179 |
180 | * 1.1.3
181 | * 新增自定义地址设置
182 | * 1.1.4
183 | * 修复一些匹配错误的bug
184 | * 1.1.6
185 | * 升级地址库和包版本, 修复一些匹配错误的地址
186 | * 1.2.0
187 | - geocoding项目转移到组织:bitlap/geocoding
188 |
189 | [1]:https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do
190 | [2]:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html
191 | [3]:https://github.com/kakuilan/china_area_mysql
192 | [4]:https://github.com/liuzhibin-cn/address-semantic-search
193 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.bitlap
8 | geocoding
9 | 1.3.1
10 |
11 | geocoding
12 | 地理编码技术,提供地址标准化和相似度计算。
13 | https://github.com/bitlap/geocoding
14 | 2018
15 |
16 |
17 |
18 | MIT
19 | https://opensource.org/licenses/MIT
20 | repo
21 |
22 |
23 |
24 |
25 | scm:git:git://github.com/bitlap/geocoding.git
26 | https://github.com/bitlap/geocoding
27 | HEAD
28 |
29 |
30 |
31 |
32 | IceMimosa
33 | ChenKai
34 | http://patamon.me
35 | chk19940609@gmail.com
36 |
37 |
38 | jxnu-liguobin
39 | 梦境迷离
40 | https://dreamylost.cn
41 | dreamylost@outlook.com
42 |
43 |
44 | overcat
45 | Jun Luo
46 | https://keybase.io/overcat
47 | 4catcode@gmail.com
48 |
49 |
50 | cheese8
51 | cheese8
52 | https://github.com/cheese8
53 |
54 |
55 |
56 |
57 | UTF-8
58 | UTF-8
59 | 1.6.10
60 | 1.6.0
61 | 8.5.2
62 | 1.3
63 | 2.9.0
64 | 2012_u6
65 | true
66 | 1.8
67 |
68 |
69 |
70 |
71 | org.jetbrains.kotlin
72 | kotlin-stdlib
73 | ${kotlin.version}
74 |
75 |
76 | com.google.code.gson
77 | gson
78 | ${gson.version}
79 |
80 |
81 |
82 | commons-io
83 | commons-io
84 | 2.11.0
85 | test
86 |
87 |
88 | com.google.guava
89 | guava
90 | 31.1-jre
91 | test
92 |
93 |
94 |
95 | org.jetbrains.kotlin
96 | kotlin-test-junit
97 | ${kotlin.version}
98 | test
99 |
100 |
101 | junit
102 | junit
103 | 4.13.2
104 | test
105 |
106 |
107 | mysql
108 | mysql-connector-java
109 | 8.0.28
110 | test
111 |
112 |
113 |
114 |
126 |
147 |
148 | com.janeluo
149 | ikanalyzer
150 | ${ik.analyzer.version}
151 |
152 |
153 | org.apache.lucene
154 | lucene-core
155 |
156 |
157 | org.apache.lucene
158 | lucene-queryparser
159 |
160 |
161 | org.apache.lucene
162 | lucene-analyzers-common
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 | org.apache.maven.plugins
172 | maven-compiler-plugin
173 | 3.8.1
174 |
175 | 1.8
176 | 1.8
177 |
178 |
179 |
180 | kotlin-maven-plugin
181 | org.jetbrains.kotlin
182 | ${kotlin.version}
183 |
184 |
185 | compile
186 | process-sources
187 |
188 | compile
189 |
190 |
191 |
192 | test-compile
193 | process-test-sources
194 |
195 | test-compile
196 |
197 |
198 |
199 |
200 |
201 | org.apache.maven.plugins
202 | maven-source-plugin
203 | 3.2.1
204 |
205 |
206 | package
207 |
208 | jar
209 |
210 |
211 |
212 |
213 |
214 | org.apache.maven.plugins
215 | maven-resources-plugin
216 | 3.2.0
217 |
218 |
219 | org.apache.maven.plugins
220 | maven-jar-plugin
221 | 2.4
222 |
223 |
224 | **/logback.xml
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 | release
234 |
235 |
236 | central
237 | Maven Snapshots
238 | https://s01.oss.sonatype.org/content/repositories/snapshots/
239 |
240 |
241 | central
242 | Maven Releases
243 | https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/
244 |
245 |
246 |
247 |
248 |
249 | org.jetbrains.dokka
250 | dokka-maven-plugin
251 | ${dokka.version}
252 |
253 |
254 | package
255 |
256 | dokka
257 | javadocJar
258 |
259 |
260 |
261 |
262 |
263 |
264 | org.jetbrains.dokka
265 | kotlin-as-java-plugin
266 | ${dokka.version}
267 |
268 |
269 |
270 |
271 |
272 | org.apache.maven.plugins
273 | maven-gpg-plugin
274 | 3.0.1
275 |
276 |
277 | sign-artifacts
278 | verify
279 |
280 | sign
281 |
282 |
283 |
284 |
285 |
286 | org.apache.maven.plugins
287 | maven-deploy-plugin
288 | 2.8.2
289 |
290 | true
291 |
292 |
293 |
294 | org.sonatype.plugins
295 | nexus-staging-maven-plugin
296 | 1.6.13
297 | true
298 |
299 | central
300 | https://s01.oss.sonatype.org/
301 | true
302 |
303 |
304 |
305 |
306 |
307 |
308 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/Geocoding.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding;
2 |
3 | import org.bitlap.geocoding.core.Context
4 | import org.bitlap.geocoding.model.Address
5 | import org.bitlap.geocoding.model.RegionEntity
6 | import org.bitlap.geocoding.model.RegionType
7 | import org.bitlap.geocoding.similarity.Document
8 | import org.bitlap.geocoding.similarity.MatchedResult
9 |
10 | /**
11 | * Desc: 提供服务的主类
12 | * Mail: chk19940609@gmail.com
13 | * Created by IceMimosa
14 | * Date: 2017/1/12
15 | */
16 | object Geocoding {
17 |
18 | @JvmField
19 | val DEFAULT = GeocodingX()
20 |
21 | /**
22 | * 地址的标准化, 将不规范的地址清洗成标准的地址格式
23 | */
24 | @JvmStatic
25 | fun normalizing(address: String): Address? {
26 | return DEFAULT.normalizing(address)
27 | }
28 |
29 | /**
30 | * 将地址进行切分
31 | */
32 | @JvmStatic
33 | fun analyze(address: String): Document? {
34 | return DEFAULT.analyze(address)
35 | }
36 | @JvmStatic
37 | fun analyze(address: Address?): Document? {
38 | return DEFAULT.analyze(address)
39 | }
40 |
41 | /**
42 | * 地址的相似度计算
43 | */
44 | @JvmStatic
45 | fun similarity(address1: String, address2: String): Double {
46 | return DEFAULT.similarity(address1, address2)
47 | }
48 | @JvmStatic
49 | fun similarity(address1: Address?, address2: Address?): Double {
50 | return DEFAULT.similarity(address1, address2)
51 | }
52 |
53 | /**
54 | * 地址相似度计算, 包含匹配的所有结果
55 | */
56 | @JvmStatic
57 | fun similarityWithResult(address1: String, address2: String): MatchedResult {
58 | return DEFAULT.similarityWithResult(address1, address2)
59 | }
60 | @JvmStatic
61 | fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
62 | return DEFAULT.similarityWithResult(address1, address2)
63 | }
64 |
65 | /**
66 | * 深度优先匹配符合[text]的地址信息
67 | */
68 | @JvmStatic
69 | fun match(text: String): List {
70 | return DEFAULT.match(text)
71 | }
72 |
73 | @JvmStatic
74 | fun getContext(): Context = DEFAULT.ctx
75 |
76 | /**
77 | * 设置自定义地址
78 | *
79 | * @param id 地址的ID
80 | * @param parentId 地址的父ID, 必须存在
81 | * @param name 地址的名称
82 | * @param type 地址类型, [RegionType]
83 | * @param alias 地址的别名
84 | * @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
85 | */
86 | @JvmStatic
87 | fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): Geocoding {
88 | DEFAULT.addRegionEntry(id, parentId, name, type, alias, replace)
89 | return this
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/GeocodingX.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.core.Context
4 | import org.bitlap.geocoding.model.Address
5 | import org.bitlap.geocoding.model.RegionEntity
6 | import org.bitlap.geocoding.model.RegionType
7 | import org.bitlap.geocoding.similarity.Document
8 | import org.bitlap.geocoding.similarity.MatchedResult
9 |
10 |
11 | /**
12 | * Create custom geocoding
13 | */
14 | open class GeocodingX(val ctx: Context) {
15 |
16 | constructor(): this(false)
17 | constructor(strict: Boolean): this("core/region.dat", strict)
18 | constructor(dataClassPath: String): this(dataClassPath, false)
19 |
20 | /**
21 | * @param dataClassPath 自定义地址文档的classpath路径
22 | * @param strict 解析模式, 默认为false。当发现没有省和市,且匹配的父项数量等于1时,能成功匹配。
23 | * * true: 严格模式,当发现没有省和市,且匹配的父项数量大于1时,返回null
24 | * * false: 非严格模式,当发现没有省和市,且匹配的父项数量大于1时,匹配随机一项省和市
25 | */
26 | constructor(dataClassPath: String, strict: Boolean): this(Context(dataClassPath, strict))
27 |
28 | /**
29 | * 地址的标准化, 将不规范的地址清洗成标准的地址格式
30 | */
31 | fun normalizing(address: String): Address? {
32 | return Address.build(ctx.interpreter.interpret(address))
33 | }
34 |
35 | /**
36 | * 将地址进行切分
37 | */
38 | fun analyze(address: String): Document? {
39 | val add = normalizing(address) ?: return null
40 | return ctx.computer.analyze(add)
41 | }
42 | fun analyze(address: Address?): Document? {
43 | address ?: return null
44 | return ctx.computer.analyze(address)
45 | }
46 |
47 | /**
48 | * 地址的相似度计算
49 | */
50 | fun similarity(address1: String, address2: String): Double {
51 | val compute = ctx.computer.compute(
52 | normalizing(address1),
53 | normalizing(address2)
54 | )
55 | return compute.similarity
56 | }
57 | fun similarity(address1: Address?, address2: Address?): Double {
58 | val compute = ctx.computer.compute(address1, address2)
59 | return compute.similarity
60 | }
61 |
62 | /**
63 | * 地址相似度计算, 包含匹配的所有结果
64 | */
65 | fun similarityWithResult(address1: String, address2: String): MatchedResult {
66 | return ctx.computer.compute(
67 | normalizing(address1),
68 | normalizing(address2)
69 | )
70 | }
71 | fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
72 | return ctx.computer.compute(address1, address2)
73 | }
74 |
75 | /**
76 | * 深度优先匹配符合[text]的地址信息
77 | */
78 | fun match(text: String): List {
79 | val terms = ctx.interpreter.getTermIndexBuilder().fullMatch(text) ?: emptyList()
80 | return terms.mapNotNull { it.value }
81 | }
82 |
83 | /**
84 | * 设置自定义地址
85 | *
86 | * @param id 地址的ID
87 | * @param parentId 地址的父ID, 必须存在
88 | * @param name 地址的名称
89 | * @param type 地址类型, [RegionType]
90 | * @param alias 地址的别名
91 | * @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
92 | */
93 | fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): GeocodingX {
94 | ctx.persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId")
95 | if (name.isBlank()) {
96 | throw IllegalArgumentException("name should not be blank.")
97 | }
98 | // 构建 region 对象
99 | val region = RegionEntity()
100 | region.id = id
101 | region.parentId = parentId
102 | region.name = name
103 | region.alias = alias
104 | region.type = type
105 | // 暂时在这里初始化下级行政区划列表
106 | region.children = arrayListOf()
107 | // 1. Add to cache (id -> Region)
108 | ctx.persister.addRegionEntity(region)
109 | // 2. Build term index
110 | val indexBuilder = ctx.interpreter.getTermIndexBuilder()
111 | indexBuilder.indexRegions(listOf(region), replace)
112 | return this
113 | }
114 |
115 | fun save(path: String) {
116 | ctx.persister.save(path)
117 | }
118 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/AddressInterpreter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | import org.bitlap.geocoding.index.TermIndexBuilder
4 | import org.bitlap.geocoding.model.AddressEntity
5 |
6 | /**
7 | * Desc: 地址解析操作
8 | * 从地址文本中解析出省、市、区、街道、乡镇、道路等地址组成部分
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/1/12
12 | */
13 | interface AddressInterpreter {
14 |
15 | /**
16 | * 将`脏`地址进行标准化处理, 解析成 [AddressEntity]
17 | */
18 | fun interpret(address: String?): AddressEntity?
19 |
20 |
21 | /**
22 | * 获取 [TermIndexBuilder]
23 | */
24 | fun getTermIndexBuilder(): TermIndexBuilder
25 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/AddressPersister.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | import org.bitlap.geocoding.model.RegionEntity
4 |
5 | /**
6 | * Desc: 地址持久层的操作, 这边暂时只是对标准地址库的处理.
7 | * 暂时不将标准化后的地址存储在数据出中。
8 | * Mail: chk19940609@gmail.com
9 | * Created by IceMimosa
10 | * Date: 2017/1/12
11 | */
12 | interface AddressPersister {
13 |
14 | /**
15 | * 获取行政规划地址树状结构关系
16 | */
17 | fun getRootRegion(): RegionEntity
18 |
19 | /**
20 | * 根据id获取
21 | */
22 | fun getRegion(id: Long): RegionEntity?
23 |
24 | /**
25 | * 新增一个region信息
26 | */
27 | fun addRegionEntity(entity: RegionEntity)
28 |
29 | /**
30 | * 保存一个新的dat文件
31 | */
32 | fun save(path: String)
33 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/Computer.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | import org.bitlap.geocoding.model.Address
4 | import org.bitlap.geocoding.similarity.Document
5 | import org.bitlap.geocoding.similarity.MatchedResult
6 |
7 | /**
8 | * Desc: 相似度算法相关逻辑
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/2/5
12 | */
13 | interface Computer {
14 |
15 | /**
16 | * 将标准地址转化成文档对象
17 | * 1. 对text进行分词
18 | * 2. 对每个部分计算 IDF
19 | */
20 | fun analyze(address: Address): Document
21 |
22 | /**
23 | * 计算两个标准地址的相似度
24 | * 1. 将两个地址形成 Document
25 | * 2. 为每个Document的Term设置权重
26 | * 3. 计算两个分词组的余弦相似度
27 | */
28 | fun compute(addr1: Address?, addr2: Address?): MatchedResult
29 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/Context.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | import org.bitlap.geocoding.core.impl.DefaultAddressInterpreter
4 | import org.bitlap.geocoding.core.impl.DefaultAddressPersister
5 | import org.bitlap.geocoding.core.impl.DefaultRegionCache
6 | import org.bitlap.geocoding.core.impl.SimilarityComputer
7 |
8 | /**
9 | * Desc: 上下文
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/1/12
13 | */
14 | open class Context(
15 | val dataClassPath: String,
16 | val strict: Boolean,
17 | val persister: AddressPersister = DefaultAddressPersister(DefaultRegionCache(dataClassPath)),
18 | val interpreter: AddressInterpreter = DefaultAddressInterpreter(persister, strict),
19 | val computer: Computer = SimilarityComputer(),
20 | ) {
21 |
22 |
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/RegionCache.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | import org.bitlap.geocoding.model.RegionEntity
4 |
5 | /**
6 | * Desc: 获取 region entity 的抽象接口
7 | * 默认从 region.dat 中获取, 还可以从比如数据库中获取
8 | * Mail: chk19940609@gmail.com
9 | * Created by IceMimosa
10 | * Date: 2017/1/12
11 | */
12 | interface RegionCache {
13 |
14 | /**
15 | * 加载全部区域列表,按照行政区域划分构建树状结构关系
16 | */
17 | fun get(): RegionEntity
18 |
19 |
20 | /**
21 | * 加载区域map结构, key是区域id, 值是区域实体
22 | */
23 | fun getCache(): Map
24 |
25 | /**
26 | * 新增一个region信息
27 | */
28 | fun addRegionEntity(entity: RegionEntity)
29 |
30 | /**
31 | * 保存一个新的dat文件
32 | */
33 | fun save(path: String)
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/Segmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core
2 |
3 | /**
4 | * Desc: 分词器接口,对文本执行分词操作。
5 | * 实现可以是 SmartCN, IKAnalyzer, Word等等
6 | * Mail: chk19940609@gmail.com
7 | * Created by IceMimosa
8 | * Date: 2017/2/6
9 | */
10 | interface Segmenter {
11 |
12 | /**
13 | * 分词方法
14 | */
15 | fun segment(text: String): List
16 |
17 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/TermIndexVisitor.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core;
2 |
3 | import org.bitlap.geocoding.index.TermIndexEntry;
4 | import org.bitlap.geocoding.model.Division
5 |
6 | /**
7 | * Desc: 基于词条倒排索引搜索的访问者
8 | * Mail: chk19940609@gmail.com
9 | * Created by IceMimosa
10 | * Date: 2017/1/12
11 | */
12 | interface TermIndexVisitor {
13 |
14 | /**
15 | * 开始一轮词条匹配。
16 | */
17 | fun startRound()
18 |
19 | /**
20 | * 匹配到一个索引条目,由访问者确定是否是可接受的匹配项。
21 | * 索引条目 [entry] 下的items一定包含一个或多个索引对象
22 | *
23 | * @return 可以接受返回true, 否则返回false。对于可以接受的索引条目调用 [endVisit] 结束访问
24 | */
25 | fun visit(entry: TermIndexEntry, text: String, pos: Int): Boolean
26 |
27 | /**
28 | * [visit] 接受某个索引项之后当前匹配的指针位置
29 | */
30 | fun position(): Int
31 |
32 | /**
33 | * 结束索引访问
34 | */
35 | fun endVisit(entry: TermIndexEntry, text: String, pos: Int)
36 |
37 | /**
38 | * 结束一轮词条匹配。
39 | */
40 | fun endRound()
41 |
42 | /**
43 | * 是否匹配上了结果
44 | */
45 | fun hasResult(): Boolean
46 |
47 | /**
48 | * 获取访问后最终匹配结果
49 | */
50 | fun devision(): Division
51 |
52 | fun matchCount(): Int
53 | fun fullMatchCount(): Int
54 |
55 | /**
56 | * 获取最终匹配结果的终止位置
57 | */
58 | fun endPosition(): Int
59 |
60 | /**
61 | * 状态复位
62 | */
63 | fun reset()
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/impl/DefaultAddressInterpreter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.impl
2 |
3 | import org.bitlap.geocoding.core.AddressInterpreter
4 | import org.bitlap.geocoding.core.AddressPersister
5 | import org.bitlap.geocoding.core.TermIndexVisitor
6 | import org.bitlap.geocoding.index.TermIndexBuilder
7 | import org.bitlap.geocoding.index.TermType
8 | import org.bitlap.geocoding.model.AddressEntity
9 | import org.bitlap.geocoding.model.RegionEntity
10 | import org.bitlap.geocoding.utils.head
11 | import org.bitlap.geocoding.utils.remove
12 | import org.bitlap.geocoding.utils.removeRepeatNum
13 | import org.bitlap.geocoding.utils.tail
14 | import org.bitlap.geocoding.utils.take
15 | import java.util.regex.Pattern
16 |
17 | /**
18 | * Desc: 地址解析操作
19 | * 从地址文本中解析出省、市、区、街道、乡镇、道路等地址组成部分
20 | * Mail: chk19940609@gmail.com
21 | * Created by IceMimosa
22 | * Date: 2017/1/17
23 | */
24 | open class DefaultAddressInterpreter(val persister: AddressPersister, val strict: Boolean) : AddressInterpreter {
25 |
26 | private var indexBuilder: TermIndexBuilder? = null
27 | private val ignoringRegionNames = mutableListOf(
28 | // JD, Tmall
29 | "其它区", "其他地区", "其它地区", "全境", "城区", "城区以内", "城区以外", "郊区", "县城内", "内环以内", "开发区", "经济开发区", "经济技术开发区",
30 | // ehaier (来自TMall或HP)
31 | "省直辖", "省直辖市县",
32 | // 其他
33 | "地区", "市区"
34 | )
35 |
36 | init {
37 | // 初始化索引builder
38 | indexBuilder = TermIndexBuilder(persister.getRootRegion(), ignoringRegionNames)
39 | }
40 |
41 |
42 | companion object {
43 | // 特殊字符1
44 | private val specialChars1 = " \r\n\t,,。·..;;::、!@$%*^`~=+&'\"|_-\\/".toCharArray()
45 | // 包裹的特殊字符2
46 | private val specialChars2 = "{}【】〈〉<>[]「」“”()()".toCharArray()
47 |
48 | /**
49 | * 匹配没有路号的情况
50 | * xx路xx号楼
51 | * xx路xx-xx
52 | */
53 | private val P_BUILDING_NUM0 = Pattern.compile(
54 | //"((路|街|巷)[0-9]+号([0-9A-Z一二三四五六七八九十][\\#\\-一-/\\\\]|楼)?)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+([\\#\\-一-/\\\\]|单元|门|梯|层|座))?([0-9]+(室|房)?)?"
55 | "((路|街|巷)[0-9]+号([0-9A-Z一二三四五六七八九十][\\#\\-一-—/\\\\]|楼)?)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|楼|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+([\\#\\-一-—/\\\\]|单元|门|梯|层|座|组))?([0-9]+([\\#\\-一-—/\\\\]|室|房|层|楼|号|户)?)?([0-9]+号?)?"
56 | )
57 | /**
58 | * 标准匹配building的模式:xx栋xx单元xxx。
59 | * 注1:山东青岛市南区宁夏路118号4号楼6单元202。如果正则模式开始位置不使用(路[0-9]+号)?,则第一个符合条件的匹配结果是【118号4】,
60 | * 按照逻辑会将匹配结果及之后的所有字符当做building,导致最终结果为:118号4号楼6单元202
61 | *
62 | * 所以需要先匹配 (路[0-9]+号)?
63 | */
64 | private val P_BUILDING_NUM1 = Pattern.compile(
65 | "((路|街|巷)[0-9]+号)?([0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)){0,1}([一二三四五六七八九十东西南北甲乙丙0-9]+(单元|门|梯|层|座))?([0-9]+(室|房)?)?"
66 | )
67 | /**
68 | * 校验building的模式。building1M能够匹配到纯数字等不符合条件的文本,使用building1V排除掉
69 | */
70 | private val P_BUILDING_NUM_V = Pattern.compile(
71 | "(栋|幢|橦|号楼|号|\\#|\\#楼|单元|室|房|门)+"
72 | )
73 | /**
74 | * 匹配building的模式:12-2-302,12栋3单元302
75 | */
76 | private val P_BUILDING_NUM2 = Pattern.compile(
77 | "[A-Za-z0-9]+([\\#\\-一-/\\\\]+[A-Za-z0-9]+)+"
78 | )
79 | /**
80 | * 匹配building的模式:10组21号,农村地址
81 | */
82 | private val P_BUILDING_NUM3 = Pattern.compile(
83 | "[0-9]+(组|通道)[A-Z0-9\\-一]+号?"
84 | )
85 |
86 | // 简单括号匹配
87 | private val BRACKET_PATTERN = Pattern.compile(
88 | "(?([\\((\\{\\<〈\\[【「][^\\))\\}\\>〉\\]】」]*[\\))\\}\\>〉\\]】」]))"
89 | )
90 |
91 | // 道路信息
92 | private val P_ROAD = Pattern.compile(
93 | "^(?([\u4e00-\u9fa5]{2,6}(路|街坊|街|道|大街|大道)))(?[甲乙丙丁])?(?[0-90123456789一二三四五六七八九十]+(号院|号楼|号大院|号|號|巷|弄|院|区|条|\\#院|\\#))?"
94 | )
95 | // 道路中未匹配到的building信息
96 | private val P_ROAD_BUILDING = Pattern.compile(
97 | "[0-9A-Z一二三四五六七八九十]+(栋|橦|幢|座|号楼|号|\\#楼?)"
98 | )
99 |
100 | // 村信息
101 | private val P_TOWN1 = Pattern.compile("^((?[\u4e00-\u9fa5]{2,2}(镇|乡))(?[\u4e00-\u9fa5]{1,3}村)?)")
102 | private val P_TOWN2 = Pattern.compile("^((?[\u4e00-\u9fa5]{1,3}镇)?(?[\u4e00-\u9fa5]{1,3}乡)?(?[\u4e00-\u9fa5]{1,3}村(?!(村|委|公路|(东|西|南|北)?(大街|大道|路|街))))?)")
103 | private val P_TOWN3 = Pattern.compile("^(?[\u4e00-\u9fa5]{1,3}村(?!(村|委|公路|(东|西|南|北)?(大街|大道|路|街))))?")
104 | private var invalidTown: MutableSet = mutableSetOf()
105 | private var invalidTownFollowings: MutableSet = mutableSetOf()
106 | init {
107 | invalidTownFollowings.add("政府")
108 | invalidTownFollowings.add("大街")
109 | invalidTownFollowings.add("大道")
110 | invalidTownFollowings.add("社区")
111 | invalidTownFollowings.add("小区")
112 | invalidTownFollowings.add("小学")
113 | invalidTownFollowings.add("中学")
114 | invalidTownFollowings.add("医院")
115 | invalidTownFollowings.add("银行")
116 | invalidTownFollowings.add("中心")
117 | invalidTownFollowings.add("卫生")
118 | invalidTownFollowings.add("一小")
119 | invalidTownFollowings.add("一中")
120 | invalidTownFollowings.add("政局")
121 | invalidTownFollowings.add("企局")
122 |
123 | invalidTown.add("新村")
124 | invalidTown.add("外村")
125 | invalidTown.add("大村")
126 | invalidTown.add("后村")
127 | invalidTown.add("东村")
128 | invalidTown.add("南村")
129 | invalidTown.add("北村")
130 | invalidTown.add("西村")
131 | invalidTown.add("上村")
132 | invalidTown.add("下村")
133 | invalidTown.add("一村")
134 | invalidTown.add("二村")
135 | invalidTown.add("三村")
136 | invalidTown.add("四村")
137 | invalidTown.add("五村")
138 | invalidTown.add("六村")
139 | invalidTown.add("七村")
140 | invalidTown.add("八村")
141 | invalidTown.add("九村")
142 | invalidTown.add("十村")
143 | invalidTown.add("中村")
144 | invalidTown.add("街村")
145 | invalidTown.add("头村")
146 | invalidTown.add("店村")
147 | invalidTown.add("桥村")
148 | invalidTown.add("楼村")
149 | invalidTown.add("老村")
150 | invalidTown.add("户村")
151 | invalidTown.add("山村")
152 | invalidTown.add("才村")
153 | invalidTown.add("子村")
154 | invalidTown.add("旧村")
155 | invalidTown.add("文村")
156 | invalidTown.add("全村")
157 | invalidTown.add("和村")
158 | invalidTown.add("湖村")
159 | invalidTown.add("甲村")
160 | invalidTown.add("乙村")
161 | invalidTown.add("丙村")
162 | invalidTown.add("邻村")
163 | invalidTown.add("乡村")
164 | invalidTown.add("村二村")
165 | invalidTown.add("中关村")
166 | invalidTown.add("城乡")
167 | invalidTown.add("县乡")
168 | invalidTown.add("头乡")
169 | invalidTown.add("牌乡")
170 | invalidTown.add("茶乡")
171 | invalidTown.add("水乡")
172 | invalidTown.add("港乡")
173 | invalidTown.add("巷乡")
174 | invalidTown.add("七乡")
175 | invalidTown.add("站乡")
176 | invalidTown.add("西乡")
177 | invalidTown.add("宝乡")
178 | invalidTown.add("还乡")
179 | invalidTown.add("古镇")
180 | invalidTown.add("小镇")
181 | invalidTown.add("街镇")
182 | invalidTown.add("城镇")
183 | invalidTown.add("环镇")
184 | invalidTown.add("湾镇")
185 | invalidTown.add("岗镇")
186 | invalidTown.add("镇镇")
187 | invalidTown.add("场镇")
188 | invalidTown.add("新镇")
189 | invalidTown.add("乡镇")
190 | invalidTown.add("屯镇")
191 | invalidTown.add("大镇")
192 | invalidTown.add("南镇")
193 | invalidTown.add("店镇")
194 | invalidTown.add("铺镇")
195 | invalidTown.add("关镇")
196 | invalidTown.add("口镇")
197 | invalidTown.add("和镇")
198 | invalidTown.add("建镇")
199 | invalidTown.add("集镇")
200 | invalidTown.add("庙镇")
201 | invalidTown.add("河镇")
202 | invalidTown.add("村镇")
203 | }
204 | }
205 |
206 | /**
207 | * 将`脏`地址进行标准化处理, 解析成 [AddressEntity]
208 | */
209 | override fun interpret(address: String?): AddressEntity? {
210 | return interpret(address, RegionInterpreterVisitor(persister, strict))
211 | }
212 |
213 | private fun interpret(address: String?, visitor: TermIndexVisitor): AddressEntity? {
214 | if (address.isNullOrBlank()) return null
215 |
216 | val entity = AddressEntity(address)
217 |
218 | // 清洗下开头垃圾数据, 针对用户数据
219 | prepare(entity)
220 | // extractBuildingNum, 提取建筑物号
221 | extractBuildingNum(entity)
222 | // 去除特殊字符
223 | removeSpecialChars(entity)
224 | // 提取包括的数据
225 | var brackets = extractBrackets(entity)
226 | // 去除包括的特殊字符
227 | brackets = brackets.remove(specialChars2)
228 | removeBrackets(entity)
229 | // 提取行政规划标准地址
230 | extractRegion(entity, visitor)
231 | // 规整省市区街道等匹配的结果
232 | removeRedundancy(entity, visitor)
233 | // 提取道路信息
234 | extractRoad(entity)
235 | // 提取农村信息
236 | // extractTownVillage(entity)
237 |
238 | entity.text = entity.text!!.replace("[0-9A-Za-z\\#]+(单元|楼|室|层|米|户|\\#)", "")
239 | entity.text = entity.text!!.replace("[一二三四五六七八九十]+(单元|楼|室|层|米|户)", "")
240 | if (brackets.isNotEmpty()) {
241 | entity.text = entity.text + brackets
242 | // 如果没有道路信息, 可能存在于 Brackets 中
243 | if (entity.road.isNullOrBlank()) extractRoad(entity)
244 | }
245 |
246 | return entity
247 | }
248 |
249 | // 清洗下开头垃圾数据
250 | private fun prepare(entity: AddressEntity) {
251 | // 去除开头的数字, 字母, 空格等
252 | if (entity.text.isNullOrBlank()) return
253 |
254 | val p = Pattern.compile("[ \\da-zA-Z\r\n\t,,。·..;;::、!@$%*^`~=+&'\"|_\\-\\/]")
255 | entity.text = entity.text?.trimStart {
256 | p.matcher("$it").find()
257 | }
258 |
259 | // 将地址中的 ー-—- 等替换为-
260 | entity.text = entity.text?.replace(Regex("[ー_-—/]|(--)"), "-")
261 | }
262 |
263 | // 提取建筑物号
264 | private fun extractBuildingNum(entity: AddressEntity): Boolean {
265 | if (entity.text.isNullOrBlank()) return false
266 |
267 | var found = false // 是否找到的标志
268 | var building: String? // 最后匹配的文本
269 |
270 | // 使用 P_BUILDING_NUM0 先进行匹配
271 | var matcher = P_BUILDING_NUM0.matcher(entity.text)
272 | while (matcher.find()) {
273 | if (matcher.start() == matcher.end()) continue
274 | building = entity.text!!.take(matcher.start(), matcher.end() - 1)
275 | // 查看匹配数量, 对building进行最小匹配
276 | var notEmptyGroups = 0
277 | for (i in 0 until matcher.groupCount()) {
278 | if (matcher.group(i) != null) notEmptyGroups++
279 | }
280 | // 如果匹配group的数量大于3, 并且匹配到了building
281 | // 去除前面的 `xx路xx号` 前缀
282 | if (P_BUILDING_NUM_V.matcher(building).find() && notEmptyGroups > 3) {
283 | var pos = matcher.start()
284 | if (building.startsWith("路") || building.startsWith("街") || building.startsWith("巷")) {
285 | if (building.contains("号楼")) pos += building.indexOf("路") + 1
286 | else pos += building.indexOf("号") + 1
287 | building = entity.text!!.take(pos, matcher.end() - 1)
288 | }
289 | entity.buildingNum = building
290 | entity.text = entity.text.head(pos) + entity.text!!.take(matcher.end())
291 | found = true
292 | break
293 | }
294 | }
295 |
296 | if (!found) {
297 | matcher = P_BUILDING_NUM1.matcher(entity.text)
298 | while (matcher.find()) {
299 | if (matcher.start() == matcher.end()) continue
300 | building = entity.text!!.take(matcher.start(), matcher.end() - 1)
301 | // 查看匹配数量, 对building进行最小匹配
302 | var notEmptyGroups = 0
303 | for (i in 0 until matcher.groupCount()) {
304 | if (matcher.group(i) != null) notEmptyGroups++
305 | }
306 | // 如果匹配group的数量大于3, 并且匹配到了building
307 | // 去除前面的 `xx路xx号` 前缀
308 | if (P_BUILDING_NUM_V.matcher(building).find() && notEmptyGroups > 3) {
309 | var pos = matcher.start()
310 | if (building.startsWith("路") || building.startsWith("街") || building.startsWith("巷")) {
311 | pos += building.indexOf("号") + 1
312 | building = entity.text!!.take(pos, matcher.end() - 1)
313 | }
314 | entity.buildingNum = building
315 | entity.text = entity.text.head(pos) + entity.text!!.take(matcher.end())
316 | found = true
317 | break
318 | }
319 | }
320 | }
321 |
322 | if (!found) {
323 | //xx-xx-xx(xx栋xx单元xxx)
324 | matcher = P_BUILDING_NUM2.matcher(entity.text)
325 | if (matcher.find()) {
326 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1)
327 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end())
328 | found = true
329 | }
330 | }
331 | if (!found) {
332 | //xx组xx号, xx通道xx号
333 | matcher = P_BUILDING_NUM3.matcher(entity.text)
334 | if (matcher.find()) {
335 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1)
336 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end())
337 | found = true
338 | }
339 | }
340 | return found
341 | }
342 |
343 | // 去除特殊字符
344 | private fun removeSpecialChars(entity: AddressEntity) {
345 | if (entity.text.isNullOrBlank()) return
346 |
347 | var text = entity.text!!
348 | // 1. 删除特殊字符1, 简单场景比 replaceAll 优化了10~20倍
349 | text = text.remove(specialChars1)
350 |
351 | // 2. 删除连续出现6个以上的数字, TODO: 可能真会出现, 这个暂做这个处理
352 | text = text.removeRepeatNum(6)
353 | entity.text = text
354 |
355 | // 去除building
356 | var building = entity.buildingNum
357 | if (building.isNullOrBlank()) return
358 | building = building.remove(specialChars1, "-一-_#")
359 | building = building.removeRepeatNum(6)
360 | entity.buildingNum = building
361 | }
362 |
363 | // 去除包裹的特殊字符
364 | private fun removeBrackets(entity: AddressEntity) {
365 | if (entity.text.isNullOrBlank()) return
366 | entity.text = entity.text!!.remove(specialChars2)
367 | }
368 |
369 | // 提取包括的数据
370 | private fun extractBrackets(entity: AddressEntity): String {
371 | if (entity.text.isNullOrBlank()) return ""
372 |
373 | // 匹配出带有 `Brackets` 的文字
374 | // 最后将文字拼接到 text 中
375 | val matcher = BRACKET_PATTERN.matcher(entity.text)
376 | var found = false
377 | val brackets = StringBuilder()
378 | while (matcher.find()) {
379 | val bracket = matcher.group("bracket")
380 | if (bracket.length <= 2) continue // 如果没有文字
381 | brackets.append(bracket.take(1, bracket.length - 2))
382 | found = true
383 | }
384 | if (found) {
385 | val result = brackets.toString()
386 | entity.text = matcher.replaceAll("")
387 | return result
388 | }
389 | return ""
390 | }
391 |
392 |
393 | // 提取标准4级地址
394 | private fun extractRegion(entity: AddressEntity, visitor: TermIndexVisitor): Boolean {
395 | if (entity.text.isNullOrBlank()) return false
396 |
397 | // 开始匹配
398 | visitor.reset()
399 | indexBuilder!!.deepMostQuery(entity.text, visitor)
400 | entity.province = visitor.devision().province
401 | entity.city = visitor.devision().city
402 | entity.district = visitor.devision().district
403 | entity.street = visitor.devision().street
404 | entity.town = visitor.devision().town
405 | entity.village = visitor.devision().village
406 | entity.text = entity.text!!.take(visitor.endPosition() + 1)
407 | return visitor.hasResult()
408 | }
409 |
410 |
411 | private fun removeRedundancy(entity: AddressEntity, visitor: TermIndexVisitor): Boolean {
412 | if (entity.text.isNullOrBlank() || !entity.hasProvince() || !entity.hasCity()) return false
413 |
414 | var removed = false
415 | // 采用后序数组方式匹配省市区
416 | var endIndex = entity.text!!.length - 2
417 | var i = 0
418 | while (i < endIndex) {
419 | visitor.reset()
420 | indexBuilder!!.deepMostQuery(entity.text, i, visitor)
421 | if (visitor.matchCount() < 2 || visitor.fullMatchCount() < 1) {
422 | // 没有匹配上,或者匹配上的行政区域个数少于2个认当做无效匹配
423 | i++
424 | continue
425 | }
426 | // 匹配上的省份、地级市不正确
427 | if (entity.province!! != visitor.devision().province || entity.city!! != visitor.devision().city) {
428 | i++
429 | continue
430 | }
431 | // 正确匹配,进行回馈
432 | val devision = visitor.devision()
433 | // > 修复区信息
434 | if (!entity.hasDistrict() && devision.hasDistrict() && devision.district!!.parentId == entity.city!!.id)
435 | entity.district = devision.district
436 | // > 修复街道信息
437 | if (entity.hasDistrict() && !entity.hasStreet()
438 | && devision.hasStreet() && devision.street!!.parentId == entity.district!!.id) {
439 | entity.street = devision.street
440 | }
441 | // > 修复乡镇信息
442 | if (entity.hasDistrict() && !entity.hasTown()
443 | && devision.hasTown() && devision.town!!.parentId == entity.district!!.id)
444 | entity.town = devision.town
445 | else if (entity.hasDistrict() && entity.hasTown() && entity.town!! == entity.street
446 | && devision.hasTown()
447 | && devision.town!! != devision.street
448 | && devision.town!!.parentId == entity.district!!.id)
449 | entity.town = devision.town
450 | if (entity.hasDistrict() && !entity.hasVillage() && devision.hasVillage()
451 | && devision.village!!.parentId == entity.district!!.id)
452 | entity.village = devision.village
453 |
454 | // 正确匹配上,删除
455 | entity.text = entity.text!!.take(visitor.endPosition() + 1)
456 | endIndex = entity.text!!.length
457 | i = 0
458 | removed = true
459 | }
460 | return removed
461 | }
462 |
463 | // 提取道路信息
464 | private fun extractRoad(entity: AddressEntity): Boolean {
465 | if (entity.text.isNullOrBlank()) return false
466 | // 如果已经提取过了
467 | if (entity.road != null && entity.road!!.isNotEmpty()) return true
468 | val matcher = P_ROAD.matcher(entity.text)
469 | if (matcher.find()) {
470 | val road = matcher.group("road")
471 | val ex = matcher.group("ex")
472 | var roadNum: String? = matcher.group("roadnum")
473 | roadNum = (ex ?: "") + if (roadNum == null) "" else roadNum
474 | val leftText = entity.text!!.take(road.length + roadNum.length)
475 | if (leftText.startsWith("小区")) return false
476 | entity.road = fixRoad(road)
477 | // 仅包含【甲乙丙丁】单个汉字,不能作为门牌号
478 | if (roadNum.length == 1) {
479 | entity.text = roadNum + leftText
480 | } else {
481 | entity.roadNum = roadNum
482 | entity.text = leftText
483 | }
484 | // 修复road中存在building的问题
485 | if (entity.buildingNum.isNullOrBlank()) {
486 | fixRoadBuilding(entity)
487 | }
488 | return true
489 | }
490 | return false
491 | }
492 |
493 | // 修复重复出现的情况
494 | private fun fixRoad(road: String): String {
495 | if (road.isBlank() || road.length % 2 == 1) return road
496 | // 从中间截取
497 | val first = road.substring(0, road.length / 2)
498 | val second = road.substring(road.length / 2)
499 | if (first == second) {
500 | return first
501 | }
502 | return road
503 | }
504 |
505 | // 修复road中存在 xx号楼 的问题
506 | private fun fixRoadBuilding(entity: AddressEntity): Boolean {
507 | if (entity.text.isNullOrBlank()) return false
508 | val matcher = P_ROAD_BUILDING.matcher(entity.text)
509 | // 最开始匹配, 先这样处理
510 | if (matcher.find() && matcher.start() == 0) {
511 | entity.buildingNum = entity.text!!.take(matcher.start(), matcher.end() - 1)
512 | entity.text = entity.text.head(matcher.start()) + entity.text!!.take(matcher.end())
513 | return true
514 | }
515 | return false
516 | }
517 |
518 | // 提取农村信息
519 | private fun extractTownVillage(addr: AddressEntity) {
520 | if (extractTownVillage(addr, P_TOWN1, "z", null, "c") >= 0) return
521 | if (addr.hasTown())
522 | extractTownVillage(addr, P_TOWN3, null, null, "c")
523 | else
524 | extractTownVillage(addr, P_TOWN2, "z", "x", "c")
525 | }
526 |
527 | //返回值:
528 | // 1: 执行了匹配操作,匹配成功
529 | //-1: 执行了匹配操作,未匹配上
530 | // 0: 未执行匹配操作
531 | private fun extractTownVillage(addr: AddressEntity, pattern: Pattern, gz: String?, gx: String?, gc: String?): Int {
532 | if (addr.text.isNullOrBlank() || !addr.hasDistrict()) return 0
533 |
534 | var result = -1
535 | val matcher = pattern.matcher(addr.text)
536 |
537 | if (matcher.find()) {
538 | val text = addr.text!!
539 | var c: String? = if (gc == null) null else matcher.group("c")
540 | var ic = if (gc == null) -1 else matcher.end("c")
541 |
542 | if (gz != null) {
543 | val z = matcher.group(gz)
544 | val iz = matcher.end(gz)
545 | if (!z.isNullOrBlank()) { //镇
546 | if (z.length == 2 && text.startsWith("村", z.length)) {
547 | c = z + "村"
548 | ic = iz + 1
549 | } else if (isAcceptableTownFollowingChars(z, text, z.length)) {
550 | if (acceptTown(z, addr.district) >= 0) {
551 | addr.text = text.take(iz)
552 | result = 1
553 | }
554 | }
555 | }
556 | }
557 |
558 | if (gx != null) {
559 | val x = matcher.group(gx)
560 | val ix = matcher.end(gx)
561 | if (!x.isNullOrBlank()) { //镇
562 | if (x.length == 2 && text.startsWith("村", x.length)) {
563 | c = x + "村"
564 | ic = ix + 1
565 | } else if (isAcceptableTownFollowingChars(x, text, x.length)) {
566 | if (acceptTown(x, addr.district) >= 0) {
567 | addr.text = text.take(ix)
568 | result = 1
569 | }
570 | }
571 | }
572 | }
573 |
574 | if (!c.isNullOrBlank()) { //村
575 | if (c.endsWith("农村")) return result
576 | var leftString = text.take(ic)
577 | if (c.endsWith("村村")) {
578 | c = c.head(c.length - 1)
579 | leftString = "村" + leftString
580 | }
581 | if (leftString.startsWith("委") || leftString.startsWith("民委员")) {
582 | leftString = "村" + leftString
583 | }
584 | if (c!!.length >= 4 && (c[0] == '东' || c[0] == '西' || c[0] == '南' || c[0] == '北'))
585 | c = c.tail(c.length - 1)
586 | if (c!!.length == 2 && !isAcceptableTownFollowingChars(c, leftString, 0)) return ic
587 | if (acceptTown(c, addr.district) >= 0) {
588 | addr.text = leftString
589 | result = 1
590 | }
591 | }
592 | }
593 | return result
594 | }
595 |
596 | private fun isAcceptableTownFollowingChars(matched: String, text: String?, start: Int): Boolean {
597 | if (text == null || start >= text.length) return true
598 | if (matched.length == 4) {
599 | when (text[start]) {
600 | '区', '县', '乡', '镇', '村', '街', '路' -> return false
601 | }
602 | }
603 | var s1 = text.take(start, start + 1)
604 | if (invalidTownFollowings.contains(s1)) return false
605 | s1 = text.take(start, start + 2)
606 | if (invalidTownFollowings.contains(s1)) return false
607 | return true
608 | }
609 |
610 | //返回值:
611 | // -1: 无效的匹配
612 | // 0: 有效的匹配,无需执行添加操作
613 | // 1: 有效的匹配,已经执行添加操作
614 | private fun acceptTown(town: String?, district: RegionEntity?): Int {
615 | if (town.isNullOrBlank() || district == null) return -1
616 | if (invalidTown.contains(town)) return -1
617 |
618 | // 已加入bas_region表,不再添加
619 | val items = indexBuilder!!.fullMatch(town)
620 | if (items != null) {
621 | for (item in items) {
622 | if (item.type != TermType.Town && item.type != TermType.Street && item.type != TermType.Village)
623 | continue
624 | val region = item.value as RegionEntity
625 | if (region.parentId == district.id) return 0
626 | }
627 | }
628 |
629 | // 排除一些特殊情况:草滩街镇、西乡街镇等
630 | if (town.length == 4 && town[2] == '街') return -1
631 |
632 | return 1
633 | }
634 |
635 | /**
636 | * 获取 [TermIndexBuilder]
637 | */
638 | override fun getTermIndexBuilder(): TermIndexBuilder {
639 | return this.indexBuilder!!
640 | }
641 | }
642 |
643 |
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/impl/DefaultAddressPersister.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.impl
2 |
3 | import org.bitlap.geocoding.core.AddressPersister
4 | import org.bitlap.geocoding.core.RegionCache
5 | import org.bitlap.geocoding.model.RegionEntity
6 |
7 | /**
8 | * Desc: 地址持久层的操作, 这边暂时只是对标准地址库的处理.
9 | * 暂时不将标准化后的地址存储在数据出中。
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/1/17
13 | */
14 | open class DefaultAddressPersister (
15 | // 行政规划准地址库
16 | private val regionCache: RegionCache
17 | ) : AddressPersister {
18 |
19 | /**
20 | * 获取行政规划地址树状结构关系
21 | */
22 | override fun getRootRegion(): RegionEntity {
23 | return regionCache.get()
24 | }
25 |
26 | /**
27 | * 根据id获取
28 | */
29 | override fun getRegion(id: Long): RegionEntity? {
30 | return regionCache.getCache()[id]
31 | }
32 |
33 | /**
34 | * 新增一个region信息
35 | */
36 | override fun addRegionEntity(entity: RegionEntity) {
37 | regionCache.addRegionEntity(entity)
38 | }
39 |
40 | /**
41 | * 保存一个新的dat文件
42 | */
43 | override fun save(path: String) {
44 | regionCache.save(path)
45 | }
46 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/impl/DefaultRegoinCache.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.impl
2 |
3 | import com.google.gson.Gson
4 | import org.bitlap.geocoding.core.RegionCache
5 | import org.bitlap.geocoding.model.RegionEntity
6 | import org.bitlap.geocoding.model.RegionType
7 | import java.io.ByteArrayInputStream
8 | import java.io.ByteArrayOutputStream
9 | import java.io.FileOutputStream
10 | import java.util.*
11 | import java.util.zip.GZIPInputStream
12 | import java.util.zip.GZIPOutputStream
13 | import kotlin.text.Charsets.UTF_8
14 |
15 | /**
16 | * Desc: 默认 [RegionEntity] 获取的缓存类
17 | * 默认从 region.dat 中获取
18 | * Mail: chk19940609@gmail.com
19 | * Created by IceMimosa
20 | * Date: 2017/1/12
21 | */
22 | open class DefaultRegionCache(dataClassPath: String) : RegionCache {
23 |
24 | private var regions: RegionEntity? = null
25 | private val REGION_CACHE = hashMapOf()
26 |
27 | init {
28 | // 加载区域数据
29 | if (regions == null) {
30 | val input = this.javaClass.classLoader.getResourceAsStream(dataClassPath)
31 | ?: throw IllegalArgumentException("Geocoding data file [$dataClassPath] does not exist.")
32 | regions = Gson().fromJson(decode(String(input.readBytes())), RegionEntity::class.java)
33 | }
34 | // 加载cache
35 | REGION_CACHE[regions!!.id] = regions!!
36 | loadChildrenInCache(regions)
37 | }
38 |
39 | private fun loadChildrenInCache(parent: RegionEntity?) {
40 | // 已经到最底层,结束
41 | if (parent == null || parent.type == RegionType.Street ||
42 | parent.type == RegionType.Village ||
43 | parent.type == RegionType.PlatformL4 ||
44 | parent.type == RegionType.Town) return
45 |
46 | // 递归children
47 | parent.children?.forEach {
48 | REGION_CACHE[it.id] = it
49 | this.loadChildrenInCache(it)
50 | }
51 | }
52 |
53 | /**
54 | * 解压缩数据
55 | */
56 | private fun decode(dat: String): String {
57 | return String(GZIPInputStream(ByteArrayInputStream(Base64.getMimeDecoder().decode(dat))).readBytes())
58 | }
59 |
60 | /**
61 | * 加载全部区域列表,按照行政区域划分构建树状结构关系
62 | */
63 | override fun get(): RegionEntity {
64 | if (regions == null) throw IllegalArgumentException("行政规划区域数据加载失败!")
65 | return regions!!
66 | }
67 |
68 | /**
69 | * 加载区域map结构, key是区域id, 值是区域实体
70 | */
71 | override fun getCache(): Map {
72 | return REGION_CACHE
73 | }
74 |
75 | /**
76 | * 新增一个region信息
77 | */
78 | override fun addRegionEntity(entity: RegionEntity) {
79 | this.loadChildrenInCache(entity)
80 | this.REGION_CACHE[entity.id] = entity
81 | this.REGION_CACHE[entity.parentId]?.children?.add(entity)
82 | }
83 |
84 | /**
85 | * 保存一个新的dat文件
86 | */
87 | override fun save(path: String) {
88 | val gzip = ByteArrayOutputStream()
89 | GZIPOutputStream(gzip, 8192).use { gzipos ->
90 | gzipos.write(Gson().toJson(regions, RegionEntity::class.java).toByteArray(UTF_8))
91 | }
92 | val dat = Base64.getMimeEncoder().encode(gzip.toByteArray())
93 | ByteArrayInputStream(dat).copyTo(FileOutputStream(path), 8192)
94 | }
95 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/impl/RegionInterpreterVisitor.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.impl
2 |
3 | import org.bitlap.geocoding.core.AddressPersister
4 | import org.bitlap.geocoding.core.TermIndexVisitor
5 | import org.bitlap.geocoding.index.TermIndexEntry
6 | import org.bitlap.geocoding.index.TermIndexItem
7 | import org.bitlap.geocoding.index.TermType
8 | import org.bitlap.geocoding.model.Division
9 | import org.bitlap.geocoding.model.RegionEntity
10 | import org.bitlap.geocoding.model.RegionType.City
11 | import org.bitlap.geocoding.model.RegionType.CityLevelDistrict
12 | import org.bitlap.geocoding.model.RegionType.District
13 | import org.bitlap.geocoding.model.RegionType.PlatformL4
14 | import org.bitlap.geocoding.model.RegionType.Province
15 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity1
16 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity2
17 | import org.bitlap.geocoding.model.RegionType.Street
18 | import org.bitlap.geocoding.model.RegionType.Town
19 | import org.bitlap.geocoding.model.RegionType.Village
20 |
21 | import java.util.ArrayDeque
22 |
23 | /**
24 | * Desc: 基于倒排索引搜索匹配省市区行政区划的访问者
25 | * Mail: chk19940609@gmail.com
26 | * Created by IceMimosa
27 | * Date: 2017/1/12
28 | */
29 | open class RegionInterpreterVisitor (
30 | // 地址持久层对象
31 | val persister: AddressPersister,
32 | val strict: Boolean
33 | ) : TermIndexVisitor {
34 |
35 | private var currentLevel = 0
36 | private var deepMostLevel = 0
37 | private var currentPos = -1
38 | private var deepMostPos = -1
39 |
40 | private var fullMatchCount = 0
41 | private var deepMostFullMatchCount = 0
42 |
43 | private val deepMostDivision = Division()
44 | private val curDivision = Division()
45 | private val stack = ArrayDeque()
46 |
47 | companion object {
48 | private val ambiguousChars = mutableListOf('市', '县', '区', '镇', '乡')
49 | }
50 |
51 | /**
52 | * 开始一轮词条匹配。
53 | */
54 | override fun startRound() {
55 | currentLevel++
56 | }
57 |
58 | /**
59 | * 匹配到一个索引条目,由访问者确定是否是可接受的匹配项。
60 | * 索引条目 [entry] 下的items一定包含一个或多个索引对象
61 | *
62 | * @return 可以接受返回true, 否则返回false。对于可以接受的索引条目调用 [endVisit] 结束访问
63 | */
64 | override fun visit(entry: TermIndexEntry, text: String, pos: Int): Boolean {
65 | // 找到最匹配的 被索引对象. 没有匹配对象,匹配不成功,返回
66 | val acceptableItem = findAcceptableItem(entry, text, pos) ?: return false
67 |
68 | // acceptableItem可能为TermType.Ignore类型,此时其value并不是RegionEntity对象,因此下面region的值可能为null
69 | val region = acceptableItem.value as? RegionEntity
70 |
71 | // 更新当前状态
72 | stack.push(acceptableItem) // 匹配项压栈
73 | // 使用全名匹配的词条数
74 | if (isFullMatch(entry, region))
75 | fullMatchCount++
76 | currentPos = positioning(region, entry, text, pos) // 当前结束的位置
77 | updateCurrentDivisionState(region, entry) // 刷新当前已经匹配上的省市区
78 |
79 | return true
80 | }
81 |
82 | private fun findAcceptableItem(entry: TermIndexEntry, text: String, pos: Int): TermIndexItem? {
83 | var mostPriority = -1
84 | var acceptableItem: TermIndexItem? = null
85 |
86 | // 每个 被索引对象循环,找出最匹配的
87 | loop@ for (item in entry.items) {
88 | // 仅处理省市区类型的 被索引对象,忽略其它类型的
89 | if (!isAcceptableItemType(item.type!!)) continue
90 |
91 | //省市区中的特殊名称
92 | if (item.type == TermType.Ignore) {
93 | if (acceptableItem == null) {
94 | mostPriority = 4
95 | acceptableItem = item
96 | }
97 | continue
98 | }
99 |
100 | val region = item.value as RegionEntity
101 | // 从未匹配上任何一个省市区,则从全部被索引对象中找出一个级别最高的
102 | if (!curDivision.hasProvince()) {
103 |
104 | // 在为匹配上任务省市区情况下, 由于 `xx路` 的xx是某县区/市区/省的别名, 如江苏路, 绍兴路等等, 导致错误的匹配。
105 | // 如 延安路118号, 错误匹配上了延安县
106 | if (!isFullMatch(entry, region) && pos + 1 <= text.length - 1) {
107 | if (region.type == Province
108 | || region.type == City
109 | || region.type == CityLevelDistrict || region.type == District
110 | || region.type == Street || region.type == PlatformL4
111 | || region.type == Town) { // 县区或街道
112 |
113 | // 如果是某某路, 街等
114 | when (text[pos + 1]) {
115 | '路', '街', '巷', '道' -> continue@loop
116 | }
117 | }
118 | }
119 |
120 | if (mostPriority == -1) {
121 | mostPriority = region.type.value
122 | acceptableItem = item
123 | }
124 | if (region.type.value < mostPriority) {
125 | mostPriority = region.type.value
126 | acceptableItem = item
127 | }
128 | continue
129 | }
130 |
131 | // 对于省市区全部匹配, 并且当前term属于非完全匹配的时候
132 | // 需要忽略掉当前term, 以免污染已经匹配的省市区
133 | if (!isFullMatch(entry, region) && hasThreeDivision()) {
134 | when (region.type) {
135 | Province -> {
136 | if (region.id != curDivision.province!!.id) {
137 | continue@loop
138 | }
139 | }
140 | City, CityLevelDistrict -> {
141 | if (region.id != curDivision.city!!.id) {
142 | continue@loop
143 | }
144 | }
145 | District -> {
146 | if (region.id != curDivision.district!!.id) {
147 | continue@loop
148 | }
149 | }
150 | else -> { }
151 | }
152 | }
153 |
154 | // 已经匹配上部分省市区,按下面规则判断最匹配项
155 | // 高优先级的排除情况
156 | if (!isFullMatch(entry, region) && pos + 1 <= text.length - 1) { // 使用别名匹配,并且后面还有一个字符
157 | // 1. 湖南益阳沅江市万子湖乡万子湖村
158 | // 错误匹配方式:提取省市区时,将【万子湖村】中的字符【万子湖】匹配成【万子湖乡】,剩下一个【村】。
159 | // 2. 广东广州白云区均和街新市镇
160 | // 白云区下面有均和街道,街道、乡镇使用别名匹配时,后续字符不能是某些行政区域和道路关键字符
161 | if (region.type == Province
162 | || region.type == City
163 | || region.type in listOf(CityLevelDistrict, District)
164 | || region.type == Street
165 | || region.type == Town) { //街道、乡镇
166 | when (text[pos + 1]) {
167 | '区', '县', '乡', '镇', '村', '街', '路' -> continue@loop
168 | '大' -> if (pos + 2 <= text.length - 1) {
169 | val c = text[pos + 2]
170 | if (c == '街' || c == '道') continue@loop
171 | }
172 | }
173 | }
174 | }
175 |
176 | // 1. 匹配度最高的情况,正好是下一级行政区域
177 | if (region.parentId == curDivision.leastRegion().id) {
178 | acceptableItem = item
179 | break
180 | }
181 |
182 | // 2. 中间缺一级的情况。
183 | if (mostPriority == -1 || mostPriority > 2) {
184 | val parent = persister.getRegion(region.parentId)
185 | // 2.1 缺地级市
186 | if (!curDivision.hasCity() && curDivision.hasProvince() && region.type == District
187 | && curDivision.province!!.id == parent!!.parentId) {
188 | mostPriority = 2
189 | acceptableItem = item
190 | continue
191 | }
192 | // 2.2 缺区县
193 | if (!curDivision.hasDistrict() && curDivision.hasCity()
194 | && (region.type == Street || region.type == Town
195 | || region.type == PlatformL4 || region.type == Village)
196 | && curDivision.city!!.id == parent!!.parentId) {
197 | mostPriority = 2
198 | acceptableItem = item
199 | continue
200 | }
201 | }
202 |
203 | // 3. 地址中省市区重复出现的情况
204 | if (mostPriority == -1 || mostPriority > 3) {
205 | if (curDivision.hasProvince() && curDivision.province!!.id == region.id ||
206 | curDivision.hasCity() && curDivision.city!!.id == region.id ||
207 | curDivision.hasDistrict() && curDivision.district!!.id == region.id ||
208 | curDivision.hasStreet() && curDivision.street!!.id == region.id ||
209 | curDivision.hasTown() && curDivision.town!!.id == region.id ||
210 | curDivision.hasVillage() && curDivision.village!!.id == region.id) {
211 | mostPriority = 3
212 | acceptableItem = item
213 | continue
214 | }
215 | }
216 |
217 | // 4. 容错
218 | if (mostPriority == -1 || mostPriority > 4) {
219 | // 4.1 新疆阿克苏地区阿拉尔市
220 | // 到目前为止,新疆下面仍然有地级市【阿克苏地区】
221 | //【阿拉尔市】是县级市,以前属于地级市【阿克苏地区】,目前已变成新疆的省直辖县级行政区划
222 | // 即,老的行政区划关系为:新疆->阿克苏地区->阿拉尔市
223 | // 新的行政区划关系为:
224 | // 新疆->阿克苏地区
225 | // 新疆->阿拉尔市
226 | // 错误匹配方式:新疆 阿克苏地区 阿拉尔市,会导致在【阿克苏地区】下面无法匹配到【阿拉尔市】
227 | // 正确匹配结果:新疆 阿拉尔市
228 | if (region.type == CityLevelDistrict
229 | && curDivision.hasProvince() && curDivision.province!!.id == region.parentId) {
230 | mostPriority = 4
231 | acceptableItem = item
232 | continue
233 | }
234 | // 4.2 地级市-区县从属关系错误,但区县对应的省份正确,则将使用区县的地级市覆盖已匹配的地级市
235 | // 主要是地级市的管辖范围有调整,或者由于外部系统地级市与区县对应关系有调整导致
236 | if (region.type == District // 必须是普通区县
237 | && curDivision.hasCity() && curDivision.hasProvince()
238 | && isFullMatch(entry, region) // 使用的全名匹配
239 | && curDivision.city!!.id != region.parentId) {
240 | val city = persister.getRegion(region.parentId)!! // 区县的地级市
241 | if (city.parentId == curDivision.province!!.id && !hasThreeDivision()) {
242 | mostPriority = 4
243 | acceptableItem = item
244 | continue
245 | }
246 | }
247 | }
248 |
249 | // 5. 街道、乡镇,且不符合上述情况
250 | if (region.type == Street || region.type == Town
251 | || region.type == Village || region.type == PlatformL4) {
252 | if (!curDivision.hasDistrict()) {
253 | var parent = persister.getRegion(region.parentId) // parent为区县
254 | parent = persister.getRegion(parent!!.parentId) // parent为地级市
255 | if (curDivision.hasCity() && curDivision.city!!.id == parent!!.id) {
256 | mostPriority = 5
257 | acceptableItem = item
258 | continue
259 | }
260 | } else if (region.parentId == curDivision.district!!.id) {
261 | //已经匹配上区县
262 | mostPriority = 5
263 | acceptableItem = item
264 | continue
265 | }
266 | }
267 | }
268 | return acceptableItem
269 | }
270 |
271 | private fun isFullMatch(entry: TermIndexEntry, region: RegionEntity?): Boolean {
272 | if (region == null) return false
273 | if (entry.key!!.length == region.name.length) return true
274 | if (region.type == Street && region.name.endsWith("街道") && region.name.length == entry.key!!.length + 1)
275 | return true //xx街道,使用别名xx镇、xx乡匹配上的,认为是全名匹配
276 | return false
277 | }
278 |
279 | /**
280 | * 索引对象是否是可接受的省市区等类型。
281 | */
282 | private fun isAcceptableItemType(type: TermType): Boolean {
283 | return when (type) {
284 | TermType.Province, TermType.City, TermType.District,
285 | TermType.Street, TermType.Town, TermType.Village, TermType.Ignore -> true
286 | else -> false
287 | }
288 | }
289 |
290 | /**
291 | * 当前是否已经完全匹配了省市区了
292 | */
293 | private fun hasThreeDivision(): Boolean {
294 | return (curDivision.hasProvince() && curDivision.hasCity() && curDivision.hasDistrict())
295 | && (curDivision.city!!.parentId == curDivision.province!!.id)
296 | && (curDivision.district!!.parentId == curDivision.city!!.id)
297 | }
298 |
299 | private fun positioning(acceptedRegion: RegionEntity?, entry: TermIndexEntry, text: String, pos: Int): Int {
300 | if (acceptedRegion == null) return pos
301 | // 需要调整指针的情况
302 | // 1. 山东泰安肥城市桃园镇桃园镇山东省泰安市肥城县桃园镇东伏村
303 | // 错误匹配方式:提取省市区时,将【肥城县】中的字符【肥城】匹配成【肥城市】,剩下一个【县】
304 | if ((acceptedRegion.type == City || acceptedRegion.type == District
305 | || acceptedRegion.type == Street)
306 | && !isFullMatch(entry, acceptedRegion) && pos + 1 <= text.length - 1) {
307 | val c = text[pos + 1]
308 | if (ambiguousChars.contains(c)) { //后续跟着特殊字符
309 | for (child in acceptedRegion.children ?: arrayListOf()) {
310 | if (child.name[0] == c) return pos
311 | }
312 | return pos + 1
313 | }
314 | // fix: 如果已经匹配最低等级
315 | if (curDivision.hasTown() || curDivision.hasStreet()) {
316 | // 如果不是特殊字符的, 由于存在 `xx小区, xx苑, xx是以镇名字命名的情况`
317 | if (!ambiguousChars.contains(c)) {
318 | deepMostPos = currentPos // 则不移动当前指针
319 | }
320 | }
321 | }
322 | return pos
323 | }
324 |
325 | /**
326 | * 更新当前已匹配区域对象的状态。
327 | * @param region
328 | */
329 | private fun updateCurrentDivisionState(region: RegionEntity?, entry: TermIndexEntry) {
330 | if (region == null) return
331 | // region为重复项,无需更新状态
332 | if (region == curDivision.province || region == curDivision.city
333 | || region == curDivision.district || region == curDivision.street
334 | || region == curDivision.town || region == curDivision.village)
335 | return
336 |
337 | // 非严格模式 || 只有一个父项
338 | val needUpdateCityAndProvince = !strict || (entry.items.size == 1)
339 | when (region.type) {
340 | Province, ProvinceLevelCity1 -> {
341 | curDivision.province = region
342 | curDivision.city = null
343 | }
344 | City, ProvinceLevelCity2 -> {
345 | curDivision.city = region
346 | if (!curDivision.hasProvince())
347 | curDivision.province = persister.getRegion(region.parentId)
348 | }
349 | CityLevelDistrict -> {
350 | curDivision.city = region
351 | curDivision.district = region
352 | if (!curDivision.hasProvince())
353 | curDivision.province = persister.getRegion(region.parentId)
354 | }
355 | District -> {
356 | curDivision.district = region
357 | //成功匹配了区县,则强制更新地级市
358 | curDivision.city = persister.getRegion(curDivision.district!!.parentId)
359 | if (!curDivision.hasProvince())
360 | curDivision.province = persister.getRegion(curDivision.city!!.parentId)
361 | }
362 | Street, PlatformL4 -> {
363 | if (!curDivision.hasStreet()) curDivision.street = region
364 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId)
365 | if (needUpdateCityAndProvince) {
366 | updateCityAndProvince(curDivision.district)
367 | }
368 | }
369 | Town -> {
370 | if (!curDivision.hasTown()) curDivision.town = region
371 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId)
372 | if (needUpdateCityAndProvince) {
373 | updateCityAndProvince(curDivision.district)
374 | }
375 | }
376 | Village -> {
377 | if (!curDivision.hasVillage()) curDivision.village = region
378 | if (!curDivision.hasDistrict()) curDivision.district = persister.getRegion(region.parentId)
379 | if (needUpdateCityAndProvince) {
380 | updateCityAndProvince(curDivision.district)
381 | }
382 | }
383 | else -> { }
384 | }
385 | }
386 |
387 | private fun updateCityAndProvince(distinct: RegionEntity?) {
388 | if (distinct == null) return
389 | if (!curDivision.hasCity()) {
390 | curDivision.city = persister.getRegion(distinct.parentId)?.also { city ->
391 | if (!curDivision.hasProvince()) {
392 | curDivision.province = persister.getRegion(city.parentId)
393 | }
394 | }
395 | }
396 | }
397 |
398 | /**
399 | * [visit] 接受某个索引项之后当前匹配的指针位置
400 | */
401 | override fun position(): Int {
402 | return this.currentPos
403 | }
404 |
405 | /**
406 | * 结束索引访问
407 | */
408 | override fun endVisit(entry: TermIndexEntry, text: String, pos: Int) {
409 | this.checkDeepMost()
410 |
411 | val indexTerm = stack.pop() // 当前访问的索引对象出栈
412 | currentPos = pos - entry.key!!.length // 恢复当前位置指针
413 | val region = indexTerm.value as? RegionEntity
414 | if (isFullMatch(entry, region)) fullMatchCount++ //更新全名匹配的数量
415 | if (indexTerm.type == TermType.Ignore) return //如果是忽略项,无需更新当前已匹配的省市区状态
416 |
417 | // 扫描一遍stack,找出街道street、乡镇town、村庄village,以及省市区中级别最低的一个least
418 | var least: RegionEntity? = null
419 | var street: RegionEntity? = null
420 | var town: RegionEntity? = null
421 | var village: RegionEntity? = null
422 | stack.forEach {
423 | if (it.type == TermType.Ignore) return@forEach
424 | val r = it.value as RegionEntity
425 | when (r.type) {
426 | Street, PlatformL4 -> {
427 | street = r
428 | return@forEach
429 | }
430 | Town -> {
431 | town = r
432 | return@forEach
433 | }
434 | Village -> {
435 | village = r
436 | return@forEach
437 | }
438 | else -> { }
439 | }
440 | if (least == null) {
441 | least = r
442 | return@forEach
443 | }
444 | }
445 | if (street == null) curDivision.street = null // 剩余匹配项中没有街道了
446 | if (town == null) curDivision.town = null // 剩余匹配项中没有乡镇了
447 | if (village == null) curDivision.village = null // 剩余匹配项中没有村庄了
448 | // 只有街道、乡镇、村庄都没有时,才开始清空省市区
449 | if (curDivision.hasStreet() || curDivision.hasTown() || curDivision.hasVillage()) return
450 | if (least != null) {
451 | when (least!!.type) {
452 | Province, ProvinceLevelCity1 -> {
453 | curDivision.city = null
454 | curDivision.district = null
455 | return
456 | }
457 | City, ProvinceLevelCity2 -> {
458 | curDivision.district = null
459 | return
460 | }
461 | else -> return
462 | }
463 | }
464 | // least为null,说明stack中什么都不剩了
465 | curDivision.province = null
466 | curDivision.city = null
467 | curDivision.district = null
468 | }
469 |
470 | /**
471 | * 结束一轮词条匹配。
472 | */
473 | override fun endRound() {
474 | this.checkDeepMost()
475 | currentLevel--
476 | }
477 |
478 | private fun checkDeepMost() {
479 | if (stack.size > deepMostLevel) {
480 | deepMostLevel = stack.size
481 | deepMostPos = currentPos
482 | deepMostFullMatchCount = fullMatchCount
483 | deepMostDivision.province = curDivision.province
484 | deepMostDivision.city = curDivision.city
485 | deepMostDivision.district = curDivision.district
486 | deepMostDivision.street = curDivision.street
487 | deepMostDivision.town = curDivision.town
488 | deepMostDivision.village = curDivision.village
489 | }
490 | }
491 |
492 | /**
493 | * 是否匹配上了结果
494 | */
495 | override fun hasResult(): Boolean {
496 | return deepMostPos > 0 && deepMostDivision.hasDistrict()
497 | }
498 |
499 | /**
500 | * 获取访问后的对象
501 | */
502 | override fun devision(): Division {
503 | return deepMostDivision
504 | }
505 |
506 | override fun matchCount(): Int {
507 | return deepMostLevel
508 | }
509 |
510 | override fun fullMatchCount(): Int {
511 | return deepMostFullMatchCount
512 | }
513 |
514 | /**
515 | * 获取最终匹配结果的终止位置
516 | */
517 | override fun endPosition(): Int {
518 | return deepMostPos
519 | }
520 |
521 | /**
522 | * 状态复位
523 | */
524 | override fun reset() {
525 | currentLevel = 0
526 | deepMostLevel = 0
527 | currentPos = -1
528 | deepMostPos = -1
529 | fullMatchCount = 0
530 | deepMostFullMatchCount = 0
531 | deepMostDivision.province = null
532 | deepMostDivision.city = null
533 | deepMostDivision.district = null
534 | deepMostDivision.street = null
535 | deepMostDivision.town = null
536 | deepMostDivision.village = null
537 | curDivision.province = null
538 | curDivision.city = null
539 | curDivision.district = null
540 | curDivision.street = null
541 | curDivision.town = null
542 | curDivision.village = null
543 | }
544 |
545 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/impl/SimilarityComputer.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.impl
2 |
3 | import org.bitlap.geocoding.core.Computer
4 | import org.bitlap.geocoding.core.segment.AsciiSegmenter
5 | import org.bitlap.geocoding.core.segment.IKAnalyzerSegmenter
6 | import org.bitlap.geocoding.model.Address
7 | import org.bitlap.geocoding.similarity.Document
8 | import org.bitlap.geocoding.similarity.MatchedResult
9 | import org.bitlap.geocoding.similarity.MatchedTerm
10 | import org.bitlap.geocoding.similarity.Term
11 | import org.bitlap.geocoding.similarity.Term.TermType
12 | import org.bitlap.geocoding.similarity.Term.TermType.Building
13 | import org.bitlap.geocoding.similarity.Term.TermType.City
14 | import org.bitlap.geocoding.similarity.Term.TermType.District
15 | import org.bitlap.geocoding.similarity.Term.TermType.Province
16 | import org.bitlap.geocoding.similarity.Term.TermType.Road
17 | import org.bitlap.geocoding.similarity.Term.TermType.RoadNum
18 | import org.bitlap.geocoding.similarity.Term.TermType.Street
19 | import org.bitlap.geocoding.similarity.Term.TermType.Text
20 | import org.bitlap.geocoding.similarity.Term.TermType.Town
21 | import org.bitlap.geocoding.similarity.Term.TermType.Village
22 | import org.bitlap.geocoding.utils.isAsciiChars
23 | import org.bitlap.geocoding.utils.isNumericChars
24 |
25 | /**
26 | * Desc: 相似度算法相关逻辑
27 | *
28 | * * 关于 TF-IDF
29 | * * TC: 词数 Term Count, 某个词在文档中出现的次数
30 | * * TF: 词频 Term Frequency, 某个词在文档中出现的频率. TF = 该词在文档中出现的次数 / 该文档的总词数
31 | * * IDF: 逆文档词频 Inverse Document Frequency. IDF = log( 语料库文档总数 / ( 包含该词的文档数 + 1 ) ). 分母加1是为了防止分母出现0的情况
32 | * * TF-IDF: 词条的特征值, TF-IDF = TF * IDF
33 | *
34 | * Mail: chk19940609@gmail.com
35 | * Created by IceMimosa
36 | * Date: 2017/2/5
37 | */
38 | open class SimilarityComputer : Computer {
39 |
40 | private val segmenter = IKAnalyzerSegmenter() // text的分词, 默认 ik 分词器
41 |
42 | // private val simpleSegmenter = SimpleSegmenter() // 暂时用于处理 building 的分词
43 | private val simpleSegmenter = AsciiSegmenter() // 暂时用于处理 building 的分词
44 |
45 | // 中文数字字符
46 | private val NUMBER_CN = arrayOf('一', '二', '三', '四', '五', '六', '七', '八', '九', '0', '1', '2' ,'3' ,'4' ,'5' ,'6' ,'7' ,'8' ,'9')
47 |
48 | // 权重值常量
49 | private val BOOST_M = 1.0 // 正常权重
50 | private val BOOST_L = 2.0 // 加权高值
51 | private val BOOST_XL = 4.0 // 加权高值
52 | private val BOOST_S = 0.5 // 降权
53 | private val BOOST_XS = 0.25 // 降权
54 |
55 | /**
56 | * 将标准地址转化成文档对象
57 | * 1. 对text进行分词
58 | * 2. 对每个部分设置权重
59 | */
60 | override fun analyze(address: Address): Document {
61 | val doc = Document()
62 |
63 | var tokens: List = emptyList()
64 | // 1. 对 text (地址解析后剩余文本) 进行分词
65 | if (!address.text.isNullOrBlank()) {
66 | tokens = segmenter.segment(address.text!!)
67 | }
68 |
69 | val terms = arrayListOf()
70 | // 2. 生成 term
71 | // 2.1 town
72 | val town = if (!address.town.isNullOrBlank()) address.town else address.street
73 | if (!town.isNullOrBlank()) {
74 | doc.town = Term(Town, town)
75 | terms.add(doc.town!!)
76 | }
77 | // 2.2 village
78 | val village = address.village
79 | if (!village.isNullOrBlank()) {
80 | doc.village = Term(Village, village)
81 | terms.add(doc.village!!)
82 | }
83 | // 2.3 road
84 | val road = address.road
85 | if (!road.isNullOrBlank()) {
86 | doc.road = Term(Road, road)
87 | terms.add(doc.road!!)
88 | }
89 | // 2.4 road num
90 | val roadNum = address.roadNum
91 | if (!roadNum.isNullOrBlank()) {
92 | val roadNumTerm = Term(RoadNum, roadNum)
93 | doc.roadNum = roadNumTerm
94 | doc.roadNumValue = translateRoadNum(roadNum)
95 | roadNumTerm.ref = doc.road
96 | terms.add(doc.roadNum!!)
97 | }
98 | // 2.5 building num
99 | val buildingNum = address.buildingNum
100 | if (!buildingNum.isNullOrBlank()) {
101 | // 转换 building串
102 | translateBuilding(buildingNum).forEach {
103 | terms.add(Term(Building, it))
104 | }
105 | }
106 |
107 | // 3. 将分词放置到token中
108 | val termTexts = terms.map(Term::text)
109 | tokens.forEach {
110 | // 如果 terms 中不包含
111 | // 并且乡镇道路中不包含
112 | if (!termTexts.contains(it) && town != it && village != it && road != it) {
113 | terms.add(Term(Text, it))
114 | }
115 | }
116 |
117 | // 4. 设置每个 Term 的 IDF
118 | // 由于 TF-IDF 在计算地址相似度上意义不是特别明显
119 | putIdfs(terms)
120 |
121 | doc.terms = terms
122 | return doc
123 | }
124 |
125 | /**
126 | * 计算两个标准地址的相似度
127 | * 1. 将两个地址形成 Document
128 | * 2. 为每个Document的Term设置权重
129 | * 3. 计算两个分词组的余弦相似度, 值为0~1,值越大表示相似度越高,返回值为1则表示完全相同
130 | */
131 | override fun compute(addr1: Address?, addr2: Address?): MatchedResult {
132 | if (addr1 == null || addr2 == null) {
133 | return MatchedResult()
134 | }
135 | // 如果两个地址不在同一个省市区, 则认为是不相同地址
136 | if (addr1.provinceId != addr2.provinceId || addr1.cityId != addr2.cityId || addr1.districtId != addr2.districtId) {
137 | return MatchedResult()
138 | }
139 |
140 | // 为每个address计算词条
141 | val doc1 = analyze(addr1)
142 | val doc2 = analyze(addr2)
143 |
144 | // 计算两个document的相似度
145 | val cp1 = computeSimilarity(doc1, doc2)
146 | val cp2 = computeSimilarity(doc2, doc1)
147 |
148 | // 暂时获取计算结果最小的那个
149 | if (cp1.similarity < cp2.similarity) {
150 | return cp1
151 | }
152 | return cp2
153 | }
154 |
155 |
156 | /**
157 | * 提取 道路门牌号中的数字, 如 40号、一号院
158 | */
159 | private fun translateRoadNum(roadNum: String?): Int {
160 | if (roadNum.isNullOrBlank()) return 0
161 |
162 | val sb = StringBuilder()
163 | var isTen = false // 是否含有十
164 | loop@ for (i in roadNum.indices) {
165 | val c = roadNum[i]
166 |
167 | // 识别汉字中的 "十", 由于 "十号" 和 "二十号" 的意义不同
168 | if (isTen) {
169 | val pre = sb.isNotEmpty()
170 | val post = NUMBER_CN.contains(c) || c in '0'..'9'
171 | if (pre) { // 如果前面含有, 则追加 0
172 | if (post) { /*do nothing*/
173 | } else {
174 | sb.append('0')
175 | }
176 | } else {
177 | if (post) sb.append('1')
178 | else sb.append("10")
179 | }
180 | isTen = false
181 | }
182 | // 追加数字
183 | when (c) {
184 | '一' -> { sb.append(1); continue@loop }
185 | '二' -> { sb.append(2); continue@loop }
186 | '三' -> { sb.append(3); continue@loop }
187 | '四' -> { sb.append(4); continue@loop }
188 | '五' -> { sb.append(5); continue@loop }
189 | '六' -> { sb.append(6); continue@loop }
190 | '七' -> { sb.append(7); continue@loop }
191 | '八' -> { sb.append(8); continue@loop }
192 | '九' -> { sb.append(9); continue@loop }
193 | '十' -> { isTen = true; continue@loop }
194 | }
195 |
196 | //ANSI数字字符
197 | if (c in '0'..'9') {
198 | sb.append(c)
199 | continue
200 | }
201 | //中文全角数字字符
202 | when (c) {
203 | '0' -> { sb.append(0); continue@loop}
204 | '1' -> { sb.append(1); continue@loop}
205 | '2' -> { sb.append(2); continue@loop}
206 | '3' -> { sb.append(3); continue@loop}
207 | '4' -> { sb.append(4); continue@loop}
208 | '5' -> { sb.append(5); continue@loop}
209 | '6' -> { sb.append(6); continue@loop}
210 | '7' -> { sb.append(7); continue@loop}
211 | '8' -> { sb.append(8); continue@loop}
212 | '9' -> { sb.append(9); continue@loop}
213 | }
214 | }
215 | if (isTen) {
216 | if (sb.isNotEmpty())
217 | sb.append('0')
218 | else
219 | sb.append("10")
220 | }
221 | if (sb.isNotEmpty()) return Integer.parseInt(sb.toString())
222 | return 0
223 | }
224 |
225 | /**
226 | * 与 road 不同的是, building可能存在多个数字
227 | * 将字符串中的数字, 字母等提取出来
228 | */
229 | private fun translateBuilding(building: String?): List {
230 | if (building.isNullOrBlank()) return emptyList()
231 | return simpleSegmenter.segment(building)
232 | }
233 |
234 | /**
235 | * 获取 termText -> IDF 的映射
236 | * 简单实现, TODO: 未进行语料库的统计
237 | */
238 | private fun putIdfs(terms: List) {
239 | terms.forEach {
240 | // 计算 IDF
241 | val key = it.text
242 | if (key.isNumericChars()) it.idf = 2.0
243 | else if (key.isAsciiChars()) it.idf = 2.0
244 | // else it.idf = Math.log(docs / (tdocs + 1))
245 | else it.idf = 4.0 // 由于未进行语料库的统计, 默认4
246 | }
247 | }
248 |
249 | /**
250 | * 计算两个文档的余弦相似度
251 | */
252 | private fun computeSimilarity(doc1: Document, doc2: Document): MatchedResult {
253 |
254 | // 1. 计算Terms中 text类型词条 的匹配率
255 | var qTextTermCount = 0 // 文档1的Text类型词条数目
256 | var dTextTermMatchCount = 0 // 与文档2的Text类型词条匹配数目
257 | // 匹配此处之间的词数间隔
258 | var matchStart = -1
259 | var matchEnd = -1
260 | for (term1 in doc1.terms ?: emptyList()) {
261 | if (term1.type != TermType.Text) continue
262 | qTextTermCount++
263 | for ((i, term2) in (doc2.terms ?: emptyList()).withIndex()) {
264 | if (term2.type != TermType.Text) continue
265 | if (term1.text == term2.text) {
266 | dTextTermMatchCount++
267 | if (matchStart == -1) {
268 | matchEnd = i
269 | matchStart = matchEnd
270 | break
271 | }
272 | if (i > matchEnd)
273 | matchEnd = i
274 | else if (i < matchStart)
275 | matchStart = i
276 | break
277 | }
278 | }
279 | }
280 |
281 | // 1.1 计算匹配率
282 | var termCoord = 1.0
283 | if (qTextTermCount > 0) {
284 | // Math.sqrt( 匹配上的词条数 / doc1的Text词条数 ) * 0.5 + 0.5
285 | termCoord = Math.sqrt(dTextTermMatchCount * 1.0 / qTextTermCount) * 0.5 + 0.5
286 | }
287 | // 1.2 计算稠密度
288 | var termDensity = 1.0
289 | if (qTextTermCount >= 2 && dTextTermMatchCount >= 2) {
290 | // Math.sqrt( 匹配上的词条数 / doc2匹配词条之间的距离 ) * 0.5 + 0.5
291 | termDensity = Math.sqrt(dTextTermMatchCount * 1.0 / (matchEnd - matchStart + 1)) * 0.5 + 0.5
292 | }
293 |
294 | // 2. 计算 TF-IDF(非标准) 和 余弦相似度的中间值
295 | val result = MatchedResult()
296 | result.doc1 = doc1
297 | result.doc2 = doc2
298 |
299 | // 余弦相似度的中间值
300 | var sumQD = 0.0
301 | var sumQQ = 0.0
302 | var sumDD = 0.0
303 | for (qterm in doc1.terms ?: emptyList()) {
304 | val qboost = getBoostValue(false, doc1, qterm, doc2, null)
305 | val q_TF_IDF = qboost * qterm.idf!!
306 | // 文档2的term
307 | var dterm = doc2.getTerm(qterm.text)
308 | if (dterm == null && RoadNum == qterm.type) {
309 | // 从文档2中找门牌号词条
310 | if (doc2.roadNum != null && doc2.road != null && doc2.road == qterm.ref)
311 | dterm = doc2.roadNum
312 | }
313 |
314 | val dboost = if (dterm == null) 0.0 else getBoostValue(true, doc1, qterm, doc2, dterm)
315 | val coord = if (dterm != null && Text == dterm.type) termCoord else 1.0
316 | val density = if (dterm != null && Text == dterm.type) termDensity else 1.0
317 | val d_TF_IDF = (if (dterm != null) dterm.idf else qterm.idf)!! * dboost * coord * density
318 |
319 | // 计算相似度
320 | if (dterm != null) {
321 | val matchedTerm = MatchedTerm(dterm)
322 | matchedTerm.boost = dboost
323 | matchedTerm.tfidf = d_TF_IDF
324 | if (Text == dterm.type) {
325 | matchedTerm.density = density
326 | matchedTerm.coord = coord
327 | } else {
328 | matchedTerm.density = -1.0
329 | matchedTerm.coord = -1.0
330 | }
331 | result.terms.add(matchedTerm)
332 | }
333 |
334 | sumQQ += q_TF_IDF * q_TF_IDF
335 | sumQD += q_TF_IDF * d_TF_IDF
336 | sumDD += d_TF_IDF * d_TF_IDF
337 | }
338 |
339 | if (sumDD == 0.0 || sumQQ == 0.0) return result
340 |
341 | // 计算余弦相似度
342 | result.similarity = sumQD / Math.sqrt(sumQQ * sumDD)
343 |
344 | return result
345 | }
346 |
347 | /**
348 | * 根据不同的词条设置不同的权重
349 | * [forDoc]
350 | * > true 则计算 [ddoc] 的权重, 此时 [qdoc], [qterm], [ddoc], [dterm] 不为空
351 | * > false 则计算 [qdoc] 的权重, 此时 [qdoc], [qterm], [ddoc] 不为空, [dterm] 为空
352 | */
353 | private fun getBoostValue(forDoc: Boolean, qdoc: Document, qterm: Term, ddoc: Document, dterm: Term?): Double {
354 |
355 | val termType = if (forDoc) dterm!!.type else qterm.type
356 | // 权重值
357 | var boost = BOOST_M
358 | when (termType) {
359 | // 省市区、道路出现频次高, IDF值较低, 但重要程度最高, 因此给予比较高的加权权重
360 | Province, City, District -> boost = BOOST_XL
361 | // 一般人对于城市街道范围概念不强,在地址中随意选择街道的可能性较高,因此降权处理
362 | Street -> boost = BOOST_XS
363 | // 乡镇和村庄
364 | Town, Village -> {
365 | boost = BOOST_XS
366 | // 乡镇
367 | if (Town == termType) {
368 | // 查询两个文档之间都有乡镇, 为乡镇加权。注意:存在乡镇相同、不同两种情况。
369 | // > 乡镇相同:查询文档和地址库文档都加权BOOST_L,提高相似度
370 | // > 乡镇不同:只有查询文档的词条加权BOOST_L, 地址库文档的词条因无法匹配不会进入该函数。结果是拉开相似度的差异
371 | if (qdoc.town != null && ddoc.town != null) boost = BOOST_L
372 | }
373 | // 村庄
374 | else {
375 | // 两个文档都有乡镇且乡镇相同,且查询文档和地址库文档都有村庄时,为村庄加权
376 | // 与上述乡镇类似,存在村庄相同和不同两种情况
377 | if (qdoc.village != null && ddoc.village != null && qdoc.town != null) {
378 | if (qdoc.town == ddoc.town) { // 镇相同
379 | if (qdoc.village == ddoc.village) boost = BOOST_XL
380 | else boost = BOOST_L
381 | } else if (ddoc.town != null) { // 镇不同
382 | if (!forDoc) boost = BOOST_L
383 | else boost = BOOST_S
384 | }
385 | }
386 | }
387 | }
388 | // 道路信息
389 | Road, RoadNum, Building -> {
390 | // 有乡镇有村庄,不再考虑道路、门牌号的加权
391 | if (qdoc.town == null || qdoc.village == null) {
392 | // 道路
393 | if (Road == termType) {
394 | if (qdoc.road != null && ddoc.road != null) boost = BOOST_L
395 | }
396 | // 门牌号。注意:查询文档和地址库文档的门牌号都会进入此处执行, 这一点跟Road、Town、Village不同。
397 | // TODO: building 暂时和道路号的权重一致, 后期需优化单独处理
398 | else {
399 | if (qdoc.roadNumValue > 0 && ddoc.roadNumValue > 0 && qdoc.road != null && qdoc.road == ddoc.road) {
400 | if (qdoc.roadNumValue == ddoc.roadNumValue)
401 | boost = 3.0
402 | else
403 | boost = if (forDoc)
404 | 1 / Math.sqrt(Math.sqrt((Math.abs(qdoc.roadNumValue - ddoc.roadNumValue) + 1).toDouble())) * BOOST_L
405 | else
406 | 3.0
407 | }
408 | }
409 | }
410 | }
411 | Text -> boost = BOOST_M
412 | else -> boost = BOOST_M
413 | }
414 |
415 | return boost
416 | }
417 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/segment/AsciiSegmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.segment
2 |
3 | /**
4 | * Desc: 简单的分词, 直接按单个字符切分,连续出现的数字、英文字母会作为一个词条.
5 | * 去除非 ASCII 字符 (其实只保留英文和数字)
6 | * Mail: chk19940609@gmail.com
7 | * Created by IceMimosa
8 | * Date: 2017/2/28
9 | */
10 | class AsciiSegmenter : SimpleSegmenter() {
11 |
12 | /**
13 | * 分词方法
14 | */
15 | override fun segment(text: String): List {
16 | return super.segment(text, true)
17 | }
18 |
19 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/segment/IKAnalyzerSegmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.segment
2 |
3 | import org.bitlap.geocoding.core.Segmenter
4 | import org.wltea.analyzer.core.IKSegmenter
5 | import org.wltea.analyzer.core.Lexeme
6 | import java.io.StringReader
7 |
8 | /**
9 | * Desc: ik 分词器
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/2/12
13 | */
14 | class IKAnalyzerSegmenter : Segmenter {
15 |
16 | /**
17 | * 分词方法
18 | */
19 | override fun segment(text: String): List {
20 | val segs = arrayListOf()
21 | val reader = StringReader(text)
22 | // 设置ik的智能分词
23 | val ik = IKSegmenter(reader, true)
24 | var lexeme: Lexeme? = ik.next()
25 | while (lexeme != null) {
26 | segs.add(lexeme.lexemeText)
27 | lexeme = ik.next()
28 | }
29 | reader.close()
30 | return segs
31 | }
32 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/segment/SimpleSegmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.segment
2 |
3 | import org.bitlap.geocoding.core.Segmenter
4 | import org.bitlap.geocoding.utils.take
5 |
6 | /**
7 | * Desc: 简单的分词, 直接按单个字符切分,连续出现的数字、英文字母会作为一个词条
8 | * Mail: chk19940609@gmail.com
9 | * Created by IceMimosa
10 | * Date: 2017/2/6
11 | */
12 | open class SimpleSegmenter : Segmenter {
13 |
14 | /**
15 | * 分词方法
16 | */
17 | override fun segment(text: String): List {
18 | return segment(text, false)
19 | }
20 |
21 | /**
22 | * [remove] 是否去除 非ascii字符, 其实只保留英文和数字
23 | */
24 | protected fun segment(text: String, remove: Boolean): List {
25 | val segs = arrayListOf()
26 | if (text.isBlank()) {
27 | return segs
28 | }
29 | var digitNum = 0
30 | var ansiCharNum = 0
31 | for (i in 0 until text.length) {
32 | val c = text[i]
33 | // 是否是数字
34 | if (c in '0'..'9') {
35 | // 截取出字母
36 | if (ansiCharNum > 0) {
37 | segs.add(text.take(i - ansiCharNum, i - 1))
38 | ansiCharNum = 0
39 | }
40 | digitNum++
41 | continue
42 | }
43 | // 是否是字母
44 | if (c in 'A'..'Z' || c in 'a'..'z') {
45 | // 截取出数字
46 | if (digitNum > 0) {
47 | segs.add(text.take(i - digitNum, i - 1))
48 | digitNum = 0
49 | }
50 | ansiCharNum++
51 | continue
52 | }
53 | // 非数字字母时, 截取
54 | if (digitNum > 0 || ansiCharNum > 0) { //digitNum, ansiCharNum中只可能一个大于0
55 | segs.add(text.take(i - digitNum - ansiCharNum, i - 1))
56 | ansiCharNum = 0
57 | digitNum = 0
58 | }
59 | if (!remove) segs.add(c.toString())
60 | }
61 | // 截取剩余
62 | if (digitNum > 0 || ansiCharNum > 0) { //digitNum, ansiCharNum中只可能一个大于0
63 | segs.add(text.take(text.length - digitNum - ansiCharNum))
64 | // ansiCharNum = 0
65 | // digitNum = 0
66 | }
67 | return segs
68 | }
69 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/segment/SmartCNSegmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.segment
2 |
3 | import org.bitlap.geocoding.core.Segmenter
4 | //import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer
5 | //import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
6 |
7 | /**
8 | * Desc: lucene 的 smartCN 分词器
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/2/6
12 | */
13 | open class SmartCNSegmenter : Segmenter {
14 |
15 | // private val ANALYZER = SmartChineseAnalyzer()
16 |
17 | /**
18 | * 分词方法
19 | */
20 | override fun segment(text: String): List {
21 | val segs = arrayListOf()
22 | // 切分
23 | // val ts = ANALYZER.tokenStream("text", text)
24 | // ts.reset()
25 | // while (ts.incrementToken()) {
26 | // val attr = ts.getAttribute(CharTermAttribute::class.java)
27 | // segs.add(attr.toString())
28 | // }
29 | // ts.end()
30 | // ts.close()
31 | return segs
32 | }
33 |
34 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/core/segment/WordSegmenter.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.core.segment
2 |
3 | import org.bitlap.geocoding.core.Segmenter
4 |
5 | /**
6 | * Desc: word 分词器 @see https://github.com/ysc/word
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/2/6
10 | */
11 | open class WordSegmenter : Segmenter {
12 |
13 | /**
14 | * 分词方法
15 | */
16 | override fun segment(text: String): List {
17 | val segs = arrayListOf()
18 | // 去除停用词
19 | // WordSegmenter.segWithStopWords(text).forEach {
20 | // segs.add(it.text)
21 | // }
22 | return segs
23 | }
24 |
25 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/index/TermIndexBuilder.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.index
2 |
3 | import org.bitlap.geocoding.core.TermIndexVisitor
4 | import org.bitlap.geocoding.model.RegionEntity
5 | import org.bitlap.geocoding.model.RegionType.City
6 | import org.bitlap.geocoding.model.RegionType.CityLevelDistrict
7 | import org.bitlap.geocoding.model.RegionType.Country
8 | import org.bitlap.geocoding.model.RegionType.District
9 | import org.bitlap.geocoding.model.RegionType.PlatformL4
10 | import org.bitlap.geocoding.model.RegionType.Province
11 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity1
12 | import org.bitlap.geocoding.model.RegionType.ProvinceLevelCity2
13 | import org.bitlap.geocoding.model.RegionType.Street
14 | import org.bitlap.geocoding.model.RegionType.Town
15 | import org.bitlap.geocoding.model.RegionType.Village
16 | import org.bitlap.geocoding.utils.head
17 |
18 | /**
19 | * Desc: 行政区划建立倒排索引
20 | * Mail: chk19940609@gmail.com
21 | * Created by IceMimosa
22 | * Date: 2017/1/17
23 | */
24 | open class TermIndexBuilder(
25 | rootRegion: RegionEntity,
26 | ignoringRegionNames: List
27 | ) {
28 |
29 | private val indexRoot = TermIndexEntry()
30 |
31 | init {
32 | this.indexRegions(rootRegion.children ?: emptyList())
33 | this.indexIgnoring(ignoringRegionNames)
34 | }
35 |
36 | // 为行政区划(标准地址库建立倒排索引)
37 | @Synchronized
38 | fun indexRegions(regions: List, replace: Boolean = false) {
39 | if (regions.isEmpty()) return
40 | for (region in regions) {
41 | val indexItem = TermIndexItem(convertRegionType(region), region)
42 | for (alias in region.orderedNames ?: emptyList()) {
43 | indexRoot.buildIndex(alias, 0, indexItem, replace)
44 | }
45 |
46 | //1. 为xx街道,建立xx镇、xx乡的别名索引项
47 | //2. 为xx镇,建立xx乡的别名索引项
48 | //3. 为xx乡,建立xx镇的别名索引项
49 | val rName = region.name
50 | var autoAlias = rName.length <= 5 && region.alias.isEmpty()
51 | && (region.isTown() || rName.endsWith("街道"))
52 | if (autoAlias && rName.length == 5) {
53 | when (region.name[2]) {
54 | '路', '街', '门', '镇', '村', '区' -> autoAlias = false
55 | }
56 | }
57 | if (autoAlias) {
58 | var shortName: String?
59 | if (region.isTown()) {
60 | shortName = rName.head(rName.length - 1) ?: ""
61 | } else {
62 | shortName = rName.head(rName.length - 2) ?: ""
63 | }
64 | // 建立索引
65 | if (shortName.length >= 2) {
66 | indexRoot.buildIndex(shortName, 0, indexItem, replace)
67 | }
68 | if (rName.endsWith("街道") || rName.endsWith("镇"))
69 | indexRoot.buildIndex(shortName + "乡", 0, indexItem, replace)
70 | if (rName.endsWith("街道") || rName.endsWith("乡"))
71 | indexRoot.buildIndex(shortName + "镇", 0, indexItem, replace)
72 | }
73 |
74 | // 递归
75 | if (region.children != null && region.children!!.isNotEmpty()) {
76 | this.indexRegions(region.children!!)
77 | }
78 | }
79 | }
80 |
81 | /**
82 | * 为忽略列表建立倒排索引
83 | */
84 | @Synchronized
85 | fun indexIgnoring(ignoringRegionNames: List, replace: Boolean = false) {
86 | if (ignoringRegionNames.isEmpty()) return
87 | for (ignore in ignoringRegionNames) {
88 | indexRoot.buildIndex(ignore, 0, TermIndexItem(TermType.Ignore, null), replace)
89 | }
90 | }
91 |
92 | // 获取 region 的类型
93 | private fun convertRegionType(region: RegionEntity): TermType =
94 | when (region.type) {
95 | Country -> TermType.Country
96 | Province, ProvinceLevelCity1 -> TermType.Province
97 | City, ProvinceLevelCity2 -> TermType.City
98 | District, CityLevelDistrict -> TermType.District
99 | PlatformL4 -> TermType.Street
100 | Town -> TermType.Town
101 | Village -> TermType.Village
102 | Street -> if (region.isTown()) TermType.Town else TermType.Street
103 | else -> TermType.Undefined
104 | }
105 |
106 | /**
107 | * 深度优先匹配词条
108 | */
109 | fun deepMostQuery(text: String?, visitor: TermIndexVisitor) {
110 | if (text == null || text.isEmpty()) return
111 | // 判断是否有中国开头
112 | var p = 0
113 | if (text.startsWith("中国") || text.startsWith("天朝")) {
114 | p += 2
115 | }
116 | this.deepMostQuery(text, p, visitor)
117 | }
118 |
119 | fun deepMostQuery(text: String?, pos: Int, visitor: TermIndexVisitor) {
120 | if (text == null || text.isEmpty()) return
121 | // 开始匹配
122 | visitor.startRound()
123 | this.deepFirstQueryRound(text, pos, indexRoot.children ?: emptyMap(), visitor)
124 | visitor.endRound()
125 | }
126 |
127 | private fun deepFirstQueryRound(text: String, pos: Int, entries: Map, visitor: TermIndexVisitor) {
128 | // 获取索引对象
129 | if (pos > text.length - 1) return
130 | val entry = entries[text[pos]] ?: return
131 |
132 | if (entry.children != null && pos + 1 <= text.length - 1) {
133 | this.deepFirstQueryRound(text, pos + 1, entry.children ?: emptyMap(), visitor)
134 | }
135 | if (entry.hasItem()) {
136 | if (visitor.visit(entry, text, pos)) {
137 | // 给访问者一个调整当前指针的机会
138 | val p = visitor.position()
139 | if (p + 1 <= text.length - 1) {
140 | deepMostQuery(text, p + 1, visitor)
141 | }
142 | visitor.endVisit(entry, text, p)
143 | }
144 | }
145 | }
146 |
147 | fun fullMatch(text: String?): List? {
148 | if (text == null || text.isEmpty()) return null
149 | return fullMatch(text, 0, indexRoot.children)
150 | }
151 |
152 | private fun fullMatch(text: String, pos: Int, entries: Map?): List? {
153 | if (entries == null) return null
154 | val c = text[pos]
155 | val entry = entries[c] ?: return null
156 | if (pos == text.length - 1) return entry.items
157 | return fullMatch(text, pos + 1, entry.children)
158 | }
159 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/index/TermIndexEntry.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.index
2 |
3 | import org.bitlap.geocoding.utils.head
4 |
5 | /**
6 | * Desc: 索引条目
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/1/12
10 | */
11 | open class TermIndexEntry {
12 | // 条目的key
13 | var key: String? = null
14 | // 每个条目下的所有索引对象
15 | var items = mutableListOf()
16 | // 子条目
17 | var children = hashMapOf()
18 |
19 | fun addItem(item: TermIndexItem): TermIndexEntry {
20 | this.items.add(item)
21 | return this
22 | }
23 | fun hasItem(): Boolean = this.items.isNotEmpty()
24 |
25 | /**
26 | * 初始化倒排索引
27 | */
28 | fun buildIndex(text: String?, pos: Int, item: TermIndexItem, replace: Boolean) {
29 | if (text.isNullOrBlank() || pos < 0 || pos >=text.length) {
30 | return
31 | }
32 | val c = text[pos]
33 | var entry = this.children[c]
34 | if (entry == null) {
35 | entry = TermIndexEntry()
36 | entry.key = text.head(pos + 1)
37 | this.children[c] = entry
38 | }
39 | if (pos == text.length - 1) {
40 | if (replace && item.value != null) {
41 | entry.items.removeIf { item.value.equalsWithoutId(it.value) }
42 | }
43 | entry.addItem(item)
44 | return
45 | }
46 | entry.buildIndex(text, pos + 1, item, replace)
47 | }
48 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/index/TermIndexItem.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.index
2 |
3 | import org.bitlap.geocoding.model.RegionEntity
4 |
5 | /**
6 | * Desc: 索引对象
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/1/16
10 | */
11 | data class TermIndexItem constructor(val type: TermType, val value: RegionEntity?)
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/index/TermType.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.index
2 |
3 |
4 | /**
5 | * Desc: 词条的类型
6 | * 地址虽算不上标准结构化文本,但格式具备一定的规则性,例如省/市/区、道路/门牌号、小区/楼号/户号等
7 | * 词条类型用来标记该词条属于地址的哪一组成部分,主要用于相似度计算时,为不同组成部分区别性的进行加权
8 | * Mail: chk19940609@gmail.com
9 | * Created by IceMimosa
10 | * Date: 2017/1/12
11 | */
12 | enum class TermType(val type: Char) {
13 |
14 | Undefined('0'),
15 | // 国家
16 | Country('C'),
17 | // 省
18 | Province('1'),
19 | // 地级市
20 | City('2'),
21 | // 区县
22 | District('3'),
23 | // 街道
24 | Street('4'),
25 | // 乡镇
26 | Town('T'),
27 | // 村
28 | Village('V'),
29 | // 道路
30 | Road('R'),
31 | // 门牌号
32 | RoadNum('N'),
33 | // 其他地址文本
34 | Text('X'),
35 | // 忽略项
36 | Ignore('I');
37 |
38 | // 获取枚举类型
39 | fun toEnum(type: Char): TermType {
40 | val enums = TermType.values()
41 | for (e in enums) {
42 | if (e.type == type) return e
43 | }
44 | return TermType.Undefined
45 | }
46 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/model/Address.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.model
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Desc: address 实体类
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/1/18
10 | */
11 | open class Address : Serializable {
12 |
13 | // 省
14 | var provinceId: Long? = null
15 | var province: String? = null
16 | // 市
17 | var cityId: Long? = null
18 | var city: String? = null
19 | // 区
20 | var districtId: Long? = null
21 | var district: String? = null
22 | // 街道
23 | var streetId: Long? = null
24 | var street: String? = null
25 | // 乡镇
26 | var townId: Long? = null
27 | var town: String? = null
28 | // 村
29 | var villageId: Long? = null
30 | var village: String? = null
31 | // 道路
32 | var road: String? = null
33 | // 道路号
34 | var roadNum: String? = null
35 | // 建筑物信息
36 | var buildingNum: String? = null
37 | // 切分剩余未解析出来的地址
38 | var text: String? = null
39 |
40 | companion object {
41 | // 构建一个Address对象
42 | fun build(entity: AddressEntity?): Address? {
43 | if (entity == null || !entity.hasProvince()) return null
44 | val address = Address()
45 | address.provinceId = entity.province?.id
46 | address.province = entity.province?.name
47 | address.cityId = entity.city?.id
48 | address.city = entity.city?.name
49 | address.districtId = entity.district?.id
50 | address.district = entity.district?.name
51 | address.streetId = entity.street?.id
52 | address.street = entity.street?.name
53 | address.townId = entity.town?.id
54 | address.town = entity.town?.name
55 | address.villageId = entity.village?.id
56 | address.village = entity.village?.name
57 | address.road = entity.road
58 | address.roadNum = entity.roadNum
59 | address.buildingNum = entity.buildingNum
60 | address.text = entity.text
61 | return address
62 | }
63 | }
64 |
65 | constructor()
66 | constructor(provinceId: Long?, province: String?, cityId: Long?, city: String?, districtId: Long?, district: String?, streetId: Long?, street: String?, townId: Long?, town: String?, villageId: Long?, village: String?, road: String?, roadNum: String?, buildingNum: String?, text: String?) {
67 | this.provinceId = provinceId
68 | this.province = province
69 | this.cityId = cityId
70 | this.city = city
71 | this.districtId = districtId
72 | this.district = district
73 | this.streetId = streetId
74 | this.street = street
75 | this.townId = townId
76 | this.town = town
77 | this.villageId = villageId
78 | this.village = village
79 | this.road = road
80 | this.roadNum = roadNum
81 | this.buildingNum = buildingNum
82 | this.text = text
83 | }
84 |
85 | override fun toString(): String {
86 | return "Address(\n\tprovinceId=$provinceId, province=$province, " +
87 | "\n\tcityId=$cityId, city=$city, " +
88 | "\n\tdistrictId=$districtId, district=$district, " +
89 | "\n\tstreetId=$streetId, street=$street, " +
90 | "\n\ttownId=$townId, town=$town, " +
91 | "\n\tvillageId=$villageId, village=$village, " +
92 | "\n\troad=$road, " +
93 | "\n\troadNum=$roadNum, " +
94 | "\n\tbuildingNum=$buildingNum, " +
95 | "\n\ttext=$text\n)"
96 |
97 | }
98 |
99 | override fun equals(other: Any?): Boolean {
100 | if (this === other) return true
101 | if (other !is Address) return false
102 |
103 | if (provinceId != other.provinceId) return false
104 | if (province != other.province) return false
105 | if (cityId != other.cityId) return false
106 | if (city != other.city) return false
107 | if (districtId != other.districtId) return false
108 | if (district != other.district) return false
109 | if (streetId != other.streetId) return false
110 | if (street != other.street) return false
111 | if (townId != other.townId) return false
112 | if (town != other.town) return false
113 | if (villageId != other.villageId) return false
114 | if (village != other.village) return false
115 | if (road != other.road) return false
116 | if (roadNum != other.roadNum) return false
117 | if (buildingNum != other.buildingNum) return false
118 | if (text != other.text) return false
119 |
120 | return true
121 | }
122 |
123 | override fun hashCode(): Int {
124 | var result = provinceId?.hashCode() ?: 0
125 | result = 31 * result + (province?.hashCode() ?: 0)
126 | result = 31 * result + (cityId?.hashCode() ?: 0)
127 | result = 31 * result + (city?.hashCode() ?: 0)
128 | result = 31 * result + (districtId?.hashCode() ?: 0)
129 | result = 31 * result + (district?.hashCode() ?: 0)
130 | result = 31 * result + (streetId?.hashCode() ?: 0)
131 | result = 31 * result + (street?.hashCode() ?: 0)
132 | result = 31 * result + (townId?.hashCode() ?: 0)
133 | result = 31 * result + (town?.hashCode() ?: 0)
134 | result = 31 * result + (villageId?.hashCode() ?: 0)
135 | result = 31 * result + (village?.hashCode() ?: 0)
136 | result = 31 * result + (road?.hashCode() ?: 0)
137 | result = 31 * result + (roadNum?.hashCode() ?: 0)
138 | result = 31 * result + (buildingNum?.hashCode() ?: 0)
139 | result = 31 * result + (text?.hashCode() ?: 0)
140 | return result
141 | }
142 |
143 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/model/AddressEntity.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.model
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Desc: 标准地址实体类
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/1/17
10 | */
11 | open class AddressEntity constructor() : Division(), Serializable {
12 |
13 | /**
14 | * 解析地址后剩余的地址
15 | */
16 | var text: String? = null
17 | set(value) {
18 | if (value == null) field = "" else field = value.trim()
19 | }
20 | /**
21 | * 解析出的道路信息
22 | */
23 | var road: String? = null
24 | set(value) {
25 | if (value == null) field = "" else field = value.trim()
26 | }
27 | /**
28 | * 解析出的道路号
29 | */
30 | var roadNum: String? = null
31 | set(value) {
32 | if (value == null) field = "" else field = value.trim()
33 | }
34 | /**
35 | * 解析出的建筑信息
36 | */
37 | var buildingNum: String? = null
38 | set(value) {
39 | if (value == null) field = "" else field = value.trim()
40 | }
41 | /**
42 | * 源地址的hash值, 保留做唯一性处理
43 | */
44 | var hash: Int? = null
45 | /**
46 | * 源地址保留
47 | */
48 | var address: String? = null
49 |
50 | constructor(address: String?) : this() {
51 | this.address = address
52 | this.text = address
53 | }
54 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/model/Division.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.model
2 |
3 | import org.bitlap.geocoding.model.RegionType.PlatformL4
4 | import org.bitlap.geocoding.model.RegionType.Street
5 | import org.bitlap.geocoding.model.RegionType.Town
6 |
7 | /**
8 | * Desc: 行政区规范实体
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/1/13
12 | */
13 | open class Division {
14 |
15 | // 省
16 | var province: RegionEntity? = null
17 | // 市
18 | var city: RegionEntity? = null
19 | // 区
20 | var district: RegionEntity? = null
21 | // 街道
22 | var street: RegionEntity? = null
23 | // 乡镇
24 | var town: RegionEntity? = null
25 | set(town) {
26 | town ?: return
27 | when(town.type) {
28 | Town -> field = town
29 | Street, PlatformL4 -> this.street = town
30 | else -> return
31 | }
32 | }
33 | get() {
34 | if (field != null) return field
35 | if (this.street == null) return null
36 | return if (this.street!!.isTown()) this.street else null
37 | }
38 | // 村
39 | var village: RegionEntity? = null
40 |
41 |
42 | fun hasProvince(): Boolean = this.province != null
43 | fun hasCity(): Boolean = this.city != null
44 | fun hasDistrict(): Boolean = this.district != null
45 | fun hasStreet(): Boolean = this.street != null
46 | fun hasTown(): Boolean = this.town != null
47 | fun hasVillage(): Boolean = this.village != null
48 |
49 | /**
50 | * 获取最小一级有效行政区域对象。
51 | */
52 | fun leastRegion(): RegionEntity {
53 | if (hasVillage()) return this.village!!
54 | if (hasTown()) return this.town!!
55 | if (hasStreet()) return this.street!!
56 | if (hasDistrict()) return this.district!!
57 | if (hasCity()) return this.city!!
58 | return this.province!!
59 | }
60 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/model/RegionEntity.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.model
2 |
3 | import org.bitlap.geocoding.model.RegionType.Street
4 | import org.bitlap.geocoding.model.RegionType.Town
5 | import java.io.Serializable
6 | import java.util.*
7 |
8 | /**
9 | * Desc: 区域实体类, 标准地址库4级地址(region.dat from Taobao, JD)
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/1/12
13 | */
14 | open class RegionEntity : Serializable {
15 |
16 | var id: Long = 0
17 | var parentId: Long = 0
18 | var name: String = ""
19 | var alias = ""
20 | var type: RegionType = RegionType.Undefined
21 | var zip = ""
22 | var children: ArrayList? = null
23 | var orderedNames: List? = null
24 | get() {
25 | synchronized(this) {
26 | if (field != null) return field
27 | field = buildOrderedNames()
28 | return field
29 | }
30 | }
31 |
32 | // 创建排序后的别名, 并按照长度排序
33 | private fun buildOrderedNames(): List {
34 | val fields = mutableListOf(this.name)
35 | if (this.alias.isBlank()) return fields
36 | this.alias.split(";").forEach {
37 | if (it.isNotBlank()) {
38 | fields.add(it)
39 | }
40 | }
41 | // 按长度倒序
42 | fields.sortWith { t1, t2 ->
43 | t2.length - t1.length
44 | }
45 | return fields
46 | }
47 |
48 | /**
49 | * 判断是否是乡镇
50 | */
51 | fun isTown(): Boolean {
52 | when (this.type) {
53 | Town -> return true
54 | Street -> {
55 | if (this.name.isBlank()) return false
56 | return this.name.length <= 4 && (this.name.last() == '镇' || this.name[this.name.lastIndex] == '乡')
57 | }
58 | else -> return false
59 | }
60 | }
61 |
62 |
63 | override fun equals(other: Any?): Boolean {
64 | if (other == null || other.javaClass != RegionEntity::class.java) return false
65 | val region = other as RegionEntity?
66 | return this.id == region!!.id
67 | }
68 |
69 | override fun hashCode(): Int {
70 | return this.id.hashCode()
71 | }
72 |
73 | fun equalsWithoutId(other: Any?): Boolean {
74 | if (other == null || other.javaClass != RegionEntity::class.java) return false
75 | other as RegionEntity
76 |
77 | if (parentId != other.parentId) return false
78 | if (name != other.name) return false
79 | if (alias != other.alias) return false
80 | if (type != other.type) return false
81 | if (zip != other.zip) return false
82 |
83 | return true
84 | }
85 |
86 | override fun toString(): String {
87 | return "RegionEntity(id=$id, parentId=$parentId, name='$name', alias='$alias', type=$type, zip='$zip')"
88 | }
89 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/model/RegionType.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.model
2 |
3 | /**
4 | * Desc: 区域类型
5 | * Mail: chk19940609@gmail.com
6 | * Created by IceMimosa
7 | * Date: 2017/1/12
8 | */
9 | enum class RegionType(val value: Int) {
10 | // 未定义区域类型
11 | Undefined(0),
12 | // 国家
13 | Country(10),
14 | // 省份
15 | Province(100),
16 | // 直辖市-与省份并行的一级
17 | ProvinceLevelCity1(150),
18 | // 直辖市-与城市并行的一级
19 | ProvinceLevelCity2(151),
20 | // 地级市
21 | City(200),
22 | // 省直辖县级市
23 | CityLevelDistrict(250),
24 | // 县、区
25 | District(300),
26 | // 街道乡镇一级
27 | Street(450),
28 | // 特定平台的4级地址
29 | PlatformL4(460),
30 | // 附加:乡镇
31 | Town(400),
32 | // 附加:村
33 | Village(410);
34 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/similarity/Document.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.similarity
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Desc: 文档对象
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/2/5
10 | */
11 | open class Document : Serializable {
12 |
13 | // 文档所有词条, 按照文档顺序, 未去重
14 | var terms: List? = null
15 | // Term.text -> Term
16 | var termsMap: HashMap? = null
17 |
18 | // 乡镇相关的词条信息
19 | var town: Term? = null
20 | var village: Term? = null
21 |
22 | // 道路信息
23 | var road: Term? = null
24 | var roadNum: Term? = null
25 | var roadNumValue = 0
26 |
27 | /**
28 | * 获取 Term
29 | */
30 | fun getTerm(text: String?): Term? {
31 | if (this.terms == null || this.terms!!.isEmpty()) return null
32 | if (this.termsMap == null) {
33 | // build cache
34 | synchronized(this) {
35 | if (this.termsMap == null) {
36 | this.termsMap = hashMapOf()
37 | this.terms?.forEach {
38 | if (!it.text.isNullOrBlank()) {
39 | this.termsMap!!.put(it.text!!, it)
40 | }
41 | }
42 | }
43 | }
44 | }
45 | return this.termsMap!![text]
46 | }
47 |
48 | override fun toString(): String {
49 | return "Document(terms=$terms, town=$town, village=$village, road=$road, roadNum=$roadNum, roadNumValue=$roadNumValue)"
50 | }
51 |
52 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/similarity/MatchedResult.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.similarity
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Desc: 相似度匹配的结果
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/2/7
10 | */
11 | open class MatchedResult : Serializable {
12 |
13 | // 两个地址分析出的文档
14 | var doc1: Document? = null
15 | var doc2: Document? = null
16 |
17 | // 匹配的词条信息
18 | var terms: ArrayList = arrayListOf()
19 |
20 | // 相似度值
21 | var similarity = 0.0
22 |
23 | override fun toString(): String {
24 | return "MatchedResult(\n\tdoc1=$doc1, \n\tdoc2=$doc2, \n\tterms=$terms, \n\tsimilarity=$similarity\n)"
25 | }
26 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/similarity/MatchedTerm.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.similarity
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Desc: 匹配的词条信息
7 | * Mail: chk19940609@gmail.com
8 | * Created by IceMimosa
9 | * Date: 2017/2/7
10 | */
11 | open class MatchedTerm : Serializable {
12 |
13 | // 匹配的词条
14 | var term: Term? = null
15 |
16 | // 匹配率
17 | var coord: Double = 0.0
18 |
19 | // 稠密度
20 | var density: Double = 0.0
21 |
22 | // 权重
23 | var boost: Double = 0.0
24 |
25 | // 特征值 TF-IDF
26 | var tfidf: Double = 0.0
27 |
28 | constructor(term: Term) {
29 | this.term = term
30 | }
31 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/similarity/Term.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.similarity
2 |
3 | import org.bitlap.geocoding.similarity.Term.TermType.City
4 | import org.bitlap.geocoding.similarity.Term.TermType.District
5 | import org.bitlap.geocoding.similarity.Term.TermType.Ignore
6 | import org.bitlap.geocoding.similarity.Term.TermType.Province
7 | import org.bitlap.geocoding.similarity.Term.TermType.Street
8 | import org.bitlap.geocoding.similarity.Term.TermType.Town
9 | import java.io.Serializable
10 |
11 | /**
12 | * Desc: 词条
13 | * Mail: chk19940609@gmail.com
14 | * Created by
15 | * Date: 2017/2/5
16 | */
17 | open class Term : Serializable {
18 | // 词条内容
19 | var text: String? = null
20 |
21 | // 词条类型
22 | var type: TermType? = null
23 |
24 | // Inverse Document Frequency,逆文档词频
25 | var idf: Double? = null
26 | get() {
27 | when (type) {
28 | Province, City, District -> return 0.0
29 | Street -> return 1.0
30 | // Town, Village, Road, RoadNum, Text,
31 | else -> return field
32 | }
33 | }
34 |
35 | // 相关联的词条引用
36 | var ref: Term? = null
37 |
38 | constructor(type: TermType, text: String?) {
39 | this.type = type
40 | if (text == null) {
41 | this.text = null
42 | return
43 | }
44 | when (type) {
45 | Province, City, District, Street, Town, Ignore -> this.text = text.intern()
46 | else -> this.text = text
47 | }
48 | }
49 |
50 | override fun equals(other: Any?): Boolean {
51 | if (other == null || other.javaClass != Term::class.java)
52 | return false
53 | val t = other as Term
54 | if (this.text == null) return t.text == null
55 | return this.text == t.text
56 | }
57 |
58 | override fun hashCode(): Int {
59 | if (this.text == null) return 0
60 | return this.text!!.hashCode()
61 | }
62 |
63 | override fun toString(): String {
64 | return "Term($text)"
65 | }
66 |
67 |
68 | // 词条类型, 主要用于给每部分加权重
69 | enum class TermType(val value: Char) {
70 | Undefined('0'),
71 | // 省
72 | Province('1'),
73 | // 地级市
74 | City('2'),
75 | // 区县
76 | District('3'),
77 | // 街道
78 | Street('4'),
79 | // 乡镇
80 | Town('T'),
81 | // 村
82 | Village('V'),
83 | // 道路
84 | Road('R'),
85 | // 门牌号
86 | RoadNum('N'),
87 | // 建筑物号
88 | Building('B'),
89 | // 其他地址文本
90 | Text('X'),
91 | // 忽略项
92 | Ignore('I');
93 | }
94 |
95 |
96 | }
--------------------------------------------------------------------------------
/src/main/java/org/bitlap/geocoding/utils/StringHelper.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.utils
2 |
3 | /**
4 | * Desc: String 一些帮助类
5 | * Mail: chk19940609@gmail.com
6 | * Created by IceMimosa
7 | * Date: 2017/1/17
8 | */
9 |
10 |
11 | /**
12 | * 获取String头部length字符的子串。
13 | * 此处优化边界处理
14 | */
15 | fun String?.head(length: Int): String? {
16 | if (this.isNullOrBlank() || this.length <= length) return this
17 | if (length <= 0) return ""
18 | return this.substring(0, length)
19 | }
20 |
21 | /**
22 | * 获取String尾部length字符的子串。
23 | * 此处优化边界处理
24 | */
25 | fun String?.tail(length: Int): String? {
26 | if (this.isNullOrBlank() || this.length <= length) return this
27 | if (length <= 0) return ""
28 | return this.substring(this.length - length)
29 | }
30 |
31 | /**
32 | * 提取子串, 优化边界判断
33 | * [begin]: 开始位置, 包括
34 | */
35 | fun String.take(begin: Int): String {
36 | if (this.isBlank() || begin <= 0) return this
37 | if (begin > this.length - 1) return ""
38 | return this.substring(begin)
39 | }
40 | /**
41 | * 提取子串, 优化边界判断
42 | * [begin]: 开始位置, 包括
43 | * [end]: 结束位置, 包括
44 | */
45 | fun String.take(begin: Int, end: Int): String {
46 | if (this.isBlank()) return this
47 | val s = if (begin <= 0) 0 else begin
48 | val e = if (end >= this.length - 1) this.length - 1 else end
49 | if (s > e) return ""
50 | if (s == 0 && e == this.length - 1) return this
51 | return this.substring(s, e + 1)
52 | }
53 |
54 | /**
55 | * 删除数组中对应的字符
56 | */
57 | @JvmOverloads
58 | fun String.remove(array: CharArray, exclude: String = ""): String {
59 | if (this.isBlank() || array.isEmpty()) return this
60 | // 去除字符
61 | val sb = StringBuilder(this.length)
62 | var remove = false
63 | this.forEach {
64 | if (array.contains(it) && !exclude.contains(it)) {
65 | remove = true
66 | return@forEach
67 | }
68 | sb.append(it)
69 | }
70 | return if (remove) sb.toString() else this
71 | }
72 |
73 | /**
74 | * 去除重复出现 [length] 个以上的数字
75 | * [length] : 重复出现的次数
76 | */
77 | fun String.removeRepeatNum(length: Int): String {
78 | if (this.isBlank() || this.length < length) return this
79 | val sb = StringBuilder(this.length)
80 | var count = 0
81 | this.forEachIndexed { i, c ->
82 | if (c in '0'..'9') {
83 | count++
84 | return@forEachIndexed
85 | }
86 | // 如果小于重复出现的长度
87 | if (count in 1 until length) {
88 | sb.append(this.take(i - count, i - 1))
89 | }
90 | // 重置标志
91 | count = 0
92 | sb.append(c)
93 | }
94 | if (count in 1 until length) {
95 | sb.append(this.tail(count))
96 | }
97 | return sb.toString()
98 | }
99 |
100 | /**
101 | * 判断是否是纯数字
102 | */
103 | fun String?.isNumericChars(): Boolean {
104 | if (this.isNullOrBlank()) return false
105 | return this.none {
106 | it !in '0'..'9'
107 | }
108 | }
109 |
110 | /**
111 | * 全部为 ASCII 字母
112 | */
113 | fun String?.isAsciiChars(): Boolean {
114 | if (this.isNullOrBlank()) return false
115 | return this.none {
116 | it !in 'a'..'z' && it !in 'A'..'Z'
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/resources/IKAnalyzer.cfg.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | IK Analyzer扩展配置
5 | dic/region.dic;dic/community.dic
6 | dic/stop.dic;
7 |
--------------------------------------------------------------------------------
/src/main/resources/dic/stop.dic:
--------------------------------------------------------------------------------
1 | 到了
2 | 联系
3 | 附近
4 | 街上
5 | 街
6 | 省
7 | 市
8 | 区
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/resources/word.local.conf:
--------------------------------------------------------------------------------
1 | dic.path=classpath:dic/region.dic,classpath:dic/community.dic
2 | stopwords.path=classpath:dic/stop.dic
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestCustomDatSave.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.model.Address
4 | import org.bitlap.geocoding.model.RegionType
5 | import org.junit.Test
6 | import kotlin.test.assertEquals
7 |
8 | /**
9 | * Desc: 测试保存自定义文件
10 | * Mail: blvyoucan@163.com
11 | * Created by hechongbin
12 | * Date: 2023/6/7
13 | */
14 | class TestCustomDatSave {
15 |
16 | @Test
17 | fun saveAndNomalizing() {
18 | val geocoding = GeocodingX("region_2021.dat")
19 | val addrss = "浙江省杭州市临平区经济开发区新颜路22号501D"
20 |
21 | // 未添加自定义地区"临平区"
22 | assertEquals(
23 | geocoding.normalizing(addrss),
24 | Address(
25 | 330000000000, "浙江省",
26 | 330100000000, "杭州市",
27 | 330110000000, "余杭区",
28 | 330110001000, "临平街道",
29 | null, null,
30 | null, null,
31 | null, null,
32 | "501",
33 | "区经济开发区新颜路22号D"
34 | )
35 | )
36 |
37 | // 添加自定义地区"临平区"
38 | geocoding.addRegionEntry(330113000000, 330100000000, "临平区", RegionType.District, "", true)
39 |
40 | val addNew = Address(
41 | 330000000000, "浙江省",
42 | 330100000000, "杭州市",
43 | 330113000000, "临平区",
44 | null, null,
45 | null, null,
46 | null, null,
47 | "新颜路", "22号",
48 | "501",
49 | "D"
50 | )
51 |
52 | assertEquals(geocoding.normalizing(addrss), addNew)
53 |
54 | // 添加后"临平区"后保存自定义字典文件
55 | val filename = "mydata.dat"
56 | val filePath = "${this.javaClass.classLoader.getResource("").path}/$filename"
57 | geocoding.save(filePath)
58 |
59 | // 读取添加了"临平区"的自定义字典文件
60 | val myGeocoding = GeocodingX(filename)
61 | assertEquals(myGeocoding.normalizing(addrss), addNew)
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestNormalizing.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.model.Address
4 | import org.junit.Test
5 | import java.sql.DriverManager
6 | import kotlin.test.assertEquals
7 |
8 | /**
9 | * Desc: 测试地址标准化
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/1/18
13 | */
14 | class TestNormalizing {
15 |
16 | @Test
17 | fun testNormalizing() {
18 | assertEquals(
19 | Geocoding.normalizing("江苏泰州兴化市昌荣镇【康琴网吧】 (昌荣镇附近)"),
20 | Address(
21 | 320000000000, "江苏省",
22 | 321200000000, "泰州市",
23 | 321281000000, "兴化市",
24 | 321281119000, "昌荣镇",
25 | 321281119000, "昌荣镇",
26 | null, null,
27 | null, null,
28 | null,
29 | "康琴网吧昌荣镇附近"
30 | )
31 | )
32 | assertEquals(
33 | Geocoding.normalizing("中国山东临沂兰山区小埠东社区居委会【绿杨榭公寓31-1-101 】 (绿杨榭公寓附近)"),
34 | Address(
35 | 370000000000, "山东省",
36 | 371300000000, "临沂市",
37 | 371302000000, "兰山区",
38 | null, null,
39 | null, null,
40 | null, null,
41 | null, null,
42 | "31-1-101",
43 | "小埠东社区居委会绿杨榭公寓绿杨榭公寓附近"
44 | )
45 | )
46 | assertEquals(
47 | Geocoding.normalizing("抚顺顺城区将军桥【将军水泥厂住宅4-1-102】 (将军桥附近)"),
48 | Address(
49 | 210000000000, "辽宁省",
50 | 210400000000, "抚顺市",
51 | 210411000000, "顺城区",
52 | null, null,
53 | null, null,
54 | null, null,
55 | null,
56 | null,
57 | "4-1-102",
58 | "将军桥将军水泥厂住宅将军桥附近"
59 | )
60 | )
61 | assertEquals(
62 | Geocoding.normalizing("中国辽宁沈阳辽中县北一路【虹桥商厦西行100米】(邮政储蓄银行北一路支行附近)"),
63 | Address(
64 | 210000000000, "辽宁省",
65 | 210100000000, "沈阳市",
66 | 210122000000, "辽中县",
67 | null, null,
68 | null, null,
69 | null, null,
70 | "北一路",
71 | "",
72 | null,
73 | "虹桥商厦西行100米邮政储蓄银行北一路支行附近"
74 | )
75 | )
76 | assertEquals(
77 | Geocoding.normalizing("辽宁 沈阳 辽中县中国辽宁 沈阳 辽中县虹桥商厦苏宁易购"),
78 | Address(
79 | 210000000000, "辽宁省",
80 | 210100000000, "沈阳市",
81 | 210122000000, "辽中县",
82 | null, null,
83 | null, null,
84 | null, null,
85 | null,
86 | null,
87 | null,
88 | "虹桥商厦苏宁易购"
89 | )
90 | )
91 | assertEquals(
92 | Geocoding.normalizing("辽宁沈阳于洪区沈阳市辽中县县城虹桥商厦西侧三单元外跨楼梯3-2-23-"),
93 | Address(
94 | 210000000000, "辽宁省",
95 | 210100000000, "沈阳市",
96 | 210114000000, "于洪区",
97 | null, null,
98 | null, null,
99 | null, null,
100 | null,
101 | null,
102 | "3-2-23",
103 | "辽中县县城虹桥商厦西侧三单元外跨楼梯"
104 | )
105 | )
106 | assertEquals(
107 | Geocoding.normalizing("山东济宁任城区金宇路【杨柳国际新城K8栋3单元1302】(杨柳国际新城·丽宫附近)"),
108 | Address(
109 | 370000000000, "山东省",
110 | 370800000000, "济宁市",
111 | 370811000000, "任城区",
112 | null, null,
113 | null, null,
114 | null, null,
115 | "金宇路",
116 | "",
117 | "K8栋3单元1302",
118 | "杨柳国际新城杨柳国际新城丽宫附近"
119 | )
120 | )
121 | assertEquals(
122 | Geocoding.normalizing("上海宝山区杨行镇宝山区江杨北路98号农贸批发市场蔬菜二区7通道A16号"),
123 | Address(
124 | 310000000000, "上海",
125 | 310100000000, "上海市",
126 | 310113000000, "宝山区",
127 | 310113103000, "杨行镇",
128 | 310113103000, "杨行镇",
129 | null, null,
130 | "江杨北路",
131 | "98号",
132 | "7通道A16号",
133 | "农贸批发市场蔬菜二区"
134 | )
135 | )
136 | assertEquals(
137 | Geocoding.normalizing("上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)"),
138 | Address(
139 | 310000000000, "上海",
140 | 310100000000, "上海市",
141 | 310113000000, "宝山区",
142 | null, null,
143 | null, null,
144 | null, null,
145 | "新沪路",
146 | "58弄",
147 | "11-802",
148 | "水韵华庭水韵华庭附近"
149 | )
150 | )
151 | // 精确度缺失
152 | assertEquals(
153 | Geocoding.normalizing("赤城街道赤城大厦10E"),
154 | Address(
155 | 330000000000, "浙江省",
156 | 331000000000, "台州市",
157 | 331023000000, "天台县",
158 | 331023001000, "赤城街道",
159 | null, null,
160 | null, null,
161 | null,
162 | null,
163 | null,
164 | "大厦10E"
165 | )
166 | )
167 | assertEquals(
168 | Geocoding.normalizing("上海黄浦区内环以内黄浦区小东门聚奎街43号"),
169 | Address(
170 | 310000000000, "上海",
171 | 310100000000, "上海市",
172 | 310101000000, "黄浦区",
173 | null, null,
174 | null, null,
175 | null, null,
176 | "小东门聚奎街",
177 | "43号",
178 | null,
179 | ""
180 | )
181 | )
182 | assertEquals(
183 | Geocoding.normalizing("河南信阳平桥区王岗镇【镇上】 (王岗乡(大杨墩)附近)"),
184 | Address(
185 | 410000000000, "河南省",
186 | 411500000000, "信阳市",
187 | 411503000000, "平桥区",
188 | 411503209000, "王岗乡",
189 | 411503209000, "王岗乡",
190 | null, null,
191 | null,
192 | null,
193 | null,
194 | "附近镇上王岗乡大杨墩"
195 | )
196 | )
197 | // fix 若干电话号码
198 | assertEquals(
199 | Geocoding.normalizing("四川自贡贡井区四川省自贡市贡井区莲花镇四川自贡贡井区莲花镇黄桷村7组22号13298213121/15609000090/18681337139"),
200 | Address(
201 | 510000000000, "四川省",
202 | 510300000000, "自贡市",
203 | 510303000000, "贡井区",
204 | 510303107000, "莲花镇",
205 | 510303107000, "莲花镇",
206 | null, null,
207 | null,
208 | null,
209 | "7组22号",
210 | "黄桷村"
211 | )
212 | )
213 | // fix 大云小区, 大云是镇名称的情
214 | assertEquals(
215 | Geocoding.normalizing("浙江嘉兴嘉善县浙江省嘉兴市嘉善县大云镇大云镇大云小区公寓楼1号302室"),
216 | Address(
217 | 330000000000, "浙江省",
218 | 330400000000, "嘉兴市",
219 | 330421000000, "嘉善县",
220 | 330421102000, "大云镇",
221 | 330421102000, "大云镇",
222 | null, null,
223 | null,
224 | null,
225 | "1号302室",
226 | "大云小区公寓楼"
227 | )
228 | )
229 | // fix xx路xx号楼
230 | assertEquals(
231 | Geocoding.normalizing("辽宁沈阳铁西区中国辽宁沈阳沈阳市铁西区南十一西路12号楼472 (第九医院(沈阳)附近)"),
232 | Address(
233 | 210000000000, "辽宁省",
234 | 210100000000, "沈阳市",
235 | 210106000000, "铁西区",
236 | null, null,
237 | null, null,
238 | null, null,
239 | "南十一西路",
240 | "",
241 | "12号楼472",
242 | "附近第九医院沈阳"
243 | )
244 | )
245 | assertEquals(
246 | Geocoding.normalizing("重庆重庆渝北区重庆渝北区两路镇双龙西路236号5-4(交警12支队红绿灯路口渝达商务宾馆楼上5-4)"),
247 | Address(
248 | 500000000000, "重庆",
249 | 500100000000, "重庆市",
250 | 500112000000, "渝北区",
251 | 500112016000, "两路街道",
252 | null, null,
253 | null, null,
254 | "双龙西路",
255 | "236号",
256 | "5-4",
257 | "交警12支队红绿灯路口渝达商务宾馆楼上54"
258 | )
259 | )
260 | assertEquals(
261 | Geocoding.normalizing("山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303"),
262 | Address(
263 | 370000000000, "山东省",
264 | 370200000000, "青岛市",
265 | 370203000000, "市北区",
266 | 370203030000, "水清沟街道",
267 | null, null,
268 | null, null,
269 | "九江路",
270 | "20号",
271 | "3号楼2单元1303",
272 | "大都会"
273 | )
274 | )
275 | assertEquals(
276 | Geocoding.normalizing("中国山东青岛城阳区湘潭路【华胥美邦 到了联系20-1-1402】 (中铁华胥美邦附近)"),
277 | Address(
278 | 370000000000, "山东省",
279 | 370200000000, "青岛市",
280 | 370214000000, "城阳区",
281 | null, null,
282 | null, null,
283 | null, null,
284 | "湘潭路",
285 | "",
286 | "20-1-1402",
287 | "华胥美邦到了联系中铁华胥美邦附近"
288 | )
289 | )
290 | assertEquals(
291 | Geocoding.normalizing("辽宁沈阳沈河区辽宁沈阳市沈河区一环内会武街56号4-3-2"),
292 | Address(
293 | 210000000000, "辽宁省",
294 | 210100000000, "沈阳市",
295 | 210103000000, "沈河区",
296 | null, null,
297 | null, null,
298 | null, null,
299 | "一环内会武街",
300 | "56号",
301 | "4-3-2",
302 | ""
303 | )
304 | )
305 | // fix 辣鸡数据
306 | assertEquals(Geocoding.normalizing("1008中国"), null)
307 | // fix 3层/楼
308 | assertEquals(
309 | Geocoding.normalizing("清徐县中国山西太原清徐县清徐县人民医院附近苹果社区2号楼1单元3层"),
310 | Address(
311 | 140000000000, "山西省",
312 | 140100000000, "太原市",
313 | 140121000000, "清徐县",
314 | null, null,
315 | null, null,
316 | null, null,
317 | null,
318 | null,
319 | "2号楼1单元3层",
320 | "人民医院附近苹果社区"
321 | )
322 | )
323 | // fix 3组
324 | assertEquals(
325 | Geocoding.normalizing("辽宁辽阳宏伟区辽宁省辽阳市宏伟区新村街道龙鼎山小区B区08栋3组401号"),
326 | Address(
327 | 210000000000, "辽宁省",
328 | 211000000000, "辽阳市",
329 | 211004000000, "宏伟区",
330 | 211004003000, "新村街道",
331 | null, null,
332 | null, null,
333 | null,
334 | null,
335 | "08栋3组401号",
336 | "龙鼎山小区B区"
337 | )
338 | )
339 | // fix 3门
340 | assertEquals(
341 | Geocoding.normalizing("北京北京市西城区 白纸坊街道右安门内西街甲10号院11楼3门501"),
342 | Address(
343 | 110000000000, "北京",
344 | 110100000000, "北京市",
345 | 110102000000, "西城区",
346 | 110102019000, "白纸坊街道",
347 | null, null,
348 | null, null,
349 | "右安门内西街",
350 | "甲10号院",
351 | "11楼3门501",
352 | ""
353 | )
354 | )
355 | // fix 延川是县区的情况, 不能将延川路识别成延川县
356 | assertEquals(Geocoding.normalizing("延川路116号绿城城园东区7号楼2单元802户"), null)
357 | // fix 绍兴路匹配上绍兴市的情况
358 | assertEquals(Geocoding.normalizing("绍兴路59号速递易"), null)
359 | // fix 同上, 不能识别成金水区
360 | assertEquals(Geocoding.normalizing("金水路751号1号楼3单元501"), null)
361 | assertEquals(
362 | Geocoding.normalizing("中国上海上海宝山区 顾村镇菊太路777弄24号602室"),
363 | Address(
364 | 310000000000, "上海",
365 | 310100000000, "上海市",
366 | 310113000000, "宝山区",
367 | 310113109000, "顾村镇",
368 | 310113109000, "顾村镇",
369 | null, null,
370 | "菊太路",
371 | "777弄",
372 | "24号602室",
373 | ""
374 | )
375 | )
376 | // fix字符 —
377 | assertEquals(
378 | Geocoding.normalizing("辽宁大连甘井子区辽宁, 大连, 甘井子区, 泡崖街玉境路26号3—2—1"),
379 | Address(
380 | 210000000000, "辽宁省",
381 | 210200000000, "大连市",
382 | 210211000000, "甘井子区",
383 | null, null,
384 | null, null,
385 | null, null,
386 | "泡崖街玉境路",
387 | "26号",
388 | "3-2-1",
389 | ""
390 | )
391 | )
392 | // fix 开发区的影响
393 | assertEquals(
394 | Geocoding.normalizing("山东德州德城区宋官屯街道开发区段庄村"),
395 | Address(
396 | 370000000000, "山东省",
397 | 371400000000, "德州市",
398 | 371402000000, "德城区",
399 | 371402008000, "宋官屯街道",
400 | null, null,
401 | null, null,
402 | null,
403 | null,
404 | null,
405 | "段庄村"
406 | )
407 | )
408 | // fix 只有 1号楼 的情
409 | assertEquals(
410 | Geocoding.normalizing("北京市西城区新康街2号院1号楼北侧楼房"),
411 | Address(
412 | 110000000000, "北京",
413 | 110100000000, "北京市",
414 | 110102000000, "西城区",
415 | null, null,
416 | null, null,
417 | null, null,
418 | "新康街",
419 | "2号院",
420 | "1号楼",
421 | "北侧楼房"
422 | )
423 | )
424 | // Fix issues #10
425 | assertEquals(
426 | Geocoding.normalizing("福建福州鼓楼区六一路111号金三桥大厦"),
427 | Address(
428 | 350000000000, "福建省",
429 | 350100000000, "福州市",
430 | 350102000000, "鼓楼区",
431 | null, null,
432 | null, null,
433 | null, null,
434 | "六一路",
435 | "111号",
436 | null,
437 | "金三桥大厦"
438 | )
439 | )
440 | // Fix issues #8
441 | assertEquals(
442 | Geocoding.normalizing("广东省河源市源城区中山大道16号华怡小区"),
443 | Address(
444 | 440000000000, "广东省",
445 | 441600000000, "河源市",
446 | 441602000000, "源城区",
447 | null, null,
448 | null, null,
449 | null, null,
450 | "中山大道",
451 | "16号",
452 | null,
453 | "华怡小区"
454 | )
455 |
456 | )
457 | assertEquals(
458 | Geocoding.normalizing("广东省河源市中山大道16号华怡小区"),
459 | Address(
460 | 440000000000, "广东省",
461 | 441600000000, "河源市",
462 | null, null,
463 | null, null,
464 | null, null,
465 | null, null,
466 | "中山大道",
467 | "16号",
468 | null,
469 | "华怡小区"
470 | )
471 | )
472 | // Fix issues #9
473 | assertEquals(
474 | Geocoding.normalizing("浙江省杭州市西湖区中国建设银河西湖支行"),
475 | Address(
476 | 330000000000, "浙江省",
477 | 330100000000, "杭州市",
478 | 330106000000, "西湖区",
479 | null, null,
480 | null, null,
481 | null, null,
482 | null,
483 | null,
484 | null,
485 | "中国建设银河西湖支行"
486 | )
487 | )
488 | assertEquals(
489 | Geocoding.normalizing("江西赣州市赣县区王母渡镇"),
490 | Address(
491 | 360000000000, "江西省",
492 | 360700000000, "赣州市",
493 | 360721000000, "赣县区",
494 | 360721101000, "王母渡镇",
495 | 360721101000, "王母渡镇",
496 | null, null,
497 | null,
498 | null,
499 | null,
500 | ""
501 | )
502 | )
503 | // fix 只有父级地址
504 | assertEquals(
505 | Geocoding.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"),
506 | Address(
507 | 130000000000, "河北省",
508 | 130600000000, "保定市",
509 | 130634000000, "曲阳县",
510 | 130634101000, "灵山镇",
511 | 130634101000, "灵山镇",
512 | null, null,
513 | "海榆大道",
514 | "4号",
515 | "11#楼2单元203",
516 | "绿地城润园"
517 | )
518 | )
519 | }
520 |
521 |
522 | /**
523 | * 将测试数据解析, 载入到数据库, 便于观察
524 | *
525 | * 表结构在 sql/creat.sql
526 | *
527 | * 注意: 自行修改数据库连接地址
528 | */
529 | // @Test
530 | fun testImport() {
531 | Class.forName("com.mysql.jdbc.Driver")
532 | val connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/geocoding", "root", "anywhere")
533 | val statement = connection.prepareStatement(
534 | "INSERT INTO `addr_address` (`province`, `city`, `district`, `street`, `text`, `town`, `village`, `road`, `road_num`, `building_num`, `raw_text`) VALUES (?,?,?,?,?,?,?,?,?,?,?)"
535 | )
536 | TestNormalizing::class.java.classLoader.getResourceAsStream("address.txt").reader().readLines().forEach {
537 | val address = Geocoding.normalizing(it)
538 | statement.setLong(1, address?.provinceId ?: 0)
539 | statement.setLong(2, address?.cityId ?: 0)
540 | statement.setLong(3, address?.districtId ?: 0)
541 | statement.setLong(4, address?.streetId ?: 0)
542 | statement.setString(5, address?.text ?: "")
543 | statement.setString(6, address?.town ?: "")
544 | statement.setString(7, address?.village ?: "")
545 | statement.setString(8, address?.road ?: "")
546 | statement.setString(9, address?.roadNum ?: "")
547 | statement.setString(10, address?.buildingNum ?: "")
548 | statement.setString(11, it)
549 |
550 | statement.execute()
551 | }
552 |
553 | statement.close()
554 | connection.close()
555 | }
556 |
557 | }
558 |
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestNormalizingAddRegionEntry.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.model.Address
4 | import org.bitlap.geocoding.model.RegionType
5 | import org.junit.Test
6 | import kotlin.test.assertEquals
7 |
8 | /**
9 | * Desc: 测试地址标准化
10 | * Mail: chk19940609@gmail.com
11 | * Created by IceMimosa
12 | * Date: 2017/1/18
13 | */
14 | class TestNormalizingAddRegionEntry {
15 |
16 | @Test
17 | fun testNormalizing() {
18 | Geocoding.addRegionEntry(888888, 321200000000, "泥煤市", RegionType.District)
19 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"),
20 | Address(
21 | 320000000000, "江苏省",
22 | 321200000000, "泰州市",
23 | 888888, "泥煤市",
24 | null, null,
25 | null, null,
26 | null, null,
27 | "泥煤大道", "888号",
28 | null,
29 | ""
30 | )
31 | )
32 | Geocoding.addRegionEntry(88888888, 100000000000, "尼玛省", RegionType.Province)
33 | Geocoding.addRegionEntry(8888888, 88888888, "尼玛市", RegionType.City)
34 | Geocoding.addRegionEntry(888888, 8888888, "泥煤市", RegionType.District)
35 | assertEquals(
36 | Geocoding.normalizing("中国尼玛省尼玛市泥煤市泥煤大道888号xxx"),
37 | Address(
38 | 88888888, "尼玛省",
39 | 8888888, "尼玛市",
40 | 888888, "泥煤市",
41 | null, null,
42 | null, null,
43 | null, null,
44 | "泥煤大道", "888号",
45 | null,
46 | "xxx"
47 | )
48 | )
49 | }
50 |
51 | @Test
52 | fun testNormalizingReplace() {
53 | Geocoding.addRegionEntry(888888, 321200000000, "泥煤市", RegionType.District)
54 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"),
55 | Address(
56 | 320000000000, "江苏省",
57 | 321200000000, "泰州市",
58 | 888888, "泥煤市",
59 | null, null,
60 | null, null,
61 | null, null,
62 | "泥煤大道", "888号",
63 | null,
64 | ""
65 | )
66 | )
67 | Geocoding.addRegionEntry(888889, 321200000000, "泥煤市", RegionType.District)
68 | assertEquals(Geocoding.normalizing("江苏泰州泥煤市泥煤大道888号"),
69 | Address(
70 | 320000000000, "江苏省",
71 | 321200000000, "泰州市",
72 | 888889, "泥煤市",
73 | null, null,
74 | null, null,
75 | null, null,
76 | "泥煤大道", "888号",
77 | null,
78 | ""
79 | )
80 | )
81 | }
82 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestNormalizingCustom.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.model.Address
4 | import org.junit.Test
5 | import kotlin.test.assertEquals
6 |
7 | /**
8 | * Desc: 测试地址标准化
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/1/18
12 | */
13 | class TestNormalizingCustom {
14 |
15 | @Test
16 | fun testNormalizing() {
17 | val geocoding = GeocodingX("region_2021.dat")
18 | assertEquals(
19 | geocoding.normalizing("江苏泰州兴化市昌荣镇【康琴网吧】 (昌荣镇附近)"),
20 | Address(
21 | 320000000000, "江苏省",
22 | 321200000000, "泰州市",
23 | 321281000000, "兴化市",
24 | 321281119000, "昌荣镇",
25 | null, null,
26 | null, null,
27 | null, null,
28 | null,
29 | "康琴网吧昌荣镇附近"
30 | )
31 | )
32 | assertEquals(
33 | geocoding.normalizing("中国山东临沂兰山区小埠东社区居委会【绿杨榭公寓31-1-101 】 (绿杨榭公寓附近)"),
34 | Address(
35 | 370000000000, "山东省",
36 | 371300000000, "临沂市",
37 | 371302000000, "兰山区",
38 | null, null,
39 | null, null,
40 | null, null,
41 | null, null,
42 | "31-1-101",
43 | "小埠东社区居委会绿杨榭公寓绿杨榭公寓附近"
44 | )
45 | )
46 | assertEquals(
47 | geocoding.normalizing("抚顺顺城区将军桥【将军水泥厂住宅4-1-102】 (将军桥附近)"),
48 | Address(
49 | 210000000000, "辽宁省",
50 | 210400000000, "抚顺市",
51 | 210411000000, "顺城区",
52 | null, null,
53 | null, null,
54 | null, null,
55 | null,
56 | null,
57 | "4-1-102",
58 | "将军桥将军水泥厂住宅将军桥附近"
59 | )
60 | )
61 | assertEquals(
62 | geocoding.normalizing("辽宁沈阳于洪区沈阳市辽中县县城虹桥商厦西侧三单元外跨楼梯3-2-23-"),
63 | Address(
64 | 210000000000, "辽宁省",
65 | 210100000000, "沈阳市",
66 | 210114000000, "于洪区",
67 | null, null,
68 | null, null,
69 | null, null,
70 | null,
71 | null,
72 | "3-2-23",
73 | "辽中县县城虹桥商厦西侧三单元外跨楼梯"
74 | )
75 | )
76 | assertEquals(
77 | geocoding.normalizing("山东济宁任城区金宇路【杨柳国际新城K8栋3单元1302】(杨柳国际新城·丽宫附近)"),
78 | Address(
79 | 370000000000, "山东省",
80 | 370800000000, "济宁市",
81 | 370811000000, "任城区",
82 | null, null,
83 | null, null,
84 | null, null,
85 | "金宇路",
86 | "",
87 | "K8栋3单元1302",
88 | "杨柳国际新城杨柳国际新城丽宫附近"
89 | )
90 | )
91 | assertEquals(
92 | geocoding.normalizing("上海宝山区杨行镇宝山区江杨北路98号农贸批发市场蔬菜二区7通道A16号"),
93 | Address(
94 | 310000000000, "上海市",
95 | 310100000000, "直辖区",
96 | 310113000000, "宝山区",
97 | 310113103000, "杨行镇",
98 | null, null,
99 | null, null,
100 | "江杨北路",
101 | "98号",
102 | "7通道A16号",
103 | "农贸批发市场蔬菜二区"
104 | )
105 | )
106 | assertEquals(
107 | geocoding.normalizing("上海上海宝山区宝山区【新沪路58弄11-802 水韵华庭 】 (水韵华庭附近)"),
108 | Address(
109 | 310000000000, "上海市",
110 | 310100000000, "直辖区",
111 | 310113000000, "宝山区",
112 | null, null,
113 | null, null,
114 | null, null,
115 | "新沪路",
116 | "58弄",
117 | "11-802",
118 | "水韵华庭水韵华庭附近"
119 | )
120 | )
121 | // 精确度缺失
122 | assertEquals(
123 | geocoding.normalizing("赤城街道赤城大厦10E"),
124 | Address(
125 | 330000000000, "浙江省",
126 | 331000000000, "台州市",
127 | 331023000000, "天台县",
128 | 331023001000, "赤城街道",
129 | null, null,
130 | null, null,
131 | null,
132 | null,
133 | null,
134 | "大厦10E"
135 | )
136 | )
137 | assertEquals(
138 | geocoding.normalizing("上海黄浦区内环以内黄浦区小东门聚奎街43号"),
139 | Address(
140 | 310000000000, "上海市",
141 | 310100000000, "直辖区",
142 | 310101000000, "黄浦区",
143 | null, null,
144 | null, null,
145 | null, null,
146 | "小东门聚奎街",
147 | "43号",
148 | null,
149 | ""
150 | )
151 | )
152 | // fix 若干电话号码
153 | assertEquals(
154 | geocoding.normalizing("四川自贡贡井区四川省自贡市贡井区莲花镇四川自贡贡井区莲花镇黄桷村7组22号13298213121/15609000090/18681337139"),
155 | Address(
156 | 510000000000, "四川省",
157 | 510300000000, "自贡市",
158 | 510303000000, "贡井区",
159 | 510303107000, "莲花镇",
160 | null, null,
161 | null, null,
162 | null,
163 | null,
164 | "7组22号",
165 | "黄桷村"
166 | )
167 | )
168 | // fix 大云小区, 大云是镇名称的情
169 | assertEquals(
170 | geocoding.normalizing("浙江嘉兴嘉善县浙江省嘉兴市嘉善县大云镇大云镇大云小区公寓楼1号302室"),
171 | Address(
172 | 330000000000, "浙江省",
173 | 330400000000, "嘉兴市",
174 | 330421000000, "嘉善县",
175 | 330421102000, "大云镇",
176 | null, null,
177 | null, null,
178 | null,
179 | null,
180 | "1号302室",
181 | "大云小区公寓楼"
182 | )
183 | )
184 | // fix xx路xx号楼
185 | assertEquals(
186 | geocoding.normalizing("辽宁沈阳铁西区中国辽宁沈阳沈阳市铁西区南十一西路12号楼472 (第九医院(沈阳)附近)"),
187 | Address(
188 | 210000000000, "辽宁省",
189 | 210100000000, "沈阳市",
190 | 210106000000, "铁西区",
191 | null, null,
192 | null, null,
193 | null, null,
194 | "南十一西路",
195 | "",
196 | "12号楼472",
197 | "附近第九医院沈阳"
198 | )
199 | )
200 | assertEquals(
201 | geocoding.normalizing("重庆重庆渝北区重庆渝北区两路镇双龙西路236号5-4(交警12支队红绿灯路口渝达商务宾馆楼上5-4)"),
202 | Address(
203 | 500000000000, "重庆市",
204 | 500100000000, "直辖区",
205 | 500112000000, "渝北区",
206 | 500112016000, "两路街道",
207 | null, null,
208 | null, null,
209 | "双龙西路",
210 | "236号",
211 | "5-4",
212 | "交警12支队红绿灯路口渝达商务宾馆楼上54"
213 | )
214 | )
215 | assertEquals(
216 | geocoding.normalizing("山东青岛市北区山东省青岛市市北区水清沟街道九江路20号大都会3号楼2单元1303"),
217 | Address(
218 | 370000000000, "山东省",
219 | 370200000000, "青岛市",
220 | 370203000000, "市北区",
221 | 370203030000, "水清沟街道",
222 | null, null,
223 | null, null,
224 | "九江路",
225 | "20号",
226 | "3号楼2单元1303",
227 | "大都会"
228 | )
229 | )
230 | assertEquals(
231 | geocoding.normalizing("中国山东青岛城阳区湘潭路【华胥美邦 到了联系20-1-1402】 (中铁华胥美邦附近)"),
232 | Address(
233 | 370000000000, "山东省",
234 | 370200000000, "青岛市",
235 | 370214000000, "城阳区",
236 | null, null,
237 | null, null,
238 | null, null,
239 | "湘潭路",
240 | "",
241 | "20-1-1402",
242 | "华胥美邦到了联系中铁华胥美邦附近"
243 | )
244 | )
245 | assertEquals(
246 | geocoding.normalizing("辽宁沈阳沈河区辽宁沈阳市沈河区一环内会武街56号4-3-2"),
247 | Address(
248 | 210000000000, "辽宁省",
249 | 210100000000, "沈阳市",
250 | 210103000000, "沈河区",
251 | null, null,
252 | null, null,
253 | null, null,
254 | "一环内会武街",
255 | "56号",
256 | "4-3-2",
257 | ""
258 | )
259 | )
260 | // fix 辣鸡数据
261 | assertEquals(geocoding.normalizing("1008中国"), null)
262 | // fix 3层/楼
263 | assertEquals(
264 | geocoding.normalizing("清徐县中国山西太原清徐县清徐县人民医院附近苹果社区2号楼1单元3层"),
265 | Address(
266 | 140000000000, "山西省",
267 | 140100000000, "太原市",
268 | 140121000000, "清徐县",
269 | null, null,
270 | null, null,
271 | null, null,
272 | null,
273 | null,
274 | "2号楼1单元3层",
275 | "人民医院附近苹果社区"
276 | )
277 | )
278 | // fix 3门
279 | assertEquals(
280 | geocoding.normalizing("北京北京市西城区 白纸坊街道右安门内西街甲10号院11楼3门501"),
281 | Address(
282 | 110000000000, "北京市",
283 | 110100000000, "直辖区",
284 | 110102000000, "西城区",
285 | 110102019000, "白纸坊街道",
286 | null, null,
287 | null, null,
288 | "右安门内西街",
289 | "甲10号院",
290 | "11楼3门501",
291 | ""
292 | )
293 | )
294 | // fix 延川是县区的情况, 不能将延川路识别成延川县
295 | assertEquals(geocoding.normalizing("延川路116号绿城城园东区7号楼2单元802户"), null)
296 | // fix 同上, 不能识别成金水区
297 | assertEquals(geocoding.normalizing("金水路751号1号楼3单元501"), null)
298 | assertEquals(
299 | geocoding.normalizing("中国上海上海宝山区 顾村镇菊太路777弄24号602室"),
300 | Address(
301 | 310000000000, "上海市",
302 | 310100000000, "直辖区",
303 | 310113000000, "宝山区",
304 | 310113109000, "顾村镇",
305 | null, null,
306 | null, null,
307 | "菊太路",
308 | "777弄",
309 | "24号602室",
310 | ""
311 | )
312 | )
313 | // fix字符 —
314 | assertEquals(
315 | geocoding.normalizing("辽宁大连甘井子区辽宁, 大连, 甘井子区, 泡崖街道玉境路26号3—2—1"),
316 | Address(
317 | 210000000000, "辽宁省",
318 | 210200000000, "大连市",
319 | 210211000000, "甘井子区",
320 | 210211007000, "泡崖街道",
321 | null, null,
322 | null, null,
323 | "玉境路",
324 | "26号",
325 | "3-2-1",
326 | ""
327 | )
328 | )
329 | // fix 只有 1号楼 的情
330 | assertEquals(
331 | geocoding.normalizing("北京市西城区新康街2号院1号楼北侧楼房"),
332 | Address(
333 | 110000000000, "北京市",
334 | 110100000000, "直辖区",
335 | 110102000000, "西城区",
336 | null, null,
337 | null, null,
338 | null, null,
339 | "新康街",
340 | "2号院",
341 | "1号楼",
342 | "北侧楼房"
343 | )
344 | )
345 | // Fix issues #10
346 | assertEquals(
347 | geocoding.normalizing("福建福州鼓楼区六一路111号金三桥大厦"),
348 | Address(
349 | 350000000000, "福建省",
350 | 350100000000, "福州市",
351 | 350102000000, "鼓楼区",
352 | null, null,
353 | null, null,
354 | null, null,
355 | "六一路",
356 | "111号",
357 | null,
358 | "金三桥大厦"
359 | )
360 | )
361 | // Fix issues #8
362 | assertEquals(
363 | geocoding.normalizing("广东省河源市源城区中山大道16号华怡小区"),
364 | Address(
365 | 440000000000, "广东省",
366 | 441600000000, "河源市",
367 | 441602000000, "源城区",
368 | null, null,
369 | null, null,
370 | null, null,
371 | "中山大道",
372 | "16号",
373 | null,
374 | "华怡小区"
375 | )
376 |
377 | )
378 | assertEquals(
379 | geocoding.normalizing("广东省河源市中山大道16号华怡小区"),
380 | Address(
381 | 440000000000, "广东省",
382 | 441600000000, "河源市",
383 | null, null,
384 | null, null,
385 | null, null,
386 | null, null,
387 | "中山大道",
388 | "16号",
389 | null,
390 | "华怡小区"
391 | )
392 | )
393 | // Fix issues #9
394 | assertEquals(
395 | geocoding.normalizing("浙江省杭州市西湖区中国建设银河西湖支行"),
396 | Address(
397 | 330000000000, "浙江省",
398 | 330100000000, "杭州市",
399 | 330106000000, "西湖区",
400 | null, null,
401 | null, null,
402 | null, null,
403 | null,
404 | null,
405 | null,
406 | "中国建设银河西湖支行"
407 | )
408 | )
409 | assertEquals(
410 | geocoding.normalizing("江西赣州市赣县区王母渡镇"),
411 | Address(
412 | 360000000000, "江西省",
413 | 360700000000, "赣州市",
414 | 360704000000, "赣县区",
415 | 360704101000, "王母渡镇",
416 | null, null,
417 | null, null,
418 | null,
419 | null,
420 | null,
421 | ""
422 | )
423 | )
424 | }
425 |
426 | @Test
427 | fun testNormalizingWithStrict() {
428 | // 严格模式
429 | val geocoding = GeocodingX(true)
430 | assertEquals(
431 | geocoding.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"),
432 | null
433 | )
434 |
435 | // 非严格模式
436 | val geocoding2 = GeocodingX(false)
437 | assertEquals(
438 | geocoding2.normalizing("灵山镇海榆大道4号绿地城.润园11#楼2单元203"),
439 | Address(
440 | 130000000000, "河北省",
441 | 130600000000, "保定市",
442 | 130634000000, "曲阳县",
443 | 130634101000, "灵山镇",
444 | 130634101000, "灵山镇",
445 | null, null,
446 | "海榆大道",
447 | "4号",
448 | "11#楼2单元203",
449 | "绿地城润园"
450 | )
451 | )
452 | }
453 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestSegments.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.bitlap.geocoding.core.segment.IKAnalyzerSegmenter
4 | import org.bitlap.geocoding.core.segment.SimpleSegmenter
5 | import org.bitlap.geocoding.core.segment.SmartCNSegmenter
6 | import org.bitlap.geocoding.core.segment.WordSegmenter
7 | import org.junit.Test
8 |
9 | /**
10 | * Desc: 测试 segments
11 | * Mail: chk19940609@gmail.com
12 | * Created by IceMimosa
13 | * Date: 2017/2/6
14 | */
15 | class TestSegments {
16 |
17 | private val simple = SimpleSegmenter()
18 | private val smart = SmartCNSegmenter()
19 | private val word = WordSegmenter()
20 | private val ik = IKAnalyzerSegmenter()
21 |
22 | @Test
23 | fun test_segments() {
24 | var text = "7号楼1单元102室"
25 | // text = "九鼎2期B7号楼东数新都商贸购物中心附近"
26 |
27 | println(">>> simple 分词: ")
28 | println(simple.segment(text))
29 |
30 | // println(">>> smart 分词: ")
31 | // println(smart.segment(text))
32 |
33 | // println(">>> word 分词: ")
34 | // println(word.segment(text))
35 |
36 | println(">>> ik 分词: ")
37 | println(ik.segment(text))
38 | }
39 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/TestSimilarity.kt:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding
2 |
3 | import org.junit.Test
4 | import java.util.concurrent.Callable
5 | import java.util.concurrent.Executors
6 |
7 | /**
8 | * Desc: 测试相似度
9 | * Mail: chk19940609@gmail.com
10 | * Created by IceMimosa
11 | * Date: 2017/2/7
12 | */
13 | open class TestSimilarity {
14 |
15 | @Test
16 | fun test_similarity() {
17 | // 一般匹配
18 | var text1 = "山东省沂水县四十里堡镇东艾家庄村205号"
19 | var text2 = "山东省沂水县四十里堡镇东艾家庄村206号"
20 |
21 | // 带有building匹配
22 | text1 = "湖南衡阳常宁市湖南省衡阳市常宁市泉峰街道泉峰街道消防大队南园小区A栋1单元601"
23 | text2 = "湖南衡阳常宁市湖南省衡阳市常宁市泉峰街道泉峰街道消防大队南园小区A栋2单元601"
24 |
25 | // 特殊
26 | text1 = "山东青岛李沧区延川路116号绿城城园东区7号楼2单元802户"
27 | text2 = "山东青岛李沧区延川路绿城城园东区7-2-802"
28 |
29 | // 标准化
30 | val addr1 = Geocoding.normalizing(text1)
31 | val addr2 = Geocoding.normalizing(text2)
32 | println("addr1 >>>> $addr1")
33 | println(">>>>>>>>>>>>>>>>>")
34 | println("addr2 >>>> $addr2")
35 |
36 | println("相似度结果分析 >>>>>>>>> " + Geocoding.similarityWithResult(addr1, addr2))
37 | }
38 |
39 | @Test
40 | fun test_fix_null_test() {
41 | // 一般匹配
42 | val text1 = "中国湖南郴州宜章县梅田镇【梅田镇】(梅田镇附近)"
43 | val text2 = "湖南省郴州市宜章县梅田镇上寮村2组"
44 |
45 | // 标准化
46 | val addr2 = Geocoding.normalizing(text1)
47 | val addr1 = Geocoding.normalizing(text2)
48 | println("addr1 >>>> $addr1")
49 | println(">>>>>>>>>>>>>>>>>")
50 | println("addr2 >>>> $addr2")
51 |
52 | println("相似度结果分析 >>>>>>>>> " + Geocoding.similarityWithResult(addr1, addr2))
53 | }
54 |
55 | @Test
56 | fun test_similarity_threads() {
57 | val pool = Executors.newFixedThreadPool(10)
58 |
59 | val addr1 = "中国湖南郴州宜章县梅田镇【梅田镇】(梅田镇附近)"
60 | val addr2 = "湖南省郴州市宜章县梅田镇上寮村2组"
61 |
62 | (1 .. 1000).map {
63 | pool.submit(Callable {
64 | Geocoding.similarity(addr1, addr2)
65 | })
66 | }.forEach {
67 | val r = it.get()
68 | assert(0.8164965809277261 == r)
69 | }
70 | pool.shutdown()
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/Main.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region;
2 |
3 | import java.io.IOException;
4 |
5 | import org.bitlap.geocoding.region.model.RegionEntity;
6 | import org.bitlap.geocoding.region.util.OutUtil;
7 |
8 | public class Main {
9 |
10 | // 导入数据库成功后,执行china.sql,插入数据项:【中国】
11 | public static void main(String[] args) throws IOException {
12 | long start = System.currentTimeMillis();
13 | String pathname = "/tmp/cnarea" + 20210707 + ".dat";
14 | RegionDatFileHelper.writeDatFile(pathname);
15 | long end = System.currentTimeMillis();
16 | OutUtil.info(String.format("cost %s ms", end - start));
17 | RegionEntity regionEntity = RegionDatFileHelper.readDatFile(pathname);
18 | OutUtil.info(regionEntity.toString());
19 | }
20 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/IceMimosa/geocoding/actions/workflows/java8.yml)
3 |
4 | # 介绍
5 | 项目目前采用的是 [淘宝物流4级地址](!https://lsp.wuliu.taobao.com/locationservice/addr/output_address_town.do)的标准地址库,即`classpath:src/main/resources/core/region.dat`中的数据,
6 | 本package下代码可将 [中国5级行政区域](!https://github.com/kakuilan/china_area_mysql) 处理为兼容geocoding的标准地址库。
7 |
8 | ### 使用步骤
9 |
10 | 1. 成功导入china_area_mysql到数据库
11 | 2. 执行本package下sql/china.sql插`中国`数据
12 | 3. 修改本package下util/JdbcUtil.java中的jdbc相关参数
13 | 4. 执行本package下Maine类中main方法
14 | 5. 将生成的dat文件改名为region.dat并放入`classpath:src/main/resources/core/`
15 |
16 | ### 注意事项
17 | 本测试配置基于Server version: 8.0.21 MySQL Community Server - GPL环境,其它可能略有差异,可通过下面两个SQL确认配置是否OK
18 |
19 | ```
20 | show variables like '%CHARACTER%';
21 | show variables like '%max_allowed_packet%';
22 | ```
23 |
24 | 1. 设置max_allowed_packet,[mysqld]下max_allowed_packet = 2000M,[mysqldump]下max_allowed_packet = 2000M
25 | 2. 设置字符集,[client]下default-character-set=utf8mb4,[mysqld]下character-set-server=utf8mb4和init_connect='SET NAMES utf8mb4',[mysql]下default-character-set=utf8mb4
26 |
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/RegionDatFileHelper.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region;
2 |
3 | import java.io.ByteArrayInputStream;
4 | import java.io.ByteArrayOutputStream;
5 | import java.io.File;
6 | import java.io.IOException;
7 | import java.sql.Connection;
8 | import java.util.Base64;
9 | import java.util.List;
10 | import java.util.zip.GZIPInputStream;
11 | import java.util.zip.GZIPOutputStream;
12 |
13 | import org.apache.commons.io.IOUtils;
14 |
15 | import com.google.common.collect.Lists;
16 | import com.google.common.io.Files;
17 | import com.google.gson.Gson;
18 |
19 | import org.bitlap.geocoding.model.RegionType;
20 | import org.bitlap.geocoding.region.model.RegionEntity;
21 | import org.bitlap.geocoding.region.util.JdbcUtil;
22 | import kotlin.text.Charsets;
23 |
24 | public class RegionDatFileHelper {
25 |
26 | final static List provinceLevelCity1 = Lists.newArrayList("北京市", "天津市", "上海市", "重庆市");
27 |
28 | public static void writeDatFile(String pathname) throws IOException {
29 | write(pathname, "");
30 | Connection conn = JdbcUtil.getConnection();
31 | if (conn == null) return;
32 | List china = Lists.newArrayList();
33 | List provinces = RegionSqlHelper.findProvinces(conn);
34 | for (int i = 0; i < provinces.size(); i++) {
35 | RegionEntity province = provinces.get(i);
36 | List list = RegionSqlHelper.findByProvince(conn, province.getShortName() + "%");
37 | if (i == 0) {
38 | List tree = parseProvince(list);
39 | china.add(tree.get(0));
40 | } else {
41 | List tree = parseProvince(list);
42 | china.get(0).getChildren().add(tree.get(0));
43 | }
44 | }
45 | JdbcUtil.free(conn);
46 | Gson gson = new Gson();
47 |
48 | byte[] context = encode(gson.toJson(china.get(0)));
49 | write(pathname, new String(context, Charsets.UTF_8));
50 | }
51 |
52 | private static List parseProvince(List list) {
53 | List province = Lists.newArrayList();
54 |
55 | for (RegionEntity entity : list) {
56 | if (entity.getParentId().equals(0L)) {
57 | if (entity.getChildren() == null) entity.setChildren(Lists.newArrayList());
58 | entity.setType(of(entity.getId(), entity.getLevel(), entity.getName()));
59 | province.add(entity);
60 | }
61 | }
62 |
63 | for (RegionEntity item : province) {
64 | item = recursive(item, list, province.size());
65 | }
66 |
67 | return province;
68 | }
69 |
70 | private static RegionEntity recursive(RegionEntity parent, List list, int j) {
71 | for (int i = j; i < list.size(); i++) {
72 | RegionEntity entity = list.get(i);
73 | if (parent.getId().equals(entity.getParentId())) {
74 | entity = recursive(entity, list, i + 1);
75 | entity.setType(of(entity.getId(), entity.getLevel(), entity.getName()));
76 | if (parent.getChildren() == null) parent.setChildren(Lists.newArrayList());
77 | parent.getChildren().add(entity);
78 | }
79 | }
80 | return parent;
81 | }
82 |
83 | private static void write(final String fileName, final String contents) throws IOException {
84 | File file = new File(fileName);
85 | // file.deleteOnExit();
86 | if (!file.exists()) {
87 | Files.createParentDirs(file);
88 | file.createNewFile();
89 | }
90 | if (contents != null && !contents.trim().isEmpty()) {
91 | Files.write(contents.getBytes(), file);
92 | }
93 | }
94 |
95 | private static RegionType of(Long id, int level, String name) {
96 | if (id.equals(100000000000L)) return RegionType.Country;
97 | if (level == 0) {
98 | if (provinceLevelCity1.contains(name)) return RegionType.ProvinceLevelCity1;
99 | return RegionType.Province;
100 | }
101 | if (level == 1) {
102 | if ("直辖区".equalsIgnoreCase(name)) return RegionType.ProvinceLevelCity2;
103 | if ("直辖县".equalsIgnoreCase(name)) return RegionType.CityLevelDistrict;
104 | return RegionType.City;
105 | }
106 | if (level == 2) return RegionType.District;
107 | if (level == 3) {
108 | if (name.matches("乡$")) return RegionType.Town;
109 | if (name.matches("镇$")) return RegionType.Town;
110 | return RegionType.PlatformL4;
111 | }
112 | if (level == 4) return RegionType.Village;
113 | return RegionType.Undefined;
114 | }
115 |
116 | public static RegionEntity readDatFile(String file) throws IOException {
117 | byte[] byteArray = Files.toByteArray(new File(file));
118 | String json = new String(byteArray);
119 | return new Gson().fromJson(decode(json), RegionEntity.class);
120 | }
121 |
122 | private static String decode(String str) throws IOException {
123 | byte decodedByteArray[] = Base64.getMimeDecoder().decode(str);
124 | GZIPInputStream gzipis = new GZIPInputStream(new ByteArrayInputStream(decodedByteArray));
125 | return new String(IOUtils.toByteArray(gzipis), Charsets.UTF_8);
126 | }
127 |
128 | private static byte[] encode(String str) throws IOException {
129 | ByteArrayOutputStream out = new ByteArrayOutputStream();
130 | GZIPOutputStream gzipos = new GZIPOutputStream(out);
131 | gzipos.write(str.getBytes(Charsets.UTF_8));
132 | gzipos.close();
133 | return Base64.getMimeEncoder().encode(out.toByteArray());
134 | }
135 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/RegionSqlHelper.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region;
2 |
3 | import java.sql.Connection;
4 | import java.sql.PreparedStatement;
5 | import java.sql.ResultSet;
6 | import java.sql.SQLException;
7 | import java.util.List;
8 |
9 | import com.google.common.collect.Lists;
10 |
11 | import org.bitlap.geocoding.region.model.RegionEntity;
12 | import org.bitlap.geocoding.region.util.JdbcUtil;
13 | import org.bitlap.geocoding.region.util.OutUtil;
14 |
15 | public class RegionSqlHelper {
16 |
17 | private static final String sqlFindAllProvinces = "select `level`, area_code as id, parent_code as parentId, "
18 | + "`name` as `name`, short_name as shortName, merger_name as `alias`, zip_code as zip "
19 | + "from cnarea_2020 where parent_code = 0 order by area_code";
20 |
21 | private static final String sqlFindByProvince = "select `level`, area_code as id, parent_code as parentId, "
22 | + "`name` as `name`, short_name as shortName, merger_name as `alias`, zip_code as zip "
23 | + "from cnarea_2020 where merger_name like ? order by `level`, parent_code, area_code";
24 |
25 | public static List findProvinces(Connection conn) {
26 | PreparedStatement pstmt = null;
27 | ResultSet rs = null;
28 | try {
29 | pstmt = conn.prepareStatement(sqlFindAllProvinces);
30 | rs = pstmt.executeQuery();
31 | OutUtil.info(sqlFindAllProvinces);
32 | return convert(rs);
33 | } catch (SQLException sqle) {
34 | OutUtil.err("Exception: RegionEntityHelper.findProvinces " + sqle.getMessage());
35 | }finally {
36 | JdbcUtil.free(rs, pstmt);
37 | }
38 | return Lists.newArrayList();
39 | }
40 |
41 |
42 | public static List findByProvince(Connection conn, String name) {
43 | PreparedStatement pstmt = null;
44 | ResultSet rs = null;
45 | try {
46 | pstmt = conn.prepareStatement(sqlFindByProvince);
47 | pstmt.setString(1, name);
48 | rs = pstmt.executeQuery();
49 | OutUtil.info(sqlFindByProvince.replace("?", "'" + name + "'"));
50 | return convert(rs);
51 | } catch (SQLException sqle) {
52 | OutUtil.err("Exception: RegionEntityHelper.findByProvince " + sqle.getMessage());
53 | } finally {
54 | JdbcUtil.free(rs, pstmt);
55 | }
56 | return Lists.newArrayList();
57 | }
58 |
59 | private static List convert(ResultSet rs) throws SQLException {
60 | List list = Lists.newArrayList();
61 | while (rs != null && rs.next()) {
62 | RegionEntity regionEntity = new RegionEntity();
63 | regionEntity.setAlias(rs.getString("alias"));
64 | regionEntity.setId(rs.getLong("id"));
65 | regionEntity.setLevel(rs.getInt("level"));
66 | regionEntity.setName(rs.getString("name"));
67 | regionEntity.setParentId(rs.getLong("parentId"));
68 | regionEntity.setShortName(rs.getString("shortName"));
69 | regionEntity.setZip(rs.getString("zip"));
70 | list.add(regionEntity);
71 | }
72 | return list;
73 | }
74 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/model/RegionEntity.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region.model;
2 |
3 | import com.google.gson.annotations.Expose;
4 | import org.bitlap.geocoding.model.RegionType;
5 |
6 | import java.io.Serializable;
7 | import java.util.List;
8 |
9 | public class RegionEntity implements Serializable{
10 |
11 | private static final long serialVersionUID = 1L;
12 |
13 | private Long id = 0L;
14 | private Long parentId = 0L;
15 | @Expose(serialize = false, deserialize = false)
16 | private Integer level = 0;
17 | private String name = "";
18 | @Expose(serialize = false, deserialize = false)
19 | private String shortName = "";
20 | private String alias = "";
21 | private RegionType type = RegionType.Undefined;
22 | private String zip = "";
23 | private List children = null;
24 | private List orderedNames = null;
25 |
26 | public Long getId() {
27 | return id;
28 | }
29 |
30 | public void setId(Long id) {
31 | this.id = id;
32 | }
33 |
34 | public Long getParentId() {
35 | return parentId;
36 | }
37 |
38 | public void setParentId(Long parentId) {
39 | this.parentId = parentId;
40 | }
41 |
42 | public Integer getLevel() {
43 | return level;
44 | }
45 |
46 | public void setLevel(Integer level) {
47 | this.level = level;
48 | }
49 |
50 | public String getName() {
51 | return name;
52 | }
53 |
54 | public void setName(String name) {
55 | this.name = name;
56 | }
57 |
58 | public String getShortName() {
59 | return shortName;
60 | }
61 |
62 | public void setShortName(String shortName) {
63 | this.shortName = shortName;
64 | }
65 |
66 | public String getAlias() {
67 | return alias;
68 | }
69 |
70 | public void setAlias(String alias) {
71 | this.alias = alias;
72 | }
73 |
74 | public RegionType getType() {
75 | return type;
76 | }
77 |
78 | public void setType(RegionType type) {
79 | this.type = type;
80 | }
81 |
82 | public String getZip() {
83 | return zip;
84 | }
85 |
86 | public void setZip(String zip) {
87 | this.zip = zip;
88 | }
89 |
90 | public List getChildren() {
91 | return children;
92 | }
93 |
94 | public void setChildren(List children) {
95 | this.children = children;
96 | }
97 |
98 | public List getOrderedNames() {
99 | return orderedNames;
100 | }
101 |
102 | public void setOrderedNames(List orderedNames) {
103 | this.orderedNames = orderedNames;
104 | }
105 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/sql/china.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `cnarea_2020`
2 | (`level`, `parent_code`, `area_code`, `zip_code`, `city_code`, `name`, `short_name`, `merger_name`, `pinyin`, `lng`, `lat`)
3 | VALUES
4 | (0, 0, 100000000000, 000000, '', '中国', '中国', '中国', 'ZHONGGUO', 0.000000, 0.000000);
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/util/JdbcUtil.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region.util;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.ResultSet;
6 | import java.sql.SQLException;
7 | import java.sql.Statement;
8 |
9 | public class JdbcUtil {
10 |
11 | private static final String driver_class = "com.mysql.cj.jdbc.Driver"; // com.mysql.jdbc.Driver
12 |
13 | private static final String db_url = "jdbc:mysql://localhost:3306/cnarea";
14 |
15 | private static final String db_userid = "root";
16 |
17 | private static final String db_password = "12345678";
18 |
19 | public static Connection getConnection() {
20 | Connection conn = null;
21 | try {
22 | Class.forName(driver_class);
23 | } catch (ClassNotFoundException cnfe) {
24 | OutUtil.err("Exception: JdbcUtil.getConnection driver_class not found");
25 | return null;
26 | }
27 | try {
28 | conn = DriverManager.getConnection(db_url, db_userid, db_password);
29 | } catch (SQLException sqle) {
30 | OutUtil.err("Exception: JdbcUtil.getConnection get connection failed");
31 | return null;
32 | }
33 | return conn;
34 | }
35 |
36 | public static void free(ResultSet rs, Statement stmt) {
37 | free(stmt);
38 | free(rs);
39 | }
40 |
41 | public static void free(ResultSet rs) {
42 | if (rs == null) return;
43 | try {
44 | rs.close();
45 | } catch (SQLException sqle) {}
46 | }
47 |
48 | public static void free(Statement stmt) {
49 | if (stmt == null) return;
50 | try {
51 | stmt.close();
52 | } catch (SQLException sqle) {}
53 | }
54 |
55 | public static void free(Connection conn) {
56 | if (conn == null) return;
57 | try {
58 | conn.close();
59 | } catch (SQLException sqle) {}
60 | }
61 | }
--------------------------------------------------------------------------------
/src/test/java/org/bitlap/geocoding/region/util/OutUtil.java:
--------------------------------------------------------------------------------
1 | package org.bitlap.geocoding.region.util;
2 |
3 | public class OutUtil {
4 |
5 | public static void err(String str) {
6 | System.err.println(str);
7 | }
8 |
9 | public static void info(String str) {
10 | System.out.println(str);
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/test/resources/sql/create.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS `addr_address`;
2 | CREATE TABLE `addr_address` (
3 | `id` BIGINT(11) NOT NULL AUTO_INCREMENT COMMENT 'Address Record ID',
4 | `province` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'Province ID',
5 | `city` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'City ID',
6 | `district` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'District ID',
7 | `street` BIGINT(11) NOT NULL DEFAULT '0' COMMENT 'Street ID',
8 | `text` varchar(100) NOT NULL DEFAULT '' COMMENT 'Address Text',
9 | `town` varchar(20) NOT NULL DEFAULT '' COMMENT '镇',
10 | `village` varchar(5) NOT NULL DEFAULT '' COMMENT '村',
11 | `road` varchar(8) NOT NULL DEFAULT '' COMMENT '道路',
12 | `road_num` varchar(10) NOT NULL DEFAULT '' COMMENT '道路号码',
13 | `building_num` varchar(20) NOT NULL DEFAULT '' COMMENT '几号楼+几单元+房间号',
14 | `hash` int(11) NOT NULL DEFAULT '0' COMMENT 'Address Text Hash Code',
15 | `raw_text` varchar(150) NOT NULL DEFAULT '' COMMENT 'Original Address Text',
16 | `prop1` varchar(20) NOT NULL DEFAULT '' COMMENT '扩展字段:订单号',
17 | `prop2` varchar(20) NOT NULL DEFAULT '' COMMENT '扩展字段:片区ID',
18 | `create_time` date NOT NULL DEFAULT '1900-01-01',
19 | PRIMARY KEY (`id`),
20 | KEY `ix_hash` (`hash`),
21 | KEY `ix_pid_cid_did` (`province`,`city`,`district`)
22 | ) ENGINE=InnoDB AUTO_INCREMENT=10001 DEFAULT CHARSET=utf8;
--------------------------------------------------------------------------------