├── LICENSE
├── README.md
├── baidu_aoi_spider
├── middlewares.py
├── settings.py
└── spiders
│ ├── baidu_aoi.py
│ └── examples.py
├── data
├── AOI_example1
│ ├── AOI_example1.cpg
│ ├── AOI_example1.dbf
│ ├── AOI_example1.prj
│ ├── AOI_example1.shp
│ └── AOI_example1.shx
├── AOI_example2
│ ├── AOI_example2.cpg
│ ├── AOI_example2.dbf
│ ├── AOI_example2.prj
│ ├── AOI_example2.shp
│ └── AOI_example2.shx
├── POI_example1.csv
└── POI_example2.csv
├── env.yaml
├── images
├── AOI_Peking_University.png
├── running_process.png
└── similarity_problem.png
├── processor
├── __init__.py
├── aoi_container.py
├── api_handler.py
├── counter.py
├── file_operator.py
├── logger.py
├── repository.py
└── validator.py
├── scrapy.cfg
└── spatial
├── coords.py
└── geometry.py
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BaiduAOISpider
2 |
3 | ## 项目简介
4 |
5 | 基于 Scrapy 框架的百度地图 AOI(Area of Interest)[^1]/ 矢量边界数据的 Python 多线程爬虫。根据给定 POI 数据(即包含名称 + 经纬度的数据),按一定条件匹配合适的 AOI 数据,将结果保存为 csv 格式和 shp (ESRI shapefile)[^2]格式。
6 |
7 | ### 项目特点
8 |
9 | * 需要提前准备目标 POI 数据, POI 获取方法可参考其他项目例如 [AMapPoi](https://github.com/Civitasv/AMapPoi)
10 |
11 | * 项目内可以选用一些简单反爬手段,包括:随机 user-agent、随机 cookie、使用随机代理[^3]
12 |
13 | * 爬取结果和爬取状态直接保存在原始 POI 数据 csv 上,可以随时中断、继续爬取和重新爬取
14 |
15 | * 提供了一些 AOI 的匹配和筛选条件,可根据需求自行修改,便于获取更精准的结果
16 |
17 | ### 应用场景
18 |
19 | 相较于直接爬取整块空间区域内的所有 AOI 的方法(例如项目 [poi](https://github.com/liujiao111/poi)),本项目有如下的使用场景:
20 |
21 | * 因为 POI 数据获取更容易,当你手上已经有一份筛选好的 POI 数据,只需要匹配每个 POI 对应的 AOI
22 |
23 | * POI 是根据别的来源确定的,例如爬取的带有经纬度坐标的签到数据、点评数据和房价数据等等。这时你不可能提前知道需要的 AOI 名单,因而必须先确定 POI、然后再去找匹配的 AOI
24 |
25 | * 当你只需要一部分而不需要全量 AOI 数据时,想节约成本和时间(是的,百度 AOI 爬取 “基本上” 是付费的,详见 [API 参数配置](#api-参数配置)中关于 AK 的内容)
26 |
27 | ### 工作流程
28 |
29 | 爬虫的完整流程如下,即对 `baidu_aoi.py` 的说明:
30 |
31 | * 初始化环境,包括:
32 |
33 | * 导入全局配置和局部配置的各参数,并核验是否合法
34 |
35 | * 导入 POI csv 文件,并核验是否合法
36 |
37 | * 添加:爬取状态、最终匹配的 AOI 的名称[^4]、WGS-84 坐标系[^5]下的经纬度、AOI 几何形状共 5 列字段
38 |
39 | * 将原始 csv 内的坐标统一到 WGS-84 坐标系
40 |
41 | * 启动和初始化计数器和 AOI 容器
42 |
43 | * 将对应参数填入百度圆形区域地点检索 API 的 url[^6]:`https://api.map.baidu.com/place/v2/search?query=名称&location=纬度,经度&radius=搜索半径&ak=您的密钥&output=json&coord_type=1`
44 |
45 | * 该 API 根据地名返回 `json` 格式的可能 POI 的列表,每个 POI 中包含名称、经纬度、其他附加信息,以及 `uid`(如果存在的话),每一个 uid 对应一个几何形状,即我们想要的 AOI
46 |
47 | * 下文中为了将上述返回结果中的 POI 和原始 POI csv 区分开来,**将检索结果中的 POI 以 uid 代称**,事实上这一步骤的核心就是获取所有可能的 uid
48 |
49 | * 如果想在搜索中限定 POI 的种类,还可以添加 `prim_ind`([百度标准](https://lbsyun.baidu.com/index.php?title=open/poitags)的 POI 一级行业分类)和 `sec_ind`(二级行业分类)两个参数[^7]
50 |
51 | * 如果这一步没有检索到任何 uid,那么将爬取状态设置为 `No Uid`
52 |
53 | * 将 uid 填入查询 AOI 的 url:`https://map.baidu.com/?newmap=1&qt=ext&uid=查询到的uid&ext_ver=new&ie=utf-8&l=11`,然后提取几何形状,如果该几何形状满足 AOI 的筛选条件(例如其四至范围[^8]必须包含 POI 的经纬度等等),将该几何形状存入 AOI 容器中。这样对于每个原始 POI 能获取到一个可能的 AOI 列表
54 |
55 | * 如果这一步所有 AOI 都满足筛选条件,那么将爬取状态设置为 `No Geometry`
56 |
57 | * 根据 `settings.py` 中 `FILTER_RULES` 的设置,计算 AOI 列表的综合排序值,将排序第一位的 AOI 作为最终匹配的 AOI
58 |
59 | * 提供了以下几种排序方法:(1)上一步的百度检索排序、(2)AOI 面积排序、(3)和原 POI 距离排序、(4)文本相似度排序,综合排序是其中被设置开启的方法的算术平均值的再排序
60 |
61 | * 例如,假设在有 3 个可能 AOI、并且四种方法都开启
62 |
63 | * 四种排序的结果为:(1)1、2、3;(2)2、3、1;(3)3、1、2;(4)3、2、1
64 |
65 | * 那么算数平均值为:9/4、8/4、7/4,再排序为:3、2、1
66 |
67 | * 因此最终匹配的 AOI 为第三个
68 |
69 | * 匹配到 AOI 后,将爬取状态设置为 `Matched`
70 |
71 | 总结整个工作流程如下:
72 |
73 | ```text
74 | POI csv 读取 → 每个 POI 信息拼接 url → 一个 url 返回一组 json 格式的 uid 列表
75 |
76 | → 用每个 uid 查询 AOI → 得到每个 POI 的可能 AOI 列表 → 综合排序,取第一位的 AOI
77 | ```
78 |
79 | #### 流程案例说明
80 |
81 | 例如对 `北京大学, 116.30420708125263, 39.991595084258336` 进行爬取:
82 |
83 | * 拼接出地点检索 url:
84 |
85 | * 返回结果如下,提取出所有的 uid:
86 |
87 | ```json
88 | {
89 | // some information
90 | // ...
91 | "results": [
92 | {
93 | "name": "北京大学",
94 | "location": {
95 | "lat": 39.998877,
96 | "lng": 116.316833,
97 | },
98 | "address": "北京市海淀区颐和园路5号",
99 | "province": "北京市",
100 | "city": "北京市",
101 | "area": "海淀区",
102 | "telephone": "(010)62752114",
103 | "detail": 1,
104 | "uid": "ddfd7c2d8db36cf39ee3219e"
105 | "detail_info":{
106 | "tag":"教育培训;高等院校",
107 | // ...
108 | },
109 | // more information
110 | // ...
111 | },
112 | // more uids
113 | // ...
114 | ]
115 | }
116 | ```
117 |
118 | * 对每一个 uid,拼接并访问 AOI 查询 url,例如:,返回结果形如:
119 |
120 | ```json
121 | {
122 | // some information
123 | // ...
124 | "content": {
125 | "geo": "4|12946839.266068,4837125.446178;12949751.777560,4839020.969541|1-12948599.7094790,4837127.8547043,...,12948599.7094790,4837127.8547043;",
126 | "uid": "ddfd7c2d8db36cf39ee3219e"
127 | },
128 | // some information
129 | // ...
130 | }
131 | ```
132 |
133 | * 提取每一个可能的 AOI,进行综合排序并获取最后匹配的 AOI,结果就是下图中蓝色虚线的几何形状:
134 |
135 |
136 |
137 | ## 项目运行
138 |
139 | * 下载代码
140 |
141 | ```bash
142 | # 或者直接手动下载
143 | git clone git@github.com:Prufrok/BaiduAOISpider.git
144 | ```
145 |
146 | * 安装环境(基于 anaconda)
147 |
148 | ```bash
149 | # 创建虚拟环境并安装依赖
150 | conda env create -n BaiduAOISpider -f env.yaml
151 |
152 | # 激活新环境
153 | conda activate BaiduAOISpider
154 | ```
155 |
156 | * 阅读[配置说明](#配置说明),按照实际需求修改 `settings.py`
157 |
158 | ```Python
159 | # BaiduAOISpider/settings.py
160 | # 修改百度地图 API 密钥
161 | # 在百度地图开放平台注册、登陆并申请服务端密钥:
162 | # https://lbsyun.baidu.com/apiconsole/key
163 | AK_LIST = ['your_baidu_map_aks',]
164 |
165 | # 根据使用需要,修改其他设置,例如:
166 | CONCURRENT_REQUESTS_PER_IP = 20 # 调整每个 ip 的并发请求数
167 | UPDATE_INTERVAL = 100 # 调整更新间隔
168 | ```
169 |
170 | * 注意:**爬取大量数据的情况下,建议请使用代理 IP**,具体见[基础设置](#基础设置)中的 `PROXY_ENABLED` 参数
171 |
172 | * 准备 POI 数据
173 |
174 | * POI 数据必须为 csv 格式,放入 `data` 文件夹下。文件名任意,但是需要与 `settings.py` 中的 `POI_CSV_PATH` 匹配
175 |
176 | * POI csv 必须包含:`name`(POI 名称)、`lng`(经度)、`lat`(纬度)三个字段
177 |
178 | * 如果想要在检索中对每个 POI 限定它的类别,还可以添加 `prim_ind` 和 `sec_ind` 两个字段,详见 [API 参数配置](#api-参数配置)
179 |
180 | * 运行爬虫
181 |
182 | ```bash
183 | # 进入项目根目录
184 | cd "your directory/.../BaiduAOISpider"
185 |
186 | # 运行爬虫,spider_name 为爬虫类中的 name 属性,例如:
187 | # BaiduAOISpider 为 BaiduAOI,Example1 为 example1
188 | # 如果中断继爬、重爬,只需再次运行同样的命令即可
189 | scrapy crawl spider_name
190 | ```
191 |
192 | * 运行过程截图(示例 1 的情况)
193 |
194 | * 爬取状态记录格式为 `Matched/No Uid/No Geometry/Total | Crawled (Percentage)`,从左至右含义:匹配到 AOI 的 POI 数量、地名检索中不存在 uid 的 POI 数量、没有返回 AOI 几何信息或所有 AOI 都不符合过滤条件的 POI 数量、总共的 POI 数量、已爬取的 POI 数量(匹配到、无 uid、无 AOI 三种情况的总数)、已爬取的 POI 数量占总数的百分比
195 |
196 |
197 |
198 | ## 配置说明
199 |
200 | ### 配置类别
201 |
202 | 总共有两种配置方法:
203 |
204 | * **全局配置**,即在 `settings.py` 中进行设置
205 |
206 | * **局部配置**,可设置的内容和全局配置完全一样,不过需要在每个爬虫类内的 `updating_settings` 变量中设置。**局部配置的优先级高于全局配置**。局部配置后,使用“深度” `update` 方法,对全局配置中的设定进行更新,例如:
207 |
208 | * 如果在 `settings.py` 中设置:
209 |
210 | ``` python
211 | CONCURRENT_REQUESTS = 25
212 | FILTER_RULES = {
213 | 'min_aoi_area': 0,
214 | 'min_similarity': 0,
215 | }
216 | ```
217 |
218 | * 而在 `updating_settings` 中设置:
219 |
220 | ``` python
221 | CONCURRENT_REQUESTS = 30
222 | FILTER_RULES = {
223 | 'min_similarity': 0.1,
224 | }
225 | ```
226 |
227 | * 最终爬虫使用的设置为:
228 |
229 | ```python
230 | CONCURRENT_REQUESTS = 30
231 | FILTER_RULES = {
232 | 'min_aoi_area': 0,
233 | 'min_similarity': 0.1,
234 | }
235 | ```
236 |
237 | ### 配置内容
238 |
239 | 可设置的内容分为以下两个部分:
240 |
241 | * **默认配置**,不需要修改。包括请求头、开启中间件、日志等级以及scrapy基础设置等等
242 |
243 | * **运行配置**,需要根据实际需求修改,运行配置内又分为四类:[并发配置](#并发配置)、[基础配置](#基础设置)、[API 参数配置](#api-参数配置)和 [AOI 筛选配置](#aoi-筛选配置),下面将分别详细介绍
244 |
245 | #### 并发配置
246 |
247 | 包含 scrapy 框架的三个并发参数:`CONCURRENT_REQUESTS`、`DOWNLOAD_DELAY` 和 `CONCURRENT_REQUESTS_PER_IP`。这三个参数的含义和使用方法可以参考[官方文档](https://docs.scrapy.org/en/latest/topics/settings.html#concurrent-requests-settings)。需要注意的是:
248 |
249 | * **百度地图地点检索 API 的并发上限是 30 QPS**,因此至少 `CONCURRENT_REQUESTS_PER_IP` 的值不应超过 30
250 |
251 | * 按照项目目前的设置,并发峰值在 10 QPS 左右,大约每小时能匹配完 4000 至 5000 条原始 POI(对地点检索结果下 10 个左右 uid 进行抓取的情况下)
252 |
253 | * **适度地设置抓取速度,建议不用追求最大化的并发**,减小爬取行为带来的负担和避免可能关闭 AOI 查询 url 的风险
254 |
255 | #### 基础设置
256 |
257 | * `POI_CSV_PATH`:POI 数据文件路径,同时也是 csv 结果的保存路径
258 |
259 | * `AOI_CSV_PATH`:shp 结果保存路径,如果上级文件夹不存在会自动创建
260 |
261 | * `PROXY_ENABLED`:是否使用随机代理,`1` 代表是,`0` 代表否
262 |
263 | * 如果使用随机代理,需要在 `middlewares.py` 中设置代理池:
264 |
265 | * 本项目默认采用的代理池来自 [proxy_pool](https://github.com/jhao104/proxy_pool) 项目,需要按照该项目进行额外配置
266 |
267 | * 地点检索和 AOI 查询 url 都是 https 协议,因此**使用的随机代理也要支持 https 协议**
268 |
269 | * 如果自己搭建代理池,还需要构造获取一个随机代理和删除某一个代理的 2 个函数,并替换 `middlewares.py` 中的 `get_proxy` 和 `delete_proxy`
270 |
271 | * 是否使用随机代理的可能影响:
272 |
273 | * 地点检索作为百度公开的 API 接口应该没有反爬限制,所以是否使用应该没有影响
274 |
275 | * AOI 查询的 url 应该不算是公开 API,因此如果不使用代理,可能会被封 IP(不保证,个人没有尝试过大批量数据下不开代理爬取)
276 |
277 | * 此外,以前高德网站也有类似的 AOI 查询 url,但是在大量爬取之后,这个 url 最后被关闭了[^9]
278 |
279 | * 再次提醒:**强烈建议如果要爬取大量数据,请开启使用代理,并且请不要过于[频繁爬取](#并发配置)**
280 |
281 | * `UPDATE_INTERVAL`:更新间隔,单位为次
282 |
283 | * 在隔多少次数的总 AOI 访问后进行:(1)文件保存、(2)阶段性爬取状态统计
284 |
285 | * `USE_FIRST_UID`:是否使用第一个 uid,`1` 代表是,`0` 代表否
286 |
287 | * 如果使用,那么把百度地点检索 API 结果中的第一个可用 uid 当作可能的最终匹配 AOI(还需要验证是否符合 AOI 筛选条件),即只采用百度检索结果排序来进行筛选
288 |
289 | * 如果不使用,那么将对 API 检索结果中所有 uid 进行抓取,然后再根据 `FILTER_RULES` 中设立的规则和 AOI 自身的筛选条件进行筛选
290 |
291 | * 开启后,会加快整体爬取速度,但是会造成一定的不匹配和遗漏,因为百度的最佳搜索结果并不一定是准确的
292 |
293 | #### API 参数配置
294 |
295 | * `API_PARAMS`:百度地点检索 API 的参数,包括以下几类:
296 |
297 | * **行业分类参数**(`prim_ind` 和 `sec_ind`)
298 |
299 | * 需要按照[百度标准](https://lbsyun.baidu.com/index.php?title=open/poitags)对类别进行重分类
300 |
301 | * 如果不需要开启,将值设为 `''`,可以只开启一个,也可以同时开启两个
302 |
303 | * 如果需要开启,并且所有 POI 是同一种类的情况下,直接填入对应类别即可,例如:
304 |
305 | ```python
306 | API_PARAMS = {
307 | 'prim_ind': '房地产',
308 | 'sec_ind': '住宅区',
309 | }
310 | ```
311 |
312 | * 如果需要开启,但是 POI 类别不同,需要:(1)将参数值设置为 `VAR`(作为变量输入的意思);(2)在 csv 文件中添加对应的列
313 |
314 | * **搜索半径参数**
315 |
316 | * `radius`:搜索半径大小,单位为米。建议将值设置的稍微大一点避免遗漏,例如 2000(m)
317 |
318 | * `radius_limit`:是否限制结果一定要在搜索半径内,`true` 代表是,`false` 代表否,建议开启
319 |
320 | * **坐标系参数**
321 |
322 | * `crs`:POI csv 中 `lng` 和 `lat` 的坐标系,可以为 `bd09`(百度坐标系)、`gcj02`(火星坐标系)、`wgs84`(GPS 坐标系)
323 |
324 | * 各坐标系含义可以参考[这篇文章](https://www.jianshu.com/p/559029832a67)。简略地说:如果坐标来源于百度地图,那么就是 `bd09`;如果坐标来源于高德地图,那么就是 `gcj02`;如果坐标来源于 GPS,那么就是 `wgs84`
325 |
326 | * 如果你不知道手上数据的坐标系,可以挑选几个点,然后使用例如[地图坐标系转换 - 在线工具](https://tool.lu/coordinate)、[经纬度查询定位](https://www.lddgo.net/convert/position)等等工具查看比对一下
327 |
328 | * `AK_LIST`:百度地点检索 API 的密钥列表,可以有多个但至少提供一个
329 |
330 | * 密钥每天仅可以免费使用地点检索功能 100 次,**按 30 元 / 万次收费,每次最低充值 100 元**
331 |
332 | * 曾经有说法是使用 ”多个密钥 + 多 ip“ 可以累加每日免费爬取额度[^10],但经测试多个密钥之间是共享使用额度的、并且多 ip 并没有凑效,**因此只提供一个可用密钥即可**
333 |
334 | #### AOI 筛选配置
335 |
336 | `FILTER_RULES` 包括以下几类:
337 |
338 | * **检验 AOI 是否合法的规则**
339 |
340 | * 面积筛选规则
341 |
342 | * `min_aoi_area`:最小 AOI 面积,单位为平方公里,设置为 0 则不限制
343 |
344 | * `max_aoi_area`:最大 AOI 面积,单位为平方公里,设置为很大的正数则不限制,例如 10000(km^2)
345 |
346 | * 文本相似度规则
347 |
348 | * `min_similarity`:最小文本相似度,设置为 0 则不限制,最大值为 1
349 |
350 | * 文本相似度选择使用 Python 自带库 `difflib` 中的 `SequenceMatcher` 计算,计算原理可以参考[这里](https://stackoverflow.com/questions/35517353/how-does-pythons-sequencematcher-work)
351 |
352 | * 这一规则是为了筛除和原始 POI 名称过于不同的 AOI,需要将参数值设置成较小的正数,例如 0.1
353 |
354 | * 然而,会出现一些特殊情况,例如对北京 “北太平庄路 2 号院” 这一小区进行检索,在不限制文本相似度的情况下,会返回 “月华轩小区” 的 AOI。从名称上看,两者毫无干系,然而查看地图会发现,这两者很可能就是同一个小区。因此需要**自行定夺是否开启筛选**
355 |
356 |
357 |
358 | * **AOI 合法后,对所有可能的 AOI 的排序规则**
359 |
360 | * `sort_by_search_rank`:是否按照百度地点检索 API 的搜索排名进行排序。`0` 代表否;`1` 代表是,并且排名越高(排名数字越小)越好
361 |
362 | * `sort_by_area`:是否按照 AOI 面积进行排序。`0` 代表否;`1` 代表是,并且面积越大越好;`-1` 代表是,并且面积越小越好
363 |
364 | * `sort_by_distance`:是否按照 AOI 到 POI 点的距离进行排序。`0` 代表否;`1` 代表是,并且距离越近越好
365 |
366 | * `sort_by_similarity`:是否按照文本相似度进行排序。`0` 代表否;`1` 代表是,并且相似度越高越好
367 |
368 | ## 项目结构
369 |
370 | ```text
371 | BaiduAOISpider
372 | ├── README.md
373 | ├── BaiduAOISpider
374 | │ ├── middlewares.py 中间件
375 | │ ├── settings.py 各项设置
376 | │ └── spiders
377 | │ ├── BaiduAOI.py 百度地图爬虫
378 | │ └── examples.py 示例爬虫
379 | ├── data
380 | │ ├── AOI_example1 示例 1 爬取的 shp 格式数据
381 | │ │ ├── AOI_example1.cpg
382 | │ │ ├── AOI_example1.dbf
383 | │ │ ├── AOI_example1.prj
384 | │ │ ├── AOI_example1.shp
385 | │ │ └── AOI_example1.shx
386 | │ ├── AOI_example2 示例 2 爬取的 shp 格式数据
387 | │ │ ├── AOI_example2.cpg
388 | │ │ ├── AOI_example2.dbf
389 | │ │ ├── AOI_example2.prj
390 | │ │ ├── AOI_example2.shp
391 | │ │ └── AOI_example2.shx
392 | │ ├── POI_example1.csv 示例 1 原始数据及爬取的 csv 格式数据
393 | │ └── POI_example2.csv 示例 2 原始数据及爬取的 csv 格式数据
394 | ├── env.yaml conda 环境配置文件
395 | ├── images
396 | │ ├── AOI_Peking_University.png
397 | │ ├── running_process.png
398 | │ └── similarity_problem.png
399 | ├── processor
400 | │ ├── __init__.py
401 | │ ├── aoi_container.py AOI 容器类,用于存储、处理 AOI 数据
402 | │ ├── api_handler.py 百度地图 API 处理类
403 | │ ├── counter.py 计数器类
404 | │ ├── file_operator.py 文件操作类
405 | │ ├── logger.py 日志类
406 | │ ├── repository.py 仓库类,用于存放爬虫用到的各类设置和文件
407 | │ └── validator.py 验证器类
408 | ├── scrapy.cfg
409 | └── spatial
410 | ├── coords.py 坐标处理函数
411 | └── geometry.py 几何处理函数
412 | ```
413 |
414 | ## 示例说明
415 |
416 | 在 `examples.py` 中提供了两个示例 `example1` 和 `example2`,分别展示了两种不同使用情景:
417 |
418 | * **示例一:**
419 |
420 | * 目标是北京的一些住宅小区 AOI
421 |
422 | * 在默认设置上更新了:限制单一 POI 类别(房地产 + 住宅区)、AOI 最大面积 1 平方公里、最小文本相似度 0.1
423 |
424 | ```python
425 | updating_settings = dict(
426 | POI_CSV_PATH = 'data/POI_example1.csv',
427 | AOI_SHP_PATH = 'data/AOI_example1/AOI_example1.shp',
428 | PROXY_ENABLED = False,
429 | UPDATE_INTERVAL = 20,
430 | API_PARAMS = {
431 | 'prim_ind': '房地产',
432 | 'sec_ind': '住宅区',
433 | },
434 | FILTER_RULES = {
435 | 'max_aoi_area': 1,
436 | 'min_similarity': 0.1,
437 | },
438 | )
439 | ```
440 |
441 | * **示例二:**
442 |
443 | * 目标是一些分布在全国的不同类别、较大型的 AOI,例如大学校园、风景区、体育场馆等等,并且坐标来源是百度地图
444 |
445 | * 在默认设置上更新了:在 csv 文件中指定了 POI 的类别、给定坐标系为 `bd09`、最小 AOI 面积为 0.02 平方公里、开启按照面积从大到小排序
446 |
447 | ```python
448 | updating_settings = dict(
449 | POI_CSV_PATH = 'data/POI_example2.csv',
450 | AOI_SHP_PATH = 'data/AOI_example2/AOI_example2.shp',
451 | PROXY_ENABLED = False,
452 | UPDATE_INTERVAL = 20,
453 | API_PARAMS = {
454 | 'prim_ind': 'VAR',
455 | 'sec_ind': 'VAR',
456 | 'crs': 'bd09',
457 | },
458 | FILTER_RULES = {
459 | 'min_aoi_area': 0.02,
460 | 'sort_by_area': -1,
461 | }
462 | )
463 | ```
464 |
465 | ## 参考
466 |
467 | 1. 主体框架部分参考:[ResidentialAreaBoundary](https://github.com/XuCQ/ResidentialAreaBoundary) 项目
468 |
469 | 2. AOI 爬取的一些相关讨论:[《利用名称爬取百度AOI》](https://www.cnblogs.com/zhangqinglan/p/13301425.html)、[《黑科技 | 百度地图抓取地块功能(上)》](https://zhuanlan.zhihu.com/p/111256406)、[《黑科技 | 百度地图获取地块功能属性(下)》](https://zhuanlan.zhihu.com/p/111263995)
470 |
471 | 3. 坐标计算相关函数 `coords.py` 参考 [CoordinatesConverter](https://github.com/dickwxyz/CoordinatesConverter) 项目
472 |
473 | [^1]: 兴趣面(area of interest,简称 AOI),也叫信息面,指的是地图数据中的区域状的地理实体。参考来源:[百度百科](https://baike.baidu.com/item/%E5%85%B4%E8%B6%A3%E9%9D%A2/61305284?fr=aladdin)
474 |
475 | [^2]: ESRI Shapefile(shp),或简称 shapefile,是美国环境系统研究所公司(ESRI)开发的一种空间数据开放格式,该文件格式已经成为了地理信息软件界的一个开放标准。参考来源:[百度百科](https://baike.baidu.com/item/shapefile%E6%96%87%E4%BB%B6/11041662?fr=aladdin)
476 |
477 | [^3]: 随机 user-agent 和 随机 cookie 默认开启使用,前者通过 `scrapy-fake-useragent` 包实现,后者在 `middlewares.py` 中的 `get_cookie` 中实现;随机 IP 代理需要在 `settings.py` 中选择是否开启
478 |
479 | [^4]: 在结果文件中字段叫做 uid 名称,因为每一个 uid 对应一个 AOI 查询的url,进而对应一个几何形状,而获取 AOI 的 url 返回的数据里其实是没有名称的
480 |
481 | [^5]: WGS-84 坐标系(World Geodetic System 一 1984 Coordinate System)是一种国际上采用的地理坐标系。GPS 是以 WGS-84 坐标系为根据的(因此 GPS 的经纬度就是在 WGS84 坐标系下的经纬度)。参考来源:[百度百科](https://baike.baidu.com/item/WGS-84%E5%9D%90%E6%A0%87%E7%B3%BB/730443)
482 |
483 | [^6]: 类似的项目往往采用最基础的 “行政区划区域检索” url,即还需要 POI 所在的市和区县作为参数,这在已经有每个 POI 经纬度定位的条件下是多余的
484 |
485 | [^7]: 如果两级分类都提供,就添加 `&tag=一级行业分类;二级行业分类&scope=2`;如果只提供了其中一级分类,就添加 `&tag=一级/二级行业分类&scope=2`。设置 `&scope=2` 会返回检索结果中每一个 uid 的详细信息包括行业分类信息,进而再次确认每个检索结果的行业分类是一致的
486 |
487 | [^8]: 四至点,即几何形状的左上、右上、右下、左下的四个顶点
488 |
489 | [^9]: 见[《Python 批量爬取高德 AOI 边界数据 + GIS 可视化(超详细)》](https://blog.csdn.net/Smart3S/article/details/88606789)的评论区
490 |
491 | [^10]: 见[《如何绕过百度地图 API 的调用次数限制?》](http://www.site-digger.com/html/articles/20160421/122.html)
492 |
--------------------------------------------------------------------------------
/baidu_aoi_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 | from typing import Optional, Union
4 |
5 | import requests
6 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
7 | from scrapy.http.request import Request
8 | from scrapy.spiders import Spider
9 | from scrapy.utils.python import global_object_name
10 | from scrapy.utils.response import response_status_message
11 |
12 |
13 | class BaiduAOIMiddleware(RetryMiddleware):
14 | def get_proxy(self) -> str:
15 | """
16 | proxy pool is built with reference to https://github.com/jhao104/proxy_pool
17 | """
18 | proxy = requests.get("http://127.0.0.1:5000/get/").json()
19 | return f'http://{proxy["proxy"]}'
20 |
21 | def delete_proxy(self, proxy) -> None:
22 | requests.get(f"http://127.0.0.1:5000/delete/?proxy={proxy}")
23 |
24 | def get_cookie(self) -> str:
25 | """
26 | It is observed that `BAIDUID` cookie value
27 | is made up of 32 random numbers and letters.
28 | """
29 |
30 | def random_32_string():
31 | return "".join(
32 | random.choice(string.ascii_uppercase[:6] + string.digits)
33 | for _ in range(32)
34 | )
35 |
36 | bd_id = random_32_string()
37 | return f"{bd_id}:FG=1"
38 |
39 | def alter_proxy_and_cookie(self, request):
40 | request.cookies["BAIDUID"] = self.get_cookie()
41 | if request.meta.get("proxy_enabled"):
42 | self.delete_proxy(request.meta["proxy"])
43 | request.meta["proxy"] = self.get_proxy()
44 | return request
45 |
46 | def process_request(self, request, spider):
47 | request.headers["Connection"] = "close"
48 | request.meta["dont_redirect"] = True
49 | request.meta["download_timeout"] = 15
50 | request.cookies["BAIDUID"] = self.get_cookie()
51 | if request.meta.get("proxy_enabled"):
52 | request.meta["proxy"] = self.get_proxy()
53 |
54 | def process_response(self, request, response, spider):
55 | if request.meta.get("dont_retry", False):
56 | return response
57 | if response.status in self.retry_http_codes:
58 | request = self.alter_proxy_and_cookie(request)
59 | reason = response_status_message(response.status)
60 | return self._retry(request, reason, spider) or response
61 | return response
62 |
63 | def process_exception(self, request, exception, spider):
64 | if isinstance(exception, self.EXCEPTIONS_TO_RETRY) and not request.meta.get(
65 | "dont_retry", False
66 | ):
67 | request = self.alter_proxy_and_cookie(request)
68 | return self._retry(request, exception, spider)
69 |
70 | def _retry(self, request, reason, spider):
71 | max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
72 | priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
73 | return get_retry_request(
74 | request,
75 | reason=reason,
76 | spider=spider,
77 | max_retry_times=max_retry_times,
78 | priority_adjust=priority_adjust,
79 | )
80 |
81 |
82 | def get_retry_request(
83 | request: Request,
84 | *,
85 | spider: Spider,
86 | reason: Union[str, Exception] = "unspecified",
87 | max_retry_times: Optional[int] = None,
88 | priority_adjust: Optional[int] = None,
89 | stats_base_key: str = "retry",
90 | ):
91 | """
92 | Copied from scrapy source code and made minor changes.
93 | """
94 | settings = spider.crawler.settings
95 | stats = spider.crawler.stats
96 | retry_times = request.meta.get("retry_times", 0) + 1
97 | if max_retry_times is None:
98 | max_retry_times = request.meta.get("max_retry_times")
99 | if max_retry_times is None:
100 | max_retry_times = settings.getint("RETRY_TIMES")
101 | if retry_times <= max_retry_times:
102 | new_request: Request = request.copy()
103 | new_request.meta["retry_times"] = retry_times
104 | new_request.dont_filter = True
105 | if priority_adjust is None:
106 | priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
107 | new_request.priority = request.priority + priority_adjust
108 |
109 | if callable(reason):
110 | reason = reason()
111 | if isinstance(reason, Exception):
112 | reason = global_object_name(reason.__class__)
113 |
114 | stats.inc_value(f"{stats_base_key}/count")
115 | stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
116 | return new_request
117 | else:
118 | stats.inc_value(f"{stats_base_key}/max_reached")
119 | return f"Gave up retrying {request} (failed {retry_times} times)"
120 |
--------------------------------------------------------------------------------
/baidu_aoi_spider/settings.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------------------- #
2 | # Default settings, no changes needed #
3 | # ---------------------------------------------------------------------------- #
4 |
5 | BOT_NAME = "baidu_aoi_spider"
6 | SPIDER_MODULES = ["baidu_aoi_spider.spiders"]
7 | NEWSPIDER_MODULE = "baidu_aoi_spider.spiders"
8 |
9 | # Disobey robots.txt rules
10 | ROBOTSTXT_OBEY = False
11 |
12 | # Enable cookies
13 | COOKIES_ENABLED = True
14 |
15 | # Override the default request headers
16 | DEFAULT_REQUEST_HEADERS = {
17 | "Accept": "text/html,application/xhtml+xml,application/xml;"
18 | "q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
19 | "Accept-Encoding": "gzip, deflate",
20 | "Accept-Language": "en-US,en;q=0.9",
21 | }
22 |
23 | # Enable or disable downloader middlewares
24 | DOWNLOADER_MIDDLEWARES = {
25 | "scrapy.downloadermiddleware.useragent.UserAgentMiddleware": None,
26 | "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 100,
27 | "baidu_aoi_spider.middlewares.BaiduAOIMiddleware": 200,
28 | }
29 |
30 | # Retry settings
31 | RETRY_TIMES = 3
32 | RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 403, 400, 302, 301]
33 | RETRY_PRIORITY_ADJUST = -1
34 |
35 | # Log level settings
36 | LOG_LEVEL = "WARNING"
37 |
38 | # Settings whose default value is deprecated to a future-proof value
39 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
40 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
41 |
42 | # ---------------------------------------------------------------------------- #
43 | # Custom Settings #
44 | # ---------------------------------------------------------------------------- #
45 |
46 | # ------------------------------ 1. Concurrency ------------------------------ #
47 |
48 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
49 | # Maximum for baidu map is 30 QPS
50 | CONCURRENT_REQUESTS = 25
51 |
52 | # Configure a delay for requests for the same website (default: 0)
53 | DOWNLOAD_DELAY = 0.1
54 | # The download delay setting will honor only one of:
55 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
56 | CONCURRENT_REQUESTS_PER_IP = 10
57 |
58 | # --------------------------------- 2. Basics -------------------------------- #
59 |
60 | # File path settings
61 | POI_CSV_PATH = "data/POI.csv"
62 | AOI_SHP_PATH = "data/AOI/AOI.shp"
63 |
64 | # Spider settings
65 | PROXY_ENABLED = True
66 | UPDATE_INTERVAL = 150 # how many AOI API calls before updating the output file
67 | USE_FIRST_UID = False
68 |
69 | # ---------------------- 3. Baidu Map uid API parameters --------------------- #
70 |
71 | # Detailed information can be found at:
72 | # https://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi
73 |
74 | # In this project, circular area search is used instead of administrative area search,
75 | # which finds POIs within a certain radius of the input coordinate point.
76 | # The former is more efficient as regional information is not needed.
77 | # However, it may leave out some POIs if the radius are not set appropriately.
78 |
79 | API_PARAMS = {
80 | # 1. Primary/Secondary industry classification (Optional)
81 | # Please re-classify according to Baidu standard:
82 | # see https://lbsyun.baidu.com/index.php?title=open/poitags
83 | "prim_ind": "", # (1) ''; (2) a string; (3) 'VAR'
84 | "sec_ind": "", # same as above
85 | # 2.1 Retrieval radius
86 | # To avoid missing out POIs, it is recommended to set it slightly higher, e.g. 2000.
87 | "radius": 2000, # unit: meters
88 | # 2.2 Radius limit
89 | # It is said that only POIs within retrieval radius will be returned if set to 'true'.
90 | "radius_limit": "true", # 'true' or 'false'
91 | # 3. Coordinate system
92 | "crs": "wgs84", # 'gcj02', 'bd09' or 'wgs84'
93 | }
94 |
95 | # Baidu access key list, you can get it from https://lbsyun.baidu.com/apiconsole/key
96 | # at least provide one access key
97 | AK_LIST = [
98 | "your aks",
99 | ]
100 |
101 | # ---------------------------- 4. AOI filter rules --------------------------- #
102 |
103 | FILTER_RULES = {
104 | # 1. Upper and lower bound of AOI area, in square kilometers
105 | "min_aoi_area": 0, # Set to 0 to disable
106 | "max_aoi_area": 10000, # Set to a large number to disable, e.g. 10000
107 | # 2. Lowest name similarity between AOI and POI
108 | "min_similarity": 0, # 0 to disable, maximum is 1
109 | # 3. AOI sorting rules, set to 0 to disable
110 | "sort_by_search_rank": 1, # the higher the rank, the more relevant, the better
111 | "sort_by_area": 0, # 1 for the smaller the better, -1 for the bigger the better
112 | "sort_by_distance": 1, # the closer the AOI's geometry to POI, the better
113 | "sort_by_similarity": 1, # the more similar the name of AOI to POI, the better
114 | }
115 |
--------------------------------------------------------------------------------
/baidu_aoi_spider/spiders/baidu_aoi.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy import signals
3 | from scrapy.http import Request
4 |
5 | from processor import (
6 | AOIContainer,
7 | APIHandler,
8 | Counter,
9 | FileOperator,
10 | Logger,
11 | Repo,
12 | Validator,
13 | )
14 |
15 |
16 | class BaiduAOISpider(scrapy.Spider):
17 | name = "BaiduAOI"
18 | updating_settings = dict()
19 | allowed_domains = ["map.baidu.com", "api.map.baidu.com"]
20 |
21 | # ------------------------------ initialization ------------------------------ #
22 |
23 | @classmethod
24 | def from_crawler(cls, crawler):
25 | # bind a close_spider method
26 | spider = super(BaiduAOISpider, cls).from_crawler(
27 | crawler, crawler.settings.copy_to_dict()
28 | )
29 | crawler.signals.connect(spider.close_spider, signal=signals.spider_closed)
30 | return spider
31 |
32 | def __init__(self, settings):
33 | settings = self.deep_update(settings, self.updating_settings)
34 | # import and validate settings
35 | Repo.import_settings(settings)
36 | Validator.validate_settings()
37 | # load file and validate it
38 | Repo.load_file()
39 | Validator.validate_file()
40 | # prepare file for writing
41 | FileOperator.add_cols()
42 | FileOperator.convert_crs_to_wgs84()
43 | # counter and AOI container initialization
44 | Counter.boot()
45 | AOIContainer.mold()
46 |
47 | # -------------------------------- main spider ------------------------------- #
48 |
49 | def start_requests(self):
50 | Logger.log_start()
51 | # idx_url_tuples is of the form [(idx1, url1), (idx2, url2), ...]
52 | idx_url_tuples = APIHandler.assemble_uid_urls()
53 | for idx, url in idx_url_tuples:
54 | yield self.request_uid(url, idx=idx)
55 |
56 | def parse_uid(self, response, idx):
57 | try:
58 | self.check_retry_times(response)
59 | # uid_name_rank_triples is of the form:
60 | # [(uid_name1, uid1, search_rank1), (uid_name2, uid2, search_rank2), ...]
61 | uid_name_rank_triples = APIHandler.extract_uid_name_rank(idx, response)
62 | if uid_name_rank_triples:
63 | # record how many uids are available for this POI
64 | Counter.write_aoi_total_num(idx, len(uid_name_rank_triples))
65 | # if `USE_FIRST_UID` is on, only the first search result will be requested
66 | for uid_name, uid, rank in uid_name_rank_triples:
67 | url = APIHandler.assemble_aoi_url(uid)
68 | yield self.request_aoi(url, idx=idx, uid_name=uid_name, rank=rank)
69 | else:
70 | # no uid found, skip this POI
71 | Repo.file.loc[idx, "status"] = "No Uid"
72 | Logger.log_progress()
73 | except Exception as e:
74 | Logger.log_uid_fail(e, idx)
75 |
76 | def parse_aoi(self, response, idx, uid_name, rank):
77 | try:
78 | self.check_retry_times(response)
79 | geometry = APIHandler.get_polygon_geometry(response)
80 | # if geometry exists and is valid,
81 | # append it to the AOI list of this POI
82 | if geometry:
83 | AOIContainer.append(idx, rank, uid_name, geometry)
84 | except Exception as e:
85 | Logger.log_aoi_fail(e, idx, uid_name)
86 | finally:
87 | # count that one AOI of this POI is called
88 | Counter.count_aoi_called(idx)
89 | # if all AOIs of this POI are called,
90 | # find the best AOI and record it if exists
91 | if Counter.all_aoi_called(idx):
92 | best_aoi = AOIContainer.get_best_aoi(idx)
93 | if best_aoi:
94 | FileOperator.write_aoi_and_status(idx, best_aoi)
95 | else:
96 | Repo.file.loc[idx, "status"] = "No Geometry"
97 | Logger.log_progress()
98 | # update file periodically
99 | if Counter.reach_update_interval():
100 | FileOperator.save_file()
101 | Logger.log_update()
102 |
103 | def close_spider(self):
104 | Logger.log_finish()
105 | FileOperator.save_file()
106 |
107 | # ---------------------------------- utility --------------------------------- #
108 |
109 | def request(self, url: str, **kwargs) -> Request:
110 | return scrapy.Request(
111 | url=url,
112 | **kwargs,
113 | dont_filter=True,
114 | meta={"proxy_enabled": Repo._proxy_enabled}
115 | )
116 |
117 | def request_uid(self, url: str, **kwargs) -> Request:
118 | params = dict(
119 | callback=self.parse_uid,
120 | headers={"Host": "api.map.baidu.com"},
121 | cb_kwargs=dict(**kwargs),
122 | )
123 | return self.request(url, **params)
124 |
125 | def request_aoi(self, url: str, **kwargs) -> Request:
126 | params = dict(
127 | callback=self.parse_aoi,
128 | headers={"Host": "map.baidu.com"},
129 | cb_kwargs=dict(**kwargs),
130 | )
131 | return self.request(url, **params)
132 |
133 | def check_retry_times(self, response) -> None:
134 | if isinstance(response, str):
135 | if response.startswith("Gave up retrying"):
136 | raise Exception(response)
137 |
138 | def deep_update(self, base_dict: dict, updating_dict: dict) -> dict:
139 | updated_dict = base_dict.copy()
140 | for k, v in updating_dict.items():
141 | if isinstance(v, dict):
142 | updated_dict[k] = self.deep_update(updated_dict.get(k, {}), v)
143 | else:
144 | updated_dict[k] = v
145 | return updated_dict
146 |
--------------------------------------------------------------------------------
/baidu_aoi_spider/spiders/examples.py:
--------------------------------------------------------------------------------
1 | from baidu_aoi_spider.spiders.baidu_aoi import BaiduAOISpider
2 |
3 |
4 | class Example1(BaiduAOISpider):
5 | name = "example1"
6 | updating_settings = dict(
7 | POI_CSV_PATH="data/POI_example1.csv",
8 | AOI_SHP_PATH="data/AOI_example1/AOI_example1.shp",
9 | PROXY_ENABLED=False,
10 | UPDATE_INTERVAL=20,
11 | API_PARAMS={
12 | "prim_ind": "房地产",
13 | "sec_ind": "住宅区",
14 | },
15 | FILTER_RULES={
16 | "max_aoi_area": 1,
17 | "min_similarity": 0.1,
18 | },
19 | )
20 |
21 |
22 | class Example2(BaiduAOISpider):
23 | name = "example2"
24 | updating_settings = dict(
25 | POI_CSV_PATH="data/POI_example2.csv",
26 | AOI_SHP_PATH="data/AOI_example2/AOI_example2.shp",
27 | PROXY_ENABLED=False,
28 | UPDATE_INTERVAL=20,
29 | API_PARAMS={
30 | "prim_ind": "VAR",
31 | "sec_ind": "VAR",
32 | "crs": "bd09",
33 | },
34 | FILTER_RULES={
35 | "min_aoi_area": 0.02,
36 | "sort_by_area": -1,
37 | },
38 | )
39 |
--------------------------------------------------------------------------------
/data/AOI_example1/AOI_example1.cpg:
--------------------------------------------------------------------------------
1 | UTF-8
--------------------------------------------------------------------------------
/data/AOI_example1/AOI_example1.dbf:
--------------------------------------------------------------------------------
1 | { Q name C P lng N lat N status C P uid_name C P lng_wgs84 N lat_wgs84 N
弘善家园 116.444730045491994 39.869354135514598Matched 弘善家园 116.444730045491994 39.869354135514598 望京西园一区 116.469525721766004 40.002457242763597Matched 望京西园(一区) 116.469525721766004 40.002457242763597 沿海赛洛城七期 116.498423360228003 39.896920374397297Matched 沿海赛洛城 116.498423360228003 39.896920374397297 马甸南村 116.371405804933005 39.965353871917898Matched 马甸南村小区 116.371405804933005 39.965353871917898 农光东里 116.463734222306002 39.881733030113701Matched 农光东里 116.463734222306002 39.881733030113701 国美第一城3号院 116.509456647947005 39.932530647759499Matched 国美第一城-3号院 116.509456647947005 39.932530647759499
--------------------------------------------------------------------------------
/data/AOI_example1/AOI_example1.prj:
--------------------------------------------------------------------------------
1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
--------------------------------------------------------------------------------
/data/AOI_example1/AOI_example1.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/data/AOI_example1/AOI_example1.shp
--------------------------------------------------------------------------------
/data/AOI_example1/AOI_example1.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/data/AOI_example1/AOI_example1.shx
--------------------------------------------------------------------------------
/data/AOI_example2/AOI_example2.cpg:
--------------------------------------------------------------------------------
1 | UTF-8
--------------------------------------------------------------------------------
/data/AOI_example2/AOI_example2.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/data/AOI_example2/AOI_example2.dbf
--------------------------------------------------------------------------------
/data/AOI_example2/AOI_example2.prj:
--------------------------------------------------------------------------------
1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
--------------------------------------------------------------------------------
/data/AOI_example2/AOI_example2.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/data/AOI_example2/AOI_example2.shp
--------------------------------------------------------------------------------
/data/AOI_example2/AOI_example2.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/data/AOI_example2/AOI_example2.shx
--------------------------------------------------------------------------------
/data/POI_example1.csv:
--------------------------------------------------------------------------------
1 | name,lng,lat,status,uid_name,lng_wgs84,lat_wgs84,geometry
2 | 北三环中路,116.377422166388,39.9664223675039,No Geometry,,116.377422166388,39.9664223675039,
3 | 农光里,116.459305231953,39.8806912902182,No Geometry,,116.459305231953,39.8806912902182,
4 | 弘善家园,116.444730045492,39.8693541355146,Matched,弘善家园,116.444730045492,39.8693541355146,"POLYGON ((116.44078337745441 39.870237012705616, 116.44070423122652 39.86986988571904, 116.44068218927289 39.86977684984584, 116.44068220173138 39.869687854958045, 116.44071228930112 39.869597916333944, 116.44075551501588 39.86947000447372, 116.4409269738944 39.86925433787994, 116.44193362808304 39.86849218135793, 116.44227453868723 39.86823384527016, 116.44250214598202 39.86808928854748, 116.4425502729412 39.86807138171991, 116.4425853647234 39.86806444934341, 116.4426314842854 39.868063537796516, 116.44267359270633 39.868068618264765, 116.44273875837463 39.86809274195313, 116.44278586929651 39.86817382780052, 116.44282895717046 39.86834490081993, 116.44283898055535 39.86836391900073, 116.4428610352439 39.868382960316, 116.44288208918373 39.868389000461946, 116.44291216749129 39.86838905832518, 116.44294425177549 39.86838412035889, 116.44343955933056 39.86830798125391, 116.44471796217253 39.868302378659756, 116.44473912026591 39.86829442078959, 116.44476017882182 39.86828146302139, 116.44477321564266 39.868269489399744, 116.4447892644203 39.868230523270825, 116.44480532587147 39.86810256229265, 116.44480536272383 39.86784357725441, 116.44471317897742 39.86738842175318, 116.44471318196594 39.867367422963675, 116.44471318808502 39.86732442544307, 116.44473325047039 39.86726546834056, 116.44476133605079 39.86719552768758, 116.44477738226661 39.86717456051443, 116.4448064644901 39.867147619390906, 116.44600280070054 39.866608927302565, 116.44644404043876 39.86646382155762, 116.44757335903908 39.86605703535708, 116.44808182560682 39.8658829875232, 116.44946893902171 39.86572277469004, 116.44949300886053 39.86572372503045, 116.44951206383898 39.865726764760765, 116.44953412600731 39.86574081015946, 116.44956320432071 39.86578486852598, 116.44957122351055 39.865813883642105, 116.44961124068313 39.8665079270809, 116.4496320641765 39.86746591495154, 116.44962183753915 39.86884081354086, 116.44960677047602 39.869002772538465, 116.44961378790481 39.86902378603219, 116.44962381538971 39.86903580636219, 116.44963484651875 39.86904282909026, 116.4496518954719 39.869047864561765, 116.45100598156009 39.86903453151988, 116.45103807661897 39.86903659995648, 116.45108320976409 39.86904369600117, 116.45112533332227 39.86905578537534, 116.45114639369112 39.86907182949672, 116.45116945717349 39.869107876754946, 116.4511814889793 39.86913590087879, 116.45118336598286 39.870033252810664, 116.45118135857426 39.87004324793402, 116.45116330339609 39.8700532087131, 116.4511402347468 39.87005315935528, 116.44959077778098 39.870024079743615, 116.44790031859192 39.87002137487737, 116.44629582298994 39.870024517777885, 116.44502273917851 39.87003398014537, 116.44443234891227 39.87004241700428, 116.44346839122963 39.87003593781707, 116.44342126745919 39.870034846590585, 116.44340422323197 39.87003081381761, 116.44338918513405 39.87002078528123, 116.443374148744 39.869998757444854, 116.44313063856502 39.86909133904469, 116.44310257098222 39.86905028725302, 116.44308653033944 39.8690412568342, 116.4430624680566 39.8690372106706, 116.44302938154514 39.86903814684413, 116.44218417348927 39.86917861890757, 116.44217815578911 39.86919460652176, 116.44220019829777 39.869294942790454, 116.44242265413955 39.870126019771256, 116.4424226509002 39.87014901844165, 116.44241162048738 39.870162996550626, 116.4423965814326 39.87016496769191, 116.44096081136205 39.87030824101216, 116.44090266559628 39.87030923194317, 116.44086958282584 39.8703091699602, 116.4408314889816 39.8702990991964, 116.44080843402888 39.87028005712858, 116.44078337745441 39.870237012705616))"
5 | 望京西园一区,116.469525721766,40.0024572427636,Matched,望京西园(一区),116.469525721766,40.0024572427636,"POLYGON ((116.46732626190584 40.003529262061946, 116.4673166318597 40.00350363731869, 116.46731342443728 40.00347642980118, 116.46731342866083 40.003446030990006, 116.46732588751001 40.003334868590386, 116.46732789706726 40.00331647467368, 116.46733191384779 40.00329648618414, 116.46734074698593 40.003279710443174, 116.46735118489354 40.0032689387616, 116.46735439633244 40.00326724741197, 116.46737085501336 40.00325819176646, 116.46739534100458 40.003254257400414, 116.46741862247379 40.003252719738526, 116.46744511488502 40.003253190609975, 116.46772167915825 40.00326793148776, 116.46775660089229 40.003270825179335, 116.46778148788273 40.00327129203514, 116.46780376592595 40.00327055194462, 116.46781982223841 40.00326959514966, 116.46783326970377 40.00326663142402, 116.46784731966918 40.00326086943245, 116.46785695490819 40.00324989577154, 116.46786297797789 40.00323551252757, 116.46786659361904 40.00321392308558, 116.46787101312555 40.00318473609888, 116.46787422974931 40.003145946246036, 116.46787624160484 40.00311115299902, 116.46787664756295 40.00307835535442, 116.46787790316334 40.002708373018265, 116.46788031814967 40.00266118134238, 116.46788353305628 40.002634791013364, 116.46789317122085 40.00260281818481, 116.46791003356645 40.00257886449054, 116.46792850098281 40.0025589149707, 116.46795900968041 40.00254539763718, 116.46799192638866 40.0025350866964, 116.46804491309332 40.00252722979428, 116.46809629300759 40.00252816831019, 116.46849689954756 40.00252215180671, 116.46876985806529 40.002530792586676, 116.4693333404094 40.0025572291599, 116.4697733900095 40.00259913546061, 116.46980309524075 40.00259841724759, 116.469823166609 40.0025960725954, 116.46983882290709 40.00258971594741, 116.46985407823163 40.00258015832292, 116.46986973630784 40.002561002175135, 116.46988740210308 40.0025374517327, 116.46990466637223 40.00251470016302, 116.47007569826353 40.002326379052725, 116.47008693997937 40.00231281060554, 116.47009376610175 40.00229882998282, 116.47009778265112 40.002282041706415, 116.47009939071769 40.00226484679332, 116.47011066835472 40.001991288232716, 116.47011107303744 40.00196769023947, 116.47011107513825 40.00195249081806, 116.47010585879549 40.00193687699903, 116.47009743107043 40.001921254311455, 116.47008338254457 40.00191201586421, 116.47006331190984 40.00190836058745, 116.47003481108224 40.001907081966614, 116.46937287002783 40.00190446221306, 116.46862865238161 40.00187283420113, 116.46761389852814 40.00185249733471, 116.46758339234464 40.00184961567498, 116.46755368948746 40.001843136339815, 116.46752880380251 40.00183427002877, 116.46750191184977 40.00182019857211, 116.46748545623305 40.00180695504124, 116.46747140958627 40.001789718123696, 116.46746177941473 40.00176489333015, 116.46745656421062 40.0017432802267, 116.46744934256006 40.00171766191351, 116.46744854282397 40.001695660641204, 116.46744694307309 40.00165365801753, 116.46745263730318 40.00111849442461, 116.46745826210459 40.00108131094902, 116.46746308332382 40.00104972510164, 116.46746991065602 40.001024544369805, 116.46747834286778 40.00100456772652, 116.46749078926003 40.00098340187638, 116.46750684771027 40.000966245541605, 116.4675249130613 40.00094989455652, 116.46755903571558 40.00092478693696, 116.4676060040441 40.00089411400343, 116.46764012614298 40.0008734062988, 116.46767344479224 40.000857496287104, 116.46773285578567 40.000833656656994, 116.46778102509704 40.000827386249, 116.4678532784862 40.000822780590134, 116.46793034817863 40.00082298787322, 116.47117339953915 40.00087628299308, 116.47118744923088 40.000879122151765, 116.4711994911647 40.00088635553898, 116.47120310216951 40.00089996512136, 116.47120229693829 40.00091716222329, 116.47117533219902 40.0014170681101, 116.47117814028704 40.00143107543492, 116.47118656873869 40.00144269855948, 116.47120302691276 40.001446344432395, 116.47123032373584 40.00145002062267, 116.4712451763084 40.00145286205661, 116.4712604289404 40.001465704243856, 116.47126444072225 40.00148411478028, 116.47123788359055 40.001939223575704, 116.4712008998406 40.00231750616146, 116.47120410831855 40.00233911433632, 116.47121614883646 40.00235674735789, 116.47123059904521 40.0023663874184, 116.47125227546675 40.00237284781983, 116.47136547896473 40.00237556463484, 116.47163323372085 40.002385915379996, 116.47182712487572 40.00240086009686, 116.4720687832922 40.00244833953183, 116.47209166512238 40.002449204088016, 116.47211173684235 40.00245046070961, 116.47212618794599 40.002455301340824, 116.4721350184829 40.00246332598793, 116.47214063674966 40.00247694136257, 116.47214143732297 40.00249374301406, 116.47214143529362 40.002508542472036, 116.47210962340236 40.00322562676488, 116.4721080104162 40.00327842031537, 116.47210680385598 40.00329481631996, 116.47210479496785 40.00330721020284, 116.47209837031818 40.00331919163141, 116.47208632444303 40.00333915690458, 116.47206223566157 40.00335748824551, 116.47202771072244 40.00336699047135, 116.4718952358908 40.00337141678111, 116.47181815916218 40.00337839944013, 116.47178122652528 40.003382295340444, 116.47173947559044 40.00339417743872, 116.47171378121179 40.003412504508184, 116.47169691534236 40.003453255612136, 116.47168004398937 40.003534005269614, 116.47166558629868 40.00357796304537, 116.47163989096462 40.003603489908265, 116.47160375867544 40.00362658755732, 116.4715563887784 40.003631254354346, 116.47148493191675 40.00364305337532, 116.47139500966999 40.00365160090534, 116.47081934983056 40.0037089903592, 116.4700012293264 40.003817318026556, 116.46964957897805 40.003856348028805, 116.46844090620353 40.00394624581784, 116.46841481440774 40.003947375085254, 116.46839313851622 40.00394571643906, 116.46837628010005 40.003940071002035, 116.4683626333497 40.00393163437504, 116.46835340232424 40.00392160976598, 116.46834818627751 40.00390519626875, 116.46834457597737 40.003887987149106, 116.46834216984968 40.003871181273375, 116.46834056759144 40.00384677786213, 116.46833777151986 40.00374717407133, 116.46833938081117 40.00372077942772, 116.46833777982897 40.0036871763709, 116.46833377139087 40.0036463670733, 116.46831291298594 40.00353911471549, 116.46737603282642 40.003548994354084, 116.46734392151536 40.003543708704065, 116.46732626190584 40.003529262061946))"
6 | 沿海赛洛城七期,116.498423360228,39.8969203743973,Matched,沿海赛洛城,116.498423360228,39.8969203743973,"POLYGON ((116.49688014577212 39.894008165826065, 116.49686729782077 39.89400252728083, 116.49669705075914 39.89402041342518, 116.49668420250272 39.894017174771776, 116.49667135485184 39.89400913626605, 116.49666734126258 39.89399632455596, 116.49670530588728 39.892233294091184, 116.49670691357059 39.892220499344155, 116.49671253585632 39.892212516556846, 116.49672779422886 39.892207762737115, 116.49792031036105 39.892195350477806, 116.49849248030947 39.892155065532585, 116.4986940412785 39.89216686766323, 116.49886227505739 39.892186969410204, 116.49887432001046 39.892191805205655, 116.49887994017452 39.89220062169369, 116.49888154217749 39.892233425408115, 116.49888635910688 39.89224383944581, 116.49890081336706 39.89224708247814, 116.5012199733449 39.89223475966231, 116.50123201826548 39.89223879500167, 116.50123763827806 39.89224841122715, 116.50123924266059 39.892262015490665, 116.5012827313544 39.897680361128394, 116.50128112071624 39.89771635518418, 116.50123933314593 39.89795222438052, 116.5012369207958 39.89797781642693, 116.50124080296689 39.89901779294575, 116.50123914410717 39.89942897425264, 116.5012343242832 39.89944175965159, 116.50122227801536 39.89944812401889, 116.50120621711096 39.89945047671288, 116.4992452047587 39.89947027605924, 116.49923315940052 39.899468640284766, 116.4992251299541 39.899461416633855, 116.49921951054878 39.899447000381905, 116.49921710308874 39.89943419363188, 116.49924122198591 39.899219072296546, 116.49924604175801 39.89920708701806, 116.49925728490331 39.899202320612474, 116.4998121911447 39.89914237043241, 116.49982905592219 39.899134420693954, 116.49984029965604 39.8991248543394, 116.49997763433613 39.899003665277235, 116.49998807564299 39.8989892966769, 116.49999289601301 39.8989725115035, 116.49999129200025 39.89895650727583, 116.49979621135395 39.89851274353456, 116.49978737980987 39.89849831781116, 116.49977292650298 39.89848787528141, 116.499751245463 39.89848061120467, 116.49915459338102 39.89844603937005, 116.4991401396275 39.89843879657981, 116.49913050512697 39.89842356838869, 116.49912729492863 39.898408359319525, 116.49910246376224 39.89791710113652, 116.49909684454687 39.89790108491767, 116.49908881560494 39.89788986136806, 116.49907516467975 39.897884220897, 116.49827373331594 39.897870629434536, 116.49825767179863 39.89787618121937, 116.49824964006545 39.89788655686892, 116.49824883476646 39.89790415390599, 116.49829209727775 39.89869465838422, 116.4982904893428 39.89870905312212, 116.49828326033281 39.8987218311039, 116.49827041086643 39.898728192480235, 116.49825756181279 39.89873135395596, 116.4972047739659 39.89873539696389, 116.49719272874955 39.89873216084946, 116.49718469936218 39.898724136955586, 116.49717988255652 39.898712922818206, 116.49714065680361 39.897756034346614, 116.497135036951 39.89774481778766, 116.49712540137386 39.89773758902608, 116.49711174986766 39.897735948009746, 116.4969398972316 39.89774823042944, 116.49692785159961 39.89774819416263, 116.49691660952422 39.89774416043672, 116.49691259565172 39.89773374866926, 116.49675540149715 39.89615652348967, 116.49672496831656 39.89551005161601, 116.49689056671706 39.89415539265354, 116.49689058207416 39.8940337964799, 116.49688817487005 39.89401858969825, 116.49688014577212 39.894008165826065))"
7 | 新外大街10号院,116.366213572074,39.959638744187,No Geometry,,116.366213572074,39.959638744187,
8 | 北太平庄路2号院,116.365300377415,39.9733513807324,No Geometry,,116.365300377415,39.9733513807324,
9 | 马甸南村,116.371405804933,39.9653538719179,Matched,马甸南村小区,116.371405804933,39.9653538719179,"POLYGON ((116.36944451228285 39.96628662447733, 116.36953263083635 39.96600315418367, 116.3695376255959 39.96599234991537, 116.36954521698217 39.965981343130885, 116.36955550470917 39.96597143366189, 116.36956689082372 39.965963023044345, 116.36957937532753 39.96595611127872, 116.36959255871962 39.96595089874413, 116.36960634113315 39.96594738553855, 116.36962042294748 39.96594567194732, 116.37000772074083 39.965944697742515, 116.37002310099572 39.965944182993404, 116.37003668392276 39.9659407701363, 116.37004787018111 39.96593535969508, 116.37005795809839 39.96592795042056, 116.37006654813882 39.96591894267454, 116.3700732408008 39.96590843683245, 116.37007793615113 39.96589693296198, 116.37008043442101 39.965884631243256, 116.37008274291153 39.96578943427264, 116.37008044736424 39.96577693716294, 116.37007435665858 39.9657647436769, 116.37006616844647 39.96575425211053, 116.37005648198316 39.96574526190086, 116.37004539711864 39.965737972943124, 116.3700333133402 39.96573238485427, 116.37002053021642 39.96572889732578, 116.37000734736156 39.96572751006989, 116.3696978497587 39.96571500953644, 116.36968376820873 39.96571412324178, 116.36967078578954 39.965708636140384, 116.36966279704663 39.96570164428029, 116.36965680590703 39.9656929505765, 116.36965321179909 39.965682954618615, 116.36965241411619 39.965672355978846, 116.36984054414319 39.9650530080972, 116.36984474022447 39.96504040474379, 116.36984913602464 39.965028001186404, 116.36985373156571 39.96501559743675, 116.36985862670866 39.96500329339293, 116.36986362166293 39.96499148922595, 116.3698703143121 39.96498058337087, 116.36987880446802 39.96497107570515, 116.36988879248426 39.96496326650158, 116.36990007859806 39.9649573559442, 116.36991226330272 39.96495354440737, 116.36992484722083 39.9649520323622, 116.36993763062321 39.964952720007446, 116.3701606416391 39.964970004900955, 116.37017602185406 39.964970690143794, 116.3701911026445 39.9649698757635, 116.37020498509774 39.96496806258922, 116.37021976674819 39.96496344871648, 116.37023195194269 39.96495703742928, 116.37024343819434 39.96494932688547, 116.37025402573195 39.964940517264445, 116.37026361469384 39.9649305086653, 116.37027210517063 39.964919601166685, 116.37027939728765 39.964907794862626, 116.37038509212177 39.96466350770565, 116.37039168521265 39.96465130211141, 116.3703997762354 39.964640495016155, 116.37040966482067 39.96463098614149, 116.37042205010843 39.96462257483714, 116.37043523423411 39.96461546270591, 116.37044911730017 39.964609849832065, 116.37046349956171 39.964605736407485, 116.370478281134 39.964603222520275, 116.37049326228353 39.96460220836728, 116.3705082432377 39.96460289412605, 116.3706566550319 39.964610453331076, 116.37067123654249 39.964611639505776, 116.3706840201785 39.96461402731531, 116.37069660390424 39.96461781524032, 116.37070828850153 39.96462380389585, 116.37071797544533 39.96463119435927, 116.37072616408105 39.96464018614796, 116.37073255479488 39.96465067954871, 116.3707368480351 39.96466207487615, 116.37073894395985 39.96467407224011, 116.37073874272994 39.964686371749856, 116.37070045690913 39.96496779228316, 116.370699656386 39.96498049233704, 116.37069995485481 39.96499009152631, 116.37070314932541 39.965002687820956, 116.3707085411561 39.9650142821009, 116.37071453259662 39.965022875982115, 116.37072671624212 39.965031664021176, 116.37073969903629 39.965039251378734, 116.37075328125466 39.965045438254116, 116.37076756276205 39.96505032454755, 116.37078214408466 39.965053710648256, 116.37079702522222 39.96505559655519, 116.37081200642497 39.965055982456896, 116.37085794896915 39.96505613931667, 116.37087283043503 39.96505572538081, 116.37088721266728 39.965054211978924, 116.3708998971704 39.9650515002401, 116.37091208256693 39.9650465890938, 116.37092346917159 39.965039978792504, 116.37093385721747 39.965031769518035, 116.37094284717784 39.96502216163192, 116.3709504390268 39.96501135512192, 116.37095623323535 39.96499955035069, 116.3709603296546 39.96498694721201, 116.37098361358693 39.964879831334656, 116.37098780989957 39.96486712811111, 116.37099390378157 39.96485502308145, 116.37100219475664 39.96484441591818, 116.37101318222886 39.96483510616549, 116.37102526818721 39.96482709531918, 116.371038252868 39.96482048356078, 116.37105183661012 39.96481557115419, 116.37106601942538 39.964812258106065, 116.37108040179862 39.96481064478329, 116.37201456661154 39.96479858623793, 116.3720692012801 39.96479973664593, 116.37208538192918 39.96480032196109, 116.37210196206749 39.96480130689948, 116.37211754329886 39.964803192700025, 116.37213222546661 39.96480627925402, 116.37214660784419 39.964810666013925, 116.3721604906804 39.96481625316511, 116.3721705780074 39.96482204373769, 116.372183761377 39.96483023138726, 116.37219384872739 39.964835921971954, 116.37220733200951 39.96484200947417, 116.37222241347867 39.964847495572876, 116.37223769484267 39.96485198155211, 116.37225327599555 39.96485536732729, 116.37226905704341 39.96485775298399, 116.37228493811749 39.96485903861688, 116.37230091920584 39.96485932422094, 116.3728227104577 39.964852358792776, 116.37283729369567 39.96485164591655, 116.37284967963605 39.96484963506114, 116.37286236520167 39.96484792392782, 116.37287465150108 39.964844013271204, 116.37288334229918 39.964837205954765, 116.37289243294025 39.96482789842211, 116.37290052485612 39.964817591828826, 116.37290741826286 39.9648063863444, 116.3729130132385 39.96479458204088, 116.37291720989762 39.964782179005745, 116.37292000821495 39.96476937722788, 116.37294270575387 39.96457256785761, 116.3729468024143 39.9645611648603, 116.3729531962919 39.96455095977033, 116.37296148782256 39.964542152932175, 116.37297137729344 39.96453524458261, 116.3729824651401 39.964530435065214, 116.37299435178943 39.96452802471692, 116.37300633802991 39.9645281141488, 116.37350726939742 39.9645308759268, 116.37352035468287 39.964531164556135, 116.37353333981152 39.96453375315288, 116.37354562550509 39.96453824225815, 116.37355711184865 39.96454483194665, 116.37356729945931 39.96455302267753, 116.37357419041527 39.96456411610992, 116.37357928329993 39.96457591106306, 116.37358337726738 39.96458800686588, 116.37358587326358 39.96459810416005, 116.37358687061088 39.96461090260681, 116.37358746840518 39.9646237013995, 116.37358776653456 39.964636500451554, 116.37358776499902 39.96464929976285, 116.37358736390966 39.964662099420316, 116.37358676304427 39.96467489925092, 116.37358576262586 39.96468769942747, 116.37353205058021 39.96528931360718, 116.37353045083877 39.96530211430639, 116.37352825176973 39.965314915526335, 116.3735246543759 39.965326818006645, 116.37352015799144 39.96533872126659, 116.37351656056371 39.965350923731684, 116.37351406186816 39.96536342522779, 116.3735125620283 39.96537612584698, 116.37351216095617 39.965388825513834, 116.37351275876402 39.9654015243142, 116.37352643256334 39.96549220759574, 116.37352503280239 39.96550330821431, 116.37352103603813 39.96551371112173, 116.3735147420064 39.965522816091834, 116.37350635053345 39.965530222973065, 116.37349656088533 39.96553553118137, 116.37348207676568 39.965538743579316, 116.37319949527455 39.96554608953122, 116.37306594759842 39.965545906696356, 116.3730507649188 39.965546320023314, 116.37303558207088 39.965548233275435, 116.37302069873515 39.96555144619968, 116.3730061149115 39.965555958796344, 116.37299492704666 39.96556176833758, 116.37298463792823 39.96556947698709, 116.37297564714456 39.96557868441508, 116.37296805461588 39.96558909054744, 116.37296216001914 39.96560049513006, 116.37295786349314 39.96561269826162, 116.37295556461211 39.965625399605116, 116.37295214598008 39.96581369249027, 116.37295154513797 39.965826492331864, 116.37295064463935 39.96583929243753, 116.37294934459999 39.96585209289634, 116.37294764502062 39.96586489370762, 116.37294564579737 39.96587759478814, 116.37294324709335 39.96588979624853, 116.37293785194271 39.96590110040039, 116.37293045924615 39.96591110638535, 116.37292126878924 39.96591971403328, 116.37291068017268 39.965926423020846, 116.37289909296304 39.96593103300537, 116.37288680682806 39.96593344372897, 116.37287432120694 39.965933554757065, 116.37148896598201 39.96592260535258, 116.37147368458264 39.96592291940963, 116.37146259772719 39.9659261294497, 116.37145250937004 39.96593173844207, 116.37144391893636 39.96593944594248, 116.37143722598408 39.96594885160186, 116.37143283008237 39.96595945507821, 116.37143440321427 39.96616734231892, 116.37143140553302 39.96617854447429, 116.37142611076779 39.966188848797565, 116.37141871872389 39.9661978551268, 116.37140952910835 39.96620496321984, 116.37139904133676 39.96620997262697, 116.37138775484826 39.96621248291071, 116.37092922074498 39.96621510900884, 116.37091423934932 39.96621562300239, 116.37089935765916 39.966217636827935, 116.3708847754137 39.96622125029027, 116.37087348894444 39.966225860615474, 116.37086170270882 39.96623377123282, 116.37085201380017 39.966242179858824, 116.370843623129 39.966251787204854, 116.37083673045727 39.96626249308674, 116.37083143566984 39.96627419741638, 116.3708278387014 39.96628640012566, 116.370826039426 39.96629910112134, 116.37082583858519 39.966308200812804, 116.37082393943476 39.96632090190242, 116.3708215413639 39.9663298036678, 116.37081734545923 39.96633940708224, 116.37080995341998 39.966350113438395, 116.37080076381392 39.9663593215656, 116.3707901761649 39.966366831100764, 116.37077839023823 39.96637254186333, 116.37076590544687 39.96637615340165, 116.37075302142652 39.96637756543945, 116.3695039246555 39.96636686209377, 116.3694883451672 39.96636617732067, 116.36947706037927 39.966362788513926, 116.36946677457581 39.96635699886811, 116.369457987038 39.966349307867574, 116.36945099735976 39.96633981521332, 116.36944620495456 39.96632902048733, 116.3694428110977 39.96631502457288, 116.36944211349505 39.96630272593174, 116.3694436129825 39.9662906251346, 116.36944451228285 39.96628662447733))"
10 | 农光东里,116.463734222306,39.8817330301137,Matched,农光东里,116.463734222306,39.8817330301137,"POLYGON ((116.46515633226267 39.88184081372272, 116.46517701542618 39.882473338480466, 116.4651862071859 39.88276024933524, 116.4651852019621 39.88277304614255, 116.46518289227403 39.88278573956431, 116.465179278166 39.882798029614165, 116.46517445998042 39.88280991655304, 116.465168337389 39.88282130012673, 116.46516111112257 39.88283188086905, 116.46515217896201 39.882842757168085, 116.46514164143308 39.882852629345194, 116.46513000030215 39.88286119871815, 116.46511745626994 39.88286836581379, 116.46510410971118 39.882873930903905, 116.46509016132767 39.88287779451461, 116.46507581180964 39.88287995716847, 116.46506136217423 39.88288051964123, 116.46367483085352 39.882882154376375, 116.46366128486397 39.882882219868144, 116.46364793985414 39.88288018597549, 116.46363509684357 39.88287605346355, 116.46362295647099 39.88287012282968, 116.46361192011163 39.882862295099, 116.46360218839054 39.882852970762364, 116.46359606870156 39.88284525554581, 116.46359045107728 39.88283494172886, 116.46358663965054 39.88282413253087, 116.46358473460587 39.88281392815707, 116.46358383332831 39.8828011264566, 116.46358178811228 39.88235734189838, 116.46180801120525 39.882374282815555, 116.46179356299818 39.88237354697976, 116.46177921536912 39.88237111148216, 116.46176536967488 39.8823668773231, 116.46175202588621 39.88236104449228, 116.46173958534526 39.88235361398561, 116.4617281483699 39.88234468604425, 116.46172032355157 39.882335167094745, 116.46171520789648 39.88232485490297, 116.46171199889213 39.882312547535584, 116.46171109738715 39.88230174581816, 116.4617116008547 39.882288947680344, 116.46174998206556 39.88190966111716, 116.4617471815355 39.88184645722021, 116.46176354618541 39.88034677102023, 116.46176456160134 39.880261077770896, 116.4617655900305 39.88008298913442, 116.46176733525905 39.87980240739454, 116.46176959333108 39.87944283093173, 116.46177959657325 39.87894648065426, 116.46177949803128 39.87893378104937, 116.46178120551035 39.87892118591896, 116.46178451831382 39.87890889475409, 116.46178953673271 39.87889720778936, 116.46179606007132 39.878886324517865, 116.46180408829954 39.878876444931095, 116.46181342072164 39.878867768521076, 116.46182345522543 39.878860993760185, 116.46183549634357 39.87885492394736, 116.46184984501258 39.878850059806254, 116.46186399274858 39.878847095077866, 116.46187904335504 39.87884523254301, 116.46189469584048 39.87884437145974, 116.46191105057042 39.878844312089704, 116.46192760590853 39.87884475320134, 116.46201018178614 39.87884875831302, 116.462743840237 39.878890590640715, 116.46284257228194 39.8788956386698, 116.46351604548725 39.87892053996286, 116.46390937837408 39.878936740725955, 116.4646214018312 39.87896176610001, 116.4648306166765 39.87896660588777, 116.46484707289936 39.878967348392095, 116.46486342870179 39.87896869061722, 116.46487978442568 39.87897063282158, 116.46489603971452 39.87897327474159, 116.46491219456789 39.87897661637693, 116.46492814867098 39.87898045747666, 116.46494400233804 39.878984998292154, 116.46495483879839 39.87898972611642, 116.46496828387578 39.87899676058822, 116.46498122712582 39.87900459372709, 116.4649936685218 39.879013425524285, 116.4650055077601 39.87902295573284, 116.46501353407548 39.87903147611479, 116.46502135934044 39.8790425958527, 116.46502818108436 39.87905431296311, 116.46503409966337 39.87906652770998, 116.46503901473537 39.87907923983339, 116.46504132125736 39.87908914533224, 116.46504292495206 39.87910204886403, 116.4650627441612 39.87945098340036, 116.46506193971408 39.87946308073183, 116.46505812512986 39.879473870322016, 116.4650520027882 39.87948355398197, 116.46504387380756 39.87949153252373, 116.46503413959198 39.879497606998, 116.46502320158717 39.87950127847148, 116.46500794919271 39.879502138905245, 116.46407515926553 39.8795298354729, 116.46406382028212 39.87953260632246, 116.46405338403805 39.87953777936895, 116.46404435226835 39.87954515590424, 116.46403722673492 39.879554337230715, 116.46403230853068 39.879564824141355, 116.46402979839556 39.87957621716753, 116.46406789296823 39.8805424680451, 116.46407130299671 39.88055377623252, 116.46407702102198 39.88056409037602, 116.46408474606172 39.880573109720956, 116.4640942775318 39.88058013378876, 116.4641050133986 39.880585061043504, 116.4641165523233 39.88058769046983, 116.46412839264087 39.88058772079613, 116.46506077812298 39.880593023882106, 116.46507251837531 39.8805930543327, 116.46508395724881 39.88059558388832, 116.46509469338201 39.88060051151218, 116.46510412473641 39.88060763565004, 116.46511174964634 39.88061655501795, 116.46511736747908 39.88062686911304, 116.46512067725746 39.88063817717168, 116.46512845521227 39.88098808093298, 116.46513494971192 39.88118768846918, 116.46515633226267 39.88184081372272))"
11 | 国美第一城3号院,116.509456647947,39.9325306477595,Matched,国美第一城-3号院,116.509456647947,39.9325306477595,"POLYGON ((116.5077424392518 39.93076636588794, 116.50775689383855 39.93075920668728, 116.50927932266961 39.9307498472942, 116.5101497251202 39.930742652629995, 116.51071259025221 39.930731401110684, 116.51073266339807 39.930733856095934, 116.51074872116574 39.93074109983054, 116.51075594570086 39.93075471902196, 116.51073406861266 39.93216019340664, 116.51074037773856 39.93297457269506, 116.51074920815589 39.932988196295746, 116.51076285698898 39.93299623338006, 116.51100534747664 39.93297129951518, 116.51101979996737 39.932973739000495, 116.51103184297708 39.93298097165818, 116.51104308203874 39.93299540177459, 116.51104709401535 39.93301461186851, 116.51103018309851 39.93336574915381, 116.51100768577508 39.933473682475196, 116.51099644277697 39.93348725103337, 116.51098600361045 39.93349442209077, 116.51097235348637 39.9334959846074, 116.51041109953042 39.93348964384128, 116.5091866013738 39.93346626313141, 116.50898827057263 39.933462512988825, 116.50776856311305 39.93344311351056, 116.50775330791276 39.9334334712599, 116.50774528005032 39.93341984942646, 116.50774447897207 39.93340624782461, 116.50774315229216 39.93284367058925, 116.5077384225741 39.93077915404618, 116.5077424392518 39.93076636588794))"
12 |
--------------------------------------------------------------------------------
/data/POI_example2.csv:
--------------------------------------------------------------------------------
1 | name,lng,lat,prim_ind,sec_ind,status,uid_name,lng_wgs84,lat_wgs84,geometry
2 | 北京大学,116.316833,39.998877,教育培训,高等院校,Matched,北京大学,116.30420708125263,39.991595084258336,"POLYGON ((116.30534997611336 39.98519860640878, 116.30672207361795 39.98523645355179, 116.30756022737081 39.9852422019728, 116.30851171740028 39.98526453929614, 116.30869610857013 39.985257815745875, 116.30982558790869 39.985272618872315, 116.30987667153887 39.985288527425084, 116.3099117891656 39.985317263818914, 116.30992774795145 39.98535883376207, 116.30989169605905 39.986361257151245, 116.3102211576016 39.98636467002522, 116.31133005561142 39.9864386772356, 116.31132842887153 39.98666667063236, 116.31131764096321 39.98675828624736, 116.31131164290727 39.98684509341504, 116.31131360427607 39.987101879027655, 116.31129403175213 39.9872267090371, 116.31111044615548 39.98721984017583, 116.31111321052413 39.987441025789266, 116.31112796916021 39.987502196600886, 116.31200438637104 39.98754540957364, 116.31302846079345 39.987601137593074, 116.31338844098659 39.98762447495214, 116.31385455169286 39.98785080492698, 116.31501987823258 39.98800543084862, 116.3153893943768 39.98830872511517, 116.31561527778518 39.98831790038951, 116.31559365015958 39.98888191560562, 116.31551374615889 39.98951563686684, 116.3155568393092 39.98957795301521, 116.31552484222773 39.99009318934612, 116.31550565426788 39.99032681453155, 116.31544811088261 39.9908772968285, 116.31531314904815 39.99139352572732, 116.3151758365143 39.99158577400235, 116.31498962870401 39.99182871097796, 116.31473995553372 39.99214656218262, 116.31430175902473 39.99211457976594, 116.31362330299154 39.99212223621705, 116.31372004188422 39.99093571367458, 116.30987593426924 39.990904287744215, 116.31000389150947 39.98904174341449, 116.30959840316902 39.98901526556053, 116.3095823916263 39.98937687807957, 116.30945448491246 39.99085363865544, 116.30944169432216 39.99100105459228, 116.31224017896967 39.99110162122741, 116.31213935298514 39.993021713622866, 116.312892838725 39.993059135163804, 116.31288243967286 39.99322994591876, 116.31285011223974 39.993236404869364, 116.3128049556148 39.9936828657698, 116.31171462915063 39.993670453615536, 116.31166189674396 39.9940641298547, 116.31162671463404 39.99453457029899, 116.31178874603017 39.99454867583719, 116.31211999266101 39.99457087274035, 116.31214632943727 39.99459762345926, 116.31215070835792 39.994682411252775, 116.31215669027334 39.99471679864167, 116.31222532087999 39.994821468386384, 116.31222210398333 39.99500546501226, 116.31379132518413 39.995087379709105, 116.31401640981984 39.99510456200633, 116.31391736633529 39.99562631901924, 116.3137608976442 39.99581779882689, 116.31361880839243 39.99591645638404, 116.31336098354551 39.99600812729086, 116.31314706801179 39.99602692004762, 116.31272722483477 39.996011691218285, 116.31192504894116 39.99598595581668, 116.31132161780597 39.995965450109644, 116.31104305022747 39.995943753734316, 116.31067986307475 39.99599700413797, 116.31061920120774 39.99598671352473, 116.31046195353825 39.99600299458923, 116.310386123114 39.996012729878295, 116.31015466161774 39.99587795057143, 116.3100804301396 39.995866083696555, 116.30950493279092 39.9957879118201, 116.30902281425568 39.99577596600376, 116.30885738139578 39.99736257677807, 116.30817830589208 39.99729477414821, 116.30746190738816 39.997224028879, 116.30680736521056 39.997151168268054, 116.30604405669278 39.99706848808184, 116.30591423306542 39.99793866613914, 116.305869487869 39.998286924763256, 116.30528677147235 39.998225524642685, 116.30430972215873 39.99802639436295, 116.30367111855595 39.99789307805479, 116.30321531022851 39.997798647825434, 116.30298860781797 39.997703032030444, 116.30263577980635 39.99755642838048, 116.30243542201427 39.997431968298905, 116.30226460046961 39.99730825852463, 116.30213848330666 39.997185274103565, 116.30191578408612 39.99694965509629, 116.30175454655344 39.99677393071381, 116.30156616624588 39.996593450996166, 116.30147117687808 39.99650801201974, 116.30134545342955 39.99641062416798, 116.30112153862379 39.99629019885533, 116.30084372588178 39.996249057505814, 116.30001346771681 39.99614482126348, 116.29949933864197 39.99607366163182, 116.29915285652672 39.996026226070065, 116.29841198106945 39.99592502800284, 116.29821238640905 39.99592214951095, 116.29757208035542 39.99590557807057, 116.29673375832861 39.995874917884045, 116.29585549685902 39.99582351377348, 116.29486063854222 39.99576428450866, 116.29476322838067 39.995746838242106, 116.29462589967008 39.9956862564341, 116.29457001351166 39.9956303465875, 116.29452530672476 39.99556641954866, 116.29450779336246 39.99510726772342, 116.29449027972143 39.99465051557199, 116.29447515226171 39.99427375594424, 116.29448635816041 39.99403694881452, 116.29448878052995 39.993803355241326, 116.29448800533328 39.99360176523085, 116.29420454374205 39.99363260722214, 116.29420133250602 39.99378300570833, 116.2938188545689 39.993828400532855, 116.29377652978624 39.99387166459767, 116.2936301838136 39.99407028382437, 116.29348423349343 39.994294701057235, 116.29344330865493 39.99431176392898, 116.29338940955951 39.994316847465925, 116.292758396664 39.99426442793603, 116.2925364108366 39.99424877168018, 116.29207127211362 39.99422668969204, 116.29164365586283 39.99422054702558, 116.29151009942774 39.994218151925935, 116.29150091681146 39.99421216625518, 116.2914943334328 39.994171178098355, 116.29147578731202 39.99399341410444, 116.29145782494689 39.99394944349652, 116.29137398608509 39.993870975227125, 116.29131809784994 39.99377906466742, 116.29129115816653 39.99367751018281, 116.29122848285434 39.993578410228906, 116.29122848511525 39.9935584110671, 116.2912474547549 39.99352338353158, 116.29126442618855 39.993504358374, 116.29127940326327 39.99346733702126, 116.29127800948012 39.993434740523085, 116.29125964707184 39.993396370221156, 116.29117940350115 39.993296697089605, 116.29112231605109 39.99321338781114, 116.29082844494626 39.99321463624892, 116.2902598616713 39.99319950177551, 116.28965692354461 39.99321721404263, 116.28966651372679 39.993154802123264, 116.28980870190033 39.992817001052416, 116.28991254218938 39.99261125234988, 116.29010583168609 39.99233737073295, 116.29035278948865 39.99237479454232, 116.29064606817222 39.99236054924788, 116.29106391801913 39.99235991228846, 116.29123261797609 39.99231965619753, 116.29177166571584 39.9920772404232, 116.29231589520488 39.991826413769786, 116.29275789956627 39.991625339873075, 116.29303819222861 39.99149371158413, 116.29332087334636 39.99140167710993, 116.29351371347826 39.9913773786441, 116.2939508883093 39.99136509874496, 116.29433256302771 39.99135370346371, 116.29450223862516 39.99135303813071, 116.29487572033915 39.9913608525852, 116.29535479583826 39.991344100328575, 116.29553744154008 39.99133781285667, 116.29578595726176 39.99133442089595, 116.29598217410194 39.99132751110849, 116.29618417747433 39.99132879138084, 116.2962799867238 39.99134823878633, 116.29650034987688 39.991368288448136, 116.29677340771312 39.99138385395211, 116.29679497156775 39.99132782200967, 116.29683717706166 39.99057298628475, 116.29687516018352 39.99007894610586, 116.29697001823006 39.989687611002616, 116.29709697216826 39.98963381096155, 116.2972335231849 39.98944000106749, 116.29755767221913 39.98946668234411, 116.29795926315748 39.98948763846673, 116.29816684247072 39.98950490463943, 116.29833530029882 39.98951823337567, 116.29888378354615 39.98954054864504, 116.2989590919587 39.98902664787507, 116.29900003970167 39.98876759213316, 116.29904999405186 39.988301929956194, 116.2990915332605 39.988103470646564, 116.29921296042599 39.987482498535314, 116.29929444334473 39.98707438241201, 116.29931362628284 39.98689195834649, 116.29937435595461 39.98644867685122, 116.29942834507253 39.98645098929285, 116.30017590234317 39.98648257317188, 116.30032439605291 39.98647433130992, 116.30034675210543 39.98645509555776, 116.30033718097536 39.9863815139791, 116.3003452076286 39.98602951420163, 116.30035958375731 39.98598149254706, 116.30036918061566 39.98584548199117, 116.3003691865126 39.985797483779116, 116.30039322270422 39.98509987041755, 116.30475516784682 39.98518221717691, 116.30534997611336 39.98519860640878))"
3 | 西湖,120.12792,30.228932,旅游景点,风景区,Matched,杭州西湖风景名胜区-九溪十八涧,120.116752075048,30.224861696630555,"POLYGON ((120.09134549079016 30.219900926087597, 120.09148648684243 30.21887699248457, 120.09175557507332 30.21792985102652, 120.0922168926102 30.21616356390148, 120.09298560856314 30.215139642474035, 120.0941130252463 30.21462773331543, 120.09512520957523 30.214013462377824, 120.09591959983159 30.213744775223866, 120.09746999019742 30.213706627180294, 120.09823884996636 30.21342519777375, 120.09913592196078 30.212785440001856, 120.10064832555862 30.21067396682879, 120.10139181277168 30.20904868416758, 120.10226345680995 30.20782023844583, 120.10313510870098 30.206898999085702, 120.10446825509234 30.20590117441573, 120.10600667337427 30.20410994174594, 120.10708364063605 30.202785726866068, 120.10763494759183 30.20229323120338, 120.10782087537685 30.201941345545958, 120.10783372595297 30.201640567531438, 120.10752611832847 30.200891646023027, 120.10745567377428 30.200238842249018, 120.10751982704525 30.19970770103324, 120.10791737184644 30.198504765650057, 120.10802641625199 30.197781657600903, 120.10888570617689 30.194620672080752, 120.1090011765185 30.193769575081213, 120.10920065920551 30.19213675995978, 120.10920687201937 30.192109164710445, 120.10921729066656 30.19209277139551, 120.10950759808696 30.19178355278857, 120.10952062060369 30.19177316084281, 120.10953584633617 30.191767369993308, 120.10837963514237 30.19101094722905, 120.10859882055722 30.19065988258105, 120.10863928809675 30.19063590606072, 120.10872584143574 30.19049276025456, 120.10898790450325 30.190092924488244, 120.10940142346465 30.189651580557342, 120.10941585082541 30.189613190541728, 120.11158829767227 30.190027317400457, 120.11171014467637 30.18965941345251, 120.11196423972687 30.18912680521959, 120.11145696860127 30.188954477687805, 120.11162369082591 30.188608201529384, 120.11178881225722 30.188251525708875, 120.11221995726589 30.188407008347927, 120.11282906917712 30.18812424004007, 120.11286192897259 30.18811586340273, 120.11290560695709 30.188121893772653, 120.11292844695878 30.188133909297342, 120.11350624854113 30.188630299339856, 120.11369417899927 30.18877282939043, 120.11372984215046 30.188796854336026, 120.11404600442216 30.189008277416328, 120.114202283934 30.1891119889583, 120.11493520865301 30.189591323399938, 120.11496646566833 30.18961134661685, 120.1161695147331 30.19011067868946, 120.11647329191484 30.19023412222331, 120.11649613269483 30.190273339225637, 120.11651255924406 30.19033175000794, 120.11650734629671 30.19036214422771, 120.11649331531656 30.190402130688877, 120.115986671392 30.19108968324651, 120.11560188707764 30.191588550947223, 120.11546078738886 30.1919148235465, 120.11536775701155 30.192481122891927, 120.11535490350009 30.192788297828287, 120.11561130552025 30.193697255112486, 120.11566256739268 30.194074876940157, 120.11557916070453 30.194567586145133, 120.11521998284776 30.195546455223024, 120.11451454539304 30.19667865880174, 120.1143093172426 30.197145681629813, 120.11418104766881 30.19746557020741, 120.1141161176302 30.19756471724302, 120.11402634549073 30.197627048096674, 120.11391173042173 30.197663762430736, 120.11336672867057 30.197701768402226, 120.11285378171898 30.197893399142387, 120.11243059656034 30.198174695207566, 120.11085329478179 30.199645595799836, 120.11048140640378 30.200189338995525, 120.11010951108688 30.200861080248718, 120.10987865480135 30.201552107749183, 120.10995544315259 30.20293448063654, 120.10976291467155 30.204969458013725, 120.10977332562146 30.20503186067383, 120.1097821322705 30.205110261519703, 120.1097636956183 30.205168647554956, 120.10973965022694 30.205221430639504, 120.10967954406752 30.205282992302294, 120.10912820948681 30.2055386632153, 120.10882686909423 30.20602486849732, 120.10874348628181 30.20649199665749, 120.1089292616969 30.207720830084607, 120.10931376660368 30.208924176698186, 120.11023680470478 30.20987185882523, 120.11036497873769 30.210281511433745, 120.11068548566853 30.210640084685352, 120.11205740998636 30.21117212489196, 120.11234589888353 30.211383503668042, 120.11278181917008 30.21194696431271, 120.11294206039744 30.2124014465836, 120.11309583146533 30.21343188954339, 120.11352532614723 30.214264139438388, 120.1137849694664 30.214577905325438, 120.11437794694446 30.21573666505132, 120.11457027192152 30.216066387205203, 120.11541655991084 30.217132564630614, 120.11684010811524 30.21718490531497, 120.11853316393787 30.21644400983357, 120.11921296247566 30.216342231734924, 120.12221439221524 30.217356298049744, 120.1250429022617 30.2188376891387, 120.12653733382098 30.22067613354973, 120.12672973805856 30.221085941373648, 120.12676821063788 30.22125237803278, 120.12671687388973 30.221431505001902, 120.12635755977901 30.222275821153836, 120.1258571480527 30.22290238843352, 120.12385574070589 30.22382165014701, 120.12180315773266 30.224753813073892, 120.11995589812713 30.226236695755592, 120.11777519106518 30.228820138842625, 120.11613983464282 30.2302650652185, 120.11539273625202 30.230946021761337, 120.11531258050812 30.230981156522574, 120.11520357308912 30.230987471071387, 120.11397252033755 30.230480976635352, 120.11355578408552 30.230186295056633, 120.1130877718944 30.229795590581386, 120.11261337443436 30.22923210194791, 120.11240182495663 30.228950378015572, 120.11220949539435 30.228835056686123, 120.11204280207929 30.22882854590687, 120.11103619819694 30.229403852800576, 120.110707628715 30.22949324099766, 120.11068198521822 30.229492425186926, 120.11065313760864 30.22947800841078, 120.11027652943679 30.22918579961674, 120.11010024675832 30.229051303397547, 120.10992718799362 30.22873762323431, 120.10953938757605 30.228302227805084, 120.10727354787255 30.226496333395204, 120.10537648839721 30.22462677322718, 120.10409276321035 30.22360470685451, 120.10406993277141 30.22359149867766, 120.10404429773376 30.223585888851208, 120.10165954697356 30.223409441346714, 120.10073679753413 30.2234091571228, 120.09949770508996 30.223586805377753, 120.09948088548443 30.223588001033004, 120.09946406630552 30.223584797019253, 120.09867218908593 30.22333202596851, 120.09865877436135 30.223329223138645, 120.09864415795047 30.223330219765188, 120.09490849519682 30.223916765972515, 120.09489288066835 30.223918164312533, 120.09487826725446 30.223917562909357, 120.09353469097394 30.223754274044975, 120.09352328154736 30.223751073700086, 120.09351207267203 30.2237436736831, 120.09222391481981 30.22267511752697, 120.09221630970762 30.222665518202486, 120.0922099057861 30.22265311908824, 120.0912942018774 30.220476887540077, 120.09134549079016 30.219900926087597))"
4 | 上海市同济医院,121.437529,31.272966,医疗,综合医院,Matched,上海市同济医院,121.42648733832867,31.2684804357839,"POLYGON ((121.42568275319336 31.268540728226217, 121.42568256421906 31.268422177794907, 121.42581514282269 31.2676107773113, 121.42655346934175 31.26774071619794, 121.42821621014876 31.268033374475582, 121.4283649518671 31.268075047814882, 121.4282705642695 31.268407386137785, 121.42822297725841 31.26849688733764, 121.42748788425916 31.268447476088763, 121.42742855846194 31.26876566757506, 121.42738748824786 31.26881009743633, 121.42719420858216 31.26954424723371, 121.42699589645306 31.269510915281725, 121.42695721583851 31.269711982926395, 121.4267182429243 31.269671705630703, 121.4265541829489 31.269703091604793, 121.42595463760966 31.2698927860293, 121.42590618231526 31.2696977097227, 121.42568322919193 31.268799420166246, 121.42568275319336 31.268540728226217))"
5 | 广州天河体育中心,113.331575,23.143232,运动健身,体育场馆,Matched,广州市天河体育中心,113.31961950247712,23.14006529498788,"POLYGON ((113.3164345722021 23.137832229647067, 113.31639831509409 23.13665945959413, 113.31641738201158 23.1365533659084, 113.31645989598046 23.136527996565334, 113.31667176012562 23.13652310270523, 113.31810201653435 23.13646154301406, 113.3195535252966 23.13641181857542, 113.32016107355385 23.136380276924076, 113.32231783027136 23.136299688643405, 113.32271022322442 23.136274645525592, 113.32278207301545 23.136316091159443, 113.32278844404433 23.136611162328713, 113.32278362193803 23.13720613675966, 113.322780413868 23.13747803572334, 113.32277015012134 23.139041540432974, 113.32280419932597 23.14051359815851, 113.3228073798955 23.14074710073346, 113.322810524128 23.141636347409495, 113.32279451436088 23.142400880187967, 113.32277149012064 23.1436708306453, 113.32267308866803 23.143736394291373, 113.32255672604879 23.14376860532899, 113.31914045787575 23.143735137653255, 113.31826795608266 23.14368600588769, 113.31765092746532 23.14364568336417, 113.31658372352763 23.14359280421199, 113.31651616571017 23.143535552583025, 113.31645300293434 23.143411018730948, 113.31646253222439 23.142601514521168, 113.31648130109349 23.140804678197796, 113.31643283252944 23.140268278004072, 113.3164345722021 23.137832229647067))"
6 | 武汉天河国际机场,114.217379,30.776258,交通设施,飞机场,Matched,武汉天河国际机场,114.20538229712714,30.772593566122364,"POLYGON ((114.19153304013852 30.76382951177008, 114.19152632209814 30.763837196666337, 114.19152150863701 30.763846184063432, 114.19151869991315 30.76385607431145, 114.1915181975957 30.763866268234068, 114.1915198007922 30.763876365596293, 114.19152360961598 30.763885867000333, 114.19152922337908 30.76389437185653, 114.19612454004937 30.768313374391802, 114.196130353589 30.768321979241367, 114.19613416255292 30.768331680401026, 114.19613596654469 30.768341877979555, 114.19613546416312 30.768352271854237, 114.19613295621551 30.768362262687955, 114.1961283439785 30.768371650447854, 114.1961218268886 30.768379735835435, 114.19559947839582 30.768804977960755, 114.19559296087567 30.76881316324856, 114.19558824852739 30.76882245088837, 114.19558554110286 30.768832641283147, 114.1955849385244 30.768843034923126, 114.19558654105296 30.76885343215821, 114.19559024969561 30.76886323310479, 114.19559596288157 30.768872037750405, 114.19680847117846 30.77009514915348, 114.19681408480915 30.7701038536361, 114.1968175925136 30.7701136541501, 114.19681909509792 30.770123851357557, 114.19681849291598 30.770134245043636, 114.19681568422479 30.770144235419206, 114.19681087226297 30.77015342303392, 114.19680435524806 30.77016140849543, 114.196061951591 30.770738424563607, 114.19605252716676 30.770746505332365, 114.19604139952344 30.770752084541495, 114.19602926845718 30.77075476370157, 114.1960168373802 30.77075414412747, 114.19600490790693 30.77075052682226, 114.1959942824064 30.77074401310995, 114.19598566125843 30.770735003901027, 114.19513019677004 30.769842886111185, 114.19512147582616 30.769835975583447, 114.19511145106954 30.76983086195057, 114.19510062384302 30.76982794599651, 114.19508949610824 30.769827228361443, 114.19507836813723 30.769828709601356, 114.19506784054323 30.7698324906426, 114.19505821602527 30.769838172189516, 114.19469436800144 30.770114240638865, 114.19468474373237 30.77011972214593, 114.19467431673989 30.770123203473425, 114.19466348926619 30.77012438532634, 114.19465246132735 30.770123467927277, 114.19464193451827 30.77012015255078, 114.19463231067613 30.77011483964046, 114.19462399058891 30.77010772982197, 114.19450089207785 30.769964004173648, 114.19449247084437 30.769956894150013, 114.19448274671697 30.769951481210068, 114.19447222010506 30.769948165832055, 114.19446119223952 30.769947048441075, 114.19445026460349 30.76994823011718, 114.19443973737818 30.769951611245183, 114.19443011241961 30.769957092758855, 114.19410616450314 30.770196142461113, 114.19409844352076 30.770202826410905, 114.19409272877358 30.77021131277376, 114.19408951902369 30.770221002519197, 114.19408901724154 30.770231196433198, 114.19409112163515 30.7702411947289, 114.19409583253227 30.77025019772806, 114.1941027498543 30.770257705123818, 114.19427617753807 30.770401313265054, 114.19428309495399 30.770408720706488, 114.19428780606677 30.77041782367893, 114.19428991056397 30.77042782191935, 114.19428940793468 30.77043801589062, 114.19428619909358 30.770447705619578, 114.19428058378045 30.770456192144916, 114.19427296407403 30.77046297624785, 114.19389326841684 30.770734417553502, 114.1938854483171 30.7707412012775, 114.19387973210502 30.770749787540844, 114.19387652356724 30.770759577183338, 114.19387582037673 30.77076987077471, 114.1938778246582 30.770780068773448, 114.19388243536808 30.770789271524002, 114.1938892525024 30.77079707863457, 114.19470315237598 30.771557310353728, 114.1947090665538 30.771566015408688, 114.19471297561535 30.771575816677785, 114.19471487917754 30.771586214465398, 114.19471457698519 30.771596708529167, 114.19471217010229 30.771606999288135, 114.19470765807155 30.771616487071785, 114.19470134100294 30.771624972488922, 114.19383817885418 30.772285226764502, 114.19383035736907 30.77229211044534, 114.19382464263231 30.772300896558253, 114.19382123288658 30.77231068592045, 114.19382053031671 30.772321179352097, 114.19382243375972 30.77233137725361, 114.19382694471192 30.772340879689185, 114.19383356065214 30.772348886342456, 114.1968649157172 30.77532134715694, 114.19687413833104 30.775328358220918, 114.19688446272941 30.77533327213483, 114.19689559057753 30.775336288358652, 114.19690711900142 30.77533700631515, 114.19691854695596 30.77533552536098, 114.19692937429866 30.775331944459975, 114.1969394002993 30.775326363235628, 114.19727735521363 30.775115409480904, 114.19728788196745 30.775110128915315, 114.19729921026051 30.77510704847309, 114.19731093918904 30.77510596767523, 114.19732266736897 30.775107185683783, 114.19733399570386 30.77511050192795, 114.1973444202483 30.77511581575095, 114.19735384321962 30.775123026973713, 114.20462091586289 30.78239521031007, 114.20462662817529 30.78240401397477, 114.20463023525963 30.78241381411153, 114.20463183798064 30.782424111091075, 114.2046313355461 30.782434504987123, 114.20462862897281 30.782444595921003, 114.20462391692483 30.78245398430979, 114.2046175020146 30.782462170847474, 114.20387647387605 30.7830313059465, 114.20386985797656 30.783039492133163, 114.20386514572859 30.78304878044974, 114.20386243918156 30.783058971259617, 114.20386183630328 30.783069365011116, 114.20386333871828 30.78307976185564, 114.20386704629567 30.78308956215901, 114.20387265838914 30.78309846573783, 114.20444710609904 30.78367559885604, 114.20445532411831 30.783685005770238, 114.2044656459915 30.783691917095346, 114.20447737228396 30.78369623175253, 114.20448980047841 30.783697448995614, 114.20450222870465 30.783695767711695, 114.20451375478382 30.783690986799716, 114.204523778452 30.783683605028937, 114.20529054876117 30.783112195100752, 114.20530037186373 30.783106411992676, 114.20531109606588 30.783102629172387, 114.20532242119731 30.783101046054156, 114.20533384696866 30.783101661880405, 114.20534487128218 30.783104476000446, 114.20535519369226 30.783109388137913, 114.2053644132415 30.78311619762765, 114.21070405617374 30.78830302643026, 114.21380516209437 30.791365504957596, 114.2138118747747 30.79137350830391, 114.21381628276856 30.791383008234078, 114.21381818484147 30.791393305032766, 114.21381748288469 30.791403798653384, 114.21381417534445 30.79141368978034, 114.21380846338558 30.791422478650833, 114.21380074785685 30.791429566200996, 114.21364593503978 30.791549827795198, 114.21363811879826 30.791556715267163, 114.21363240755893 30.791565504113997, 114.21362909996664 30.7915754951593, 114.21362839695777 30.79158588887772, 114.21363029993746 30.791596185640483, 114.21363480833881 30.791605685750426, 114.21364152024199 30.791613689165718, 114.21943485807611 30.79730405313758, 114.21944297149157 30.797313055559442, 114.21945318815666 30.797319561231063, 114.21946480807162 30.797323269620843, 114.21947692897697 30.79732388009002, 114.21948885010671 30.79732129208098, 114.21949966965217 30.797315904607384, 114.21950878631046 30.797307817063416, 114.2202618412363 30.796716492228395, 114.22027075666574 30.79670840426494, 114.22028157603349 30.796703016401004, 114.22029339671725 30.796700627797385, 114.22030551729769 30.796701337793007, 114.2203169365677 30.796705145517045, 114.22032705307477 30.796711750668383, 114.22033506578772 30.79672075269697, 114.22122567995349 30.79761681584008, 114.2212336925779 30.7976259174727, 114.22124380853903 30.79763252217101, 114.22125532812156 30.79763632952726, 114.22126734762668 30.79763703896471, 114.22127926778492 30.797634749822137, 114.22129008665144 30.797629361590477, 114.22129920256769 30.79762147320832, 114.22167345262609 30.797331430258826, 114.22168276917373 30.797325440854905, 114.22169298584778 30.797321451140046, 114.22170390421483 30.797319660789764, 114.22171492229701 30.797320069304817, 114.22172564034567 30.797322776404346, 114.22173565572126 30.797327581741403, 114.22174436994445 30.79733428504291, 114.22180366188056 30.79739499944889, 114.22181277605732 30.79740220276842, 114.22182309288868 30.797407408090777, 114.22183421076227 30.797410615146376, 114.22184572974999 30.797411623711334, 114.22185724830821 30.797410533416787, 114.22186836748075 30.797407243927974, 114.22187858465445 30.79740185487724, 114.22225192125308 30.797151080739866, 114.22225933419195 30.797143590517337, 114.22226474370656 30.797134499555238, 114.22226774952063 30.797124407336355, 114.22226825178814 30.797113813379305, 114.22226604887902 30.79710351723004, 114.22226154260522 30.797093918938124, 114.22225473256275 30.7970858180084, 114.22216519499041 30.79699819587806, 114.22215858509607 30.796990095088578, 114.2221541780571 30.796980696742366, 114.22215227650565 30.796970500739004, 114.22215297828664 30.796960106800483, 114.22215638491443 30.796950214784594, 114.22216209525544 30.79694152387385, 114.22216980827866 30.79693463356808, 114.22253773558808 30.796654666336213, 114.22254715124154 30.796648876650924, 114.2225574678507 30.79664518645243, 114.22256838590326 30.796643695506067, 114.22257940376929 30.796644503444327, 114.22259002059647 30.796647509869167, 114.22259973553713 30.796652614444763, 114.22260824889884 30.796659617189306, 114.22279252668795 30.79686384603114, 114.2228023414984 30.796872148886138, 114.22281305778472 30.796879253142606, 114.22282467618741 30.796884758803778, 114.22283699530354 30.796888366103, 114.22284971564062 30.796890174588242, 114.22286253655204 30.796889884338373, 114.22287525702541 30.796887794954962, 114.22340731869458 30.79676585062457, 114.22342094130917 30.796762262354665, 114.22343426288872 30.796757774379316, 114.22344738376486 30.796752686560446, 114.22346020451248 30.79674669901186, 114.22347262512731 30.79674001155523, 114.22348464477484 30.796732724112648, 114.22349626423149 30.796724636814545, 114.22578765699008 30.795032979478364, 114.22579717132871 30.79502528952747, 114.22580828866576 30.795020199136896, 114.22582040702073 30.795018107800228, 114.22583262495769 30.795019214752752, 114.2258442418188 30.795023319724923, 114.2258543567367 30.79503012231719, 114.2258624673418 30.795039322393446, 114.22690829499744 30.796131954932033, 114.22691640550484 30.796140954716922, 114.22692651985604 30.79614765684237, 114.22693793559326 30.79615156124621, 114.22695005314547 30.796152367741534, 114.22696197037985 30.796150175735505, 114.22697298699634 30.79614498478635, 114.2269822010801 30.796137194251322, 114.22945553856027 30.794188125123313, 114.2369032740998 30.78835847401276, 114.2378434712439 30.78762942730617, 114.23920254293043 30.786531998133587, 114.23921395436345 30.7865246033514, 114.23922616540035 30.78651860786421, 114.23923897756069 30.786514011745485, 114.23925218985232 30.786511014780324, 114.23926570249643 30.786509516981315, 114.2392793145056 30.786509718274164, 114.23929282633148 30.786511418809784, 114.23943615407973 30.786524526178564, 114.23944976595484 30.78652632657038, 114.23946317779273 30.786529426227222, 114.23947628901927 30.7865338250612, 114.23948879970622 30.786539523212515, 114.23950080935738 30.786546320698193, 114.23951201903608 30.78655431743926, 114.23952242767187 30.78656331359868, 114.24147169773721 30.788501282006628, 114.24148000373229 30.788510277354554, 114.24149031169789 30.788516874121317, 114.24150202076795 30.788520572383952, 114.2415142297756 30.788521272362786, 114.24152624030722 30.788518774131024, 114.24153734909518 30.78851347734038, 114.24154665786835 30.788505481938902, 114.24248174487388 30.787791784255177, 114.24248825001166 30.787783888528452, 114.24249295423492 30.787774793478604, 114.24249555724339 30.78776479898408, 114.2424959582722 30.787754604508475, 114.24249415848429 30.787744510035434, 114.24249035660597 30.78773501524163, 114.24248445258277 30.78772661982591, 114.24054747854099 30.785843685423433, 114.24053907286901 30.785834489892267, 114.24053166732553 30.785824394916407, 114.24052566329259 30.785813500483382, 114.2405209603221 30.785801906498985, 114.24051775879482 30.78578981278575, 114.2405160579989 30.78577741944736, 114.2405158590157 30.785764926215737, 114.24052337689506 30.785633997753706, 114.24052517983178 30.785621504628065, 114.24052838349633 30.785609311459304, 114.24053308861237 30.785597618088424, 114.24053919481086 30.785586624482434, 114.24054660163046 30.78557643043444, 114.24055521040495 30.7855671360598, 114.24056491914975 30.785559140978485, 114.24191710413785 30.784513963900633, 114.24192350931959 30.78450586846274, 114.24192821376556 30.784496573537368, 114.24193081681771 30.7844865790685, 114.24193131785886 30.784476184719175, 114.24192961783346 30.78446599018746, 114.24192591623087 30.784456295426583, 114.24192021180194 30.784447699964677, 114.23554345022329 30.778176649299194, 114.23553293949371 30.77816775149326, 114.23552162858579 30.77815985293892, 114.23550961576032 30.778153253552478, 114.23549700260277 30.77814795323826, 114.23548388971037 30.77814385225353, 114.23547037525627 30.778141150319016, 114.23545676042339 30.778139847698952, 114.23423822288359 30.778070256599428, 114.2342246083617 30.778068853377437, 114.23421119317281 30.778066150928787, 114.2341981786047 30.77806194933116, 114.23418556493418 30.778056548506203, 114.2341736515793 30.778049848544264, 114.23416243953783 30.778041949452863, 114.2341521285782 30.778032951145764, 114.23280217239902 30.776661243126, 114.22854743785203 30.77259113047986, 114.22362536016317 30.76777773754257, 114.21508521641442 30.759460300789588, 114.21507630096193 30.759450096274705, 114.21506868747466 30.75943889371957, 114.2150624763014 30.759426793198436, 114.21505776878061 30.759414194490375, 114.21505546445756 30.759405396511166, 114.21505326186715 30.75939210094586, 114.2150527618439 30.75937860733206, 114.21511216003923 30.758288830521188, 114.2151114601157 30.758275236706126, 114.21510935677908 30.75826184132116, 114.21510565123322 30.75824874395879, 114.2151005423381 30.758236144908516, 114.21509403103879 30.758224243848215, 114.21508611735229 30.75821314097972, 114.2150770010394 30.758203036196797, 114.20863126583872 30.751740376867957, 114.20862224730598 30.751733568563512, 114.20861202652446 30.751728557648082, 114.20860100322876 30.751725644661747, 114.20858967938338 30.75172493013048, 114.20857845565493 30.7517265145091, 114.20856773274733 30.751730198462422, 114.20855801173953 30.751735982644128, 114.20715884786364 30.75279326348665, 114.20714722277894 30.75280094365986, 114.20713489491591 30.752807323601157, 114.20712206688336 30.75281260339687, 114.20710863828367 30.75281658307566, 114.20709500893554 30.752819263152166, 114.20708107843025 30.752820643409503, 114.20706714799597 30.752820624468093, 114.20701804323272 30.752818558520225, 114.20700431362891 30.752817340400487, 114.20699068449946 30.75281482309753, 114.20697745616565 30.75281100696508, 114.2069647290438 30.752805792262805, 114.20695250312421 30.752799378821734, 114.20694097922647 30.752791766995042, 114.2069302568514 30.752783056811506, 114.20437454641102 30.75037901164495, 114.20436612818312 30.75037010411958, 114.20435580514219 30.750363492757135, 114.20434407925157 30.750359877737544, 114.20433175202473 30.750359460203345, 114.20431982497223 30.750362141712763, 114.2043090001357 30.750367923123566, 114.20429997965526 30.750376305832784, 114.2033763304223 30.75114247103608, 114.20336981525453 30.75115065726859, 114.20336520341849 30.751160045706246, 114.20336269674253 30.751170136837096, 114.20336229528854 30.751180530921467, 114.20336409865043 30.75119072826061, 114.20336810688764 30.751200329112493, 114.2033740196947 30.751208933308614, 114.2072266173355 30.75488108089859, 114.20723613656934 30.75489088886058, 114.207244554092 30.754901694710995, 114.20725166817887 30.754913298421958, 114.20725747973786 30.75492569989833, 114.20726178794314 30.754938599141838, 114.20726399192868 30.754947497567162, 114.20726549398165 30.75495649494156, 114.20726629500172 30.754965491449138, 114.20729832308152 30.755445987813314, 114.20729942466224 30.755455184535236, 114.2073011277653 30.7554641822135, 114.2073034314937 30.755473080819225, 114.20730633764809 30.755481780289706, 114.20731174795831 30.7554944810424, 114.20731826088607 30.755506583658352, 114.20732607632019 30.755517888449976, 114.20733489485596 30.755528494996792, 114.20797319832654 30.756235591273626, 114.20797830904934 30.756244693437072, 114.20798151498256 30.756254792493937, 114.20798261614526 30.75626518866484, 114.20798161340595 30.75627558198783, 114.20797860622403 30.75628567276027, 114.20797369493712 30.756294961431436, 114.20796708029518 30.75630304840335, 114.20553214242508 30.758158050598745, 114.2055227212292 30.758165933233087, 114.20551159599506 30.758171214868504, 114.20549946955079 30.758173496593482, 114.20548714159061 30.7581726797114, 114.2054754163789 30.758168865131246, 114.20546509456422 30.758162153976016, 114.20545677695438 30.758153046919222, 114.20257753852276 30.755302349645042, 114.20256921990725 30.755293342012234, 114.20255889658138 30.75528663016723, 114.20254727064639 30.755282814868472, 114.20253494195104 30.75528219697141, 114.20252301425008 30.75528467797589, 114.2025119887545 30.755290158825108, 114.20250286697528 30.75529834107801, 114.19688975881421 30.7597681999642, 114.19153304013852 30.76382951177008))"
7 | 盐田港,114.271928,22.57838,交通设施,港口,Matched,盐田港,114.26061202453415,22.575154713013056,"POLYGON ((114.26213618835881 22.579701773314717, 114.26555349126096 22.57617994632241, 114.25925684876195 22.570539329231334, 114.25576186582524 22.573983699997346, 114.26054916706937 22.578256385465092, 114.26213618835881 22.579701773314717))"
8 | 鹅岭公园,106.543065,29.555566,旅游景点,公园,Matched,鹅岭公园,106.53282784971917,29.55227294189109,"POLYGON ((106.5319943007464 29.551425567457347, 106.5319939036206 29.55135439268277, 106.53199671064631 29.551345603105776, 106.53200432877777 29.551339224946922, 106.5320183617153 29.551336062008588, 106.53208331324981 29.55133702771121, 106.53209373767163 29.55133505507832, 106.5321009548877 29.55132787616679, 106.53210496477197 29.5513178900952, 106.53215188923504 29.55103491439618, 106.53216191316614 29.55102414396874, 106.53217674791824 29.551020583155385, 106.53233551643389 29.551038581386496, 106.5328667437976 29.55109670878301, 106.53288318157885 29.55109954932752, 106.53289480806386 29.551105976368955, 106.53290493067739 29.55111999680094, 106.5329079370594 29.55113050052857, 106.53290853787188 29.55114139802731, 106.53288437159395 29.55134146311443, 106.53288527314292 29.551351561674117, 106.53288858027328 29.551361066538004, 106.53289409254795 29.551369477383805, 106.53290160952588 29.551376393850582, 106.53291042961872 29.551381214383035, 106.53292355975249 29.551384046546726, 106.53329010158613 29.551409661836853, 106.5337246916371 29.551487923686214, 106.53373792140897 29.55149555398099, 106.53374353368496 29.55150556433776, 106.53374513667376 29.551518363638117, 106.5337163643399 29.551647644085037, 106.53371516098655 29.551659336773803, 106.53371596220067 29.551671134441136, 106.5337188682355 29.551682537521028, 106.53372377887362 29.551693345836338, 106.53373049368237 29.551703059068366, 106.53373881221442 29.55171147678797, 106.53374833356382 29.551718398065, 106.53400851708794 29.55188408695251, 106.53416967966669 29.55194406626355, 106.53417729651481 29.551952082267295, 106.5341805030779 29.55196568525036, 106.53415223036188 29.55213255371012, 106.53415182885227 29.552144148460563, 106.53415353211786 29.552155648482128, 106.5341574404089 29.55216665417139, 106.53416325306199 29.552176764926557, 106.53417097009874 29.5521855808924, 106.53418009039511 29.55219280093099, 106.5341904135085 29.552198224617317, 106.53448196579667 29.552333498654967, 106.53450241157341 29.552340346781353, 106.53452887100039 29.55234521050704, 106.53458620029919 29.552343752888245, 106.53460103396218 29.552338991315544, 106.5346138635157 29.552326227710157, 106.53466588686466 29.552205000680928, 106.53467180064351 29.55219581865414, 106.53467951840295 29.55218814053089, 106.53468863922478 29.552182265210977, 106.53469886241491 29.55217849183992, 106.53470968683534 29.552177019106836, 106.5347205111388 29.55217774556322, 106.5347309344139 29.552180870146774, 106.53512009926509 29.55232937367222, 106.53513062251591 29.552334697552656, 106.53514014343106 29.552341818312026, 106.53514816090862 29.55235043483158, 106.53515447450543 29.552360446658554, 106.5351589840243 29.552371353730784, 106.53516148903468 29.55238285566795, 106.53516178910398 29.552394652087543, 106.53513511582942 29.5526624887168, 106.53513351158979 29.552675280106463, 106.53513110556875 29.552687869602227, 106.53512799798366 29.552700357413737, 106.53512428906424 29.55271264382245, 106.53511987859201 29.55272462861937, 106.5351147665596 29.55273641176722, 106.53510905319239 29.552747993510973, 106.53500851863684 29.552934978259092, 106.53500160256304 29.552946357102062, 106.53499378449294 29.55295723390978, 106.53498526487954 29.5529675092083, 106.53497594350343 29.5529770827887, 106.53496592058791 29.552985954896442, 106.5349552963617 29.552994025813273, 106.53494417104986 29.553001295785034, 106.5347533366894 29.553116683440663, 106.53470442549691 29.553139754294644, 106.53466273108958 29.553150847263467, 106.53445666419124 29.553195121243576, 106.53437648237711 29.553207717857962, 106.53371497649381 29.553270046700998, 106.53327396510294 29.553302527710166, 106.533135245637 29.553313374172983, 106.53307590909331 29.55331242479328, 106.53301737458231 29.55330587936383, 106.53294681259902 29.553292106048527, 106.53233580869515 29.553073834742502, 106.53104801211458 29.552609897260382, 106.53100070149137 29.552584983930522, 106.53096622189241 29.552547308448847, 106.5309485819835 29.55251287537683, 106.53093174470861 29.552464849362526, 106.53091510874995 29.552401329549546, 106.5309112001625 29.55238892397068, 106.53090648965842 29.5523768162039, 106.53090087699775 29.552365005989685, 106.53089446240851 29.55235369351284, 106.530887346133 29.552342779070994, 106.53087942792631 29.55233236236484, 106.5308708080166 29.552322643582468, 106.53042296772351 29.55187084550063, 106.53041474862134 29.551861727437913, 106.53040793292598 29.551851513433608, 106.53040252061984 29.551840503380355, 106.5303986119337 29.55182889746419, 106.53039630709925 29.551816795912288, 106.53039570633575 29.55180459883712, 106.53039690988172 29.551792306500577, 106.5304337082506 29.55159887366943, 106.53043761815854 29.551588187796032, 106.53044363303933 29.55157830710923, 106.53045135189915 29.551569930307195, 106.53046067448454 29.551563257051598, 106.53047119983061 29.551558486225073, 106.53048232648428 29.551555916149994, 106.53049385396939 29.55155554630322, 106.53087355599574 29.551561930504285, 106.53088789001085 29.551561267908173, 106.53090222406234 29.55155980560115, 106.53091635767603 29.551557543063705, 106.53093039109567 29.551554380591547, 106.53094422407177 29.551550517853258, 106.53095785661706 29.55154575492152, 106.53097108824427 29.551540291204873, 106.5310336378241 29.55151186361217, 106.53107854515166 29.551491187418307, 106.53115232048067 29.551472984867917, 106.53122689705413 29.551463580978652, 106.53155807786358 29.55144444149028, 106.53197545546574 29.551449110559787, 106.53198668188257 29.551445540602497, 106.53199189450707 29.551437157029422, 106.5319943007464 29.551425567457347))"
9 |
--------------------------------------------------------------------------------
/env.yaml:
--------------------------------------------------------------------------------
1 | name: geoscrapy
2 | channels:
3 | - conda-forge
4 | - defaults
5 | dependencies:
6 | - attrs=22.1.0=pyh71513ae_1
7 | - blosc=1.21.2=h1d6ff8b_0
8 | - boost-cpp=1.78.0=h1cb353e_1
9 | - branca=0.6.0=pyhd8ed1ab_0
10 | - brotli=1.0.9=h1a8c8d9_8
11 | - brotli-bin=1.0.9=h1a8c8d9_8
12 | - brotlipy=0.7.0=py310h8e9501a_1005
13 | - bzip2=1.0.8=h3422bc3_4
14 | - c-ares=1.18.1=h3422bc3_0
15 | - ca-certificates=2022.12.7=h4653dfc_0
16 | - cairo=1.16.0=h73a0509_1014
17 | - cfitsio=4.2.0=h2f961c4_0
18 | - charset-normalizer=2.1.1=pyhd8ed1ab_0
19 | - click=8.1.3=unix_pyhd8ed1ab_2
20 | - click-plugins=1.1.1=py_0
21 | - cligj=0.7.2=pyhd8ed1ab_1
22 | - contourpy=1.0.6=py310h2887b22_0
23 | - cryptography=38.0.4=py310hfc83b78_0
24 | - curl=7.86.0=h1c293e1_1
25 | - cycler=0.11.0=pyhd8ed1ab_0
26 | - expat=2.5.0=hb7217d7_0
27 | - fiona=1.8.22=py310hcdcf461_2
28 | - folium=0.13.0=pyhd8ed1ab_0
29 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
30 | - font-ttf-inconsolata=3.000=h77eed37_0
31 | - font-ttf-source-code-pro=2.038=h77eed37_0
32 | - font-ttf-ubuntu=0.83=hab24e00_0
33 | - fontconfig=2.14.1=h82840c6_0
34 | - fonts-conda-ecosystem=1=0
35 | - fonts-conda-forge=1=0
36 | - fonttools=4.38.0=py310h8e9501a_1
37 | - freetype=2.12.1=hd633e50_1
38 | - freexl=1.0.6=h1a8c8d9_1
39 | - gdal=3.5.3=py310h0b34360_8
40 | - geopandas=0.12.1=pyhd8ed1ab_1
41 | - geopandas-base=0.12.1=pyha770c72_1
42 | - geos=3.11.1=hb7217d7_0
43 | - geotiff=1.7.1=h90559a4_4
44 | - gettext=0.21.1=h0186832_0
45 | - giflib=5.2.1=h27ca646_2
46 | - hdf4=4.2.15=h1a38d6a_5
47 | - hdf5=1.12.2=nompi_h33dac16_100
48 | - icu=70.1=h6b3803e_0
49 | - idna=3.4=pyhd8ed1ab_0
50 | - jinja2=3.1.2=pyhd8ed1ab_1
51 | - joblib=1.2.0=pyhd8ed1ab_0
52 | - jpeg=9e=he4db4b2_2
53 | - json-c=0.16=hc449e50_0
54 | - kealib=1.5.0=hfd766a6_0
55 | - kiwisolver=1.4.4=py310h2887b22_1
56 | - krb5=1.19.3=he492e65_0
57 | - lcms2=2.14=h8193b64_0
58 | - lerc=4.0.0=h9a09cb3_0
59 | - libblas=3.9.0=16_osxarm64_openblas
60 | - libbrotlicommon=1.0.9=h1a8c8d9_8
61 | - libbrotlidec=1.0.9=h1a8c8d9_8
62 | - libbrotlienc=1.0.9=h1a8c8d9_8
63 | - libcblas=3.9.0=16_osxarm64_openblas
64 | - libcurl=7.86.0=h1c293e1_1
65 | - libcxx=14.0.6=h2692d47_0
66 | - libdap4=3.20.6=h8510809_2
67 | - libdeflate=1.14=h1a8c8d9_0
68 | - libedit=3.1.20191231=hc8eb9b7_2
69 | - libev=4.33=h642e427_1
70 | - libffi=3.4.2=h3422bc3_5
71 | - libgdal=3.5.3=h58c1bc0_8
72 | - libgfortran=5.0.0=11_3_0_hd922786_26
73 | - libgfortran5=11.3.0=hdaf2cc0_26
74 | - libglib=2.74.1=h4646484_1
75 | - libiconv=1.17=he4db4b2_0
76 | - libkml=1.3.0=h41464e4_1015
77 | - liblapack=3.9.0=16_osxarm64_openblas
78 | - libnetcdf=4.8.1=nompi_h2510be2_106
79 | - libnghttp2=1.47.0=h519802c_1
80 | - libopenblas=0.3.21=openmp_hc731615_3
81 | - libpng=1.6.39=h76d750c_0
82 | - libpq=15.1=h998ac43_1
83 | - librttopo=1.1.0=h844f84d_12
84 | - libspatialindex=1.9.3=hbdafb3b_4
85 | - libspatialite=5.0.1=h558e587_22
86 | - libsqlite=3.40.0=h76d750c_0
87 | - libssh2=1.10.0=h7a5bd25_3
88 | - libtiff=4.4.0=hfa0b094_4
89 | - libwebp-base=1.2.4=h57fd34a_0
90 | - libxcb=1.13=h9b22ae9_1004
91 | - libxml2=2.10.3=h87b0503_0
92 | - libzip=1.9.2=h76ab92c_1
93 | - libzlib=1.2.13=h03a7124_4
94 | - llvm-openmp=15.0.6=h7cfbb63_0
95 | - lz4-c=1.9.3=hbdafb3b_1
96 | - mapclassify=2.4.3=pyhd8ed1ab_0
97 | - markupsafe=2.1.1=py310h8e9501a_2
98 | - matplotlib-base=3.6.2=py310h78c5c2f_0
99 | - munch=2.5.0=py_0
100 | - munkres=1.1.4=pyh9f0ad1d_0
101 | - ncurses=6.3=h07bb92c_1
102 | - networkx=2.8.8=pyhd8ed1ab_0
103 | - nspr=4.35=hb7217d7_0
104 | - nss=3.78=h1483a63_0
105 | - numpy=1.23.5=py310h5d7c261_0
106 | - openjpeg=2.5.0=h5d4e404_1
107 | - openssl=3.0.7=h03a7124_1
108 | - packaging=22.0=pyhd8ed1ab_0
109 | - pandas=1.5.2=py310h2b830bf_0
110 | - pcre2=10.40=hb34f9b4_0
111 | - pillow=9.2.0=py310h9337a76_3
112 | - pip=22.3.1=pyhd8ed1ab_0
113 | - pixman=0.40.0=h27ca646_0
114 | - poppler=22.11.0=hae7f5f0_0
115 | - poppler-data=0.4.11=hd8ed1ab_0
116 | - postgresql=15.1=ha48369c_1
117 | - proj=9.1.0=h3bdf472_0
118 | - pthread-stubs=0.4=h27ca646_1001
119 | - pycparser=2.21=pyhd8ed1ab_0
120 | - pyopenssl=22.1.0=pyhd8ed1ab_0
121 | - pyparsing=3.0.9=pyhd8ed1ab_0
122 | - pyproj=3.4.0=py310hce8d790_2
123 | - pysocks=1.7.1=pyha2e5f31_6
124 | - python=3.10.4=h14b404e_0_cpython
125 | - python-dateutil=2.8.2=pyhd8ed1ab_0
126 | - python_abi=3.10=3_cp310
127 | - pytz=2022.6=pyhd8ed1ab_0
128 | - readline=8.1.2=h46ed386_0
129 | - requests=2.28.1=pyhd8ed1ab_1
130 | - rtree=1.0.1=py310ha3239f5_1
131 | - scikit-learn=1.2.0=py310ha00a7cd_0
132 | - scipy=1.9.3=py310ha0d8a01_2
133 | - setuptools=65.5.1=pyhd8ed1ab_0
134 | - shapely=1.8.5=py310h9356385_2
135 | - six=1.16.0=pyh6c4a22f_0
136 | - snappy=1.1.9=h17c5cce_2
137 | - sqlite=3.40.0=h2229b38_0
138 | - threadpoolctl=3.1.0=pyh8a188c0_0
139 | - tiledb=2.11.3=h9bd36d0_1
140 | - tk=8.6.12=he1e0b03_0
141 | - tzcode=2022g=h1a8c8d9_0
142 | - tzdata=2022g=h191b570_0
143 | - unicodedata2=15.0.0=py310h8e9501a_0
144 | - urllib3=1.26.13=pyhd8ed1ab_0
145 | - wheel=0.38.4=pyhd8ed1ab_0
146 | - xerces-c=3.2.4=h627aa08_1
147 | - xorg-libxau=1.0.9=h27ca646_0
148 | - xorg-libxdmcp=1.1.3=h27ca646_0
149 | - xyzservices=2022.9.0=pyhd8ed1ab_0
150 | - xz=5.2.6=h57fd34a_0
151 | - zlib=1.2.13=h03a7124_4
152 | - zstd=1.5.2=h8128057_4
153 | - pip:
154 | - automat==22.10.0
155 | - certifi==2022.12.7
156 | - cffi==1.15.1
157 | - constantly==15.1.0
158 | - cssselect==1.2.0
159 | - fake-useragent==1.1.1
160 | - faker==15.3.4
161 | - filelock==3.8.2
162 | - hyperlink==21.0.0
163 | - incremental==22.10.0
164 | - itemadapter==0.7.0
165 | - itemloaders==1.0.6
166 | - jmespath==1.0.1
167 | - lxml==4.9.1
168 | - parsel==1.7.0
169 | - protego==0.2.1
170 | - pyasn1==0.4.8
171 | - pyasn1-modules==0.2.8
172 | - pydispatcher==2.0.6
173 | - queuelib==1.6.2
174 | - requests-file==1.5.1
175 | - scrapy==2.7.1
176 | - scrapy-fake-useragent==1.4.4
177 | - service-identity==21.1.0
178 | - tldextract==3.4.0
179 | - twisted==22.10.0
180 | - typing-extensions==4.4.0
181 | - w3lib==2.1.1
182 | - zope-interface==5.5.2
183 | prefix: /Users/prufrock/opt/miniconda3/envs/geoscrapy
184 |
--------------------------------------------------------------------------------
/images/AOI_Peking_University.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/images/AOI_Peking_University.png
--------------------------------------------------------------------------------
/images/running_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/images/running_process.png
--------------------------------------------------------------------------------
/images/similarity_problem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YeshuoShu/BaiduAOISpider/0783b6f931d4efcbbc72061375cacc571bba6b90/images/similarity_problem.png
--------------------------------------------------------------------------------
/processor/__init__.py:
--------------------------------------------------------------------------------
1 | # import all modules
2 | from processor.aoi_container import AOIContainer
3 | from processor.api_handler import APIHandler
4 | from processor.counter import Counter
5 | from processor.file_operator import FileOperator
6 | from processor.logger import Logger
7 | from processor.repository import Repo
8 | from processor.validator import Validator
9 |
--------------------------------------------------------------------------------
/processor/aoi_container.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from difflib import SequenceMatcher
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from numpy.typing import NDArray
7 | from shapely.geometry import Point, Polygon
8 |
9 | from processor.repository import Repo
10 | from spatial.geometry import wgs84_to_wgs84utm50n
11 |
12 |
13 | class AOI(object):
14 | def __init__(self, rank: int, uid_name: str, geometry: Polygon) -> None:
15 | self.uid_name = uid_name
16 | self.geometry = geometry
17 | self.search_rank = rank
18 | self.area = self._area() / 1000000 # convert to square kilometers
19 |
20 | def _area(self) -> float:
21 | return wgs84_to_wgs84utm50n(self.geometry).area # unit: square meters
22 |
23 | def _not_too_big_or_too_small(self) -> bool:
24 | return (self.area >= Repo._min_aoi_area) and (self.area <= Repo._max_aoi_area)
25 |
26 | def _not_too_different(self) -> bool:
27 | """
28 | Return True either `sort_by_similarity` is disabled,
29 | or the similarity between the names of the AOI and the POI
30 | is above the threshold when similarity sorting is enabled.
31 | """
32 | return (Repo._sortings.get("sort_by_similarity") == 0) or (
33 | self.similarity >= Repo._min_similarity
34 | )
35 |
36 |
37 | class AOI_list(object):
38 | def __init__(self, idx: int) -> None:
39 | self.poi_name = Repo.file.loc[idx, "name"]
40 | self.p_lng = Repo.file.loc[idx, "lng_wgs84"]
41 | self.p_lat = Repo.file.loc[idx, "lat_wgs84"]
42 | self.aoi_list = []
43 |
44 | def _append(self, aoi: AOI) -> None:
45 | aoi = self._add_poi_related_property(aoi)
46 | if self._validate_aoi(aoi):
47 | self.aoi_list.append(aoi)
48 |
49 | def _validate_aoi(self, aoi: AOI) -> bool:
50 | def bbox_contains_poi() -> bool:
51 | """
52 | Check if the `bounding box` of AOI contains the corresponding POI.
53 | """
54 | lng1, lat1, lng2, lat2 = aoi.geometry.bounds
55 | if lng1 <= self.p_lng <= lng2 and lat1 <= self.p_lat <= lat2:
56 | return True
57 | else:
58 | return False
59 |
60 | return (
61 | bbox_contains_poi()
62 | and aoi._not_too_big_or_too_small()
63 | and aoi._not_too_different()
64 | )
65 |
66 | def _get_best_aoi(self) -> AOI:
67 | if self.aoi_list:
68 | weighted_rank = self._weighted_rank()
69 | best_aoi_idx = np.argmin(weighted_rank)
70 | return self.aoi_list[best_aoi_idx]
71 |
72 | def _sort_by_search_rank(self) -> NDArray:
73 | return self._get_rank(lambda aoi: aoi.search_rank)
74 |
75 | def _sort_by_area(self) -> NDArray:
76 | """
77 | Sort in ascending order if `sort_by_area` is 1, in descending order if -1.
78 | """
79 | return self._get_rank(lambda aoi: Repo._sortings.get("sort_by_area") * aoi.area)
80 |
81 | def _sort_by_distance(self) -> NDArray:
82 | return self._get_rank(lambda aoi: aoi.distance)
83 |
84 | def _sort_by_similarity(self) -> NDArray:
85 | return self._get_rank(lambda aoi: -aoi.similarity) # descending
86 |
87 | def _get_rank(self, func: callable) -> NDArray:
88 | """
89 | Return an rank array [r1, r2, ... rn] for an AOI property list
90 | [aoi_1.property, aoi_2.property, ... aoi_n.property],
91 | such that aoi_i.property is the ri-th smallest element.
92 | """
93 | return np.argsort(np.argsort([func(aoi) for aoi in self.aoi_list]))
94 |
95 | def _weighted_rank(self) -> NDArray:
96 | ranks = []
97 | for sorting, value in Repo._sortings.items():
98 | if value != 0:
99 | ranks.append(getattr(self, f"_{sorting}")())
100 | values = [value for value in Repo._sortings.values() if value != 0]
101 | weights = np.array(values) / np.abs(values).sum()
102 | return sum([rank * weight for rank, weight in zip(ranks, weights)])
103 |
104 | def _add_poi_related_property(self, aoi: AOI) -> AOI:
105 | def cal_distance(aoi: AOI) -> float:
106 | """
107 | Plane distance between the POI and the AOI
108 | in Wgs84-Utm50N projection.
109 | """
110 | geometry = wgs84_to_wgs84utm50n(aoi.geometry)
111 | point = wgs84_to_wgs84utm50n(Point(self.p_lat, self.p_lng))
112 | return geometry.distance(point)
113 |
114 | def cal_similarity(aoi: AOI) -> float:
115 | """
116 | Text similarity is calculated using difflib `SequenceMatcher`. For its algorithm,
117 | see https://stackoverflow.com/questions/35517353/how-does-pythons-sequencematcher-work
118 | """
119 | return SequenceMatcher(None, aoi.uid_name, self.poi_name).ratio()
120 |
121 | if Repo._sortings.get("sort_by_distance"):
122 | aoi.distance = cal_distance(aoi)
123 | if Repo._sortings.get("sort_by_similarity"):
124 | aoi.similarity = cal_similarity(aoi)
125 | return aoi
126 |
127 |
128 | class AOIContainer(object):
129 | @classmethod
130 | def mold(cls) -> None:
131 | """Initialize the `AOIContainer` class before logging."""
132 | cls._dict = {idx: AOI_list(idx) for idx in Repo.file.index}
133 | logging.warning("(6/6) AOIContainer is ready.")
134 |
135 | @classmethod
136 | def append(cls, idx: int, rank: int, uid_name: str, geometry: Polygon) -> None:
137 | """Append an AOI conditionally in the `AOIList` of its corresponding POI.
138 |
139 | AOI will be appended if it satisfies all the following requirements:
140 | - The bounding box of the AOI contains the corresponding POI.
141 | - The AOI's area is not too big or too small.
142 | - The AOI's name is not too different from the POI's name.
143 |
144 | Parameters
145 | ----------
146 | idx : int
147 | POI index.
148 | rank : int
149 | AOI search rank.
150 | uid_name : str
151 | AOI uid_name.
152 | geometry : Polygon
153 | AOI geometry.
154 | """
155 | aoi = AOI(rank, uid_name, geometry)
156 | cls._dict[idx]._append(aoi)
157 |
158 | @classmethod
159 | def get_best_aoi(cls, idx: int) -> AOI:
160 | """Get the best AOI of the POI with index `idx`.
161 |
162 | Parameters
163 | ----------
164 | idx : int
165 | POI index.
166 |
167 | Returns
168 | -------
169 | AOI
170 | The best AOI of the POI with index `idx`.
171 | """
172 | return cls._dict[idx]._get_best_aoi()
173 |
--------------------------------------------------------------------------------
/processor/api_handler.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | from typing import List, Tuple
4 |
5 | import pandas as pd
6 | from scrapy.http import Response
7 | from shapely.geometry import Polygon
8 |
9 | from processor.repository import Repo
10 | from spatial.coords import bd09ll_to_wgs84, bd09mc_to_wgs84
11 | from spatial.geometry import points_to_polygon, within_distance
12 |
13 |
14 | class APIHandler(object):
15 | @classmethod
16 | def assemble_uid_urls(cls) -> List[Tuple[int, str]]:
17 | """
18 | Construct `Baidu uid` circular area search urls (POIs that are already queried are skipped) using following parameters, and return a list of `(DataFrame_idx, url)` tuples:
19 | - ak (str): a random Baidu API key
20 | - name (str): POI's name
21 | - lng/lat (float): POI's longitude/latitude (wgs84 CRS)
22 | - radius (int): area search radius, in meters
23 | - radius_limit (str): 'true' or 'false', whether to limit the search radius
24 | - prim_ind (str): primary industry category
25 | - sec_ind (str): secondary industry category
26 | - scope (int): search scope, equals 2 if `prim_ind` and `sec_ind` are specified, otherwise equals 1
27 | """
28 | urls = []
29 | df = Repo.file.copy()
30 | # store industry parameter in a column
31 | if Repo._prim_ind != "VAR":
32 | df["prim_ind"] = Repo._prim_ind
33 | if Repo._sec_ind != "VAR":
34 | df["sec_ind"] = Repo._sec_ind
35 | # concatenate urls
36 | for idx in df.index:
37 | # skip POIs that are already queried
38 | if not pd.isna(df.loc[idx, "status"]):
39 | continue
40 | name = df.loc[idx, "name"]
41 | lng, lat = df.loc[idx, "lng_wgs84"], df.loc[idx, "lat_wgs84"]
42 | prim_ind, sec_ind = df.loc[idx, "prim_ind"], df.loc[idx, "sec_ind"]
43 | url = (
44 | f"https://api.map.baidu.com/place/v2/search?"
45 | f"query={name}"
46 | f"&location={lat},{lng}"
47 | f"&radius={Repo._radius}"
48 | f"&radius_limit={Repo._radius_limit}"
49 | f"&ak={random.choice(Repo._ak_list)}"
50 | f"&output=json&coord_type=1"
51 | )
52 | url += cls._industry_url_segment(prim_ind, sec_ind)
53 | urls.append((idx, url))
54 | return urls
55 |
56 | @classmethod
57 | def extract_uid_name_rank(
58 | cls, idx: int, response: Response
59 | ) -> List[Tuple[str, str, int]]:
60 | """
61 | Parse the `Baidu uid` response, filter the results,
62 | and return a list of `(uid_name, uid, search_rank)` triples.
63 | If `USE_FIRST_UID` is on, only the first result is returned.
64 |
65 | Filter Rules:
66 | -----
67 | 1. `name`, `uid`, and `geo-location` must exist.
68 | 2. The search result must be within the radius.
69 | 3. (Optional, if any of `prim_ind` and `sec_ind` are specified) The industry category must be consistent.
70 |
71 | Json Response Example
72 | -----
73 | ```
74 | # Suppose we search Peking University, the list extracted is of the form:
75 | # [('北京大学', 'ddfd7c2d8db36cf39ee3219e', 1), ...]
76 | {
77 | # some information
78 | # ...
79 | "results": [
80 | {
81 | "name": "北京大学",
82 | "location": {
83 | "lat": 39.998877,
84 | "lng": 116.316833,
85 | },
86 | "address": "北京市海淀区颐和园路5号",
87 | "province": "北京市",
88 | "city": "北京市",
89 | "area": "海淀区",
90 | "telephone": "(010)62752114",
91 | "detail": 1,
92 | "uid": "ddfd7c2d8db36cf39ee3219e"
93 | "detail_info":{
94 | "tag":"教育培训;高等院校",
95 | # ...
96 | },
97 | # more information
98 | # ...
99 | },
100 | # more uids
101 | # ...
102 | ]
103 | }
104 | ```
105 | """
106 | name_uid_rank = []
107 | status = json.loads(response.text).get("status")
108 | results = json.loads(response.text).get("results")
109 | # check status
110 | cls._check_status(status)
111 | # background POI property
112 | p_property = cls._get_poi_property(Repo.file, idx)
113 | # filter results
114 | if results:
115 | for rank, result in enumerate(results):
116 | # extract uid's property
117 | u_property = cls._get_uid_property(result)
118 | # keep the result if it passes all the rules
119 | if cls._pass_filter_rules(**p_property, **u_property):
120 | name_uid_rank.append((result["name"], result["uid"], rank + 1))
121 | if Repo._use_first_uid and name_uid_rank:
122 | break
123 | return name_uid_rank
124 |
125 | @staticmethod
126 | def assemble_aoi_url(uid: str) -> str:
127 | """
128 | Construct a `Baidu AOI` url with this AOI's `uid`.
129 | """
130 | return (
131 | f"https://map.baidu.com/?newmap=1&qt=ext&"
132 | f"uid={uid}&ext_ver=new&ie=utf-8&l=11"
133 | )
134 |
135 | @staticmethod
136 | def get_polygon_geometry(response: Response) -> Polygon | None:
137 | """
138 | Parse the `Baidu AOI` response, extract the polygon geometry.
139 |
140 | Json Response Example
141 | -----
142 | Geo data from json response conforms to the following format:
143 | `4|some_other_x, some_other_y...|1-x1, y1, x2, y2,..., xn, yn;`,
144 | what needs to be extracted is the part of `x1, y1,..., xn, yn`
145 |
146 | ```
147 | # Suppose we search the uid of Peking University,
148 | # the json response is like:
149 | {
150 | # some information
151 | # ...
152 | "content": {
153 | "geo": "4|12946839.266068,4837125.446178;12949751.777560,4839020.969541|1-12948599.7094790,4837127.8547043,...,12948599.7094790,4837127.8547043;",
154 | "uid": "ddfd7c2d8db36cf39ee3219e"
155 | },
156 | # some information
157 | # ...
158 | }
159 | ```
160 | """
161 | response = json.loads(response.text)
162 | geo = response.get("content", {}).get("geo")
163 | if geo:
164 | xys = geo.split("|")[2][2:-1].split(",")
165 | # xys now looks like [x1, y1, x2, y2, ..., xn, yn]
166 | # convert it into the format [(x1, y1), (x2, y2), ..., (xn, yn)]
167 | points = [
168 | bd09mc_to_wgs84(float(x), float(y)) for x, y in zip(xys[::2], xys[1::2])
169 | ]
170 | return points_to_polygon(points)
171 |
172 | @staticmethod
173 | def _industry_url_segment(prim_ind: str, sec_ind: str) -> str:
174 | if prim_ind and sec_ind:
175 | return f"&tag={prim_ind};{sec_ind}&scope=2"
176 | elif prim_ind or sec_ind:
177 | return f"&tag={prim_ind + sec_ind}&scope=2"
178 | return "&scope=1"
179 |
180 | @staticmethod
181 | def _check_status(status: int) -> None:
182 | """
183 | For more status code information, please refer to
184 | https://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi
185 | """
186 | if status == 0:
187 | return
188 | elif status // 100 == 2:
189 | raise Exception(f"API Parameter Invalid: {status}.")
190 | elif status // 100 == 3:
191 | raise Exception(f"API Verify Failure: {status}.")
192 | elif status // 100 == 4:
193 | raise Exception(f"API Quota Failure: {status}.")
194 | elif status // 100 == 5:
195 | raise Exception(f"API AK Failure: {status}.")
196 | else:
197 | raise Exception(f"API Error: {status}.")
198 |
199 | @staticmethod
200 | def _get_poi_property(df: pd.DataFrame, idx: int) -> dict:
201 | radius = Repo._radius / 1000 # convert to km
202 | p_lng, p_lat = df.loc[idx, "lng_wgs84"], df.loc[idx, "lat_wgs84"]
203 | if Repo._prim_ind == "VAR":
204 | p_prim_ind = df.loc[idx, "prim_ind"]
205 | else:
206 | p_prim_ind = Repo._prim_ind
207 | if Repo._sec_ind == "VAR":
208 | p_sec_ind = df.loc[idx, "sec_ind"]
209 | else:
210 | p_sec_ind = Repo._sec_ind
211 | return dict(
212 | p_lng=p_lng,
213 | p_lat=p_lat,
214 | radius=radius,
215 | p_prim_ind=p_prim_ind,
216 | p_sec_ind=p_sec_ind,
217 | )
218 |
219 | @staticmethod
220 | def _get_uid_property(result: dict) -> dict:
221 | return dict(
222 | uid=result.get("uid"),
223 | uid_name=result.get("name"),
224 | u_lng=result.get("location", {}).get("lng"), # of bd09ll CRS
225 | u_lat=result.get("location", {}).get("lat"),
226 | u_tag=result.get("detail_info", {}).get("tag"),
227 | )
228 |
229 | @staticmethod
230 | def _pass_filter_rules(
231 | p_lng: float,
232 | p_lat: float,
233 | radius: float,
234 | p_prim_ind: str,
235 | p_sec_ind: str,
236 | uid: str,
237 | uid_name: str,
238 | u_lng: float,
239 | u_lat: float,
240 | u_tag: str | None,
241 | ) -> bool:
242 | # 1. check if key information exists
243 | if not (uid and uid_name and u_lng and u_lat):
244 | return False
245 | # 2. should not be outside the radius
246 | u_lng, u_lat = bd09ll_to_wgs84(u_lng, u_lat) # re-project to wgs84 CRS
247 | if not within_distance(u_lng, u_lat, p_lng, p_lat, distance=radius):
248 | return False
249 | # 3. check if industry category is consistent
250 | if u_tag is None:
251 | return True # if no industry category is provided, pass
252 | elif ";" in u_tag:
253 | u_prim_ind, u_sec_ind = u_tag.split(";")
254 | return (p_prim_ind == u_prim_ind) and (p_sec_ind == u_sec_ind)
255 | else:
256 | return (p_prim_ind in u_tag) or (p_sec_ind in u_tag)
257 |
--------------------------------------------------------------------------------
/processor/counter.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | from typing import Tuple
4 |
5 | from processor.repository import Repo
6 |
7 |
8 | class Counter(object):
9 | @classmethod
10 | def boot(cls) -> None:
11 | cls._poi_num = len(Repo.file)
12 | cls._init_status = cls._count_status()
13 | cls._status = ()
14 | cls._df = Repo.file.reindex(
15 | columns=["poi_aoi_total", "poi_aoi_called"], fill_value=0
16 | )
17 | cls._init_time = time.time()
18 | cls._time = cls._init_time
19 | cls._poi_to_crawl = cls._poi_num - sum(cls._init_status)
20 | logging.warning("(5/6) Counter booted.")
21 |
22 | @classmethod
23 | def write_aoi_total_num(cls, idx: int, total_num: int) -> None:
24 | """
25 | Write the total number of AOIs of a POI into the `Counter`.
26 | """
27 | cls._df.loc[idx, "poi_aoi_total"] = total_num
28 |
29 | @classmethod
30 | def count_aoi_called(cls, idx: int) -> None:
31 | """
32 | Count when an AOI url of a POI is called.
33 | """
34 | cls._df.loc[idx, "poi_aoi_called"] += 1
35 |
36 | @classmethod
37 | def all_aoi_called(cls, idx: int) -> None:
38 | """
39 | Determine if all AOIs of a POI are called.
40 | """
41 | total_num = cls._df.loc[idx, "poi_aoi_total"]
42 | called_num = cls._df.loc[idx, "poi_aoi_called"]
43 | return called_num == total_num
44 |
45 | @classmethod
46 | def reach_update_interval(cls) -> None:
47 | """
48 | Determine if the `UPDATE_INTERVAL` is reached.
49 | """
50 | total_called_times = cls._df.poi_aoi_called.sum()
51 | if total_called_times % Repo._update_interval == 0:
52 | cls._time = time.time()
53 | return True
54 |
55 | @staticmethod
56 | def _count_status() -> Tuple[int, int, int]:
57 | def count(status: str) -> int:
58 | return Repo.file.status.eq(status).sum()
59 |
60 | matched = count("Matched")
61 | no_uid = count("No Uid")
62 | no_geometry = count("No Geometry")
63 | return matched, no_uid, no_geometry
64 |
65 | @classmethod
66 | def _count_missing(cls) -> int:
67 | return cls._poi_num - sum(cls._count_status())
68 |
69 | @classmethod
70 | def _cal_speed_xTime(cls) -> Tuple[str, str]:
71 | # average crawling speed
72 | poi_crawled = sum(cls._count_status()) - sum(cls._init_status)
73 | time_elapsed = cls._time - cls._init_time
74 | if time_elapsed == 0:
75 | return "nan/s (nan/h)", "nan"
76 | else:
77 | avg_speed = poi_crawled / time_elapsed
78 | # expected remaining time
79 | poi_remaining = Counter._poi_to_crawl - poi_crawled
80 | if avg_speed == 0:
81 | xTime = "Inf"
82 | else:
83 | xTime = cls._format_time(poi_remaining / avg_speed)
84 | # format average speed
85 | avg_speed = f"{avg_speed:.2f}/s ({avg_speed*3600:.0f}/h)"
86 | return avg_speed, xTime
87 |
88 | @classmethod
89 | def _total_time(cls) -> str:
90 | return cls._format_time(cls._time - cls._init_time)
91 |
92 | @staticmethod
93 | def _format_time(time: float) -> str:
94 | if time > 24 * 60 * 60:
95 | time = f">24h"
96 | elif time > 60 * 60:
97 | time = f"{time // 3600:.0f}h{time % 3600 // 60:.0f}min"
98 | elif time > 60:
99 | time = f"{time // 60:.0f}min{time % 60:.0f}s"
100 | else:
101 | time = f"{time:.0f}s"
102 | return time
103 |
--------------------------------------------------------------------------------
/processor/file_operator.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import geopandas as gpd
4 | import pandas as pd
5 |
6 | from processor.aoi_container import AOI
7 | from processor.repository import Repo
8 | from spatial.coords import bd09ll_to_wgs84, gcj02_to_wgs84
9 | from spatial.geometry import wkt_to_geometry
10 |
11 |
12 | class FileOperator(object):
13 | @staticmethod
14 | def add_cols() -> None:
15 | """
16 | In the output `AOI csv`, five additional columns will be added:
17 | - status (str): 'Matched', 'No Uid' or 'No Geometry'
18 | - uid_name (str): name of the uid whose geometry is chosen
19 | - lng_wgs84 (float)/lat_wgs84 (float): longitude/latitude in wgs84 CRS
20 | - geometry (`wkt`, well known text): AOI polygon geometry
21 | """
22 | for col in ["status", "uid_name", "lng_wgs84", "lat_wgs84", "geometry"]:
23 | if col not in Repo.file.columns:
24 | Repo.file[col] = None
25 | # the 'geometry' column will be saved as wkt in csv
26 | # convert it to shapely geometry when re-crawling
27 | Repo.file.geometry = Repo.file.geometry.apply(
28 | lambda x: wkt_to_geometry(x) if isinstance(x, str) else x
29 | )
30 | logging.warning("(3/6) Additional columns appended.")
31 |
32 | @classmethod
33 | def convert_crs_to_wgs84(cls) -> None:
34 | """
35 | Convert longitude and latitude columns
36 | from `gcj02` or `bd09ll` CRS to `wgs84`,
37 | and save as new columns: `lng_wgs84` and `lat_wgs84`.
38 | """
39 | if Repo._crs != "wgs84":
40 | # only support gcj02 and bd09ll conversion
41 | if Repo._crs == "gc02":
42 | cls._transform_crs(gcj02_to_wgs84)
43 | elif Repo._crs == "bd09":
44 | cls._transform_crs(bd09ll_to_wgs84)
45 | logging.warning("(4/6) CRS converted to wgs84.")
46 | # if the CRS is already wgs84, copy the original columns
47 | elif Repo._crs == "wgs84":
48 | cls._transform_crs(lambda x, y: (x, y))
49 | logging.warning("(4/6) CRS is already wgs84.")
50 |
51 | @staticmethod
52 | def write_aoi_and_status(idx: int, best_aoi: AOI) -> None:
53 | """
54 | Write the best AOI geometry and crawling status into the file.
55 | """
56 | Repo.file.loc[idx, "status"] = "Matched"
57 | Repo.file.loc[idx, "geometry"] = best_aoi.geometry
58 | Repo.file.loc[idx, "uid_name"] = best_aoi.uid_name
59 |
60 | @classmethod
61 | def save_file(cls) -> None:
62 | """
63 | Save the file as csv and shp (if any geometry exists).
64 | """
65 | cls._save_as_csv()
66 | cls._save_as_shp()
67 |
68 | @staticmethod
69 | def _transform_crs(func: callable) -> pd.DataFrame:
70 | Repo.file[["lng_wgs84", "lat_wgs84"]] = pd.DataFrame(
71 | Repo.file.apply(lambda x: func(x.lng, x.lat), axis=1).tolist(),
72 | index=Repo.file.index,
73 | )
74 |
75 | @staticmethod
76 | def _save_as_csv() -> None:
77 | Repo.file.to_csv(Repo._poi_csv_path, encoding="utf-8", index=False)
78 |
79 | @staticmethod
80 | def _save_as_shp() -> None:
81 | df = Repo.file.dropna(subset=["geometry"])
82 | # export to shp only when there is at least one geometry
83 | if len(df):
84 | gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="epsg:4326")
85 | gdf.to_file(Repo._aoi_shp_path, encoding="utf-8")
86 |
--------------------------------------------------------------------------------
/processor/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from processor.counter import Counter
4 |
5 |
6 | class Logger(object):
7 | @staticmethod
8 | def log_progress() -> None:
9 | status = Counter._count_status()
10 | # log only when status changes
11 | if Counter._status != status:
12 | Counter._status = status
13 | matched, no_uid, no_geometry = status
14 | logging.warning(
15 | f"{matched}/{no_uid}/{no_geometry}/{Counter._poi_num} | "
16 | f"{sum(status)} ({sum(status)/Counter._poi_num:.2%})"
17 | )
18 |
19 | @classmethod
20 | def log_start(cls) -> None:
21 | logging.warning("# ---------- Crawling Started ---------- #")
22 | logging.warning(f"-- POI total number: {Counter._poi_num}.")
23 | logging.warning(f"-- POIs to crawl: {Counter._poi_to_crawl}.")
24 | cls.log_progress()
25 |
26 | @staticmethod
27 | def log_uid_fail(exception: Exception, idx: int) -> None:
28 | logging.error(f"POI index {idx} failed to parse uid. Reason: {exception}")
29 |
30 | @staticmethod
31 | def log_aoi_fail(exception: Exception, idx: int, uid_name: str) -> None:
32 | logging.error(
33 | f"{uid_name} of POI index {idx} failed to parse AOI. Reason: {exception}"
34 | )
35 |
36 | @staticmethod
37 | def log_update() -> None:
38 | avg_speed, xTime = Counter._cal_speed_xTime()
39 | logging.warning(f"-- Updated. Avg speed: {avg_speed}. Time remaining: {xTime}.")
40 |
41 | @staticmethod
42 | def log_finish() -> None:
43 | avg_speed, _ = Counter._cal_speed_xTime()
44 | total_time = Counter._total_time()
45 | poi_missing = Counter._count_missing()
46 | poi_matched = Counter._count_status()[0]
47 | missing_prop = poi_missing / Counter._poi_num
48 | matched_prop = Counter._count_status()[0] / Counter._poi_num
49 | logging.warning("# ---------- Crawling Ended ---------- #")
50 | logging.warning(
51 | f"-- Avg speed: {avg_speed}. Total crawling time: {total_time}."
52 | )
53 | logging.warning(f"-- {poi_matched} ({matched_prop:.2%}) POIs are matched.")
54 | if poi_missing:
55 | logging.warning(
56 | f"-- {poi_missing} ({missing_prop:.2%}) POIs are missing. "
57 | f"Re-crawling is recommended."
58 | )
59 | else:
60 | logging.warning("-- All POIs are crawled. Re-crawling is not needed.")
61 |
--------------------------------------------------------------------------------
/processor/repository.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import pandas as pd
4 |
5 |
6 | class Repo(object):
7 | @classmethod
8 | def import_settings(cls, settings: dict) -> None:
9 | cls._import_settings(settings)
10 | logging.warning("# ---------- Initialization ---------- #")
11 |
12 | @classmethod
13 | def load_file(cls) -> None:
14 | cls.file = pd.read_csv(cls._poi_csv_path, encoding="utf-8")
15 |
16 | @classmethod
17 | def _import_settings(cls, settings: dict) -> None:
18 | # Spider settings
19 | cls._proxy_enabled = settings.get("PROXY_ENABLED")
20 | cls._update_interval = settings.get("UPDATE_INTERVAL")
21 | cls._use_first_uid = settings.get("USE_FIRST_UID")
22 | # File path settings
23 | cls._poi_csv_path = settings.get("POI_CSV_PATH")
24 | cls._aoi_shp_path = settings.get("AOI_SHP_PATH")
25 | # Baidu API settings
26 | cls._ak_list = settings.get("AK_LIST")
27 | cls._prim_ind = settings.get("API_PARAMS", {}).get("prim_ind")
28 | cls._sec_ind = settings.get("API_PARAMS", {}).get("sec_ind")
29 | cls._radius = settings.get("API_PARAMS", {}).get("radius")
30 | cls._radius_limit = settings.get("API_PARAMS", {}).get("radius_limit")
31 | cls._crs = settings.get("API_PARAMS", {}).get("crs")
32 | # AOI filter settings
33 | cls._min_aoi_area = settings.get("FILTER_RULES", {}).get("min_aoi_area")
34 | cls._max_aoi_area = settings.get("FILTER_RULES", {}).get("max_aoi_area")
35 | cls._min_similarity = settings.get("FILTER_RULES", {}).get("min_similarity")
36 | cls._sortings = {
37 | sorting: settings.get("FILTER_RULES", {}).get(sorting)
38 | for sorting in [
39 | "sort_by_search_rank",
40 | "sort_by_area",
41 | "sort_by_distance",
42 | "sort_by_similarity",
43 | ]
44 | }
45 |
--------------------------------------------------------------------------------
/processor/validator.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from processor.repository import Repo
5 |
6 |
7 | class Validator(object):
8 | @classmethod
9 | def validate_settings(cls) -> None:
10 | cls._validate_spider_settings()
11 | cls._validate_path_settings()
12 | cls._validate_api_settings()
13 | cls._validate_aoi_filter_settings()
14 | logging.warning("(1/6) Settings validation complete.")
15 |
16 | @classmethod
17 | def validate_file(cls) -> None:
18 | """
19 | `POI csv` should have the following columns:
20 | - Compulsory:
21 | - name (str): POI name
22 | - lng (float): POI's longitude
23 | - lat (float): POI's latitude
24 | - Optional:
25 | - prim_ind (str): POI's primary industry classification
26 | - sec_ind (str): POI's secondary industry classification
27 | """
28 | for col in ["name", "lng", "lat"]:
29 | if col not in Repo.file.columns:
30 | raise ValueError(f'Column "{col}" is missing.')
31 | cls._check_optional_col("prim_ind", Repo._prim_ind)
32 | cls._check_optional_col("sec_ind", Repo._sec_ind)
33 | logging.warning("(2/6) POI csv file validation complete.")
34 |
35 | @classmethod
36 | def _validate_spider_settings(cls) -> None:
37 | # PROXY_ENABLED, USE_FIRST_UID is bool type
38 | cls._verify_value_type(Repo._proxy_enabled, "PROXY_ENABLED", bool)
39 | cls._verify_value_type(Repo._use_first_uid, "USE_FIRST_UID", bool)
40 | # UPDATE_INTERVAL must be a positive number
41 | cls._verify_non_negative_num(Repo._update_interval, "UPDATE_INTERVAL")
42 |
43 | @staticmethod
44 | def _validate_path_settings() -> None:
45 | poi_dir = Repo._poi_csv_path
46 | aoi_dir = Repo._aoi_shp_path
47 | aoi_parent_dir = os.path.dirname(aoi_dir)
48 | # POI directory existence
49 | if not os.path.exists(poi_dir):
50 | raise FileNotFoundError(f'POI_CSV_PATH not found: "{poi_dir}".')
51 | if not poi_dir.endswith(".csv"):
52 | raise ValueError(f'"{poi_dir}" must be a csv file.')
53 | # If AOI shp parent directory does not exist, create it
54 | if not aoi_dir.endswith(".shp"):
55 | raise ValueError(f'"{aoi_dir}" must be a shp file.')
56 | if not os.path.exists(aoi_parent_dir):
57 | os.makedirs(aoi_parent_dir)
58 | logging.warning("(0/6) AOI_SHP_PATH parent directory created.")
59 |
60 | @classmethod
61 | def _validate_api_settings(cls) -> None:
62 | # AK_LIST must be a list of strings
63 | cls._verify_value_type(Repo._ak_list, "AK_LIST", list)
64 | if not Repo._ak_list:
65 | raise ValueError("AK_LIST must not be empty.")
66 | for ak in Repo._ak_list:
67 | cls._verify_value_type(ak, "AK", str)
68 | # API_PARAMS rules:
69 | # industry parameter must be a string
70 | cls._verify_value_type(Repo._prim_ind, "prim_ind", str)
71 | cls._verify_value_type(Repo._sec_ind, "sec_ind", str)
72 | # radius parameter must be a positive number
73 | cls._verify_non_negative_num(Repo._radius, "radius")
74 | # radius_limit must be one of 'true' or 'false'
75 | if Repo._radius_limit not in ["true", "false"]:
76 | raise ValueError('"radius_limit" must be "true" or "false".')
77 | # crs must be one of 'gcj02', 'bd09' or 'wgs84'
78 | if Repo._crs not in ["gcj02", "bd09", "wgs84"]:
79 | raise ValueError('"crs" must be "gcj02", "bd09" or "wgs84".')
80 |
81 | @classmethod
82 | def _validate_aoi_filter_settings(cls) -> None:
83 | # area limit must be a positive number
84 | cls._verify_non_negative_num(Repo._min_aoi_area, "min_aoi_area")
85 | cls._verify_non_negative_num(Repo._max_aoi_area, "max_aoi_area")
86 | # minimum similarity must be smaller than 1
87 | cls._verify_value_type(Repo._min_similarity, "min_similarity", float | int)
88 | if Repo._min_similarity >= 1:
89 | raise ValueError('"min_similarity" must not be more than 1.')
90 | # sorting values must be one of 0 or 1 (or -1 for 'sort_by_area')
91 | for sorting_type, value in Repo._sortings.items():
92 | if sorting_type == "sort_by_area":
93 | if value not in [0, 1, -1]:
94 | raise ValueError(f'"{sorting_type}" must be 0 or ±1.')
95 | elif value not in [0, 1]:
96 | raise ValueError(f'"{sorting_type}" must be 0 or 1.')
97 | # at least one kind of sorting must be enabled
98 | if not any(Repo._sortings.values()):
99 | raise ValueError("Sorting values must not be all 0.")
100 |
101 | @staticmethod
102 | def _check_optional_col(name: str, value: str) -> None:
103 | if value == "VAR":
104 | if name not in Repo.file.columns:
105 | raise ValueError(f'Column "{name}" is missing.')
106 |
107 | @classmethod
108 | def _verify_non_negative_num(cls, value: any, name: str) -> None:
109 | cls._verify_value_type(value, name, float | int)
110 | if value < 0:
111 | raise ValueError(f'"{name}" must be a non-negative number.')
112 |
113 | @staticmethod
114 | def _verify_value_type(
115 | value: any, name: str, type: bool | int | str | list | dict
116 | ) -> None:
117 | if not isinstance(value, type):
118 | raise TypeError(f'"{name}" must be a {str(type).replace("|", "or")}.')
119 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = baidu_aoi_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = BaiduAOISpider
12 |
--------------------------------------------------------------------------------
/spatial/coords.py:
--------------------------------------------------------------------------------
1 | # References:
2 | # https://github.com/dickwxyz/CoordinatesConverter
3 |
4 | import math
5 | from math import asin, cos, sin, sqrt
6 | from typing import Tuple
7 |
8 | # Basic Parameters:
9 | x_pi = 3.14159265358979324 * 3000.0 / 180.0
10 | pi = 3.1415926535897932384626 # π
11 | a = 6378245.0 # semi-major axis of WGS-84 ellipsoid
12 | ee = 0.00669342162296594323 # oblateness of the earth
13 |
14 | # Baidu Mercator Projection Parameters:
15 | MC_BAND = [12890594.86, 8362377.87, 5591021, 3481989.83, 1678043.12, 0]
16 | MC2LL = [
17 | [
18 | 1.410526172116255e-8,
19 | 0.00000898305509648872,
20 | -1.9939833816331,
21 | 200.9824383106796,
22 | -187.2403703815547,
23 | 91.6087516669843,
24 | -23.38765649603339,
25 | 2.57121317296198,
26 | -0.03801003308653,
27 | 17337981.2,
28 | ],
29 | [
30 | -7.435856389565537e-9,
31 | 0.000008983055097726239,
32 | -0.78625201886289,
33 | 96.32687599759846,
34 | -1.85204757529826,
35 | -59.36935905485877,
36 | 47.40033549296737,
37 | -16.50741931063887,
38 | 2.28786674699375,
39 | 10260144.86,
40 | ],
41 | [
42 | -3.030883460898826e-8,
43 | 0.00000898305509983578,
44 | 0.30071316287616,
45 | 59.74293618442277,
46 | 7.357984074871,
47 | -25.38371002664745,
48 | 13.45380521110908,
49 | -3.29883767235584,
50 | 0.32710905363475,
51 | 6856817.37,
52 | ],
53 | [
54 | -1.981981304930552e-8,
55 | 0.000008983055099779535,
56 | 0.03278182852591,
57 | 40.31678527705744,
58 | 0.65659298677277,
59 | -4.44255534477492,
60 | 0.85341911805263,
61 | 0.12923347998204,
62 | -0.04625736007561,
63 | 4482777.06,
64 | ],
65 | [
66 | 3.09191371068437e-9,
67 | 0.000008983055096812155,
68 | 0.00006995724062,
69 | 23.10934304144901,
70 | -0.00023663490511,
71 | -0.6321817810242,
72 | -0.00663494467273,
73 | 0.03430082397953,
74 | -0.00466043876332,
75 | 2555164.4,
76 | ],
77 | [
78 | 2.890871144776878e-9,
79 | 0.000008983055095805407,
80 | -3.068298e-8,
81 | 7.47137025468032,
82 | -0.00000353937994,
83 | -0.02145144861037,
84 | -0.00001234426596,
85 | 0.00010322952773,
86 | -0.00000323890364,
87 | 826088.5,
88 | ],
89 | ]
90 |
91 |
92 | def wgs84_to_gcj02(lng: float, lat: float) -> Tuple[float, float]:
93 | """
94 | Re-project the point from `wgs84` to `gcj02`.
95 |
96 | Args:
97 | lng (float): wgs84 CRS longitude
98 | lat (float): wgs84 CRS latitude
99 |
100 | Returns:
101 | tuple(float, float): (gcj02_lng, gcj02_lat)
102 | """
103 | if outside_of_china(lng, lat):
104 | return lng, lat
105 |
106 | d_lat = transform_lat(lng - 105.0, lat - 35.0)
107 | d_lng = transform_lng(lng - 105.0, lat - 35.0)
108 | rad_lat = lat / 180.0 * pi
109 |
110 | magic = math.sin(rad_lat)
111 | magic = 1 - ee * magic * magic
112 | sqrt_magic = math.sqrt(magic)
113 |
114 | d_lat = (d_lat * 180.0) / ((a * (1 - ee)) / (magic * sqrt_magic) * pi)
115 | d_lng = (d_lng * 180.0) / (a / sqrt_magic * math.cos(rad_lat) * pi)
116 | mg_lat = lat + d_lat
117 | mg_lng = lng + d_lng
118 | return mg_lng, mg_lat
119 |
120 |
121 | def gcj02_to_wgs84(lng: float, lat: float) -> Tuple[float, float]:
122 | """
123 | Re-project the point from `gcj02` to `wgs84`.
124 |
125 | Args:
126 | lng (float): gcj02 CRS longitude
127 | lat (float): gcj02 CRS latitude
128 |
129 | Returns:
130 | tuple(float, float): (wgs84_lng, wgs84_lat)
131 | """
132 | if outside_of_china(lng, lat):
133 | return lng, lat
134 |
135 | d_lat = transform_lat(lng - 105.0, lat - 35.0)
136 | d_lng = transform_lng(lng - 105.0, lat - 35.0)
137 | rad_lat = lat / 180.0 * pi
138 |
139 | magic = math.sin(rad_lat)
140 | magic = 1 - ee * magic * magic
141 | sqrt_magic = math.sqrt(magic)
142 |
143 | d_lat = (d_lat * 180.0) / ((a * (1 - ee)) / (magic * sqrt_magic) * pi)
144 | d_lng = (d_lng * 180.0) / (a / sqrt_magic * math.cos(rad_lat) * pi)
145 | mg_lat = lat + d_lat
146 | mg_lng = lng + d_lng
147 | return lng * 2 - mg_lng, lat * 2 - mg_lat
148 |
149 |
150 | def transform_lat(lng: float, lat: float) -> float:
151 | ret = (
152 | -100.0
153 | + 2.0 * lng
154 | + 3.0 * lat
155 | + 0.2 * lat * lat
156 | + 0.1 * lng * lat
157 | + 0.2 * math.sqrt(math.fabs(lng))
158 | )
159 |
160 | ret += (
161 | (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0
162 | )
163 |
164 | ret += (20.0 * math.sin(lat * pi) + 40.0 * math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
165 |
166 | ret += (
167 | (160.0 * math.sin(lat / 12.0 * pi) + 320 * math.sin(lat * pi / 30.0))
168 | * 2.0
169 | / 3.0
170 | )
171 | return ret
172 |
173 |
174 | def transform_lng(lng: float, lat: float) -> float:
175 | ret = (
176 | 300.0
177 | + lng
178 | + 2.0 * lat
179 | + 0.1 * lng * lng
180 | + 0.1 * lng * lat
181 | + 0.1 * math.sqrt(math.fabs(lng))
182 | )
183 |
184 | ret += (
185 | (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0
186 | )
187 |
188 | ret += (20.0 * math.sin(lng * pi) + 40.0 * math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
189 |
190 | ret += (
191 | (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * math.sin(lng / 30.0 * pi))
192 | * 2.0
193 | / 3.0
194 | )
195 | return ret
196 |
197 |
198 | def outside_of_china(lng: float, lat: float) -> bool:
199 | """
200 | Determine whether the point is on the outside of China.
201 |
202 | Args:
203 | lng (float): longitude in any of the CRS `wgs84`, `gcj02`, `bd09ll`
204 | lat (float): latitude in any of above CRS
205 |
206 | Returns:
207 | bool: True for outside of China, False otherwise
208 | """
209 | if lng < 72.004 or lng > 137.8347:
210 | return True
211 | if lat < 0.8293 or lat > 55.8271:
212 | return True
213 | return False
214 |
215 |
216 | def gcj02_to_bd09ll(lng: float, lat: float) -> Tuple[float, float]:
217 | """
218 | Re-project the point from `gcj02` to `bd09ll`.
219 |
220 | Args:
221 | lng (float): gcj02 CRS longitude
222 | lat (float): gcj02 CRS latitude
223 |
224 | Returns:
225 | tuple(float, float): (bd09ll_lng, bd09ll_lat)
226 | """
227 | z = math.sqrt(lng * lng + lat * lat) + 0.00002 * math.sin(lat * x_pi)
228 | theta = math.atan2(lat, lng) + 0.000003 * math.cos(lng * x_pi)
229 | bd_lng = z * math.cos(theta) + 0.0065
230 | bd_lat = z * math.sin(theta) + 0.006
231 | return bd_lng, bd_lat
232 |
233 |
234 | def bd09ll_to_gcj02(bd_lon: float, bd_lat: float) -> Tuple[float, float]:
235 | """
236 | Re-project the point from `bd09ll` to `gcj02`.
237 |
238 | Args:
239 | bd_lon (float): bd09ll CRS longitude
240 | bd_lat (float): bd09ll CRS latitude
241 |
242 | Returns:
243 | tuple(float, float): (gcj02_lng, gcj02_lat)
244 | """
245 | x = bd_lon - 0.0065
246 | y = bd_lat - 0.006
247 | z = math.sqrt(x * x + y * y) - 0.00002 * math.sin(y * x_pi)
248 | theta = math.atan2(y, x) - 0.000003 * math.cos(x * x_pi)
249 | gg_lng = z * math.cos(theta)
250 | gg_lat = z * math.sin(theta)
251 | return gg_lng, gg_lat
252 |
253 |
254 | def wgs84_to_bd09ll(lon: float, lat: float) -> Tuple[float, float]:
255 | """
256 | Re-project the point from `wgs84` to `gcj02`,
257 | then from `gcj02` to `bd09ll`.
258 |
259 | Args:
260 | lon (float): wgs84 CRS longitude
261 | lat (float): wgs84 CRS latitude
262 |
263 | Returns:
264 | tuple(float, float): (bd09ll_lng, bd09ll_lat)
265 | """
266 | lon, lat = wgs84_to_gcj02(lon, lat)
267 | lon, lat = gcj02_to_bd09ll(lon, lat)
268 | return lon, lat
269 |
270 |
271 | def bd09ll_to_wgs84(lon: float, lat: float) -> Tuple[float, float]:
272 | """
273 | Re-project the point from `bd09ll` to `gcj02`,
274 | then from `gcj02` to `wgs84`.
275 |
276 | Args:
277 | lon (float): bd09ll CRS longitude
278 | lat (float): bd09ll CRS latitude
279 |
280 | Returns:
281 | tuple(float, float): (wgs84_lng, wgs84_lat)
282 | """
283 | lon, lat = bd09ll_to_gcj02(lon, lat)
284 | lon, lat = gcj02_to_wgs84(lon, lat)
285 | return lon, lat
286 |
287 |
288 | def bd09mc_to_bd09ll(x1: float, y1: float) -> Tuple[float, float]:
289 | """
290 | Re-project the point from `bd09mc` to `bd09ll`.
291 |
292 | Args:
293 | x1 (float): bd09mc CRS longitude
294 | y1 (float): bd09mc CRS latitude
295 |
296 | Returns:
297 | tuple(float, float): (bd09ll_lng, bd09ll_lat)
298 | """
299 | for cE in range(len(MC_BAND)):
300 | if y1 > MC_BAND[cE]:
301 | cF = MC2LL[cE]
302 | break
303 | xTemp = cF[0] + cF[1] * x1
304 | cC = y1 / cF[9]
305 | yTemp = (
306 | cF[2]
307 | + cF[3] * cC
308 | + cF[4] * cC**2
309 | + cF[5] * cC**3
310 | + cF[6] * cC**4
311 | + cF[7] * cC**5
312 | + cF[8] * cC**6
313 | )
314 | return xTemp, yTemp
315 |
316 |
317 | def bd09mc_to_wgs84(x1: float, y1: float) -> Tuple[float, float]:
318 | """
319 | Re-project the point from `bd09mc` to `gcj02`,
320 | then from `gcj02` to `wgs84`.
321 |
322 | Args:
323 | x1 (float): bd09mc CRS longitude
324 | y1 (float): bd09mc CRS latitude
325 | Returns:
326 | tuple(float, float): (wgs84_lng, wgs84_lat)
327 | """
328 | x2, y2 = bd09mc_to_bd09ll(x1, y1)
329 | x3, y3 = bd09ll_to_wgs84(x2, y2)
330 | return x3, y3
331 |
332 |
333 | def cal_distance(lon1: float, lat1: float, lon2: float, lat2: float) -> float:
334 | """
335 | Calculate the `spherical` distance between two points.
336 |
337 | Args:
338 | lon1 (float): longitude of point 1
339 | lat1 (float): latitude of point 1
340 | lon2 (float): longitude of point 2
341 | lat2 (float): latitude of point 2
342 |
343 | Returns:
344 | float: spherical distance, in `kilometers`
345 | """
346 | d_lat = abs(lat1 / 180.0 * pi - lat2 / 180.0 * pi)
347 | d_lon = abs(lon1 / 180.0 * pi - lon2 / 180.0 * pi)
348 | a = sin(d_lat / 2) * sin(d_lat / 2) + cos(lat1 / 180.0 * pi) * cos(
349 | lat2 / 180.0 * pi
350 | ) * sin(d_lon / 2) * sin(d_lon / 2)
351 | dist = 2 * 6378.137 * asin(sqrt(a))
352 | return dist
353 |
--------------------------------------------------------------------------------
/spatial/geometry.py:
--------------------------------------------------------------------------------
1 | import pyproj
2 | from shapely import wkt
3 | from shapely.geometry import LineString, Polygon
4 | from shapely.geometry.base import BaseGeometry
5 | from shapely.ops import transform
6 |
7 | from spatial.coords import cal_distance
8 |
9 |
10 | def within_distance(
11 | lng1: float, lat1: float, lng2: float, lat2: float, distance: int = 1
12 | ) -> bool:
13 | """
14 | Determine whether the spherical distance between `(lng1, lat1)` and `(lng2, lat2)`
15 | is less than the given distance (unit: `kilometers`).
16 | """
17 | return cal_distance(lng2, lat2, lng1, lat1) <= distance
18 |
19 |
20 | def points_to_polygon(points: list) -> Polygon:
21 | """
22 | Convert a list of points to a `shapely` polygon.
23 | """
24 | return Polygon(LineString(points))
25 |
26 |
27 | def wkt_to_geometry(wkt_str: str) -> BaseGeometry:
28 | """
29 | Convert `wkt` string to `shapely` geometry.
30 | """
31 | return wkt.loads(wkt_str)
32 |
33 |
34 | def wgs84_to_wgs84utm50n(geometry: BaseGeometry) -> BaseGeometry:
35 | """
36 | Transform the geometry projection from `wgs84` to `wgs84_utm50n`.
37 | """
38 | wgs84 = pyproj.CRS("EPSG:4326")
39 | wgs84_utm50n = pyproj.CRS("EPSG:32650")
40 | project = pyproj.Transformer.from_crs(wgs84, wgs84_utm50n, always_xy=True).transform
41 | return transform(project, geometry)
42 |
--------------------------------------------------------------------------------