├── LICENSE
├── README.md
├── blocking_functions.py
├── data_loading_helper
├── __init__.py
├── data_loader.py
├── feature_extraction.py
└── magellan_modified_feature_generation.py
├── datasets
├── fodors_zagats
│ ├── fodors.csv
│ ├── matches_fodors_zagats.csv
│ ├── metadata.txt
│ └── zagats.csv
└── fodors_zagats_single
│ ├── fz.csv
│ ├── matches_fodors_zagats.csv
│ └── metadata.txt
├── environment.yml
├── model.py
├── utils.py
└── zeroer.py
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ZeroER
2 | Implementation for the paper [ZeroER: Entity Resolution using Zero Labeled Examples.](https://arxiv.org/abs/1908.06049)
3 |
4 | ## Setup enviroment
5 | conda env create -f environment.yml
6 | conda activate ZeroER
7 |
8 | ## How to use
9 | 1. Put you dataset into the folder `datasets`. You should have a file `metadata.txt` in your data folder that specifies the file name of the table (and possibly right table and ground-truth table). For two table record linkage, you can refer to `datasets/fodors_zagats/metadata.txt`.
10 | For single table deduplication, you can refer to `datasets/fodors_zagats_single/metadata.txt`.
11 | 2. Write a blocking function for your dataset and put it in `blocking_functions.py`.
12 | You can have a look at the blocking functions we wrote in that file to get some ideas of how to write your own blocking function.
13 | We use Magellan to do blocking so you can also refer to its [documentations](https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching).
14 |
15 | 3. **Two-table record linkage**.
16 | To run the code, for example you are using the fodors_zagats dataset:
17 |
18 | `python zeroer.py fodors_zagats`
19 |
20 | If you want to incorporate the transitivity constraint, use arg `--run_transitivity`:
21 |
22 | `python zeroer.py fodors_zagats --run_transitivity`
23 |
24 | *Note this will generate features for self-join of the two tables (LxL and RxR) when arg `--LR_dup_free` is not present, which can take some time.
25 |
26 | If you know that your left table and right table are duplicate free, you can incorporate this information by using arg `--run_transitivity --LR_dup_free`:
27 |
28 | `python zeroer.py fodors_zagats --run_transitivity --LR_dup_free`
29 |
30 | **Single-table deduplication**.
31 | You must explictly tell the system that you are doing single table deduplication by arg `--LR_identical`:
32 |
33 | `python zeroer.py fodors_zagats_single --LR_identical`
34 |
35 | If you want to incorporate the transitivity constraint, add arg `--run_transitivity`:
36 |
37 | `python zeroer.py fodors_zagats_single --LR_identical --run_transitivity`
38 |
39 | 4. Final result for matches and unmatches is the file `pred.csv` that is saved to your dataset folder.
40 |
41 | ## Citation
42 | If you use our work or found it useful, please cite our paper:
43 | ```
44 | @inproceedings{wu2020zeroer,
45 | author = {Renzhi Wu and Sanya Chaba and Saurabh Sawlani and Xu Chu and Saravanan Thirumuruganathan},
46 | title = {ZeroER: Entity Resolution using Zero Labeled Examples},
47 | booktitle = {Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data},
48 | pages = {1149–1164},
49 | year = {2020}
50 | }
51 | ```
--------------------------------------------------------------------------------
/blocking_functions.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | from pandas import merge
4 | import py_entitymatching as em
5 |
6 |
7 | """ This python file contains blocking functions
8 | specific to table pairs. For example block_fodors_zagat
9 | is the blocking function for Tables fodors and zagat.
10 | Functionality: creates initial set of tuple pairs for
11 | two tables.
12 | """
13 |
14 |
15 | def verify_blocking_ground_truth(A, B, block_df, duplicates_df, objectify=False):
16 | num_duplicates_missed = 0
17 | duplicates_df.columns = ["ltable_id", "rtable_id"]
18 | # Sometimes pandas / Magellan puts some columns as objects instead of numeric/string. In this case, we will force this to join appropriately
19 | if objectify:
20 | duplicates_df = duplicates_df.astype(object)
21 |
22 | # Intuition: merge function joints two data frames. The outer option creates a number of NaN rows when
23 | # some duplicates are missing in the blocked_df
24 | # we leverage the fact that len gives all rows while count gives non-NaN to compute the missing options
25 | merged_df = block_df.merge(duplicates_df, left_on=["ltable_id", "rtable_id"], right_on=["ltable_id", "rtable_id"],
26 | how='outer')
27 | num_duplicates_missed = len(merged_df) - merged_df["_id"].count()
28 | total_duplicates = len(duplicates_df)
29 |
30 | print("Ratio saved=", 1.0 - float(len(block_df)) / float(len(A) * len(B)))
31 | print("Totally missed:", num_duplicates_missed, " out of ", total_duplicates)
32 |
33 |
34 | def blocking_for_citeseer_dblp(A,B):
35 | #A = em.read_csv_metadata("citeseer_sample.csv", key="id", encoding='utf-8')
36 | #B = em.read_csv_metadata("dblp_sample.csv", key="id", encoding='utf-8')
37 | attributes = ['id', 'title', 'authors', 'journal', 'month', 'year', 'publication_type']
38 |
39 | ob = em.OverlapBlocker()
40 | C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, show_progress=True,
41 | l_output_attrs=attributes, r_output_attrs=attributes)
42 | return C1
43 | #verify_blocking_ground_truth(A, B, C1, matches_df_head)
44 |
45 | #fodors.csv and zagats.csv
46 | def block_fodors_zagats(A, B):
47 | ob = em.OverlapBlocker()
48 | C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=['name', 'addr', 'city', 'phone'], r_output_attrs=['name', 'addr', 'city', 'phone'],
49 | overlap_size=1, show_progress=False)
50 | return C
51 |
52 |
53 | #babies_r_us.csv and buy_buy_baby.csv
54 | def block_baby_products(A, B):
55 | ob = em.OverlapBlocker()
56 | # attributes = ['title', 'price', 'category', 'company_struct', 'brand', 'weight', 'length', 'width', 'height', 'fabrics', 'colors', 'materials']
57 | attributes = ['title', 'price', 'is_discounted', 'category', 'company_struct']
58 | # C = ob.block_tables(A, B, 'title', 'title', l_output_attrs=attributes, r_output_attrs=attributes,
59 | # overlap_size=3, show_progress=False)
60 | C = ob.block_tables(A, B, 'title', 'title', word_level = True, overlap_size = 4, show_progress = True, l_output_attrs = attributes, r_output_attrs = attributes)
61 | return C
62 |
63 |
64 | #barnes_and_noble.csv and half.csv
65 | def block_books(A, B):
66 | #assumes some preprocessing is done:
67 | #Specifically in half.csv : NewPrice => Price
68 |
69 | ob = em.OverlapBlocker()
70 | # attributes = ['Title', 'Price', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions']
71 | attributes = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions']
72 | # C = ob.block_tables(A, B, 'Title', 'Title', l_output_attrs=attributes, r_output_attrs=attributes,
73 | # overlap_size=1, show_progress=False)
74 | C = ob.block_tables(A, B, 'Title', 'Title', word_level=True, overlap_size=4, show_progress=True,
75 | l_output_attrs=attributes, r_output_attrs=attributes)
76 | return C
77 |
78 |
79 | #yellow_pages.csv and yelp.csv
80 | def block_restaurants(A, B):
81 | #assumes some preprocessing is done:
82 | #Specifically in half.csv : NewPrice => Price
83 |
84 | ob = em.OverlapBlocker()
85 | attributes = ['name', 'address', 'city', 'state', 'zipcode', 'phone']
86 | # C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=attributes, r_output_attrs=attributes,
87 | # overlap_size=1, show_progress=False)
88 | C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=4, show_progress=True,
89 | l_output_attrs=attributes, r_output_attrs=attributes)
90 | return C
91 |
92 |
93 | #dblp.csv and ACM.csv
94 | def block_dblp_acm(A, B):
95 | ab = em.AttrEquivalenceBlocker()
96 | C = ab.block_tables(A, B, l_block_attr='year', r_block_attr='year', l_output_attrs=["title","authors","venue","year"],
97 | r_output_attrs=["title","authors","venue","year"], allow_missing=False)
98 | ob = em.OverlapBlocker()
99 | #=================>results in a candidate set of size 46K with 5 missing duplicates out of 2224
100 | C2 = ob.block_candset(C, 'title', 'title', word_level=True, overlap_size=2, show_progress=True)
101 | return C2
102 |
103 |
104 | #dblp.csv and google_scholar.csv
105 | def block_dblp_scholar(A, B):
106 | ob = em.OverlapBlocker()
107 | attributes = ["id","title","authors","venue","year"]
108 | #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
109 | #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347
110 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=4, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
111 | #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347
112 | return C2
113 |
114 | def block_rotten_imdb(A, B):
115 | ob = em.OverlapBlocker()
116 | attributes = set(A.columns)
117 | attributes.remove("id")
118 | attributes = list(attributes.intersection(set(B.columns)))
119 | #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
120 | #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347
121 | C2 = ob.block_tables(A, B, 'Name', 'Name', word_level=True, overlap_size=2, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
122 | #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347
123 | return C2
124 |
125 |
126 | #abt.csv and buy.csv
127 | def block_abt_buy(A, B):
128 | try:
129 | B["description"] = B["description"] + " " + B["manufacturer"]
130 | except:
131 | print()
132 | ob = em.OverlapBlocker()
133 | #=================>results in a candidate set of size 164K with 6 missing duplicates out of 1097
134 | C = ob.block_tables(A, B, "name", "name", word_level=True, overlap_size=1,
135 | l_output_attrs=["name","description","price"], r_output_attrs=["name","description","price"], show_progress=True, allow_missing=False)
136 | return C
137 |
138 |
139 | #walmart.csv and amazon.csv
140 | def block_walmart_amazon_(A, B):
141 | #assumes some preprocessing is done:
142 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr
143 |
144 | ob = em.OverlapBlocker()
145 |
146 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2)
147 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154
148 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1)
149 |
150 | attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions']
151 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, l_output_attrs=attributes, r_output_attrs=attributes, show_progress=True, allow_missing=True)
152 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154
153 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2)
154 |
155 | return C2
156 |
157 | #walmart.csv and amazon.csv
158 | def block_walmart_amazon(A, B):
159 | #assumes some preprocessing is done:
160 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr
161 |
162 | ob = em.OverlapBlocker()
163 |
164 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2)
165 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154
166 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1)
167 |
168 | r_attributes = ["title","proddescrshort","brand","price","dimensions","shipweight"]
169 | l_attributes = ["title","shortdescr","brand","price","dimensions","shipweight"]
170 |
171 | if not set(r_attributes).issubset(B.columns): # fix in case A B are the same dataset
172 | r_attributes = l_attributes
173 | if not set(l_attributes).issubset(A.columns):
174 | l_attributes = r_attributes
175 | #attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions']
176 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, l_output_attrs=l_attributes, r_output_attrs=r_attributes, show_progress=True, allow_missing=True)
177 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154
178 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2)
179 | return C2
180 |
181 | def block_wa(A, B):
182 | #assumes some preprocessing is done:
183 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr
184 |
185 | ob = em.OverlapBlocker()
186 |
187 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2)
188 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154
189 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1)
190 |
191 | r_attributes = ["title","category","brand","modelno","price"]
192 | l_attributes = ["title","category","brand","modelno","price"]
193 |
194 | if not set(r_attributes).issubset(B.columns): # fix in case A B are the same dataset
195 | r_attributes = l_attributes
196 | if not set(l_attributes).issubset(A.columns):
197 | l_attributes = r_attributes
198 | #attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions']
199 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, l_output_attrs=l_attributes, r_output_attrs=r_attributes, show_progress=True, allow_missing=True)
200 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154
201 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2)
202 | return C2
203 |
204 | #amazon.csv and GoogleProducts.csv
205 | def block_amazon_googleproducts(A, B):
206 | ob = em.OverlapBlocker()
207 | #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300
208 | C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1, l_output_attrs=["title","description","manufacturer","price"], r_output_attrs=["title","description","manufacturer","price"], show_progress=True, allow_missing=False)
209 | return C
210 |
211 | def block_songs(A, B):
212 | ob = em.OverlapBlocker()
213 | #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300
214 | C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1,
215 | l_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"],
216 | r_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"],
217 | show_progress=True, allow_missing=False,n_jobs=8)
218 | return C
219 |
220 | def generic_blocking_func(A, B):
221 | A_prefix = A.add_prefix('ltable_')
222 | B_prefix = B.add_prefix('rtable_')
223 | A_prefix['key'] = 1
224 | B_prefix['key'] = 1
225 | final = merge(A_prefix, B_prefix,on='key', suffixes=('', ''))
226 | final = final.drop(columns=['key'])
227 | final = final.reset_index()
228 | final = final.rename(columns={'index': '_id'})
229 | print (list(final))
230 | return final
231 |
232 |
233 | blocking_functions_mapping = defaultdict(str)
234 | blocking_functions_mapping["fodors_zagats"] = block_fodors_zagats
235 | blocking_functions_mapping["fodors_zagats_single"] = block_fodors_zagats
236 | blocking_functions_mapping["abt_buy"] = block_abt_buy
237 | blocking_functions_mapping["dblp_acm"] = block_dblp_acm
238 | blocking_functions_mapping["dblp_scholar"] = block_dblp_scholar
239 | blocking_functions_mapping["amazon_googleproducts"] = block_amazon_googleproducts
240 | blocking_functions_mapping["walmart_amazon"] = block_walmart_amazon
241 | blocking_functions_mapping["songs"] = block_songs
242 | blocking_functions_mapping["citations"] = blocking_for_citeseer_dblp
243 |
244 | blocking_functions_mapping["dblp_citeseer"] = generic_blocking_func
245 | blocking_functions_mapping["imdb_omdb"] = generic_blocking_func
246 | blocking_functions_mapping["rotten_imdb"] = block_rotten_imdb
247 |
248 | blocking_functions_mapping["cora"] = generic_blocking_func
249 | blocking_functions_mapping["synthetic"] = generic_blocking_func
250 | blocking_functions_mapping["books"] = block_books
251 | blocking_functions_mapping["baby_products"] = block_baby_products
252 | blocking_functions_mapping["restaurants"] = block_restaurants
253 | blocking_functions_mapping['wa'] = block_wa
--------------------------------------------------------------------------------
/data_loading_helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chu-data-lab/zeroer/4c1ea6acd2c214b32b2e52cca9b8d50afc180220/data_loading_helper/__init__.py
--------------------------------------------------------------------------------
/data_loading_helper/data_loader.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pandas import merge
3 | import py_entitymatching as em
4 |
5 | def load_data(left_file_name, right_file_name, label_file_name, blocking_fn, include_self_join=False):
6 | A = em.read_csv_metadata(left_file_name , key="id", encoding='iso-8859-1')
7 | B = em.read_csv_metadata(right_file_name , key="id", encoding='iso-8859-1')
8 | try:
9 | G = pd.read_csv(label_file_name)
10 | except:
11 | G=None
12 | C = blocking_fn(A, B)
13 | if include_self_join:
14 | C_A = blocking_fn(A, A)
15 | C_B = blocking_fn(B, B)
16 | return A, B, G, C, C_A,C_B
17 | else:
18 | return A, B, G, C
19 |
--------------------------------------------------------------------------------
/data_loading_helper/feature_extraction.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import py_entitymatching as em
4 | from .magellan_modified_feature_generation import get_features
5 |
6 |
7 | #Given a CANDIDATE SET and the list of ACTUAL duplicates (duplicates_df),
8 | #this function adds the 1/0 labels (column name = GOLD) to the candset dataframe
9 | def add_labels_to_candset(duplicates_df, candset_df, ltable_df, rtable_df):
10 | #We are overwriting column names - but thats okay as this is not used anywhere else.
11 | duplicates_df.columns = ["ltable_id", "rtable_id"]
12 |
13 | #We merged two DF based on the common attributes. The indicator 'gold' takes three values both, left_only, right_only
14 | df_with_gold = pd.merge(candset_df, duplicates_df, on=['ltable_id', 'rtable_id'], how='left', indicator='gold')
15 |
16 | #If it is present in both, then it is a duplicate and we set it to 1 and 0 otherwise
17 | df_with_gold['gold'] = np.where(df_with_gold.gold == 'both', 1, 0)
18 |
19 | #This is to handle some Magellan issues
20 | em.set_key(df_with_gold, '_id')
21 | em.set_property(df_with_gold,'ltable', ltable_df)
22 | em.set_property(df_with_gold,'rtable', rtable_df)
23 | em.set_property(df_with_gold,'fk_ltable', "ltable_id")
24 | em.set_property(df_with_gold,'fk_rtable', "rtable_id")
25 |
26 | return df_with_gold
27 |
28 | def get_features_for_type(column_type):
29 | """
30 | Get features to be generated for a type
31 | """
32 | # First get the look up table
33 | lookup_table = dict()
34 |
35 | # Features for type str_eq_1w
36 | lookup_table['STR_EQ_1W'] = [('lev_dist'), ('lev_sim'), ('jaro'),
37 | ('jaro_winkler'),
38 | ('exact_match'),
39 | ('jaccard', 'qgm_3', 'qgm_3')]
40 |
41 | # Features for type str_bt_1w_5w
42 | lookup_table['STR_BT_1W_5W'] = [('jaccard', 'qgm_3', 'qgm_3'),
43 | ('cosine', 'dlm_dc0', 'dlm_dc0'),
44 | ('jaccard', 'dlm_dc0', 'dlm_dc0'),
45 | ('monge_elkan'), ('lev_dist'), ('lev_sim'),
46 | ('needleman_wunsch'),
47 | ('smith_waterman')] # dlm_dc0 is the concrete space tokenizer
48 |
49 | # Features for type str_bt_5w_10w
50 | lookup_table['STR_BT_5W_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
51 | ('cosine', 'dlm_dc0', 'dlm_dc0'),
52 | ('monge_elkan'), ('lev_dist'), ('lev_sim')]
53 |
54 | # Features for type str_gt_10w
55 | lookup_table['STR_GT_10W'] = [('jaccard', 'qgm_3', 'qgm_3'),
56 | ('cosine', 'dlm_dc0', 'dlm_dc0')]
57 |
58 | # Features for NUMERIC type
59 | lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'),
60 | ('lev_sim')]
61 |
62 | # Features for BOOLEAN type
63 | lookup_table['BOOL'] = [('exact_match')]
64 |
65 | # Features for un determined type
66 | lookup_table['UN_DETERMINED'] = []
67 | # Based on the column type, return the feature functions that should be
68 | # generated.
69 | if column_type is 'str_eq_1w':
70 | features = lookup_table['STR_EQ_1W']
71 | elif column_type is 'str_bt_1w_5w':
72 | features = lookup_table['STR_BT_1W_5W']
73 | elif column_type is 'str_bt_5w_10w':
74 | features = lookup_table['STR_BT_5W_10W']
75 | elif column_type is 'str_gt_10w':
76 | features = lookup_table['STR_GT_10W']
77 | elif column_type is 'numeric':
78 | features = lookup_table['NUM']
79 | elif column_type is 'boolean':
80 | features = lookup_table['BOOL']
81 | elif column_type is 'un_determined':
82 | features = lookup_table['UN_DETERMINED']
83 | else:
84 | raise TypeError('Unknown type')
85 | return features
86 |
87 |
88 | def extract_features(ltable_df, rtable_df, candset_df):
89 | tokenizers = em.get_tokenizers_for_matching()
90 | sim_functions = em.get_sim_funs_for_matching()
91 | left_attr_types = em.get_attr_types(ltable_df)
92 | right_attr_types = em.get_attr_types(rtable_df)
93 | correspondences = em.get_attr_corres(ltable_df, rtable_df)
94 |
95 | feature_dict_list = []
96 | attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7}
97 | for c in correspondences['corres']:
98 | if left_attr_types[c[0]] != right_attr_types[c[1]]:
99 | if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]:
100 | left_attr_types[c[0]] = right_attr_types[c[1]]
101 | else:
102 | right_attr_types[c[1]] = left_attr_types[c[0]]
103 |
104 | feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions)
105 | #Remove all features based on id - they are often useless
106 | feature_records = feature_records[feature_records.left_attribute !='id']
107 | feature_records.reset_index(inplace=True,drop=True)
108 |
109 | distance_functions = ["lev_dist", "rdf"]
110 | non_normalized_functions = ["aff", "sw", "swn", "nmw"]
111 | keep_features = [True]*feature_records.shape[0]
112 | for i in range(feature_records.shape[0]):
113 | feature = feature_records.loc[i,"feature_name"]
114 | for func in distance_functions + non_normalized_functions:
115 | if func in feature:
116 | keep_features[i] = False
117 | feature_records = feature_records.loc[keep_features,:]
118 |
119 | print("\n\nExtracting the full set of features:")
120 | candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1)
121 | candset_features_df.fillna(value=0, inplace=True)
122 |
123 | return candset_features_df
124 |
125 |
126 |
127 | def extract_features_auto(ltable_df, rtable_df, candset_df):
128 | feature_list = em.get_features_for_matching(ltable_df,rtable_df,validate_inferred_attr_types=False)
129 | #Remove all features based on id - they are often useless
130 | feature_list = feature_list[feature_list.left_attribute !='id']
131 |
132 | print("\n\nExtracting the full set of features:")
133 | candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_list,attrs_after='gold',show_progress=True)
134 | candset_features_df.fillna(value=0, inplace=True)
135 |
136 | return candset_features_df
137 |
138 |
139 | #High level function which just adds labels and the complete set of features to candset
140 | def gather_features_and_labels(ltable_df, rtable_df, labels_df, candset_df):
141 | labels_df.columns = ["ltable_id", "rtable_id"]
142 | labels_df["ltable_id"] = labels_df["ltable_id"].astype(str)
143 | labels_df["rtable_id"] = labels_df["rtable_id"].astype(str)
144 | candset_df["ltable_id"] = candset_df["ltable_id"].astype(str)
145 | candset_df["rtable_id"] = candset_df["rtable_id"].astype(str)
146 | ltable_df["id"] = ltable_df["id"].astype(str)
147 | rtable_df["id"] = rtable_df["id"].astype(str)
148 | candset_df = add_labels_to_candset(labels_df, candset_df, ltable_df, rtable_df)
149 | candset_features_df = extract_features(ltable_df, rtable_df, candset_df)
150 |
151 | return candset_features_df
152 |
153 |
154 | #Filter out bad features (non similarity, non distance, singular valued)
155 | def gather_similarity_features(candset_features_df, avged = False):
156 | distance_functions = ["lev_dist", "rdf"]
157 | non_normalized_functions = ["aff", "sw", "swn", "nmw"]
158 |
159 | cols = candset_features_df.columns
160 | cols_to_be_dropped = []
161 | for col in cols:
162 | for func in distance_functions + non_normalized_functions:
163 | if func in col:
164 | cols_to_be_dropped.append(col)
165 | break
166 |
167 | candset_similarity_features_df = candset_features_df.drop(cols_to_be_dropped, axis=1)
168 | similarity_features_df = candset_similarity_features_df.drop(['gold', '_id', 'ltable_id', 'rtable_id'], axis=1)
169 |
170 | # Dropping columns that have only one value
171 | cols_to_be_dropped = []
172 | col_count_map = similarity_features_df.nunique()
173 | for col in similarity_features_df.columns:
174 | if col_count_map[col] == 1:
175 | cols_to_be_dropped.append(col)
176 | similarity_features_df = similarity_features_df.drop(cols_to_be_dropped, axis=1)
177 |
178 |
179 | if (avged==False):
180 | return similarity_features_df
181 |
182 |
183 | headers= similarity_features_df.columns.values
184 |
185 | attributes = []
186 | for h in headers:
187 | arr = h.split("_")
188 | attributes.append(arr[0])
189 | attributes = set(attributes)
190 |
191 | avged_df = pd.DataFrame()
192 |
193 | for attribute in attributes:
194 | #print("\nFeatures for attribute:", attribute)
195 | matches = np.zeros(candset_features_df.shape[0])
196 | counts = 0
197 | for h in headers:
198 | if attribute in h:
199 | #print(h)
200 | matches = np.add(matches, candset_features_df[h].values)
201 | counts += 1
202 | matches = matches/counts
203 | avged_df[attribute] = matches
204 |
205 | return avged_df
--------------------------------------------------------------------------------
/data_loading_helper/magellan_modified_feature_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains functions for auto feature generation.
3 | """
4 | import logging
5 |
6 | import pandas as pd
7 | import six
8 | from py_entitymatching.utils.validation_helper import validate_object_type
9 |
10 | from IPython.display import display
11 |
12 | import py_entitymatching as em
13 | import py_entitymatching.feature.attributeutils as au
14 | import py_entitymatching.feature.simfunctions as sim
15 | import py_entitymatching.feature.tokenizers as tok
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | def get_features(ltable, rtable, l_attr_types, r_attr_types,
21 | attr_corres, tok_funcs, sim_funcs):
22 | """
23 | This function will automatically generate a set of features based on the
24 | attributes of the input tables.
25 |
26 | Specifically, this function will go through the attribute
27 | correspondences between the input tables. For each correspondence ,
28 | it examines the types of the involved attributes, then apply the
29 | appropriate tokenizers and sim functions to generate all appropriate
30 | features for this correspondence.
31 |
32 | Args:
33 | ltable,rtable (DataFrame): The pandas DataFrames for which the
34 | features must be generated.
35 | l_attr_types,r_attr_types (dictionary): The attribute types for the
36 | input DataFrames. Typically this is generated using the
37 | function 'get_attr_types'.
38 | attr_corres (dictionary): The attribute correspondences between the
39 | input DataFrames.
40 | tok_funcs (dictionary): A Python dictionary containing tokenizer
41 | functions.
42 | sim_funcs (dictionary): A Python dictionary containing similarity
43 | functions.
44 |
45 | Returns:
46 | A pandas DataFrame containing automatically generated features.
47 | Specifically, the DataFrame contains the following attributes:
48 | 'feature_name', 'left_attribute', 'right_attribute',
49 | 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
50 | 'function', 'function_source', 'is_auto_generated'.
51 |
52 | Raises:
53 | AssertionError: If `ltable` is not of type pandas
54 | DataFrame.
55 | AssertionError: If `rtable` is not of type pandas
56 | DataFrame.
57 | AssertionError: If `l_attr_types` is not of type
58 | python dictionary.
59 | AssertionError: If `r_attr_types` is not of type
60 | python dictionary.
61 | AssertionError: If `attr_corres` is not of type
62 | python dictionary.
63 | AssertionError: If `sim_funcs` is not of type
64 | python dictionary.
65 | AssertionError: If `tok_funcs` is not of type
66 | python dictionary.
67 | AssertionError: If the `ltable` and `rtable` order is same as mentioned
68 | in the `l_attr_types`/`r_attr_types` and `attr_corres`.
69 |
70 | Examples:
71 |
72 | >>> import py_entitymatching as em
73 | >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
74 | >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
75 | >>> match_t = em.get_tokenizers_for_matching()
76 | >>> match_s = em.get_sim_funs_for_matching()
77 | >>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step
78 | >>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step
79 | >>> match_c = em.get_attr_corres(A, B)
80 | >>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s)
81 |
82 | See Also:
83 | :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
84 | :meth:`py_entitymatching.get_sim_funs_for_blocking`,
85 | :meth:`py_entitymatching.get_tokenizers_for_blocking`,
86 | :meth:`py_entitymatching.get_sim_funs_for_matching`,
87 | :meth:`py_entitymatching.get_tokenizers_for_matching`
88 |
89 |
90 | Note:
91 | In the output DataFrame, two
92 | attributes demand some explanation: (1)function, and (2)
93 | is_auto_generated. The function, points to the actual python function
94 | that implements feature. Specifically, the function takes in two
95 | tuples (one from each input table) and returns a numeric value. The
96 | attribute is_auto_generated contains either True or False. The flag
97 | is True only if the feature is automatically generated by py_entitymatching.
98 | This is important because this flag is used to make some assumptions
99 | about the semantics of the similarity function used and use that
100 | information for scaling purposes.
101 |
102 | """
103 | # Validate input parameters
104 | # # We expect the ltable to be of type pandas DataFrame
105 | validate_object_type(ltable, pd.DataFrame, 'Input ltable')
106 |
107 | # # We expect the rtable to be of type pandas DataFrame
108 | validate_object_type(rtable, pd.DataFrame, 'Input rtable')
109 |
110 | # # We expect the l_attr_types to be of type python dictionary
111 | validate_object_type(l_attr_types, dict, 'Input l_attr_types')
112 |
113 | # # We expect the r_attr_types to be of type python dictionary
114 | validate_object_type(r_attr_types, dict, 'Input r_attr_types')
115 |
116 | # # We expect the attr_corres to be of type python dictionary
117 | validate_object_type(attr_corres, dict, 'Input attr_corres')
118 |
119 | # # We expect the tok_funcs to be of type python dictionary
120 | validate_object_type(tok_funcs, dict, 'Input tok_funcs')
121 |
122 | # # We expect the sim_funcs to be of type python dictionary
123 | validate_object_type(sim_funcs, dict, 'Input sim_funcs')
124 |
125 | # We expect the table order to be same in l/r_attr_types and attr_corres
126 | if not _check_table_order(ltable, rtable,
127 | l_attr_types, r_attr_types, attr_corres):
128 | logger.error('Table order is different than what is mentioned '
129 | 'in l/r attr_types and attr_corres')
130 | raise AssertionError('Table order is different than what is mentioned '
131 | 'in l/r attr_types and attr_corres')
132 |
133 | # Initialize output feature dictionary list
134 | feature_dict_list = []
135 |
136 | # Generate features for each attr. correspondence
137 | for ac in attr_corres['corres']:
138 | l_attr_type = l_attr_types[ac[0]]
139 | r_attr_type = r_attr_types[ac[1]]
140 |
141 | # Generate a feature only if the attribute types are same
142 | if l_attr_type != r_attr_type:
143 | logger.info('py_entitymatching types: %s type (%s) and %s type (%s) '
144 | 'are different.'
145 | 'If you want to set them to be same and '
146 | 'generate features, '
147 | 'update output from get_attr_types and '
148 | 'use get_features command.\n.'
149 | % (ac[0], l_attr_type, ac[1], r_attr_type))
150 | # features_1 = _get_features_for_type(l_attr_type)
151 | # features_2 = _get_features_for_type(r_attr_type)
152 | # features = set(features_1).union(features_2)
153 | continue
154 |
155 | # Generate features
156 | features = _get_features_for_type(l_attr_type)
157 |
158 | # Convert features to function objects
159 | fn_objs = _conv_func_objs(features, ac, tok_funcs, sim_funcs)
160 | # Add the function object to a feature list.
161 | feature_dict_list.append(fn_objs)
162 |
163 | # Create a feature table
164 | feature_table = pd.DataFrame(flatten_list(feature_dict_list))
165 | # Project out only the necessary columns.
166 | feature_table = feature_table[['feature_name', 'left_attribute',
167 | 'right_attribute', 'left_attr_tokenizer',
168 | 'right_attr_tokenizer',
169 | 'simfunction', 'function',
170 | 'function_source', 'is_auto_generated']]
171 | # Return the feature table.
172 | return feature_table
173 |
174 | def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres):
175 | """
176 | Check whether the order of tables matches with what is mentioned in
177 | l_attr_types, r_attr_type and attr_corres.
178 | """
179 | # Validate the input parameters
180 | # We expect the input object ltable to be of type pandas DataFrame
181 | validate_object_type(ltable, pd.DataFrame, 'Input left table')
182 |
183 | # # We expect the rtable to be of type pandas DataFrame
184 | validate_object_type(rtable, pd.DataFrame, 'Input right table')
185 |
186 | # Get the ids of the input tables. This is used to validate the order
187 | # of tables present in the given data structures.
188 | # Note: This kind of checking is bit too aggressive, the reason is this
189 | # checking needs the ltable and rtable to point to exact memory location
190 | # across the given dictionaries and the input. Ideally, we just need to
191 | # check whether the contents of those DataFrames are same.
192 | ltable_id = id(ltable)
193 | rtable_id = id(rtable)
194 |
195 | # Check whether ltable id matches with id of table mentioned in l_attr_types
196 | if ltable_id != id(l_attr_types['_table']):
197 | logger.error(
198 | 'ltable is not the same as table mentioned in left attr types')
199 | return False
200 |
201 | # Check whether rtable id matches with id of table mentioned in r_attr_types
202 | if rtable_id != id(r_attr_types['_table']):
203 | logger.error(
204 | 'rtable is not the same as table mentioned in right attr types')
205 | return False
206 |
207 | # Check whether ltable matches with ltable mentioned in attr_corres
208 | if ltable_id != id(attr_corres['ltable']):
209 | logger.error(
210 | 'ltable is not the same as table mentioned in attr correspondence')
211 | return False
212 |
213 | # Check whether rtable matches with rtable mentioned in attr_corres
214 | if rtable_id != id(attr_corres['rtable']):
215 | logger.error(
216 | 'rtable is not the same as table mentioned in attr correspondence')
217 | return False
218 |
219 | # Finally, return True.
220 | return True
221 |
222 |
223 | # get look up table to generate features
224 | def _get_feat_lkp_tbl():
225 | """
226 | This function embeds the knowledge of mapping what features to be
227 | generated for what kind of attr. types.
228 |
229 | """
230 | # Initialize a lookup table
231 | lookup_table = dict()
232 |
233 | # Features for type str_eq_1w
234 | lookup_table['STR_EQ_1W'] = [('affine'),
235 | ('cosine', 'qgm_2', 'qgm_2'),
236 | ('cosine', 'qgm_3', 'qgm_3'),
237 | ('dice', 'qgm_2', 'qgm_2'),
238 | ('dice', 'qgm_3', 'qgm_3'),
239 | #('hamming_dist'), ('hamming_sim'),
240 | ('lev_dist'), ('lev_sim'), ('jaro'),
241 | ('jaro_winkler'),
242 | ('exact_match'),
243 | ('smith_waterman'),
244 | ('needleman_wunsch'),
245 | ('monge_elkan', 'qgm_2', 'qgm_2'),
246 | ('monge_elkan', 'qgm_3', 'qgm_3'),
247 | ('overlap_coeff', 'qgm_2', 'qgm_2'),
248 | ('overlap_coeff', 'qgm_3', 'qgm_3'),
249 | ('jaccard', 'qgm_2', 'qgm_2'),
250 | ('jaccard', 'qgm_3', 'qgm_3')]
251 |
252 | # Features for type str_bt_1w_5w
253 | lookup_table['STR_BT_1W_5W'] = [('affine'),
254 | ('cosine', 'dlm_dc0', 'dlm_dc0'),
255 | ('cosine', 'qgm_3', 'qgm_3'),
256 | ('dice', 'dlm_dc0', 'dlm_dc0'),
257 | ('dice', 'qgm_3', 'qgm_3'),
258 | #('hamming_dist'), ('hamming_sim'),
259 | ('lev_dist'), ('lev_sim'), ('jaro'),
260 | ('jaro_winkler'),
261 | ('exact_match'),
262 | ('smith_waterman'),
263 | ('needleman_wunsch'),
264 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'),
265 | ('monge_elkan', 'qgm_3', 'qgm_3'),
266 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'),
267 | ('overlap_coeff', 'qgm_3', 'qgm_3'),
268 | ('jaccard', 'dlm_dc0', 'dlm_dc0'),
269 | ('jaccard', 'qgm_3', 'qgm_3')] # dlm_dc0 is the concrete space tokenizer
270 |
271 | # Features for type str_bt_5w_10w
272 | lookup_table['STR_BT_5W_10W'] = [('cosine', 'dlm_dc0', 'dlm_dc0'),
273 | ('cosine', 'qgm_3', 'qgm_3'),
274 | ('dice', 'dlm_dc0', 'dlm_dc0'),
275 | ('dice', 'qgm_3', 'qgm_3'),
276 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'),
277 | ('monge_elkan', 'qgm_3', 'qgm_3'),
278 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'),
279 | ('overlap_coeff', 'qgm_3', 'qgm_3'),
280 | ('jaccard', 'dlm_dc0', 'dlm_dc0'),
281 | ('jaccard', 'qgm_3', 'qgm_3')]
282 |
283 | # Features for type str_gt_10w
284 | lookup_table['STR_GT_10W'] = [('cosine', 'dlm_dc0', 'dlm_dc0'),
285 | ('cosine', 'qgm_3', 'qgm_3'),
286 | ('dice', 'dlm_dc0', 'dlm_dc0'),
287 | ('dice', 'qgm_3', 'qgm_3'),
288 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'),
289 | ('monge_elkan', 'qgm_3', 'qgm_3'),
290 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'),
291 | ('overlap_coeff', 'qgm_3', 'qgm_3'),
292 | ('jaccard', 'dlm_dc0', 'dlm_dc0'),
293 | ('jaccard', 'qgm_3', 'qgm_3')]
294 |
295 | # Features for NUMERIC type
296 | lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'),
297 | ('lev_sim'),
298 | #('hamming_dist'), ('hamming_sim'),
299 | ('rel_diff')]
300 |
301 | # Features for BOOLEAN type
302 | lookup_table['BOOL'] = [('exact_match')]
303 |
304 | # Features for un determined type
305 | lookup_table['UN_DETERMINED'] = []
306 |
307 | # Finally, return the lookup table
308 | return lookup_table
309 |
310 |
311 | def _get_features_for_type(column_type):
312 | """
313 | Get features to be generated for a type
314 | """
315 | # First get the look up table
316 | lookup_table = _get_feat_lkp_tbl()
317 |
318 | # Based on the column type, return the feature functions that should be
319 | # generated.
320 | if column_type is 'str_eq_1w':
321 | features = lookup_table['STR_EQ_1W']
322 | elif column_type is 'str_bt_1w_5w':
323 | features = lookup_table['STR_BT_1W_5W']
324 | elif column_type is 'str_bt_5w_10w':
325 | features = lookup_table['STR_BT_5W_10W']
326 | elif column_type is 'str_gt_10w':
327 | features = lookup_table['STR_GT_10W']
328 | elif column_type is 'numeric':
329 | features = lookup_table['NUM']
330 | elif column_type is 'boolean':
331 | features = lookup_table['BOOL']
332 | elif column_type is 'un_determined':
333 | features = lookup_table['UN_DETERMINED']
334 | else:
335 | raise TypeError('Unknown type')
336 | return features
337 |
338 | # convert features from look up table to function objects
339 | def _conv_func_objs(features, attributes,
340 | tokenizer_functions, similarity_functions):
341 | """
342 | Convert features from look up table to function objects
343 | """
344 | # We need to check whether the features have allowed tokenizers and
345 | # similarity functions.
346 |
347 | # # First get the tokenizer and similarity functions list.
348 | tokenizer_list = tokenizer_functions.keys()
349 | similarity_functions_list = similarity_functions.keys()
350 |
351 | # # Second get the features that uses only valid tokenizers and
352 | # similarity functions
353 | valid_list = [check_valid_tok_sim(feature, tokenizer_list,
354 | similarity_functions_list)
355 | for feature in features]
356 |
357 | # Get function as a string and other meta data; finally we will get a
358 | # list of tuples
359 | function_tuples = [get_fn_str(input, attributes) for input in valid_list]
360 |
361 | # Convert the function string into a function object
362 | function_objects = conv_fn_str_to_obj(function_tuples, tokenizer_functions,
363 | similarity_functions)
364 |
365 | return function_objects
366 |
367 |
368 | # check whether tokenizers and simfunctions are allowed
369 | # inp is of the form ('jaccard', 'qgm_3', 'qgm_3') or ('lev')
370 | def check_valid_tok_sim(inp, simlist, toklist):
371 | if isinstance(inp, six.string_types):
372 | inp = [inp]
373 | assert len(inp) == 1 or len(
374 | inp) == 3, 'len of feature config should be 1 or 3'
375 | # check whether the sim function in features is in simlist
376 | if len(set(inp).intersection(simlist)) > 0:
377 | return inp
378 | # check whether the tokenizer in features is in tok list
379 | if len(set(inp).intersection(toklist)) > 0:
380 | return inp
381 | return None
382 |
383 |
384 | # get function string for a feature
385 | def get_fn_str(inp, attrs):
386 | if inp:
387 | args = []
388 | args.extend(attrs)
389 | if isinstance(inp, six.string_types) == True:
390 | inp = [inp]
391 | args.extend(inp)
392 | # fill function string from a template
393 | return fill_fn_template(*args)
394 | else:
395 | return None
396 |
397 |
398 | # fill function template
399 | def fill_fn_template(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
400 | # construct function string
401 | s = 'from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers import *\n'
402 | # get the function name
403 | fn_name = get_fn_name(attr1, attr2, sim_func, tok_func_1, tok_func_2)
404 | # proceed with function construction
405 | fn_st = 'def ' + fn_name + '(ltuple, rtuple):'
406 | s += fn_st
407 | s += '\n'
408 |
409 | # add 4 spaces
410 | s += ' '
411 | fn_body = 'return '
412 | if tok_func_1 is not None and tok_func_2 is not None:
413 | fn_body = fn_body + sim_func + '(' + tok_func_1 + '(' + 'ltuple["' + attr1 + '"]'
414 | fn_body += '), '
415 | fn_body = fn_body + tok_func_2 + '(' + 'rtuple["' + attr2 + '"]'
416 | fn_body = fn_body + ')) '
417 | else:
418 | fn_body = fn_body + sim_func + '(' + 'ltuple["' + attr1 + '"], rtuple["' + attr2 + '"])'
419 | s += fn_body
420 |
421 | return fn_name, attr1, attr2, tok_func_1, tok_func_2, sim_func, s
422 |
423 |
424 | # construct function name from attrs, tokenizers and sim funcs
425 |
426 | # sim_fn_names=['jaccard', 'lev', 'cosine', 'monge_elkan',
427 | # 'needleman_wunsch', 'smith_waterman', 'jaro', 'jaro_winkler',
428 | # 'exact_match', 'rel_diff', 'abs_norm']
429 | def get_fn_name(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None):
430 | attr1 = '_'.join(attr1.split())
431 | attr2 = '_'.join(attr2.split())
432 | fp = '_'.join([attr1, attr2])
433 | name_lkp = dict()
434 | name_lkp["jaccard"] = "jac"
435 | name_lkp["lev_dist"] = "lev_dist"
436 | name_lkp["lev_sim"] = "lev_sim"
437 | name_lkp["cosine"] = "cos"
438 | name_lkp["monge_elkan"] = "mel"
439 | name_lkp["needleman_wunsch"] = "nmw"
440 | name_lkp["smith_waterman"] = "sw"
441 | name_lkp["jaro"] = "jar"
442 | name_lkp["jaro_winkler"] = "jwn"
443 | name_lkp["exact_match"] = "exm"
444 | name_lkp["abs_norm"] = "anm"
445 | name_lkp["rel_diff"] = "rdf"
446 | name_lkp["1"] = "1"
447 | name_lkp["2"] = "2"
448 | name_lkp["3"] = "3"
449 | name_lkp["4"] = "4"
450 | name_lkp["tok_whitespace"] = "wsp"
451 | name_lkp["tok_qgram"] = "qgm"
452 | name_lkp["tok_delim"] = "dlm"
453 |
454 | arg_list = [sim_func, tok_func_1, tok_func_2]
455 | nm_list = [name_lkp.get(tok, tok) for tok in arg_list if tok]
456 | sp = '_'.join(nm_list)
457 | return '_'.join([fp, sp])
458 |
459 |
460 | # conv function string to function object and return with meta data
461 | def conv_fn_str_to_obj(fn_tup, tok, sim_funcs):
462 | d_orig = {}
463 | d_orig.update(tok)
464 | d_orig.update(sim_funcs)
465 | d_ret_list = []
466 | for f in fn_tup:
467 | d_ret = {}
468 | name = f[0]
469 | attr1 = f[1]
470 | attr2 = f[2]
471 | tok_1 = f[3]
472 | tok_2 = f[4]
473 | simfunction = f[5]
474 | # exec(f[6] in d_orig)
475 | six.exec_(f[6], d_orig)
476 | d_ret['function'] = d_orig[name]
477 | d_ret['feature_name'] = name
478 | d_ret['left_attribute'] = attr1
479 | d_ret['right_attribute'] = attr2
480 | d_ret['left_attr_tokenizer'] = tok_1
481 | d_ret['right_attr_tokenizer'] = tok_2
482 | d_ret['simfunction'] = simfunction
483 | d_ret['function_source'] = f[6]
484 | d_ret['is_auto_generated'] = True
485 |
486 | d_ret_list.append(d_ret)
487 | return d_ret_list
488 |
489 |
490 | def flatten_list(inp_list):
491 | return [item for sublist in inp_list for item in sublist]
--------------------------------------------------------------------------------
/datasets/fodors_zagats/fodors.csv:
--------------------------------------------------------------------------------
1 | id,name,addr,city,phone,type,class
2 | 534,'arnie morton\'s of chicago','435 s. la cienega blv.','los angeles',310/246-1501,american,0
3 | 535,'art\'s delicatessen','12224 ventura blvd.','studio city',818/762-1221,american,1
4 | 536,'hotel bel-air','701 stone canyon rd.','bel air',310/472-1211,californian,2
5 | 537,'cafe bizou','14016 ventura blvd.','sherman oaks',818/788-3536,french,3
6 | 538,campanile,'624 s. la brea ave.','los angeles',213/938-1447,american,4
7 | 539,'chinois on main','2709 main st.','santa monica',310/392-9025,french,5
8 | 540,citrus,'6703 melrose ave.','los angeles',213/857-0034,californian,6
9 | 541,fenix,'8358 sunset blvd. west',hollywood,213/848-6677,american,7
10 | 542,granita,'23725 w. malibu rd.',malibu,310/456-0488,californian,8
11 | 543,'grill on the alley','9560 dayton way','los angeles',310/276-0615,american,9
12 | 544,'restaurant katsu','1972 n. hillhurst ave.','los angeles',213/665-1891,asian,10
13 | 545,'l\'orangerie','903 n. la cienega blvd.','los angeles',310/652-9770,french,11
14 | 546,'le chardonnay','8284 melrose ave.','los angeles',213/655-8880,french,12
15 | 547,'locanda veneta','3rd st.','los angeles',310/274-1893,italian,13
16 | 548,matsuhisa,'129 n. la cienega blvd.','beverly hills',310/659-9639,asian,14
17 | 549,'the palm','9001 santa monica blvd.','los angeles',310/550-8811,american,15
18 | 550,patina,'5955 melrose ave.','los angeles',213/467-1108,californian,16
19 | 551,'philippe\'s the original','1001 n. alameda st.','los angeles',213/628-3781,american,17
20 | 552,'pinot bistro','12969 ventura blvd.','los angeles',818/990-0500,french,18
21 | 553,'rex il ristorante','617 s. olive st.','los angeles',213/627-2300,italian,19
22 | 554,spago,'1114 horn ave.','los angeles',310/652-4025,californian,20
23 | 555,valentino,'3115 pico blvd.','santa monica',310/829-4313,italian,21
24 | 556,'yujean kang\'s gourmet chinese cuisine','67 n. raymond ave.','los angeles',818/585-0855,asian,22
25 | 557,'21 club','21 w. 52nd st.','new york',212/582-7200,american,23
26 | 558,aquavit,'13 w. 54th st.','new york',212/307-7311,continental,24
27 | 559,aureole,'34 e. 61st st.','new york','212/ 319-1660',american,25
28 | 560,'cafe lalo','201 w. 83rd st.','new york',212/496-6031,'coffee bar',26
29 | 561,'cafe des artistes','1 w. 67th st.','new york',212/877-3500,continental,27
30 | 562,'carmine\'s','2450 broadway between 90th and 91st sts.','new york',212/362-2200,italian,28
31 | 563,'carnegie deli','854 7th ave. between 54th and 55th sts.','new york',212/757-2245,delicatessen,29
32 | 564,chanterelle,'2 harrison st. near hudson st.','new york',212/966-6960,american,30
33 | 565,daniel,'20 e. 76th st.','new york',212/288-0033,french,31
34 | 566,dawat,'210 e. 58th st.','new york',212/355-7555,asian,32
35 | 567,felidia,'243 e. 58th st.','new york',212/758-1479,italian,33
36 | 568,'four seasons grill room','99 e. 52nd st.','new york',212/754-9494,american,34
37 | 569,'gotham bar & grill','12 e. 12th st.','new york',212/620-4020,american,35
38 | 570,'gramercy tavern','42 e. 20th st. between park ave. s and broadway','new york',212/477-0777,american,36
39 | 571,'island spice','402 w. 44th st.','new york',212/765-1737,'tel caribbean',37
40 | 572,'jo jo','160 e. 64th st.','new york',212/223-5656,american,38
41 | 573,'la caravelle','33 w. 55th st.','new york',212/586-4252,french,39
42 | 574,'la cote basque','60 w. 55th st. between 5th and 6th ave.','new york',212/688-6525,french,40
43 | 575,'le bernardin','155 w. 51st st.','new york',212/489-1515,french,41
44 | 576,'les celebrites','160 central park s','new york',212/484-5113,french,42
45 | 577,lespinasse,'2 e. 55th st.','new york',212/339-6719,american,43
46 | 578,lutece,'249 e. 50th st.','new york',212/752-2225,french,44
47 | 579,'manhattan ocean club','57 w. 58th st.','new york','212/ 371-7777',seafood,45
48 | 580,march,'405 e. 58th st.','new york',212/754-6272,american,46
49 | 581,'mesa grill','102 5th ave. between 15th and 16th sts.','new york',212/807-7400,american,47
50 | 582,'mi cocina','57 jane st. off hudson st.','new york',212/627-8273,mexican,48
51 | 583,montrachet,'239 w. broadway between walker and white sts.','new york','212/ 219-2777',french,49
52 | 584,oceana,'55 e. 54th st.','new york',212/759-5941,seafood,50
53 | 585,'park avenue cafe','100 e. 63rd st.','new york',212/644-1900,american,51
54 | 586,petrossian,'182 w. 58th st.','new york',212/245-2214,french,52
55 | 587,picholine,'35 w. 64th st.','new york',212/724-8585,mediterranean,53
56 | 588,pisces,'95 ave. a at 6th st.','new york',212/260-6660,seafood,54
57 | 589,'rainbow room','30 rockefeller plaza','new york',212/632-5000,'or 212/632-5100 american',55
58 | 590,'river cafe','1 water st. at the east river',brooklyn,718/522-5200,american,56
59 | 591,'san domenico','240 central park s','new york',212/265-5959,italian,57
60 | 592,'second avenue deli','156 2nd ave. at 10th st.','new york',212/677-0606,delicatessen,58
61 | 593,seryna,'11 e. 53rd st.','new york',212/980-9393,asian,59
62 | 594,'shun lee west','43 w. 65th st.','new york',212/371-8844,asian,60
63 | 595,'sign of the dove','1110 3rd ave. at 65th st.','new york',212/861-8080,american,61
64 | 596,'smith & wollensky','201 e. 49th st.','new york',212/753-1530,american,62
65 | 597,'tavern on the green','in central park at 67th st.','new york',212/873-3200,american,63
66 | 598,'uncle nick\'s','747 9th ave. between 50th and 51st sts.','new york',212/315-1726,mediterranean,64
67 | 599,'union square cafe','21 e. 16th st.','new york',212/243-4020,american,65
68 | 600,'virgil\'s','152 w. 44th st.','new york','212/ 921-9494',american,66
69 | 601,'chin\'s','3200 las vegas blvd. s','las vegas',702/733-8899,asian,67
70 | 602,'coyote cafe','3799 las vegas blvd. s','las vegas',702/891-7349,southwestern,68
71 | 603,'le montrachet','3000 w. paradise rd.','las vegas',702/732-5111,continental,69
72 | 604,'palace court','3570 las vegas blvd. s','las vegas',702/731-7547,continental,70
73 | 605,'second street grille','200 e. fremont st.','las vegas',702/385-3232,seafood,71
74 | 606,'steak house','2880 las vegas blvd. s','las vegas',702/734-0410,'steak houses',72
75 | 607,tillerman,'2245 e. flamingo rd.','las vegas',702/731-4036,seafood,73
76 | 608,abruzzi,'2355 peachtree rd. peachtree battle shopping center',atlanta,404/261-8186,italian,74
77 | 609,bacchanalia,'3125 piedmont rd. near peachtree rd.',atlanta,404/365-0410,international,75
78 | 610,'bone\'s','3130 piedmont road',atlanta,404/237-2663,american,76
79 | 611,'brasserie le coze','3393 peachtree rd. lenox square mall near neiman marcus',atlanta,404/266-1440,french,77
80 | 612,'buckhead diner','3073 piedmont road',atlanta,404/262-3336,american,78
81 | 613,ciboulette,'1529 piedmont ave.',atlanta,404/874-7600,french,79
82 | 614,delectables,'1 margaret mitchell sq.',atlanta,404/681-2909,american,80
83 | 615,'georgia grille','2290 peachtree rd. peachtree square shopping center',atlanta,404/352-3517,american,81
84 | 616,'hedgerose heights inn','490 e. paces ferry rd.',atlanta,404/233-7673,international,82
85 | 617,'heera of india','595 piedmont ave. rio shopping mall',atlanta,404/876-4408,asian,83
86 | 618,'indigo coastal grill','1397 n. highland ave.',atlanta,404/876-0676,caribbean,84
87 | 619,'la grotta','2637 peachtree rd. peachtree house condominium',atlanta,404/231-1368,italian,85
88 | 620,'mary mac\'s tea room','224 ponce de leon ave.',atlanta,404/876-1800,southern,86
89 | 621,'nikolai\'s roof','255 courtland st. at harris st.',atlanta,404/221-6362,continental,87
90 | 622,'pano\'s and paul\'s','1232 w. paces ferry rd.',atlanta,404/261-3662,international,88
91 | 623,'cafe ritz-carlton buckhead','3434 peachtree rd.',atlanta,404/237-2700,'ext 6108 international',89
92 | 624,'dining room ritz-carlton buckhead','3434 peachtree rd.',atlanta,404/237-2700,international,90
93 | 625,'restaurant ritz-carlton atlanta','181 peachtree st.',atlanta,404/659-0400,continental,91
94 | 626,toulouse,'b peachtree rd.',atlanta,404/351-9533,french,92
95 | 627,'veni vidi vici','41 14th st.',atlanta,404/875-8424,italian,93
96 | 628,'alain rondelli','126 clement st.','san francisco',415/387-0408,french,94
97 | 629,aqua,'252 california st.','san francisco',415/956-9662,seafood,95
98 | 630,boulevard,'1 mission st.','san francisco',415/543-6084,american,96
99 | 631,'cafe claude','7 claude la.','san francisco',415/392-3505,french,97
100 | 632,'campton place','340 stockton st.','san francisco',415/955-5555,american,98
101 | 633,'chez michel','804 northpoint','san francisco',415/775-7036,french,99
102 | 634,'fleur de lys','777 sutter st.','san francisco',415/673-7779,french,100
103 | 635,fringale,'570 4th st.','san francisco',415/543-0573,french,101
104 | 636,'hawthorne lane','22 hawthorne st.','san francisco',415/777-9779,american,102
105 | 637,'khan toke thai house','5937 geary blvd.','san francisco',415/668-6654,asian,103
106 | 638,'la folie','2316 polk st.','san francisco',415/776-5577,french,104
107 | 639,lulu,'816 folsom st.','san francisco',415/495-5775,mediterranean,105
108 | 640,'masa\'s','648 bush st.','san francisco',415/989-7154,french,106
109 | 641,'mifune japan center kintetsu building','1737 post st.','san francisco',415/922-0337,asian,107
110 | 642,'plumpjack cafe','3201 fillmore st.','san francisco',415/563-4755,mediterranean,108
111 | 643,postrio,'545 post st.','san francisco',415/776-7825,american,109
112 | 644,'ritz-carlton restaurant and dining room','600 stockton st.','san francisco',415/296-7465,american,110
113 | 645,'rose pistola','532 columbus ave.','san francisco',415/399-0499,italian,111
114 | 646,bolo,'23 e. 22nd st.','new york',212/228-2200,mediterranean,191
115 | 647,'il nido','251 e. 53rd st.','new york',212/753-8450,italian,267
116 | 648,remi,'145 w. 53rd st.','new york',212/581-4242,italian,334
117 | 649,'adriano\'s ristorante','2930 beverly glen circle','los angeles',310/475-9807,italian,112
118 | 650,'barney greengrass','9570 wilshire blvd.','beverly hills',310/777-5877,american,113
119 | 651,beaurivage,'26025 pacific coast hwy.',malibu,310/456-5733,french,114
120 | 652,'bistro garden','176 n. canon dr.','los angeles',310/550-3900,californian,115
121 | 653,'border grill','4th st.','los angeles',310/451-1655,mexican,116
122 | 654,'broadway deli','3rd st. promenade','santa monica',310/451-0616,american,117
123 | 655,'ca\'brea','346 s. la brea ave.','los angeles',213/938-2863,italian,118
124 | 656,'ca\'del sol','4100 cahuenga blvd.','los angeles',818/985-4669,italian,119
125 | 657,'cafe pinot','700 w. fifth st.','los angeles',213/239-6500,californian,120
126 | 658,'california pizza kitchen','207 s. beverly dr.','los angeles',310/275-1101,californian,121
127 | 659,'canter\'s','419 n. fairfax ave.','los angeles',213/651-2030.,american,122
128 | 660,cava,'3rd st.','los angeles',213/658-8898,mediterranean,123
129 | 661,'cha cha cha','656 n. virgil ave.','los angeles',213/664-7723,caribbean,124
130 | 662,'chan dara','310 n. larchmont blvd.','los angeles',213/467-1052,asian,125
131 | 663,'clearwater cafe','168 w. colorado blvd.','los angeles',818/356-0959,'health food',126
132 | 664,'dining room','9500 wilshire blvd.','los angeles',310/275-5200,californian,127
133 | 665,dive!,'10250 santa monica blvd.','los angeles',310/788-,'dive american',128
134 | 666,drago,'2628 wilshire blvd.','santa monica',310/828-1585,italian,129
135 | 667,'drai\'s','730 n. la cienega blvd.','los angeles',310/358-8585,french,130
136 | 668,'dynasty room','930 hilgard ave.','los angeles',310/208-8765,continental,131
137 | 669,eclipse,'8800 melrose ave.','los angeles',310/724-5959,californian,132
138 | 670,'ed debevic\'s','134 n. la cienega','los angeles',310/659-1952,american,133
139 | 671,'el cholo','1121 s. western ave.','los angeles',213/734-2773,mexican,134
140 | 672,'gilliland\'s','2424 main st.','santa monica',310/392-3901,american,135
141 | 673,'gladstone\'s','4 fish 17300 pacific coast hwy. at sunset blvd.','pacific palisades',310/454-3474,american,136
142 | 674,'hard rock cafe','8600 beverly blvd.','los angeles',310/276-7605,american,137
143 | 675,'harry\'s bar & american grill','2020 ave. of the stars','los angeles',310/277-2333,italian,138
144 | 676,'il fornaio cucina italiana','301 n. beverly dr.','los angeles',310/550-8330,italian,139
145 | 677,'jack sprat\'s grill','10668 w. pico blvd.','los angeles',310/837-6662,'health food',140
146 | 678,'jackson\'s farm','439 n. beverly drive','los angeles',310/273-5578,californian,141
147 | 679,'jimmy\'s','201 moreno dr.','los angeles',310/552-2394,continental,142
148 | 680,joss,'9255 sunset blvd.','los angeles',310/276-1886,asian,143
149 | 681,'le colonial','8783 beverly blvd.','los angeles',310/289-0660,asian,144
150 | 682,'le dome','8720 sunset blvd.','los angeles',310/659-6919,french,145
151 | 683,'louise\'s trattoria','4500 los feliz blvd.','los angeles',213/667-0777,italian,146
152 | 684,'mon kee seafood restaurant','679 n. spring st.','los angeles',213/628-6717,asian,147
153 | 685,'morton\'s','8764 melrose ave.','los angeles',310/276-5205,american,148
154 | 686,'nate \'n\' al\'s','414 n. beverly dr.','los angeles',310/274-0101,american,149
155 | 687,nicola,'601 s. figueroa st.','los angeles',213/485-0927,american,150
156 | 688,'ocean avenue','1401 ocean ave.','santa monica',310/394-5669,american,151
157 | 689,orleans,'11705 national blvd.','los angeles',310/479-4187,cajun,152
158 | 690,'pacific dining car','6th st.','los angeles',213/483-6000,american,153
159 | 691,'paty\'s','10001 riverside dr.','toluca lake',818/761-9126,american,154
160 | 692,'pinot hollywood','1448 n. gower st.','los angeles',213/461-8800,californian,155
161 | 693,posto,'14928 ventura blvd.','sherman oaks',818/784-4400,italian,156
162 | 694,prego,'362 n. camden dr.','los angeles',310/277-7346,italian,157
163 | 695,'rj\'s the rib joint','252 n. beverly dr.','los angeles',310/274-7427,american,158
164 | 696,remi,'3rd st. promenade','santa monica',310/393-6545,italian,159
165 | 697,'restaurant horikawa','111 s. san pedro st.','los angeles',213/680-9355,asian,160
166 | 698,'roscoe\'s house of chicken \'n\' waffles','1514 n. gower st.','los angeles',213/466-9329,american,161
167 | 699,'schatzi on main','3110 main st.','los angeles',310/399-4800,continental,162
168 | 700,sofi,'3rd st.','los angeles',213/651-0346,mediterranean,163
169 | 701,swingers,'8020 beverly blvd.','los angeles',213/653-5858,american,164
170 | 702,'tavola calda','7371 melrose ave.','los angeles',213/658-6340,italian,165
171 | 703,'the mandarin','430 n. camden dr.','los angeles',310/859-0926,asian,166
172 | 704,'tommy tang\'s','7313 melrose ave.','los angeles',213/937-5733,asian,167
173 | 705,'tra di noi','3835 cross creek rd.','los angeles',310/456-0169,italian,168
174 | 706,'trader vic\'s','9876 wilshire blvd.','los angeles',310/276-6345,asian,169
175 | 707,vida,'1930 north hillhurst ave.','los feliz',213/660-4446,american,170
176 | 708,'west beach cafe','60 n. venice blvd.','los angeles',310/823-5396,american,171
177 | 709,'20 mott','20 mott st. between bowery and pell st.','new york',212/964-0380,asian,172
178 | 710,'9 jones street','9 jones st.','new york',212/989-1220,american,173
179 | 711,adrienne,'700 5th ave. at 55th st.','new york',212/903-3918,french,174
180 | 712,agrotikon,'322 e. 14 st. between 1st and 2nd aves.','new york',212/473-2602,mediterranean,175
181 | 713,aja,'937 broadway at 22nd st.','new york',212/473-8388,american,176
182 | 714,alamo,'304 e. 48th st.','new york','212/ 759-0590',mexican,177
183 | 715,'alley\'s end','311 w. 17th st.','new york',212/627-8899,american,178
184 | 716,'ambassador grill','1 united nations plaza at 44th st.','new york',212/702-5014,american,179
185 | 717,'american place','2 park ave. at 32nd st.','new york',212/684-2122,american,180
186 | 718,'anche vivolo','222 e. 58th st. between 2nd and 3rd aves.','new york',212/308-0112,italian,181
187 | 719,arizona,'206 206 e. 60th st.','new york',212/838-0440,american,182
188 | 720,'arturo\'s','106 w. houston st. off thompson st.','new york',212/677-3820,italian,183
189 | 721,'au mandarin','200-250 vesey st. world financial center','new york',212/385-0313,asian,184
190 | 722,'bar anise','1022 3rd ave. between 60th and 61st sts.','new york',212/355-1112,mediterranean,185
191 | 723,barbetta,'321 w. 46th st.','new york',212/246-9171,italian,186
192 | 724,'ben benson\'s','123 w. 52nd st.','new york',212/581-8888,american,187
193 | 725,'big cup','228 8th ave. between 21st and 22nd sts.','new york',212/206-0059,'coffee bar',188
194 | 726,'billy\'s','948 1st ave. between 52nd and 53rd sts.','new york',212/753-1870,american,189
195 | 727,'boca chica','13 1st ave. near 1st st.','new york',212/473-0108,'latin american',190
196 | 728,boonthai,'1393a 2nd ave. between 72nd and 73rd sts.','new york',212/249-8484,asian,192
197 | 729,bouterin,'420 e. 59th st. off 1st ave.','new york',212/758-0323,french,193
198 | 730,'brothers bar-b-q','225 varick st. at clarkston st.','new york',212/727-2775,american,194
199 | 731,bruno,'240 e. 58th st.','new york',212/688-4190,italian,195
200 | 732,'bryant park grill roof restaurant and bp cafe','25 w. 40th st. between 5th and 6th aves.','new york',212/840-6500,american,196
201 | 733,c3,'103 waverly pl. near washington sq.','new york',212/254-1200,american,197
202 | 734,ct,'111 e. 22nd st. between park ave. s and lexington ave.','new york',212/995-8500,french,198
203 | 735,'cafe bianco','1486 2nd ave. between 77th and 78th sts.','new york',212/988-2655,'coffee bar',199
204 | 736,'cafe botanica','160 central park s','new york',212/484-5120,french,200
205 | 737,'cafe la fortuna','69 w. 71st st.','new york',212/724-5846,'coffee bar',201
206 | 738,'cafe luxembourg','200 w. 70th st.','new york',212/873-7411,french,202
207 | 739,'cafe pierre','2 e. 61st st.','new york',212/940-8185,french,203
208 | 740,'cafe centro','200 park ave. between 45th st. and vanderbilt ave.','new york',212/818-1222,french,204
209 | 741,'cafe fes','246 w. 4th st. at charles st.','new york',212/924-7653,mediterranean,205
210 | 742,'caffe dante','81 macdougal st. between houston and bleeker sts.','new york',212/982-5275,'coffee bar',206
211 | 743,'caffe dell\'artista','46 greenwich ave.','new york',212/645-4431,'coffee bar',207
212 | 744,'caffe lure','169 sullivan st. between houston and bleecker sts.','new york',212/473-2642,french,208
213 | 745,'caffe reggio','119 macdougal st. between 3rd and bleecker sts.','new york',212/475-9557,'coffee bar',209
214 | 746,'caffe roma','385 broome st. at mulberry','new york',212/226-8413,'coffee bar',210
215 | 747,'caffe vivaldi','32 jones st. at bleecker st.','new york',212/691-7538,'coffee bar',211
216 | 748,'caffe bondi ristorante','7 w. 20th st.','new york',212/691-8136,italian,212
217 | 749,'capsouto freres','451 washington st. near watts st.','new york',212/966-4900,french,213
218 | 750,'captain\'s table','860 2nd ave. at 46th st.','new york',212/697-9538,seafood,214
219 | 751,'casa la femme','150 wooster st. between houston and prince sts.','new york',212/505-0005,'middle eastern',215
220 | 752,'cendrillon asian grill & marimba bar','45 mercer st. between broome and grand sts.','new york',212/343-9012,asian,216
221 | 753,'chez jacqueline','72 macdougal st. between w. houston and bleecker sts.','new york',212/505-0727,french,217
222 | 754,chiam,'160 e. 48th st.','new york',212/371-2323,asian,218
223 | 755,'china grill','60 w. 53rd st.','new york',212/333-7788,american,219
224 | 756,cite,'120 w. 51st st.','new york',212/956-7100,french,220
225 | 757,'coco pazzo','23 e. 74th st.','new york',212/794-0205,italian,221
226 | 758,'columbus bakery','53rd sts.','new york',212/421-0334,'coffee bar',222
227 | 759,'corrado cafe','1013 3rd ave. between 60th and 61st sts.','new york',212/753-5100,'coffee bar',223
228 | 760,'cupcake cafe','522 9th ave. at 39th st.','new york',212/465-1530,'coffee bar',224
229 | 761,'da nico','164 mulberry st. between grand and broome sts.','new york',212/343-1212,italian,225
230 | 762,'dean & deluca','121 prince st.','new york',212/254-8776,'coffee bar',226
231 | 763,diva,'341 w. broadway near grand st.','new york',212/941-9024,italian,227
232 | 764,'dix et sept','181 w. 10th st.','new york',212/645-8023,french,228
233 | 765,docks,'633 3rd ave. at 40th st.','new york','212/ 986-8080',seafood,229
234 | 766,'duane park cafe','157 duane st. between w. broadway and hudson st.','new york',212/732-5555,american,230
235 | 767,'el teddy\'s','219 w. broadway between franklin and white sts.','new york',212/941-7070,mexican,231
236 | 768,'emily\'s','1325 5th ave. at 111th st.','new york',212/996-1212,american,232
237 | 769,'empire korea','6 e. 32nd st.','new york',212/725-1333,asian,233
238 | 770,'ernie\'s','2150 broadway between 75th and 76th sts.','new york',212/496-1588,american,234
239 | 771,'evergreen cafe','1288 1st ave. at 69th st.','new york',212/744-3266,asian,235
240 | 772,'f. ille ponte ristorante','39 desbrosses st. near west st.','new york',212/226-4621,italian,236
241 | 773,felix,'340 w. broadway at grand st.','new york',212/431-0021,french,237
242 | 774,ferrier,'29 e. 65th st.','new york',212/772-9000,french,238
243 | 775,'fifty seven fifty seven','57 e. 57th st.','new york',212/758-5757,american,239
244 | 776,'film center cafe','635 9th ave. between 44th and 45th sts.','new york','212/ 262-2525',american,240
245 | 777,'fiorello\'s roman cafe','1900 broadway between 63rd and 64th sts.','new york',212/595-5330,italian,241
246 | 778,firehouse,'522 columbus ave. between 85th and 86th sts.','new york',212/595-3139,american,242
247 | 779,first,'87 1st ave. between 5th and 6th sts.','new york',212/674-3823,american,243
248 | 780,'fishin eddie','73 w. 71st st.','new york',212/874-3474,seafood,244
249 | 781,'fleur de jour','348 e. 62nd st.','new york',212/355-2020,'coffee bar',245
250 | 782,flowers,'21 west 17th st. between 5th and 6th aves.','new york',212/691-8888,american,246
251 | 783,follonico,'6 w. 24th st.','new york',212/691-6359,italian,247
252 | 784,'fraunces tavern','54 pearl st. at broad st.','new york',212/269-0144,american,248
253 | 785,'french roast','458 6th ave. at 11th st.','new york',212/533-2233,french,249
254 | 786,'french roast cafe','2340 broadway at 85th st.','new york',212/799-1533,'coffee bar',250
255 | 787,'frico bar','402 w. 43rd st. off 9th ave.','new york',212/564-7272,italian,251
256 | 788,'fujiyama mama','467 columbus ave. between 82nd and 83rd sts.','new york',212/769-1144,asian,252
257 | 789,'gabriela\'s','685 amsterdam ave. at 93rd st.','new york',212/961-0574,mexican,253
258 | 790,'gallagher\'s','228 w. 52nd st.','new york',212/245-5336,american,254
259 | 791,'gianni\'s','15 fulton st.','new york',212/608-7300,seafood,255
260 | 792,girafe,'208 e. 58th st. between 2nd and 3rd aves.','new york',212/752-3054,italian,256
261 | 793,global,'33 93 2nd ave. between 5th and 6th sts.','new york',212/477-8427,american,257
262 | 794,'golden unicorn','18 e. broadway at catherine st.','new york','212/ 941-0911',asian,258
263 | 795,'grand ticino','228 thompson st. between w. 3rd and bleecker sts.','new york',212/777-5922,italian,259
264 | 796,halcyon,'151 w. 54th st. in the rihga royal hotel','new york',212/468-8888,american,260
265 | 797,'hard rock cafe','221 w. 57th st.','new york',212/489-6565,american,261
266 | 798,'hi-life restaurant and lounge','1340 1st ave. at 72nd st.','new york',212/249-3600,american,262
267 | 799,home,'20 cornelia st. between bleecker and w. 4th st.','new york',212/243-9579,american,263
268 | 800,'hudson river club','4 world financial center','new york',212/786-1500,american,264
269 | 801,'i trulli','122 e. 27th st. between lexington and park aves.','new york',212/481-7372,italian,265
270 | 802,'il cortile','125 mulberry st. between canal and hester sts.','new york',212/226-6060,italian,266
271 | 803,'inca grill','492 broome st. near w. broadway','new york',212/966-3371,'latin american',268
272 | 804,indochine,'430 lafayette st. between 4th st. and astor pl.','new york',212/505-5111,asian,269
273 | 805,'internet cafe','82 e. 3rd st. between 1st and 2nd aves.','new york','212/ 614-0747','coffee bar',270
274 | 806,ipanema,'13 w. 46th st.','new york',212/730-5848,'latin american',271
275 | 807,'jean lafitte','68 w. 58th st.','new york',212/751-2323,french,272
276 | 808,'jewel of india','15 w. 44th st.','new york',212/869-5544,asian,273
277 | 809,'jimmy sung\'s','219 e. 44th st. between 2nd and 3rd aves.','new york',212/682-5678,asian,274
278 | 810,'joe allen','326 w. 46th st.','new york',212/581-6464,american,275
279 | 811,'judson grill','152 w. 52nd st.','new york',212/582-5252,american,276
280 | 812,'l\'absinthe','227 e. 67th st.','new york',212/794-4950,french,277
281 | 813,'l\'auberge','1191 1st ave. between 64th and 65th sts.','new york',212/288-8791,'middle eastern',278
282 | 814,'l\'auberge du midi','310 w. 4th st. between w. 12th and bank sts.','new york',212/242-4705,french,279
283 | 815,'l\'udo','432 lafayette st. near astor pl.','new york',212/388-0978,french,280
284 | 816,'la reserve','4 w. 49th st.','new york',212/247-2993,french,281
285 | 817,'lanza restaurant','168 1st ave. between 10th and 11th sts.','new york',212/674-7014,italian,282
286 | 818,'lattanzi ristorante','361 w. 46th st.','new york',212/315-0980,italian,283
287 | 819,layla,'211 w. broadway at franklin st.','new york',212/431-0700,'middle eastern',284
288 | 820,'le chantilly','106 e. 57th st.','new york',212/751-2931,french,285
289 | 821,'le colonial','149 e. 57th st.','new york','212/ 752-0808',asian,286
290 | 822,'le gamin','50 macdougal st. between houston and prince sts.','new york',212/254-4678,'coffee bar',287
291 | 823,'le jardin','25 cleveland pl. near spring st.','new york',212/343-9599,french,288
292 | 824,'le madri','168 w. 18th st.','new york',212/727-8022,italian,289
293 | 825,'le marais','150 w. 46th st.','new york',212/869-0900,american,290
294 | 826,'le perigord','405 e. 52nd st.','new york',212/755-6244,french,291
295 | 827,'le select','507 columbus ave. between 84th and 85th sts.','new york',212/875-1993,american,292
296 | 828,'les halles','411 park ave. s between 28th and 29th sts.','new york',212/679-4111,french,293
297 | 829,'lincoln tavern','51 w. 64th st.','new york',212/721-8271,american,294
298 | 830,lola,'30 west 22nd st. between 5th and 6th ave.','new york',212/675-6700,american,295
299 | 831,'lucky strike','59 grand st. between wooster st. and w. broadway','new york',212/941-0479,'or 212/941-0772 american',296
300 | 832,'mad fish','2182 broadway between 77th and 78th sts.','new york',212/787-0202,seafood,297
301 | 833,'main street','446 columbus ave. between 81st and 82nd sts.','new york',212/873-5025,american,298
302 | 834,'mangia e bevi','800 9th ave. at 53rd st.','new york',212/956-3976,italian,299
303 | 835,'manhattan cafe','1161 1st ave. between 63rd and 64th sts.','new york',212/888-6556,american,300
304 | 836,'manila garden','325 e. 14th st. between 1st and 2nd aves.','new york',212/777-6314,asian,301
305 | 837,marichu,'342 e. 46th st. between 1st and 2nd aves.','new york',212/370-1866,french,302
306 | 838,'marquet patisserie','15 e. 12th st. between 5th ave. and university pl.','new york',212/229-9313,'coffee bar',303
307 | 839,match,'160 mercer st. between houston and prince sts.','new york',212/906-9173,american,304
308 | 840,'matthew\'s','1030 3rd ave. at 61st st.','new york',212/838-4343,american,305
309 | 841,'mavalli palace','46 e. 29th st.','new york',212/679-5535,asian,306
310 | 842,'milan cafe and coffee bar','120 w. 23rd st.','new york',212/807-1801,'coffee bar',307
311 | 843,'monkey bar','60 e. 54th st.','new york',212/838-2600,american,308
312 | 844,montien,'1134 1st ave. between 62nd and 63rd sts.','new york',212/421-4433,asian,309
313 | 845,'morton\'s','551 5th ave. at 45th st.','new york',212/972-3315,american,310
314 | 846,'motown cafe','104 w. 57th st. near 6th ave.','new york',212/581-8030,american,311
315 | 847,'new york kom tang soot bul house','32 w. 32nd st.','new york','212/ 947-8482',asian,312
316 | 848,'new york noodletown','28 1/2 bowery at bayard st.','new york',212/349-0923,asian,313
317 | 849,newsbar,'2 w. 19th st.','new york',212/255-3996,'coffee bar',314
318 | 850,odeon,'145 w. broadway at thomas st.','new york',212/233-0507,american,315
319 | 851,orso,'322 w. 46th st.','new york',212/489-7212,italian,316
320 | 852,'osteria al droge','142 w. 44th st.','new york',212/944-3643,italian,317
321 | 853,otabe,'68 e. 56th st.','new york',212/223-7575,asian,318
322 | 854,pacifica,'138 lafayette st. between canal and howard sts.','new york',212/941-4168,asian,319
323 | 855,palio,'151 w. 51st. st.','new york',212/245-4850,italian,320
324 | 856,pamir,'1065 1st ave. at 58th st.','new york',212/644-9258,'middle eastern',321
325 | 857,'parioli romanissimo','24 e. 81st st.','new york',212/288-2391,italian,322
326 | 858,patria,'250 park ave. s at 20th st.','new york',212/777-6211,'latin american',323
327 | 859,'peacock alley','301 park ave. between 49th and 50th sts.','new york',212/872-4895,french,324
328 | 860,'pen & pencil','205 e. 45th st.','new york',212/682-8660,american,325
329 | 861,'penang soho','109 spring st. between greene and mercer sts.','new york',212/274-8883,asian,326
330 | 862,persepolis,'1423 2nd ave. between 74th and 75th sts.','new york',212/535-1100,'middle eastern',327
331 | 863,'planet hollywood','140 w. 57th st.','new york',212/333-7827,american,328
332 | 864,pomaire,'371 w. 46th st. off 9th ave.','new york','212/ 956-3055','latin american',329
333 | 865,'popover cafe','551 amsterdam ave. between 86th and 87th sts.','new york',212/595-8555,american,330
334 | 866,'post house','28 e. 63rd st.','new york',212/935-2888,american,331
335 | 867,rain,'100 w. 82nd st.','new york',212/501-0776,asian,332
336 | 868,'red tulip','439 e. 75th st.','new york',212/734-4893,'eastern european',333
337 | 869,republic,'37a union sq. w between 16th and 17th sts.','new york',212/627-7172,asian,335
338 | 870,'roettelle a. g','126 e. 7th st. between 1st ave. and ave. a','new york',212/674-4140,continental,336
339 | 871,'rosa mexicano','1063 1st ave. at 58th st.','new york',212/753-7407,mexican,337
340 | 872,'ruth\'s chris','148 w. 51st st.','new york',212/245-9600,american,338
341 | 873,s.p.q.r,'133 mulberry st. between hester and grand sts.','new york',212/925-3120,italian,339
342 | 874,'sal anthony\'s','55 irving pl.','new york',212/982-9030,italian,340
343 | 875,'sammy\'s roumanian steak house','157 chrystie st. at delancey st.','new york',212/673-0330,'east european',341
344 | 876,'san pietro','18 e. 54th st.','new york',212/753-9015,italian,342
345 | 877,'sant ambroeus','1000 madison ave. between 77th and 78th sts.','new york',212/570-2211,'coffee bar',343
346 | 878,'sarabeth\'s kitchen','423 amsterdam ave. between 80th and 81st sts.','new york',212/496-6280,american,344
347 | 879,'sea grill','19 w. 49th st.','new york',212/332-7610,seafood,345
348 | 880,serendipity,'3 225 e. 60th st.','new york',212/838-3531,american,346
349 | 881,'seventh regiment mess and bar','643 park ave. at 66th st.','new york',212/744-4107,american,347
350 | 882,sfuzzi,'58 w. 65th st.','new york',212/873-3700,american,348
351 | 883,shaan,'57 w. 48th st.','new york','212/ 977-8400',asian,349
352 | 884,'sofia fabulous pizza','1022 madison ave. near 79th st.','new york',212/734-2676,italian,350
353 | 885,'spring street natural restaurant & bar','62 spring st. at lafayette st.','new york',212/966-0290,american,351
354 | 886,'stage deli','834 7th ave. between 53rd and 54th sts.','new york',212/245-7850,delicatessen,352
355 | 887,stingray,'428 amsterdam ave. between 80th and 81st sts.','new york',212/501-7515,seafood,353
356 | 888,'sweet\'n\'tart cafe','76 mott st. at canal st.','new york',212/334-8088,asian,354
357 | 889,'t salon','143 mercer st. at prince st.','new york',212/925-3700,'coffee bar',355
358 | 890,'tang pavillion','65 w. 55th st.','new york',212/956-6888,asian,356
359 | 891,tapika,'950 8th ave. at 56th st.','new york','212/ 397-3737',american,357
360 | 892,'teresa\'s','103 1st ave. between 6th and 7th sts.','new york',212/228-0604,'east european',358
361 | 893,terrace,'400 w. 119th st. between amsterdam and morningside aves.','new york',212/666-9490,continental,359
362 | 894,'the coffee pot','350 9th ave. at 49th st.','new york',212/265-3566,'coffee bar',360
363 | 895,'the savannah club','2420 broadway at 89th st.','new york',212/496-1066,american,361
364 | 896,'trattoria dell\'arte','900 7th ave. between 56th and 57th sts.','new york',212/245-9800,italian,362
365 | 897,triangolo,'345 e. 83rd st.','new york',212/472-4488,italian,363
366 | 898,'tribeca grill','375 greenwich st. near franklin st.','new york',212/941-3900,american,364
367 | 899,'trois jean','154 e. 79th st. between lexington and 3rd aves.','new york',212/988-4858,'coffee bar',365
368 | 900,'tse yang','34 e. 51st st.','new york',212/688-5447,asian,366
369 | 901,'turkish kitchen','386 3rd ave. between 27th and 28th sts.','new york',212/679-1810,'middle eastern',367
370 | 902,'two two two','222 w. 79th st.','new york',212/799-0400,american,368
371 | 903,'veniero\'s pasticceria','342 e. 11th st. near 1st ave.','new york',212/674-7264,'coffee bar',369
372 | 904,verbena,'54 irving pl. at 17th st.','new york',212/260-5454,american,370
373 | 905,'victor\'s cafe','52 236 w. 52nd st.','new york',212/586-7714,'latin american',371
374 | 906,'vince & eddie\'s','70 w. 68th st.','new york',212/721-0068,american,372
375 | 907,vong,'200 e. 54th st.','new york',212/486-9592,american,373
376 | 908,'water club','500 e. 30th st.','new york',212/683-3333,american,374
377 | 909,west,'63rd street steakhouse 44 w. 63rd st.','new york',212/246-6363,american,375
378 | 910,xunta,'174 1st ave. between 10th and 11th sts.','new york',212/614-0620,mediterranean,376
379 | 911,'zen palate','34 union sq. e at 16th st.','new york',212/614-9291,'and 212/614-9345 asian',377
380 | 912,zoe,'90 prince st. between broadway and mercer st.','new york',212/966-6722,american,378
381 | 913,abbey,'163 ponce de leon ave.',atlanta,404/876-8532,international,379
382 | 914,'aleck\'s barbecue heaven','783 martin luther king jr. dr.',atlanta,404/525-2062,barbecue,380
383 | 915,'annie\'s thai castle','3195 roswell rd.',atlanta,404/264-9546,asian,381
384 | 916,anthonys,'3109 piedmont rd. just south of peachtree rd.',atlanta,404/262-7379,american,382
385 | 917,'atlanta fish market','265 pharr rd.',atlanta,404/262-3165,american,383
386 | 918,'beesley\'s of buckhead','260 e. paces ferry road',atlanta,404/264-1334,continental,384
387 | 919,'bertolini\'s','3500 peachtree rd. phipps plaza',atlanta,404/233-2333,italian,385
388 | 920,bistango,'1100 peachtree st.',atlanta,404/724-0901,mediterranean,386
389 | 921,'cafe renaissance','7050 jimmy carter blvd. norcross',atlanta,770/441--0291,american,387
390 | 922,'camille\'s','1186 n. highland ave.',atlanta,404/872-7203,italian,388
391 | 923,cassis,'3300 peachtree rd. grand hyatt',atlanta,404/365-8100,mediterranean,389
392 | 924,'city grill','50 hurt plaza',atlanta,404/524-2489,international,390
393 | 925,'coco loco','40 buckhead crossing mall on the sidney marcus blvd.',atlanta,404/364-0212,caribbean,391
394 | 926,'colonnade restaurant','1879 cheshire bridge rd.',atlanta,404/874-5642,southern,392
395 | 927,'dante\'s down the hatch buckhead','3380 peachtree rd.',atlanta,404/266-1600,continental,393
396 | 928,'dante\'s down the hatch','underground underground mall underground atlanta',atlanta,404/577-1800,continental,394
397 | 929,'fat matt\'s rib shack','1811 piedmont ave. near cheshire bridge rd.',atlanta,404/607-1622,barbecue,395
398 | 930,'french quarter food shop','923 peachtree st. at 8th st.',atlanta,404/875-2489,southern,396
399 | 931,'holt bros. bar-b-q','6359 jimmy carter blvd. at buford hwy. norcross',atlanta,770/242-3984,barbecue,397
400 | 932,'horseradish grill','4320 powers ferry rd.',atlanta,404/255-7277,southern,398
401 | 933,'hsu\'s gourmet','192 peachtree center ave. at international blvd.',atlanta,404/659-2788,asian,399
402 | 934,'imperial fez','2285 peachtree rd. peachtree battle condominium',atlanta,404/351-0870,mediterranean,400
403 | 935,kamogawa,'3300 peachtree rd. grand hyatt',atlanta,404/841-0314,asian,401
404 | 936,'la grotta at ravinia dunwoody rd.','holiday inn/crowne plaza at ravinia dunwoody',atlanta,770/395-9925,italian,402
405 | 937,'little szechuan','c buford hwy. northwoods plaza doraville',atlanta,770/451-0192,asian,403
406 | 938,'lowcountry barbecue','6301 roswell rd. sandy springs plaza sandy springs',atlanta,404/255-5160,barbecue,404
407 | 939,'luna si','1931 peachtree rd.',atlanta,404/355-5993,continental,405
408 | 940,'mambo restaurante cubano','1402 n. highland ave.',atlanta,404/874-2626,caribbean,406
409 | 941,'mckinnon\'s louisiane','3209 maple dr.',atlanta,404/237-1313,southern,407
410 | 942,'mi spia dunwoody rd.','park place across from perimeter mall dunwoody',atlanta,770/393-1333,italian,408
411 | 943,'nickiemoto\'s: a sushi bar','247 buckhead ave. east village sq.',atlanta,404/842-0334,fusion,409
412 | 944,palisades,'1829 peachtree rd.',atlanta,404/350-6755,continental,410
413 | 945,'pleasant peasant','555 peachtree st. at linden ave.',atlanta,404/874-3223,american,411
414 | 946,pricci,'500 pharr rd.',atlanta,404/237-2941,italian,412
415 | 947,'r.j.\'s uptown kitchen & wine bar','870 n. highland ave.',atlanta,404/875-7775,american,413
416 | 948,'rib ranch','25 irby ave.',atlanta,404/233-7644,barbecue,414
417 | 949,'sa tsu ki','3043 buford hwy.',atlanta,404/325-5285,asian,415
418 | 950,'sato sushi and thai','6050 peachtree pkwy. norcross',atlanta,770/449-0033,asian,416
419 | 951,'south city kitchen','1144 crescent ave.',atlanta,404/873-7358,southern,417
420 | 952,'south of france','2345 cheshire bridge rd.',atlanta,404/325-6963,french,418
421 | 953,'stringer\'s fish camp and oyster bar','3384 shallowford rd. chamblee',atlanta,770/458-7145,southern,419
422 | 954,'sundown cafe','2165 cheshire bridge rd.',atlanta,404/321-1118,american,420
423 | 955,'taste of new orleans','889 w. peachtree st.',atlanta,404/874-5535,southern,421
424 | 956,tomtom,'3393 peachtree rd.',atlanta,404/264-1163,continental,422
425 | 957,'antonio\'s','3700 w. flamingo','las vegas',702/252-7737,italian,423
426 | 958,'bally\'s big kitchen','3645 las vegas blvd. s','las vegas',702/739-4111,buffets,424
427 | 959,'bamboo garden','4850 flamingo rd.','las vegas',702/871-3262,asian,425
428 | 960,'battista\'s hole in the wall','4041 audrie st. at flamingo rd.','las vegas',702/732-1424,italian,426
429 | 961,'bertolini\'s','3570 las vegas blvd. s','las vegas',702/735-4663,italian,427
430 | 962,'binion\'s coffee shop','128 fremont st.','las vegas',702/382-1600,'coffee shops/diners',428
431 | 963,bistro,'3400 las vegas blvd. s','las vegas',702/791-7111,continental,429
432 | 964,broiler,'4111 boulder hwy.','las vegas',702/432-7777,american,430
433 | 965,'bugsy\'s diner','3555 las vegas blvd. s','las vegas',702/733-3111,'coffee shops/diners',431
434 | 966,'cafe michelle','1350 e. flamingo rd.','las vegas',702/735-8686,american,432
435 | 967,'cafe roma','3570 las vegas blvd. s','las vegas',702/731-7547,'coffee shops/diners',433
436 | 968,'capozzoli\'s','3333 s. maryland pkwy.','las vegas',702/731-5311,italian,434
437 | 969,'carnival world','3700 w. flamingo rd.','las vegas',702/252-7777,buffets,435
438 | 970,'center stage plaza hotel','1 main st.','las vegas',702/386-2512,american,436
439 | 971,'circus circus','2880 las vegas blvd. s','las vegas',702/734-0410,buffets,437
440 | 972,'empress court','3570 las vegas blvd. s','las vegas',702/731-7888,asian,438
441 | 973,feast,'2411 w. sahara ave.','las vegas',702/367-2411,buffets,439
442 | 974,'golden nugget hotel','129 e. fremont st.','las vegas',702/385-7111,buffets,440
443 | 975,'golden steer','308 w. sahara ave.','las vegas',702/384-4470,'steak houses',441
444 | 976,'lillie langtry\'s','129 e. fremont st.','las vegas',702/385-7111,asian,442
445 | 977,'mandarin court','1510 e. flamingo rd.','las vegas',702/737-1234,asian,443
446 | 978,'margarita\'s mexican cantina','3120 las vegas blvd. s','las vegas',702/794-8200,mexican,444
447 | 979,'mary\'s diner','5111 w. boulder hwy.','las vegas',702/454-8073,'coffee shops/diners',445
448 | 980,mikado,'3400 las vegas blvd. s','las vegas',702/791-7111,asian,446
449 | 981,pamplemousse,'400 e. sahara ave.','las vegas',702/733-2066,continental,447
450 | 982,'ralph\'s diner','3000 las vegas blvd. s','las vegas',702/732-6330,'coffee shops/diners',448
451 | 983,'the bacchanal','3570 las vegas blvd. s','las vegas',702/731-7525,'only in las vegas',449
452 | 984,venetian,'3713 w. sahara ave.','las vegas',702/876-4190,italian,450
453 | 985,'viva mercado\'s','6182 w. flamingo rd.','las vegas',702/871-8826,mexican,451
454 | 986,'yolie\'s','3900 paradise rd.','las vegas',702/794-0700,'steak houses',452
455 | 987,2223,'2223 market st.','san francisco',415/431-0692,american,453
456 | 988,acquarello,'1722 sacramento st.','san francisco',415/567-5432,italian,454
457 | 989,'bardelli\'s','243 o\'farrell st.','san francisco',415/982-0243,'old san francisco',455
458 | 990,betelnut,'2030 union st.','san francisco',415/929-8855,asian,456
459 | 991,'bistro roti','155 steuart st.','san francisco',415/495-6500,french,457
460 | 992,bix,'56 gold st.','san francisco',415/433-6300,american,458
461 | 993,bizou,'598 fourth st.','san francisco',415/543-2222,french,459
462 | 994,'buca giovanni','800 greenwich st.','san francisco',415/776-7766,italian,460
463 | 995,'cafe adriano','3347 fillmore st.','san francisco',415/474-4180,italian,461
464 | 996,'cafe marimba','2317 chestnut st.','san francisco',415/776-1506,'mexican/latin american/spanish',462
465 | 997,'california culinary academy','625 polk st.','san francisco',415/771-3500,french,463
466 | 998,'capp\'s corner','1600 powell st.','san francisco',415/989-2589,italian,464
467 | 999,carta,'1772 market st.','san francisco',415/863-3516,american,465
468 | 1000,chevys,'4th and howard sts.','san francisco',415/543-8060,'mexican/latin american/spanish',466
469 | 1001,'cypress club','500 jackson st.','san francisco',415/296-8555,american,467
470 | 1002,'des alpes','732 broadway','san francisco',415/788-9900,french,468
471 | 1003,faz,'161 sutter st.','san francisco',415/362-0404,'greek and middle eastern',469
472 | 1004,'fog city diner','1300 battery st.','san francisco',415/982-2000,american,470
473 | 1005,'garden court','market and new montgomery sts.','san francisco',415/546-5011,'old san francisco',471
474 | 1006,'gaylord\'s','ghirardelli sq.','san francisco',415/771-8822,asian,472
475 | 1007,'grand cafe hotel monaco','501 geary st.','san francisco',415/292-0101,american,473
476 | 1008,greens,'bldg. a fort mason','san francisco',415/771-6222,vegetarian,474
477 | 1009,'harbor village','4 embarcadero center','san francisco',415/781-8833,asian,475
478 | 1010,'harris\'','2100 van ness ave.','san francisco',415/673-1888,'steak houses',476
479 | 1011,'harry denton\'s','161 steuart st.','san francisco',415/882-1333,american,477
480 | 1012,'hayes street grill','320 hayes st.','san francisco',415/863-5545,seafood,478
481 | 1013,helmand,'430 broadway','san francisco',415/362-0641,'greek and middle eastern',479
482 | 1014,'hong kong flower lounge','5322 geary blvd.','san francisco',415/668-8998,asian,480
483 | 1015,'hong kong villa','2332 clement st.','san francisco',415/752-8833,asian,481
484 | 1016,'hyde street bistro','1521 hyde st.','san francisco',415/441-7778,italian,482
485 | 1017,'il fornaio levi\'s plaza','1265 battery st.','san francisco',415/986-0100,italian,483
486 | 1018,'izzy\'s steak & chop house','3345 steiner st.','san francisco',415/563-0487,'steak houses',484
487 | 1019,'jack\'s','615 sacramento st.','san francisco',415/986-9854,'old san francisco',485
488 | 1020,'kabuto sushi','5116 geary blvd.','san francisco',415/752-5652,asian,486
489 | 1021,'katia\'s','600 5th ave.','san francisco',415/668-9292,'',487
490 | 1022,'kuleto\'s','221 powell st.','san francisco',415/397-7720,italian,488
491 | 1023,'kyo-ya. sheraton palace hotel','2 new montgomery st. at market st.','san francisco',415/546-5000,asian,489
492 | 1024,'l\'osteria del forno','519 columbus ave.','san francisco',415/982-1124,italian,490
493 | 1025,'le central','453 bush st.','san francisco',415/391-2233,french,491
494 | 1026,'le soleil','133 clement st.','san francisco',415/668-4848,asian,492
495 | 1027,'macarthur park','607 front st.','san francisco',415/398-5700,american,493
496 | 1028,manora,'3226 mission st.','san francisco',415/861-6224,asian,494
497 | 1029,maykadeh,'470 green st.','san francisco',415/362-8286,'greek and middle eastern',495
498 | 1030,'mccormick & kuleto\'s','ghirardelli sq.','san francisco',415/929-1730,seafood,496
499 | 1031,millennium,'246 mcallister st.','san francisco',415/487-9800,vegetarian,497
500 | 1032,'moose\'s','1652 stockton st.','san francisco',415/989-7800,mediterranean,498
501 | 1033,'north india','3131 webster st.','san francisco',415/931-1556,asian,499
502 | 1034,'one market','1 market st.','san francisco',415/777-5577,american,500
503 | 1035,oritalia,'1915 fillmore st.','san francisco',415/346-1333,italian,501
504 | 1036,'pacific pan pacific hotel','500 post st.','san francisco',415/929-2087,french,502
505 | 1037,'palio d\'asti','640 sacramento st.','san francisco',415/395-9800,italian,503
506 | 1038,'pane e vino','3011 steiner st.','san francisco',415/346-2111,italian,504
507 | 1039,pastis,'1015 battery st.','san francisco',415/391-2555,french,505
508 | 1040,'perry\'s','1944 union st.','san francisco',415/922-9022,american,506
509 | 1041,'r&g lounge','631 b kearny st.','san francisco',415/982-7877,'or 415/982-3811 asian',507
510 | 1042,rubicon,'558 sacramento st.','san francisco',415/434-4100,american,508
511 | 1043,rumpus,'1 tillman pl.','san francisco',415/421-2300,american,509
512 | 1044,sanppo,'1702 post st.','san francisco',415/346-3486,asian,510
513 | 1045,'scala\'s bistro','432 powell st.','san francisco',415/395-8555,italian,511
514 | 1046,'south park cafe','108 south park','san francisco',415/495-7275,french,512
515 | 1047,'splendido embarcadero',4,'san francisco',415/986-3222,mediterranean,513
516 | 1048,stars,'150 redwood alley','san francisco',415/861-7827,american,514
517 | 1049,'stars cafe','500 van ness ave.','san francisco',415/861-4344,american,515
518 | 1050,'stoyanof\'s cafe','1240 9th ave.','san francisco',415/664-3664,'greek and middle eastern',516
519 | 1051,'straits cafe','3300 geary blvd.','san francisco',415/668-1783,asian,517
520 | 1052,suppenkuche,'601 hayes st.','san francisco',415/252-9289,russian/german,518
521 | 1053,'tadich grill','240 california st.','san francisco',415/391-2373,seafood,519
522 | 1054,'the heights','3235 sacramento st.','san francisco',415/474-8890,french,520
523 | 1055,thepin,'298 gough st.','san francisco',415/863-9335,asian,521
524 | 1056,'ton kiang','3148 geary blvd.','san francisco',415/752-4440,asian,522
525 | 1057,vertigo,'600 montgomery st.','san francisco',415/433-7250,mediterranean,523
526 | 1058,'vivande porta via','2125 fillmore st.','san francisco',415/346-4430,italian,524
527 | 1059,'vivande ristorante','670 golden gate ave.','san francisco',415/673-9245,italian,525
528 | 1060,'world wrapps','2257 chestnut st.','san francisco',415/563-9727,american,526
529 | 1061,'wu kong','101 spear st.','san francisco',415/957-9300,asian,527
530 | 1062,'yank sing','427 battery st.','san francisco',415/541-4949,asian,528
531 | 1063,'yaya cuisine','1220 9th ave.','san francisco',415/566-6966,'greek and middle eastern',529
532 | 1064,'yoyo tsumami bistro','1611 post st.','san francisco',415/922-7788,french,530
533 | 1065,zarzuela,'2000 hyde st.','san francisco',415/346-0800,'mexican/latin american/spanish',531
534 | 1066,'zuni cafe & grill','1658 market st.','san francisco',415/552-2522,mediterranean,532
535 |
--------------------------------------------------------------------------------
/datasets/fodors_zagats/matches_fodors_zagats.csv:
--------------------------------------------------------------------------------
1 | fodors_id,zagats_id
2 | 534,219
3 | 535,220
4 | 536,221
5 | 537,222
6 | 538,223
7 | 539,224
8 | 540,225
9 | 541,226
10 | 542,227
11 | 543,228
12 | 544,229
13 | 545,230
14 | 546,231
15 | 547,232
16 | 548,233
17 | 549,234
18 | 550,235
19 | 551,236
20 | 552,237
21 | 553,238
22 | 554,239
23 | 555,240
24 | 556,241
25 | 557,242
26 | 558,243
27 | 559,244
28 | 560,245
29 | 561,246
30 | 562,247
31 | 563,248
32 | 564,249
33 | 565,250
34 | 566,251
35 | 567,252
36 | 568,253
37 | 569,254
38 | 570,255
39 | 571,256
40 | 572,257
41 | 573,258
42 | 574,259
43 | 575,260
44 | 576,261
45 | 577,262
46 | 578,263
47 | 579,264
48 | 580,265
49 | 581,266
50 | 582,267
51 | 583,268
52 | 584,269
53 | 585,270
54 | 586,271
55 | 587,272
56 | 588,273
57 | 589,274
58 | 590,275
59 | 591,276
60 | 592,277
61 | 593,278
62 | 594,279
63 | 595,280
64 | 596,281
65 | 597,282
66 | 598,283
67 | 599,284
68 | 600,285
69 | 601,286
70 | 602,287
71 | 603,288
72 | 604,289
73 | 605,290
74 | 606,291
75 | 607,292
76 | 608,293
77 | 609,294
78 | 610,295
79 | 611,296
80 | 612,297
81 | 613,298
82 | 614,299
83 | 615,300
84 | 616,301
85 | 617,302
86 | 618,303
87 | 619,304
88 | 620,305
89 | 621,306
90 | 622,307
91 | 623,308
92 | 624,309
93 | 625,310
94 | 626,311
95 | 627,312
96 | 628,313
97 | 629,314
98 | 630,315
99 | 631,316
100 | 632,317
101 | 633,318
102 | 634,319
103 | 635,320
104 | 636,321
105 | 637,322
106 | 638,323
107 | 639,324
108 | 640,325
109 | 641,326
110 | 642,327
111 | 643,328
112 | 644,329
113 | 645,330
114 |
--------------------------------------------------------------------------------
/datasets/fodors_zagats/metadata.txt:
--------------------------------------------------------------------------------
1 | fodors.csv
2 | zagats.csv
3 | matches_fodors_zagats.csv
4 |
--------------------------------------------------------------------------------
/datasets/fodors_zagats/zagats.csv:
--------------------------------------------------------------------------------
1 | id,name,addr,city,phone,type,class
2 | 1,'apple pan the','10801 w. pico blvd.','west la',310-475-3585,american,534
3 | 2,'asahi ramen','2027 sawtelle blvd.','west la',310-479-2231,'noodle shops',535
4 | 3,'baja fresh','3345 kimber dr.','westlake village',805-498-4049,mexican,536
5 | 4,'belvedere the','9882 little santa monica blvd.','beverly hills',310-788-2306,'pacific new wave',537
6 | 5,'benita\'s frites','1433 third st. promenade','santa monica',310-458-2889,'fast food',538
7 | 6,'bernard\'s','515 s. olive st.','los angeles',213-612-1580,continental,539
8 | 7,'bistro 45','45 s. mentor ave.',pasadena,818-795-2478,californian,540
9 | 8,'brent\'s deli','19565 parthenia ave.',northridge,818-886-5679,delis,541
10 | 9,'brighton coffee shop','9600 brighton way','beverly hills',310-276-7732,'coffee shops',542
11 | 10,'bristol farms market cafe','1570 rosecrans ave. s.',pasadena,310-643-5229,californian,543
12 | 11,'bruno\'s','3838 centinela ave.','mar vista',310-397-5703,italian,544
13 | 12,'cafe \'50s','838 lincoln blvd.',venice,310-399-1955,american,545
14 | 13,'cafe blanc','9777 little santa monica blvd.','beverly hills',310-888-0108,'pacific new wave',546
15 | 14,'cassell\'s','3266 w. sixth st.',la,213-480-8668,hamburgers,547
16 | 15,'chez melange','1716 pch','redondo beach',310-540-1222,eclectic,548
17 | 16,diaghilev,'1020 n. san vicente blvd.','w. hollywood',310-854-1111,russian,549
18 | 17,'don antonio\'s','1136 westwood blvd.',westwood,310-209-1422,italian,550
19 | 18,'duke\'s','8909 sunset blvd.','w. hollywood',310-652-3100,'coffee shops',551
20 | 19,'falafel king','1059 broxton ave.',westwood,310-208-4444,'middle eastern',552
21 | 20,'feast from the east','1949 westwood blvd.','west la',310-475-0400,chinese,553
22 | 21,'gumbo pot the','6333 w. third st.',la,213-933-0358,cajun/creole,554
23 | 22,'hollywood hills coffee shop','6145 franklin ave.',hollywood,213-467-7678,'coffee shops',555
24 | 23,'indo cafe','10428 1/2 national blvd.',la,310-815-1290,indonesian,556
25 | 24,'jan\'s family restaurant','8424 beverly blvd.',la,213-651-2866,'coffee shops',557
26 | 25,jiraffe,'502 santa monica blvd','santa monica',310-917-6671,californian,558
27 | 26,'jody maroni\'s sausage kingdom','2011 ocean front walk',venice,310-306-1995,'hot dogs',559
28 | 27,'joe\'s','1023 abbot kinney blvd.',venice,310-399-5811,'american (new)',560
29 | 28,'john o\'groats','10516 w. pico blvd.','west la',310-204-0692,'coffee shops',561
30 | 29,'johnnie\'s pastrami','4017 s. sepulveda blvd.','culver city',310-397-6654,delis,562
31 | 30,'johnny reb\'s southern smokehouse','4663 long beach blvd.','long beach',310-423-7327,southern/soul,563
32 | 31,'johnny rockets (la)','7507 melrose ave.',la,213-651-3361,american,564
33 | 32,'killer shrimp','4000 colfax ave.','studio city',818-508-1570,seafood,565
34 | 33,'kokomo cafe','6333 w. third st.',la,213-933-0773,american,566
35 | 34,'koo koo roo','8393 w. beverly blvd.',la,213-655-9045,chicken,567
36 | 35,'la cachette','10506 little santa monica blvd.','century city',310-470-4992,'french (new)',568
37 | 36,'la salsa (la)','22800 pch',malibu,310-456-6299,mexican,569
38 | 37,'la serenata de garibaldi','1842 e. first','st. boyle hts.',213-265-2887,mexican/tex-mex,570
39 | 38,'langer\'s','704 s. alvarado st.',la,213-483-8050,delis,571
40 | 39,'local nochol','30869 thousand oaks blvd.','westlake village',818-706-7706,'health food',572
41 | 40,'main course the','10509 w. pico blvd.','rancho park',310-475-7564,american,573
42 | 41,'mani\'s bakery & espresso bar','519 s. fairfax ave.',la,213-938-8800,desserts,574
43 | 42,'martha\'s','22nd street grill 25 22nd','st. hermosa beach',310-376-7786,american,575
44 | 43,'maxwell\'s cafe','13329 washington blvd.','marina del rey',310-306-7829,american,576
45 | 44,'michael\'s (los angeles)','1147 third st.','santa monica',310-451-0843,californian,577
46 | 45,mishima,'8474 w. third st.',la,213-782-0181,'noodle shops',578
47 | 46,'mo better meatty meat','7261 melrose ave.',la,213-935-5280,hamburgers,579
48 | 47,'mulberry st.','17040 ventura blvd.',encino,818-906-8881,pizza,580
49 | 48,'ocean park cafe','3117 ocean park blvd.','santa monica',310-452-5728,american,581
50 | 49,'ocean star','145 n. atlantic blvd.','monterey park',818-308-2128,seafood,582
51 | 50,'original pantry bakery','875 s. figueroa st. downtown',la,213-627-6879,diners,583
52 | 51,'parkway grill','510 s. arroyo pkwy.',pasadena,818-795-1001,californian,584
53 | 52,'pho hoa','642 broadway',chinatown,213-626-5530,vietnamese,585
54 | 53,'pink\'s famous chili dogs','709 n. la brea ave.',la,213-931-4223,'hot dogs',586
55 | 54,'poquito mas','2635 w. olive ave.',burbank,818-563-2252,mexican,587
56 | 55,r-23,'923 e. third st.','los angeles',213-687-7178,japanese,588
57 | 56,'rae\'s','2901 pico blvd.','santa monica',310-828-7937,diners,589
58 | 57,'rubin\'s red hots','15322 ventura blvd.',encino,818-905-6515,'hot dogs',590
59 | 58,'ruby\'s (la)','45 s. fair oaks ave.',pasadena,818-796-7829,diners,591
60 | 59,'russell\'s burgers','1198 pch','seal beach',310-596-9556,hamburgers,592
61 | 60,'ruth\'s chris steak house (los angeles)','224 s. beverly dr.','beverly hills',310-859-8744,steakhouses,593
62 | 61,shiro,'1505 mission st. s.',pasadena,818-799-4774,'pacific new wave',594
63 | 62,'sushi nozawa','11288 ventura blvd.','studio city',818-508-7017,japanese,595
64 | 63,'sweet lady jane','8360 melrose ave.',la,213-653-7145,desserts,596
65 | 64,taiko,'11677 san vicente blvd.',brentwood,310-207-7782,'noodle shops',597
66 | 65,'tommy\'s','2575 beverly blvd.',la,213-389-9060,hamburgers,598
67 | 66,'uncle bill\'s pancake house','1305 highland ave.','manhattan beach',310-545-5177,diners,599
68 | 67,'water grill','544 s. grand ave.','los angeles',213-891-0900,seafood,600
69 | 68,'zankou chicken','1415 e. colorado st.',glendale,818-244-1937,'middle eastern',601
70 | 69,'afghan kebab house','764 ninth ave.','new york city',212-307-1612,afghan,602
71 | 70,arcadia,'21 e. 62nd st.','new york city',212-223-2900,'american (new)',603
72 | 71,'benny\'s burritos','93 ave. a','new york city',212-254-2054,mexican,604
73 | 72,'cafe con leche','424 amsterdam ave.','new york city',212-595-7000,cuban,605
74 | 73,'corner bistro','331 w. fourth st.','new york city',212-242-9502,hamburgers,606
75 | 74,'cucina della fontana','368 bleecker st.','new york city',212-242-0636,italian,607
76 | 75,'cucina di pesce','87 e. fourth st.','new york city',212-260-6800,seafood,608
77 | 76,darbar,'44 w. 56th st.','new york city',212-432-7227,indian,609
78 | 77,'ej\'s luncheonette','432 sixth ave.','new york city',212-473-5555,diners,610
79 | 78,'edison cafe','228 w. 47th st.','new york city',212-840-5000,diners,611
80 | 79,'elias corner','24-02 31st st.',queens,718-932-1510,greek,612
81 | 80,'good enough to eat','483 amsterdam ave.','new york city',212-496-0163,american,613
82 | 81,'gray\'s papaya','2090 broadway','new york city',212-799-0243,'hot dogs',614
83 | 82,'il mulino','86 w. third st.','new york city',212-673-3783,italian,615
84 | 83,'jackson diner','37-03 74th st.',queens,718-672-1232,indian,616
85 | 84,'joe\'s shanghai','9 pell st.',queens,718-539-3838,chinese,617
86 | 85,'john\'s pizzeria','48 w. 65th st.','new york city',212-721-7001,pizza,618
87 | 86,'kelley & ping','127 greene st.','new york city',212-228-1212,pan-asian,619
88 | 87,kiev,'117 second ave.','new york city',212-674-4040,ukrainian,620
89 | 88,'kuruma zushi','2nd fl.','new york city',212-317-2802,japanese,621
90 | 89,'la caridad','2199 broadway','new york city',212-874-2780,cuban,622
91 | 90,'la grenouille','3 e. 52nd st.','new york city',212-752-1495,'french (classic)',623
92 | 91,'lemongrass grill','61a seventh ave.',brooklyn,718-399-7100,thai,624
93 | 92,'lombardi\'s','32 spring st.','new york city',212-941-7994,pizza,625
94 | 93,'marnie\'s noodle shop','466 hudson st.','new york city',212-741-3214,asian,626
95 | 94,menchanko-tei,'39 w. 55th st.','new york city',212-247-1585,japanese,627
96 | 95,'mitali east-west','296 bleecker st.','new york city',212-989-1367,indian,628
97 | 96,'monsoon (ny)','435 amsterdam ave.','new york city',212-580-8686,thai,629
98 | 97,moustache,'405 atlantic ave.',brooklyn,718-852-5555,'middle eastern',630
99 | 98,nobu,'105 hudson st.','new york city',212-219-0500,japanese,631
100 | 99,'one if by land tibs','17 barrow st.','new york city',212-228-0822,continental,632
101 | 100,'oyster bar','lower level','new york city',212-490-6650,seafood,633
102 | 101,palm,'837 second ave.','new york city',212-687-2953,steakhouses,634
103 | 102,'palm too','840 second ave.','new york city',212-697-5198,steakhouses,635
104 | 103,'patsy\'s pizza','19 old fulton st.',brooklyn,718-858-4300,pizza,636
105 | 104,'peter luger steak house','178 broadway',brooklyn,718-387-7400,steakhouses,637
106 | 105,'rose of india','308 e. sixth st.','new york city',212-533-5011,indian,638
107 | 106,'sam\'s noodle shop','411 third ave.','new york city',212-213-2288,chinese,639
108 | 107,'sarabeth\'s','1295 madison ave.','new york city',212-410-7335,american,640
109 | 108,'sparks steak house','210 e. 46th st.','new york city',212-687-4855,steakhouses,641
110 | 109,'stick to your ribs','5-16 51st ave.',queens,718-937-3030,bbq,642
111 | 110,sushisay,'38 e. 51st st.','new york city',212-755-1780,japanese,643
112 | 111,'sylvia\'s','328 lenox ave.','new york city',212-996-0660,southern/soul,644
113 | 112,'szechuan hunan cottage','1588 york ave.','new york city',212-535-5223,chinese,645
114 | 113,'szechuan kitchen','1460 first ave.','new york city',212-249-4615,chinese,646
115 | 114,'teresa\'s','80 montague st.',queens,718-520-2910,polish,647
116 | 115,'thai house cafe','151 hudson st.','new york city',212-334-1085,thai,648
117 | 116,'thailand restaurant','106 bayard st.','new york city',212-349-3132,thai,649
118 | 117,veselka,'144 second ave.','new york city',212-228-9682,ukrainian,650
119 | 118,'westside cottage','689 ninth ave.','new york city',212-245-0800,chinese,651
120 | 119,'windows on the world','107th fl.','new york city',212-524-7000,eclectic,652
121 | 120,'wollensky\'s grill','205 e. 49th st.','new york city',212-753-0444,steakhouses,653
122 | 121,yama,'122 e. 17th st.','new york city',212-475-0969,japanese,654
123 | 122,zarela,'953 second ave.','new york city',212-644-6740,mexican,655
124 | 123,'andre\'s french restaurant','401 s. 6th st.','las vegas',702-385-5016,'french (classic)',656
125 | 124,'buccaneer bay club','3300 las vegas blvd. s.','las vegas',702-894-7350,continental,657
126 | 125,'buzio\'s in the rio','3700 w. flamingo rd.','las vegas',702-252-7697,seafood,658
127 | 126,'emeril\'s new orleans fish house','3799 las vegas blvd. s.','las vegas',702-891-7374,seafood,659
128 | 127,'fiore rotisserie & grille','3700 w. flamingo rd.','las vegas',702-252-7702,italian,660
129 | 128,'hugo\'s cellar','202 e. fremont st.','las vegas',702-385-4011,continental,661
130 | 129,'madame ching\'s','3300 las vegas blvd. s.','las vegas',702-894-7111,asian,662
131 | 130,'mayflower cuisinier','4750 w. sahara ave.','las vegas',702-870-8432,chinese,663
132 | 131,'michael\'s (las vegas)','3595 las vegas blvd. s.','las vegas',702-737-7111,continental,664
133 | 132,'monte carlo','3145 las vegas blvd. s.','las vegas',702-733-4524,'french (new)',665
134 | 133,moongate,'3400 las vegas blvd. s.','las vegas',702-791-7352,chinese,666
135 | 134,'morton\'s of chicago (las vegas)','3200 las vegas blvd. s.','las vegas',702-893-0703,steakhouses,667
136 | 135,'nicky blair\'s','3925 paradise rd.','las vegas',702-792-9900,italian,668
137 | 136,'piero\'s restaurant','355 convention center dr.','las vegas',702-369-2305,italian,669
138 | 137,'spago (las vegas)','3500 las vegas blvd. s.','las vegas',702-369-6300,californian,670
139 | 138,'steakhouse the','128 e. fremont st.','las vegas',702-382-1600,steakhouses,671
140 | 139,'stefano\'s','129 fremont st.','las vegas',702-385-7111,italian,672
141 | 140,'sterling brunch','3645 las vegas blvd. s.','las vegas',702-739-4651,eclectic,673
142 | 141,'tre visi','3799 las vegas blvd. s.','las vegas',702-891-7331,italian,674
143 | 142,'103 west','103 w. paces ferry rd.',atlanta,404-233-5993,continental,675
144 | 143,'alon\'s at the terrace','659 peachtree st.',atlanta,404-724-0444,sandwiches,676
145 | 144,'baker\'s cajun cafe','1134 euclid ave.',atlanta,404-223-5039,cajun/creole,677
146 | 145,'barbecue kitchen','1437 virginia ave.',atlanta,404-766-9906,bbq,678
147 | 146,'bistro the','56 e. andrews dr. nw',atlanta,404-231-5733,'french bistro',679
148 | 147,'bobby & june\'s kountry kitchen','375 14th st.',atlanta,404-876-3872,southern/soul,680
149 | 148,'bradshaw\'s restaurant','2911 s. pharr court',atlanta,404-261-7015,southern/soul,681
150 | 149,'brookhaven cafe','4274 peachtree rd.',atlanta,404-231-5907,vegetarian,682
151 | 150,'cafe sunflower','5975 roswell rd.',atlanta,404-256-1675,'health food',683
152 | 151,canoe,'4199 paces ferry rd.',atlanta,770-432-2663,'american (new)',684
153 | 152,'carey\'s','1021 cobb pkwy. se',marietta,770-422-8042,hamburgers,685
154 | 153,'carey\'s corner','1215 powers ferry rd.',marietta,770-933-0909,hamburgers,686
155 | 154,chops,'70 w. paces ferry rd.',atlanta,404-262-2675,steakhouses,687
156 | 155,chopstix,'4279 roswell rd.',atlanta,404-255-4868,chinese,688
157 | 156,'deacon burton\'s soulfood restaurant','1029 edgewood ave. se',atlanta,404-523-1929,southern/soul,689
158 | 157,eats,'600 ponce de leon ave.',atlanta,404-888-9149,italian,690
159 | 158,'flying biscuit the','1655 mclendon ave.',atlanta,404-687-8888,eclectic,691
160 | 159,frijoleros,'1031 peachtree st. ne',atlanta,404-892-8226,tex-mex,692
161 | 160,'greenwood\'s','1087 green st.',roswell,770-992-5383,southern/soul,693
162 | 161,'harold\'s barbecue','171 mcdonough blvd.',atlanta,404-627-9268,bbq,694
163 | 162,'havana sandwich shop','2905 buford hwy.',atlanta,404-636-4094,cuban,695
164 | 163,'house of chan','2469 cobb pkwy.',smyrna,770-955-9444,chinese,696
165 | 164,'indian delights','3675 satellite blvd.',duluth,100-813-8212,indian,697
166 | 165,'java jive','790 ponce de leon ave.',atlanta,404-876-6161,'coffee shops',698
167 | 166,'johnny rockets (at)','2970 cobb pkwy.',atlanta,770-955-6068,american,699
168 | 167,'kalo\'s coffee house','1248 clairmont rd.',decatur,404-325-3733,coffeehouses,700
169 | 168,'la fonda latina','4427 roswell rd.',atlanta,404-303-8201,spanish,701
170 | 169,'lettuce souprise you (at)','3525 mall blvd.',duluth,770-418-9969,cafeterias,702
171 | 170,majestic,'1031 ponce de leon ave.',atlanta,404-875-0276,diners,703
172 | 171,'morton\'s of chicago (atlanta)','303 peachtree st. ne',atlanta,404-577-4366,steakhouses,704
173 | 172,'my thai','1248 clairmont rd.',atlanta,404-636-4280,thai,705
174 | 173,nava,'3060 peachtree rd.',atlanta,404-240-1984,southwestern,706
175 | 174,'nuevo laredo cantina','1495 chattahoochee ave. nw',atlanta,404-352-9009,mexican,707
176 | 175,'original pancake house (at)','4330 peachtree rd.',atlanta,404-237-4116,american,708
177 | 176,'palm the (atlanta)','3391 peachtree rd. ne',atlanta,404-814-1955,steakhouses,709
178 | 177,'rainbow restaurant','2118 n. decatur rd.',decatur,404-633-3538,vegetarian,710
179 | 178,riviera,'519 e. paces ferry rd.',atlanta,404-262-7112,mediterranean,712
180 | 179,'silver skillet the','200 14th st. nw',atlanta,404-874-1388,'coffee shops',713
181 | 180,soto,'3330 piedmont rd.',atlanta,404-233-2005,japanese,714
182 | 181,'thelma\'s kitchen','764 marietta st. nw',atlanta,404-688-5855,cafeterias,715
183 | 182,tortillas,'774 ponce de leon ave. ne',atlanta,404-892-0193,tex-mex,716
184 | 183,'van gogh\'s restaurant & bar','70 w. crossville rd.',roswell,770-993-1156,'american (new)',717
185 | 184,veggieland,'220 sandy springs circle',atlanta,404-231-3111,vegetarian,718
186 | 185,'white house restaurant','3172 peachtree rd. ne',atlanta,404-237-7601,diners,719
187 | 186,zab-e-lee,'4837 old national hwy.','college park',404-768-2705,thai,720
188 | 187,'bill\'s place','2315 clement st.','san francisco',415-221-5262,hamburgers,721
189 | 188,'cafe flore','2298 market st.','san francisco',415-621-8579,californian,722
190 | 189,'caffe greco','423 columbus ave.','san francisco',415-397-6261,continental,723
191 | 190,'campo santo','240 columbus ave.','san francisco',415-433-9623,mexican,724
192 | 191,'cha cha cha\'s','1805 haight st.','san francisco',415-386-5758,caribbean,725
193 | 192,'doidge\'s','2217 union st.','san francisco',415-921-2149,american,726
194 | 193,'dottie\'s true blue cafe','522 jones st.','san francisco',415-885-2767,diners,727
195 | 194,'dusit thai','3221 mission st.','san francisco',415-826-4639,thai,728
196 | 195,ebisu,'1283 ninth ave.','san francisco',415-566-1770,japanese,729
197 | 196,'emerald garden restaurant','1550 california st.','san francisco',415-673-1155,vietnamese,730
198 | 197,'eric\'s chinese restaurant','1500 church st.','san francisco',415-282-0919,chinese,731
199 | 198,'hamburger mary\'s','1582 folsom st.','san francisco',415-626-1985,hamburgers,732
200 | 199,'kelly\'s on trinity','333 bush st.','san francisco',415-362-4454,californian,733
201 | 200,'la cumbre','515 valencia st.','san francisco',415-863-8205,mexican,734
202 | 201,'la mediterranee','288 noe st.','san francisco',415-431-7210,mediterranean,735
203 | 202,'la taqueria','2889 mission st.','san francisco',415-285-7117,mexican,736
204 | 203,'mario\'s bohemian cigar store cafe','2209 polk st.','san francisco',415-776-8226,italian,737
205 | 204,'marnee thai','2225 irving st.','san francisco',415-665-9500,thai,738
206 | 205,'mel\'s drive-in','3355 geary st.','san francisco',415-387-2244,hamburgers,739
207 | 206,'mo\'s burgers','1322 grant st.','san francisco',415-788-3779,hamburgers,740
208 | 207,'phnom penh cambodian restaurant','631 larkin st.','san francisco',415-775-5979,cambodian,741
209 | 208,'roosevelt tamale parlor','2817 24th st.','san francisco',415-550-9213,mexican,742
210 | 209,'sally\'s cafe & bakery','300 de haro st.','san francisco',415-626-6006,american,743
211 | 210,'san francisco bbq','1328 18th st.','san francisco',415-431-8956,thai,744
212 | 211,'slanted door','584 valencia st.','san francisco',415-861-8032,vietnamese,745
213 | 212,'swan oyster depot','1517 polk st.','san francisco',415-673-1101,seafood,746
214 | 213,'thep phanom','400 waller st.','san francisco',415-431-2526,thai,747
215 | 214,'ti couz','3108 16th st.','san francisco',415-252-7373,french,748
216 | 215,'trio cafe','1870 fillmore st.','san francisco',415-563-2248,american,749
217 | 216,'tu lan','8 sixth st.','san francisco',415-626-0927,vietnamese,750
218 | 217,'vicolo pizzeria','201 ivy st.','san francisco',415-863-2382,pizza,751
219 | 218,'wa-ha-ka oaxaca mexican grill','2141 polk st.','san francisco',415-775-1055,mexican,752
220 | 219,'arnie morton\'s of chicago','435 s. la cienega blvd.','los angeles',310-246-1501,steakhouses,0
221 | 220,'art\'s deli','12224 ventura blvd.','studio city',818-762-1221,delis,1
222 | 221,'bel-air hotel','701 stone canyon rd.','bel air',310-472-1211,californian,2
223 | 222,'cafe bizou','14016 ventura blvd.','sherman oaks',818-788-3536,'french bistro',3
224 | 223,campanile,'624 s. la brea ave.','los angeles',213-938-1447,californian,4
225 | 224,'chinois on main','2709 main st.','santa monica',310-392-9025,'pacific new wave',5
226 | 225,citrus,'6703 melrose ave.','los angeles',213-857-0034,californian,6
227 | 226,'fenix at the argyle','8358 sunset blvd.','w. hollywood',213-848-6677,'french (new)',7
228 | 227,granita,'23725 w. malibu rd.',malibu,310-456-0488,californian,8
229 | 228,'grill the','9560 dayton way','beverly hills',310-276-0615,'american (traditional)',9
230 | 229,katsu,'1972 hillhurst ave.','los feliz',213-665-1891,japanese,10
231 | 230,'l\'orangerie','903 n. la cienega blvd.','w. hollywood',310-652-9770,'french (classic)',11
232 | 231,'le chardonnay (los angeles)','8284 melrose ave.','los angeles',213-655-8880,'french bistro',12
233 | 232,'locanda veneta','8638 w. third st.','los angeles',310-274-1893,italian,13
234 | 233,matsuhisa,'129 n. la cienega blvd.','beverly hills',310-659-9639,seafood,14
235 | 234,'palm the (los angeles)','9001 santa monica blvd.','w. hollywood',310-550-8811,steakhouses,15
236 | 235,patina,'5955 melrose ave.','los angeles',213-467-1108,californian,16
237 | 236,'philippe the original','1001 n. alameda st.',chinatown,213-628-3781,cafeterias,17
238 | 237,'pinot bistro','12969 ventura blvd.','studio city',818-990-0500,'french bistro',18
239 | 238,'rex il ristorante','617 s. olive st.','los angeles',213-627-2300,'nuova cucina italian',19
240 | 239,'spago (los angeles)','8795 sunset blvd.','w. hollywood',310-652-4025,californian,20
241 | 240,valentino,'3115 pico blvd.','santa monica',310-829-4313,italian,21
242 | 241,'yujean kang\'s','67 n. raymond ave.',pasadena,818-585-0855,chinese,22
243 | 242,'21 club','21 w. 52nd st.','new york city',212-582-7200,'american (new)',23
244 | 243,aquavit,'13 w. 54th st.','new york city',212-307-7311,scandinavian,24
245 | 244,aureole,'34 e. 61st st.','new york city',212-319-1660,'american (new)',25
246 | 245,'cafe lalo','201 w. 83rd st.','new york city',212-496-6031,coffeehouses,26
247 | 246,'cafe des artistes','1 w. 67th st.','new york city',212-877-3500,'french (classic)',27
248 | 247,'carmine\'s','2450 broadway','new york city',212-362-2200,italian,28
249 | 248,'carnegie deli','854 seventh ave.','new york city',212-757-2245,delis,29
250 | 249,chanterelle,'2 harrison st.','new york city',212-966-6960,'french (new)',30
251 | 250,daniel,'20 e. 76th st.','new york city',212-288-0033,'french (new)',31
252 | 251,dawat,'210 e. 58th st.','new york city',212-355-7555,indian,32
253 | 252,felidia,'243 e. 58th st.','new york city',212-758-1479,italian,33
254 | 253,'four seasons','99 e. 52nd st.','new york city',212-754-9494,'american (new)',34
255 | 254,'gotham bar & grill','12 e. 12th st.','new york city',212-620-4020,'american (new)',35
256 | 255,'gramercy tavern','42 e. 20th st.','new york city',212-477-0777,'american (new)',36
257 | 256,'island spice','402 w. 44th st.','new york city',212-765-1737,caribbean,37
258 | 257,'jo jo','160 e. 64th st.','new york city',212-223-5656,'french bistro',38
259 | 258,'la caravelle','33 w. 55th st.','new york city',212-586-4252,'french (classic)',39
260 | 259,'la cote basque','60 w. 55th st.','new york city',212-688-6525,'french (classic)',40
261 | 260,'le bernardin','155 w. 51st st.','new york city',212-489-1515,seafood,41
262 | 261,'les celebrites','155 w. 58th st.','new york city',212-484-5113,'french (classic)',42
263 | 262,'lespinasse (new york city)','2 e. 55th st.','new york city',212-339-6719,asian,43
264 | 263,lutece,'249 e. 50th st.','new york city',212-752-2225,'french (classic)',44
265 | 264,'manhattan ocean club','57 w. 58th st.','new york city',212-371-7777,seafood,45
266 | 265,march,'405 e. 58th st.','new york city',212-754-6272,'american (new)',46
267 | 266,'mesa grill','102 fifth ave.','new york city',212-807-7400,southwestern,47
268 | 267,'mi cocina','57 jane st.','new york city',212-627-8273,mexican,48
269 | 268,montrachet,'239 w. broadway','new york city',212-219-2777,'french bistro',49
270 | 269,oceana,'55 e. 54th st.','new york city',212-759-5941,seafood,50
271 | 270,'park avenue cafe (new york city)','100 e. 63rd st.','new york city',212-644-1900,'american (new)',51
272 | 271,petrossian,'182 w. 58th st.','new york city',212-245-2214,russian,52
273 | 272,picholine,'35 w. 64th st.','new york city',212-724-8585,mediterranean,53
274 | 273,pisces,'95 ave. a','new york city',212-260-6660,seafood,54
275 | 274,'rainbow room','30 rockefeller plaza','new york city',212-632-5000,'american (new)',55
276 | 275,'river cafe','1 water st.',brooklyn,718-522-5200,'american (new)',56
277 | 276,'san domenico','240 central park s.','new york city',212-265-5959,italian,57
278 | 277,'second avenue deli','156 second ave.','new york city',212-677-0606,delis,58
279 | 278,seryna,'11 e. 53rd st.','new york city',212-980-9393,japanese,59
280 | 279,'shun lee palace','155 e. 55th st.','new york city',212-371-8844,chinese,60
281 | 280,'sign of the dove','1110 third ave.','new york city',212-861-8080,'american (new)',61
282 | 281,'smith & wollensky','797 third ave.','new york city',212-753-1530,steakhouses,62
283 | 282,'tavern on the green','central park west','new york city',212-873-3200,'american (new)',63
284 | 283,'uncle nick\'s','747 ninth ave.','new york city',212-245-7992,greek,64
285 | 284,'union square cafe','21 e. 16th st.','new york city',212-243-4020,'american (new)',65
286 | 285,'virgil\'s real bbq','152 w. 44th st.','new york city',212-921-9494,bbq,66
287 | 286,'chin\'s','3200 las vegas blvd. s.','las vegas',702-733-8899,chinese,67
288 | 287,'coyote cafe (las vegas)','3799 las vegas blvd. s.','las vegas',702-891-7349,southwestern,68
289 | 288,'le montrachet bistro','3000 paradise rd.','las vegas',702-732-5651,'french bistro',69
290 | 289,'palace court','3570 las vegas blvd. s.','las vegas',702-731-7110,'french (new)',70
291 | 290,'second street grill','200 e. fremont st.','las vegas',702-385-6277,'pacific rim',71
292 | 291,'steak house the','2880 las vegas blvd. s.','las vegas',702-734-0410,steakhouses,72
293 | 292,'tillerman the','2245 e. flamingo rd.','las vegas',702-731-4036,steakhouses,73
294 | 293,abruzzi,'2355 peachtree rd. ne',atlanta,404-261-8186,italian,74
295 | 294,bacchanalia,'3125 piedmont rd.',atlanta,404-365-0410,californian,75
296 | 295,'bone\'s restaurant','3130 piedmont rd. ne',atlanta,404-237-2663,steakhouses,76
297 | 296,'brasserie le coze','3393 peachtree rd.',atlanta,404-266-1440,'french bistro',77
298 | 297,'buckhead diner','3073 piedmont rd.',atlanta,404-262-3336,'american (new)',78
299 | 298,'ciboulette restaurant','1529 piedmont ave.',atlanta,404-874-7600,'french (new)',79
300 | 299,delectables,'1 margaret mitchell sq.',atlanta,404-681-2909,cafeterias,80
301 | 300,'georgia grille','2290 peachtree rd.',atlanta,404-352-3517,southwestern,81
302 | 301,'hedgerose heights inn the','490 e. paces ferry rd. ne',atlanta,404-233-7673,continental,82
303 | 302,'heera of india','595 piedmont ave.',atlanta,404-876-4408,indian,83
304 | 303,'indigo coastal grill','1397 n. highland ave.',atlanta,404-876-0676,eclectic,84
305 | 304,'la grotta','2637 peachtree rd. ne',atlanta,404-231-1368,italian,85
306 | 305,'mary mac\'s tea room','224 ponce de leon ave.',atlanta,404-876-1800,southern/soul,86
307 | 306,'nikolai\'s roof','255 courtland st.',atlanta,404-221-6362,continental,87
308 | 307,'pano\'s & paul\'s','1232 w. paces ferry rd.',atlanta,404-261-3662,'american (new)',88
309 | 308,'ritz-carlton cafe (buckhead)','3434 peachtree rd. ne',atlanta,404-237-2700,'american (new)',89
310 | 309,'ritz-carlton dining room (buckhead)','3434 peachtree rd. ne',atlanta,404-237-2700,'american (new)',90
311 | 310,'ritz-carlton restaurant','181 peachtree st.',atlanta,404-659-0400,'french (classic)',91
312 | 311,toulouse,'293-b peachtree rd.',atlanta,404-351-9533,'french (new)',92
313 | 312,'veni vidi vici','41 14th st.',atlanta,404-875-8424,italian,93
314 | 313,'alain rondelli','126 clement st.','san francisco',415-387-0408,'french (new)',94
315 | 314,aqua,'252 california st.','san francisco',415-956-9662,'american (new)',95
316 | 315,boulevard,'1 mission st.','san francisco',415-543-6084,'american (new)',96
317 | 316,'cafe claude','7 claude ln.','san francisco',415-392-3505,'french bistro',97
318 | 317,'campton place','340 stockton st.','san francisco',415-955-5555,'american (new)',98
319 | 318,'chez michel','804 north point st.','san francisco',415-775-7036,californian,99
320 | 319,'fleur de lys','777 sutter st.','san francisco',415-673-7779,'french (new)',100
321 | 320,fringale,'570 fourth st.','san francisco',415-543-0573,'french bistro',101
322 | 321,'hawthorne lane','22 hawthorne st.','san francisco',415-777-9779,californian,102
323 | 322,'khan toke thai house','5937 geary blvd.','san francisco',415-668-6654,thai,103
324 | 323,'la folie','2316 polk st.','san francisco',415-776-5577,'french (new)',104
325 | 324,'lulu restaurant-bis-cafe','816 folsom st.','san francisco',415-495-5775,mediterranean,105
326 | 325,'masa\'s','648 bush st.','san francisco',415-989-7154,'french (new)',106
327 | 326,mifune,'1737 post st.','san francisco',415-922-0337,japanese,107
328 | 327,'plumpjack cafe','3127 fillmore st.','san francisco',415-563-4755,'american (new)',108
329 | 328,postrio,'545 post st.','san francisco',415-776-7825,californian,109
330 | 329,'ritz-carlton dining room (san francisco)','600 stockton st.','san francisco',415-296-7465,'french (new)',110
331 | 330,'rose pistola','532 columbus ave.','san francisco',415-399-0499,italian,111
332 | 331,'ritz-carlton cafe (atlanta)','181 peachtree st.',atlanta,404-659-0400,'american (new)',711
333 |
--------------------------------------------------------------------------------
/datasets/fodors_zagats_single/matches_fodors_zagats.csv:
--------------------------------------------------------------------------------
1 | l_id,r_id
2 | 534,219
3 | 535,220
4 | 536,221
5 | 537,222
6 | 538,223
7 | 539,224
8 | 540,225
9 | 541,226
10 | 542,227
11 | 543,228
12 | 544,229
13 | 545,230
14 | 546,231
15 | 547,232
16 | 548,233
17 | 549,234
18 | 550,235
19 | 551,236
20 | 552,237
21 | 553,238
22 | 554,239
23 | 555,240
24 | 556,241
25 | 557,242
26 | 558,243
27 | 559,244
28 | 560,245
29 | 561,246
30 | 562,247
31 | 563,248
32 | 564,249
33 | 565,250
34 | 566,251
35 | 567,252
36 | 568,253
37 | 569,254
38 | 570,255
39 | 571,256
40 | 572,257
41 | 573,258
42 | 574,259
43 | 575,260
44 | 576,261
45 | 577,262
46 | 578,263
47 | 579,264
48 | 580,265
49 | 581,266
50 | 582,267
51 | 583,268
52 | 584,269
53 | 585,270
54 | 586,271
55 | 587,272
56 | 588,273
57 | 589,274
58 | 590,275
59 | 591,276
60 | 592,277
61 | 593,278
62 | 594,279
63 | 595,280
64 | 596,281
65 | 597,282
66 | 598,283
67 | 599,284
68 | 600,285
69 | 601,286
70 | 602,287
71 | 603,288
72 | 604,289
73 | 605,290
74 | 606,291
75 | 607,292
76 | 608,293
77 | 609,294
78 | 610,295
79 | 611,296
80 | 612,297
81 | 613,298
82 | 614,299
83 | 615,300
84 | 616,301
85 | 617,302
86 | 618,303
87 | 619,304
88 | 620,305
89 | 621,306
90 | 622,307
91 | 623,308
92 | 624,309
93 | 625,310
94 | 626,311
95 | 627,312
96 | 628,313
97 | 629,314
98 | 630,315
99 | 631,316
100 | 632,317
101 | 633,318
102 | 634,319
103 | 635,320
104 | 636,321
105 | 637,322
106 | 638,323
107 | 639,324
108 | 640,325
109 | 641,326
110 | 642,327
111 | 643,328
112 | 644,329
113 | 645,330
114 | 219,534
115 | 220,535
116 | 221,536
117 | 222,537
118 | 223,538
119 | 224,539
120 | 225,540
121 | 226,541
122 | 227,542
123 | 228,543
124 | 229,544
125 | 230,545
126 | 231,546
127 | 232,547
128 | 233,548
129 | 234,549
130 | 235,550
131 | 236,551
132 | 237,552
133 | 238,553
134 | 239,554
135 | 240,555
136 | 241,556
137 | 242,557
138 | 243,558
139 | 244,559
140 | 245,560
141 | 246,561
142 | 247,562
143 | 248,563
144 | 249,564
145 | 250,565
146 | 251,566
147 | 252,567
148 | 253,568
149 | 254,569
150 | 255,570
151 | 256,571
152 | 257,572
153 | 258,573
154 | 259,574
155 | 260,575
156 | 261,576
157 | 262,577
158 | 263,578
159 | 264,579
160 | 265,580
161 | 266,581
162 | 267,582
163 | 268,583
164 | 269,584
165 | 270,585
166 | 271,586
167 | 272,587
168 | 273,588
169 | 274,589
170 | 275,590
171 | 276,591
172 | 277,592
173 | 278,593
174 | 279,594
175 | 280,595
176 | 281,596
177 | 282,597
178 | 283,598
179 | 284,599
180 | 285,600
181 | 286,601
182 | 287,602
183 | 288,603
184 | 289,604
185 | 290,605
186 | 291,606
187 | 292,607
188 | 293,608
189 | 294,609
190 | 295,610
191 | 296,611
192 | 297,612
193 | 298,613
194 | 299,614
195 | 300,615
196 | 301,616
197 | 302,617
198 | 303,618
199 | 304,619
200 | 305,620
201 | 306,621
202 | 307,622
203 | 308,623
204 | 309,624
205 | 310,625
206 | 311,626
207 | 312,627
208 | 313,628
209 | 314,629
210 | 315,630
211 | 316,631
212 | 317,632
213 | 318,633
214 | 319,634
215 | 320,635
216 | 321,636
217 | 322,637
218 | 323,638
219 | 324,639
220 | 325,640
221 | 326,641
222 | 327,642
223 | 328,643
224 | 329,644
225 | 330,645
226 |
--------------------------------------------------------------------------------
/datasets/fodors_zagats_single/metadata.txt:
--------------------------------------------------------------------------------
1 | fz.csv
2 | matches_fodors_zagats.csv
3 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: ZeroER
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - pip=21.0.1
7 | - python=3.6.12
8 | - setuptools=52.0.0
9 | - tk=8.6.10
10 | - pip:
11 | - backcall==0.2.0
12 | - chardet==4.0.0
13 | - cloudpickle==1.6.0
14 | - cycler==0.10.0
15 | - decorator==4.4.2
16 | - idna==2.10
17 | - ipython==7.16.1
18 | - ipython-genutils==0.2.0
19 | - jedi==0.18.0
20 | - joblib==1.0.1
21 | - kiwisolver==1.3.1
22 | - matplotlib==3.3.4
23 | - numpy==1.19.5
24 | - pandas==1.1.5
25 | - parso==0.8.1
26 | - pexpect==4.8.0
27 | - pickleshare==0.7.5
28 | - pillow==8.1.0
29 | - prompt-toolkit==3.0.16
30 | - ptyprocess==0.7.0
31 | - py-entitymatching==0.4.0
32 | - py-stringmatching==0.4.2
33 | - py-stringsimjoin==0.3.2
34 | - pygments==2.8.0
35 | - pyparsing==2.4.7
36 | - pyprind==2.11.2
37 | - python-dateutil==2.8.1
38 | - pytz==2021.1
39 | - requests==2.25.1
40 | - scikit-learn==0.24.1
41 | - scipy==1.5.4
42 | - six==1.15.0
43 | - threadpoolctl==2.1.0
44 | - tqdm==4.57.0
45 | - traitlets==4.3.3
46 | - urllib3==1.26.3
47 | - wcwidth==0.2.5
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import pandas as pd
3 | from collections import Counter
4 | from scipy.optimize import newton
5 | import numpy as np
6 | from scipy.stats import norm, multivariate_normal
7 | from sklearn.metrics import precision_score, recall_score, f1_score
8 | from sklearn.mixture import GaussianMixture
9 | from tqdm import tqdm
10 | from collections import defaultdict
11 | from sklearn.preprocessing import MinMaxScaler
12 |
13 | def getScaledSum(similarity_features):
14 | feature_sums = np.sum(similarity_features, axis=1)
15 | scaler = MinMaxScaler()
16 | scaled = scaler.fit_transform(feature_sums.reshape(-1,1))
17 | return scaled
18 |
19 |
20 | def get_y_init_given_threshold(similarity_features_df, threshold=0.8):
21 | x = similarity_features_df.values
22 | min_max_scaler = MinMaxScaler()
23 | x_scaled = min_max_scaler.fit_transform(x)
24 | scaled_sum = getScaledSum(x_scaled)
25 | training_labels_ = scaled_sum > threshold
26 | y_init = [int(val) for val in training_labels_]
27 | return y_init
28 |
29 |
30 | DEL = 1e-300
31 |
32 | def _get_results(true_labels, predicted_labels):
33 | p = precision_score(true_labels, predicted_labels)
34 | r = recall_score(true_labels, predicted_labels)
35 | f1 = f1_score(true_labels, predicted_labels)
36 | return p, r, f1
37 |
38 | def bay_coeff(a,b,u):
39 | return np.exp(-(np.log(a/(b+DEL)+b/(a+DEL)+2)+u/(a+b+DEL)))
40 |
41 |
42 | class ConvergenceMeter:
43 | def __init__(self, num_converged, rate_threshold,
44 | diff_fn=lambda a, b: abs(a - b)):
45 | self._num_converged = num_converged
46 | self._rate_threshold = rate_threshold
47 | self._diff_fn = diff_fn
48 | self._diff_history = list()
49 | self._last_val = None
50 |
51 | def offer(self, val):
52 | if self._last_val is not None:
53 | self._diff_history.append(
54 | self._diff_fn(val, self._last_val))
55 |
56 | self._last_val = val
57 |
58 | @property
59 | def is_converged(self):
60 | if len(self._diff_history) < self._num_converged:
61 | return False
62 |
63 | return np.mean(
64 | self._diff_history[-self._num_converged:]) \
65 | <= self._rate_threshold
66 |
67 |
68 |
69 | class ZeroerModel:
70 | class Gaussian:
71 | def __init__(self, mu, std):
72 | self.mu = mu
73 | self.std = (std + DEL)
74 |
75 | def plot(self, axis):
76 | x = np.linspace(0, 1, 1000)
77 | pdf = norm.pdf(x, self.mu, self.std)
78 | axis.plot(x, pdf, linewidth=4)
79 |
80 | def pdf(self, s):
81 | return norm.pdf(s, loc=self.mu, scale=self.std)
82 |
83 | def logpdf(self, s):
84 | return norm.logpdf(s, loc=self.mu, scale=self.std)
85 |
86 |
87 | def __init__(self, similarity_matrix, feature_names, y,id_df, c_bay,pi_M=None, hard=False):
88 | self.c_bay = c_bay
89 | self.y = get_y_init_given_threshold(pd.DataFrame(similarity_matrix))
90 | self.X = np.array(similarity_matrix)
91 | self.id_tuple_to_index = {}
92 | if id_df is not None:
93 | self.ids = id_df.values
94 | for i in range(self.ids.shape[0]):
95 | self.id_tuple_to_index[(self.ids[i,0],self.ids[i,1])] = i
96 | self.id_tuple_to_index[(self.ids[i,1], self.ids[i,0])] = i
97 |
98 | Mu_all = np.mean(self.X,axis=0)
99 | self.Cov_all = np.dot(np.transpose(self.X - Mu_all),(self.X - Mu_all))/self.X.shape[0]
100 | self.corr = pd.DataFrame(similarity_matrix).corr().values
101 | self.sigma = np.zeros_like(self.corr)
102 | for i in range(self.corr.shape[0]):
103 | self.sigma[i,i] = np.std(self.X[:,i])
104 | self.P_M = np.zeros(self.X.shape[0]) # M is class 1
105 | self.Q_avg = 0
106 | self.feature_names = feature_names
107 |
108 | self.col_index_2_group_name = []
109 | self.group_name_2_col_indices = defaultdict(list)
110 | for i_col,name in enumerate(feature_names):
111 | self.col_index_2_group_name.append(name.split("_")[0])
112 | self.group_name_2_col_indices[self.col_index_2_group_name[-1]].append(i_col)
113 | self.group_names = list(set(self.col_index_2_group_name))
114 |
115 | if pi_M is None:
116 | pi_M = Counter(list(y))[1] / float(len(y))
117 |
118 | self._hard = hard
119 | self._num_rows = self.X.shape[0]
120 | self._num_cols = self.X.shape[1]
121 | self._labels = list(sorted(np.unique(y)))
122 | self.y_step = y
123 |
124 | self.pi_M = pi_M
125 | self.pi_M_l = pi_M
126 | self.pi_M_r = pi_M
127 | self.params = []
128 | self.Mu_M = np.zeros((self._num_cols,))
129 | self.Mu_U = np.zeros((self._num_cols,))
130 | self.Cov_M = np.zeros((self._num_cols,self._num_cols))
131 | self.Cov_U = np.zeros((self._num_cols,self._num_cols))
132 | for i in range(self._num_cols):
133 | self.params.append(self.fit_conditional_parameters(i))
134 | self.Mu_U[i] = self.params[-1][0].mu
135 | self.Mu_M[i] = self.params[-1][1].mu
136 | self.Cov_U[i,i] = self.params[-1][0].std**2
137 | self.Cov_M[i,i] = self.params[-1][1].std**2
138 | self.P_M_2_dimen = None
139 | self.log_P_M_2_dimen = None
140 | self.log_P_U_2_dimen = None
141 |
142 |
143 | def get_class_wise_scores(self, i_cols):
144 | class_wise_scores = dict()
145 | for label in self._labels:
146 | class_wise_scores[label] = \
147 | self.X[np.where(self.y == label), i_cols]
148 |
149 | return class_wise_scores
150 |
151 |
152 | def fit_conditional_parameters(self, i):
153 | class_wise_scores = self.get_class_wise_scores(i)
154 |
155 | class_wise_parameters = dict()
156 | for label in self._labels:
157 | gmm = GaussianMixture(n_components=1)
158 | gmm.fit(class_wise_scores[label].reshape(-1, 1))
159 |
160 | class_wise_parameters[label] = \
161 | self.Gaussian(mu=gmm.means_.flatten()[0],
162 | std=np.sqrt(gmm.covariances_.flatten()[0]))
163 |
164 | return class_wise_parameters
165 |
166 |
167 | def e_step(self, model_l = None,model_r = None):
168 | self.model_l = model_l
169 | self.model_r = model_r
170 | N = self._num_rows
171 | M = self._num_cols
172 |
173 | reg_cov = 1e-8 * np.identity(len(self.X[0]))
174 | self.Cov_M += reg_cov
175 | self.Cov_U += reg_cov
176 |
177 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_M)))
178 | if min_eig < 0:
179 | self.Cov_M -= 10 * min_eig * np.eye(*self.Cov_M.shape)
180 | #self.Cov_M += reg_cov
181 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_U)))
182 | if min_eig < 0:
183 | self.Cov_U -= 10 * min_eig * np.eye(*self.Cov_U.shape)
184 | #self.Cov_U += reg_cov
185 | log_prods_dup = multivariate_normal.logpdf(self.X, mean=self.Mu_M, cov=self.Cov_M,allow_singular=True)
186 | log_prods_non_dup = multivariate_normal.logpdf(self.X, mean=self.Mu_U, cov=self.Cov_U,allow_singular=True)
187 |
188 | pi_M = self.pi_M
189 | pi_U = 1 - pi_M
190 |
191 | prob_non_dup_over_dup = np.exp(np.clip(log_prods_non_dup - log_prods_dup, -500, 500))
192 |
193 | self.Q_M = log_prods_dup
194 | self.Q_U = log_prods_non_dup
195 |
196 |
197 | self.P_M = pi_M/ (pi_M + pi_U * prob_non_dup_over_dup)
198 | self.P_U = 1-self.P_M
199 | if self._hard:
200 | self.P_M = np.round(np.clip(self.P_M, 0., 1.))
201 |
202 | def free_energy(self):
203 | return self.P_M*(np.log(self.pi_M+DEL)-np.log(self.P_M+DEL)+self.Q_M)+self.P_U*(np.log(1-self.pi_M+DEL)-np.log(self.P_U+DEL)+self.Q_U)
204 |
205 | def predict_PM(self,X_test):
206 | reg_cov = 1e-8 * np.identity(len(self.X[0]))
207 | self.Cov_M += reg_cov
208 | self.Cov_U += reg_cov
209 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_M)))
210 | if min_eig < 0:
211 | self.Cov_M -= 10 * min_eig * np.eye(*self.Cov_M.shape)
212 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_U)))
213 | if min_eig < 0:
214 | self.Cov_U -= 10 * min_eig * np.eye(*self.Cov_U.shape)
215 | log_prods_dup = multivariate_normal.logpdf(X_test, mean=self.Mu_M, cov=self.Cov_M)
216 | log_prods_non_dup = multivariate_normal.logpdf(X_test, mean=self.Mu_U, cov=self.Cov_U)
217 |
218 | pi_M = self.pi_M
219 | pi_U = 1 - pi_M
220 |
221 | prob_non_dup_over_dup = np.exp(np.clip(log_prods_non_dup - log_prods_dup, -500, 500))
222 |
223 |
224 | P_M_test = pi_M / (pi_M + pi_U * prob_non_dup_over_dup)
225 | P_M_test = np.round(np.clip(P_M_test, 0., 1.))
226 | return P_M_test
227 |
228 | def enforce_transitivity(self, P_M, ids, id_tuple_to_index, model_l, model_r,LR_dup_free=False,LR_identical=False):
229 | model_l_P_M=None
230 | model_r_P_M=None
231 | if model_l is not None:
232 | model_l_P_M = model_l.P_M
233 | model_r_P_M = model_r.P_M
234 | id_tuple_to_index_l = model_l.id_tuple_to_index
235 | id_tuple_to_index_r = model_r.id_tuple_to_index
236 | P_M = P_M.copy()
237 | pred_tuples = []
238 |
239 | for i in range(P_M.shape[0]):
240 | if P_M[i]>0.5:
241 | pred_tuples.append((ids[i,0],ids[i,1]))
242 | pred_tuples = sorted(pred_tuples)
243 |
244 | for i in range(len(pred_tuples)):
245 | for j in range(i+1, len(pred_tuples)):
246 | if pred_tuples[j][0] == pred_tuples[i][0]:
247 | p1 = P_M[id_tuple_to_index[pred_tuples[i]]]
248 | p2 = P_M[id_tuple_to_index[pred_tuples[j]]]
249 | p_r = 0
250 | id1 = id_tuple_to_index[pred_tuples[i]]
251 | id2 = id_tuple_to_index[pred_tuples[j]]
252 | if LR_dup_free:
253 | p_r = 0
254 | idr = -1
255 | elif LR_identical:
256 | if (pred_tuples[i][1], pred_tuples[j][1]) not in id_tuple_to_index:
257 | p_r = 0
258 | idr = -1
259 | else:
260 | p_r = P_M[id_tuple_to_index[(pred_tuples[i][1],pred_tuples[j][1])]]
261 | idr = id_tuple_to_index[(pred_tuples[i][1],pred_tuples[j][1])]
262 | elif model_r_P_M is not None:
263 | if (pred_tuples[i][1], pred_tuples[j][1]) not in id_tuple_to_index_r:
264 | p_r = 0
265 | idr = -1
266 | else:
267 | p_r = model_r_P_M[id_tuple_to_index_r[(pred_tuples[i][1],pred_tuples[j][1])]]
268 | idr = id_tuple_to_index_r[(pred_tuples[i][1],pred_tuples[j][1])]
269 |
270 | if p1*p2 > p_r:
271 | delta_ls = [self.delta_L(p_r/p2,id1),self.delta_L(p_r/p1,id2)]
272 | if idr != -1:
273 | if LR_identical:
274 | delta_ls.append(self.delta_L(p1 * p2, idr))
275 | else:
276 | delta_ls.append(model_r.delta_L(p1 * p2, idr))
277 | i_max = np.argmax(delta_ls)
278 | if delta_ls[i_max]>-1e100:
279 | if i_max == 0:
280 | P_M[id1] = p_r / p2
281 | elif i_max == 1:
282 | P_M[id2] = p_r / p1
283 | elif i_max == 2:
284 | if LR_identical:
285 | P_M[idr] = p1 * p2
286 | else:
287 | model_r_P_M[idr] = p1*p2
288 | else:
289 | break
290 |
291 | pred_tuples = sorted(pred_tuples,key=lambda x:(x[1],x[0]))
292 | for i in range(len(pred_tuples)):
293 | for j in range(i+1, len(pred_tuples)):
294 | if pred_tuples[j][1] == pred_tuples[i][1]:
295 | p1 = P_M[id_tuple_to_index[pred_tuples[i]]]
296 | p2 = P_M[id_tuple_to_index[pred_tuples[j]]]
297 | p_l=0
298 | id1 = id_tuple_to_index[pred_tuples[i]]
299 | id2 = id_tuple_to_index[pred_tuples[j]]
300 | if LR_dup_free:
301 | p_l = 0
302 | idl = -1
303 | elif LR_identical:
304 | if (pred_tuples[i][0], pred_tuples[j][0]) not in id_tuple_to_index:
305 | p_l = 0
306 | idl = -1
307 | else:
308 | p_l = P_M[id_tuple_to_index[(pred_tuples[i][0],pred_tuples[j][0])]]
309 | idl = id_tuple_to_index[(pred_tuples[i][0],pred_tuples[j][0])]
310 | elif model_l_P_M is not None:
311 | if (pred_tuples[i][0], pred_tuples[j][0]) not in id_tuple_to_index_l:
312 | p_l = 0
313 | idl = -1
314 | else:
315 | p_l = model_l_P_M[id_tuple_to_index_l[(pred_tuples[i][0],pred_tuples[j][0])]]
316 | idl = id_tuple_to_index_l[(pred_tuples[i][0],pred_tuples[j][0])]
317 | #p_l = 0
318 | #idl = -1
319 | if p1*p2 > p_l:
320 | delta_ls = [self.delta_L(p_l / p2, id1), self.delta_L(p_l / p1, id2)]
321 | if idl != -1:
322 | if LR_identical:
323 | delta_ls.append(self.delta_L(p1 * p2, idl))
324 | else:
325 | delta_ls.append(model_l.delta_L(p1 * p2, idl))
326 | i_max = np.argmax(delta_ls)
327 | if delta_ls[i_max]>-1e100:
328 | if i_max == 0:
329 | P_M[id1] = p_l / p2
330 | elif i_max == 1:
331 | P_M[id2] = p_l / p1
332 | elif i_max == 2:
333 | if LR_identical:
334 | P_M[idl] = p1*p2
335 | else:
336 | model_l_P_M[idl] = p1 * p2
337 | else:
338 | break
339 | if model_r_P_M is not None:
340 | model_l.P_M = model_l_P_M
341 | model_r.P_M = model_r_P_M
342 | return P_M
343 |
344 | def m_step(self):
345 | N = self._num_rows
346 | M = self._num_cols
347 |
348 | X = self.X
349 | P_M = self.P_M
350 | P_U = 1. - P_M
351 |
352 | if self._hard:
353 | P_M = P_M.astype(int)
354 | P_U = P_U.astype(int)
355 |
356 | N_M = np.sum(P_M, axis=0)
357 | N_U = N - N_M
358 |
359 | self.pi_M = N_M / N
360 |
361 |
362 | P_M = P_M.reshape(N, 1)
363 | P_U = P_U.reshape(N, 1)
364 |
365 | self.Mu_M = np.sum(P_M * X, axis=0) / (N_M + DEL)
366 | self.Mu_U = np.sum(P_U * X, axis=0) / (N_U + DEL)
367 |
368 | smooth_factor = abs((self.Mu_M - self.Mu_U))**2
369 |
370 | std_M = (np.sqrt(np.sum(
371 | P_M * ((X - np.tile(self.Mu_M, (N, 1))) ** 2), axis=0) / (N_M + DEL))) + 1e-100
372 | std_U = (np.sqrt(np.sum(
373 | P_U * ((X - np.tile(self.Mu_U, (N, 1))) ** 2), axis=0) / (N_U + DEL))) + 1e-100
374 |
375 | Cov_M = np.dot(np.transpose(self.X - self.Mu_M),P_M*(self.X - self.Mu_M))/(N_M + DEL)
376 | Cov_U = np.dot(np.transpose(self.X - self.Mu_U),P_U*(self.X - self.Mu_U))/(N_U + DEL)
377 |
378 | a = np.diag(Cov_M)
379 | b = np.diag(Cov_U)
380 | u = (self.Mu_M - self.Mu_U)**2
381 | c=0.15
382 |
383 | c_bay = self.c_bay
384 | bay_ori = bay_coeff(a,b,u)
385 | target_bay =bay_ori + c_bay
386 | target_bay[target_bay>=1] = bay_ori[target_bay>=1]/2+0.5
387 | def bay_coeff_equ(x):
388 | return bay_coeff(a + x, b + x, u) - target_bay
389 | x0=c*smooth_factor
390 | x1 = np.zeros_like(x0)
391 | kappas = newton(bay_coeff_equ,x0=x0,x1=x1,maxiter=5,tol=1)
392 | kappas[kappas<0] = 0
393 | kappas[kappas>1] = 1
394 | kappas = np.nan_to_num(kappas,posinf=0,neginf=0)
395 | self.Cov_M = np.zeros_like(Cov_M)
396 | self.Cov_U = np.zeros_like(Cov_U)
397 |
398 | for g_name in self.group_names:
399 | i_cols = self.group_name_2_col_indices[g_name]
400 |
401 | for col_1 in i_cols:
402 | for col_2 in i_cols:
403 | if col_2 == col_1:
404 | self.Cov_M[col_1, col_2] = Cov_M[col_1, col_2]+kappas[col_1]
405 | self.Cov_U[col_1, col_2] = Cov_U[col_1, col_2]+kappas[col_1]
406 | else:
407 | self.Cov_M[col_1, col_2] = self.corr[col_1,col_2]*std_M[col_1]*std_M[col_2]
408 | self.Cov_U[col_1, col_2] = self.corr[col_1,col_2]*std_U[col_1]*std_U[col_2]
409 | def L(self,q,i):
410 | return q*(np.log(self.pi_M+DEL) + self.Q_M[i] - np.log(q+DEL)) +(1-q)*(np.log(1-self.pi_M+DEL)+self.Q_U[i]-np.log(1-q+DEL))
411 |
412 | def delta_L(self,q,i):
413 | delta = self.L(q,i) - self.L(self.P_M[i],i)
414 | if delta > 0.00001:
415 | return -1e200
416 | return delta
417 |
418 | def save_model(self, filepath):
419 | pickle.dump(self, open(filepath, 'wb'))
420 |
421 | @staticmethod
422 | def load_model(filepath):
423 | return pickle.load(open(filepath, 'rb'))
424 |
425 | @classmethod
426 | def run_em(cls, similarity_matrixs, feature_names, y_inits,id_dfs,LR_dup_free,LR_identical,run_trans,
427 | c_bay=0.015,
428 | y_true=None,
429 | pi_M=None,
430 | hard=False,
431 | max_iter=40):
432 | sims, sims_l, sims_r = similarity_matrixs
433 | y_init,y_init_l,y_init_r = y_inits
434 | id_df, id_df_l, id_df_r = id_dfs
435 | model = cls(sims, feature_names,y_init,id_df,pi_M=pi_M, hard=hard,c_bay=c_bay)
436 | if run_trans and LR_dup_free==False and LR_identical==False:
437 | model_l = cls(sims_l, feature_names,y_init_l,id_df_l,c_bay=c_bay)
438 | model_r = cls(sims_r, feature_names,y_init_r,id_df_r,c_bay=c_bay)
439 |
440 | convergence = ConvergenceMeter(10, 0.01, diff_fn=lambda a, b: np.linalg.norm(a - b))
441 |
442 | with tqdm(range(max_iter)) as pbar:
443 | for i in pbar:
444 | model.e_step()
445 | if run_trans:
446 | if LR_dup_free==False and LR_identical==False:
447 | model_r.e_step()
448 | model_l.e_step()
449 | for i in range(4):
450 | if LR_dup_free == False and LR_identical==False:
451 | model_l.P_M = model_l.enforce_transitivity(model_l.P_M, model_l.ids, model_l.id_tuple_to_index, model_l, model_l)
452 | model_r.P_M = model_r.enforce_transitivity(model_r.P_M, model_r.ids, model_r.id_tuple_to_index, model_r, model_r)
453 | model.P_M = model.enforce_transitivity(model.P_M, model.ids, model.id_tuple_to_index, model_l, model_r)
454 | else:
455 | model.P_M = model.enforce_transitivity(model.P_M, model.ids, model.id_tuple_to_index, None, None,LR_dup_free,LR_identical)
456 | model.m_step()
457 | if run_trans and LR_dup_free == False and LR_identical==False:
458 | model_r.m_step()
459 | model_l.m_step()
460 |
461 | convergence.offer(model.free_energy())
462 | if convergence.is_converged:
463 | break
464 | if y_true is not None:
465 | y_pred = np.round(np.clip(model.P_M + DEL, 0., 1.)).astype(int) \
466 | if not hard else model.P_M.astype(int)
467 | p, r, f1 = _get_results(y_true, y_pred)
468 | result_str = (
469 | "norm: {:0.2f}, "
470 | "F1: {:0.2f}, "
471 | "Precision: {:0.2f}, "
472 | "Recall: {:0.2f}".format(
473 | np.linalg.norm(model.P_M),
474 | f1, p, r))
475 | pbar.set_description_str(result_str)
476 |
477 | return model, model.P_M
478 |
479 |
480 | if __name__ == '__main__':
481 | pass
482 |
483 |
484 |
485 |
486 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import precision_score, recall_score, f1_score
3 |
4 | from model import get_y_init_given_threshold,ZeroerModel
5 |
6 | DEL = 1e-300
7 |
8 |
9 | def get_results(true_labels, predicted_labels):
10 | p = precision_score(true_labels, predicted_labels)
11 | r = recall_score(true_labels, predicted_labels)
12 | f1 = f1_score(true_labels, predicted_labels)
13 | return p, r, f1
14 |
15 |
16 | def run_zeroer(similarity_features_df, similarity_features_lr,id_dfs,true_labels,LR_dup_free,LR_identical,run_trans):
17 | similarity_matrix = similarity_features_df.values
18 | y_init = get_y_init_given_threshold(similarity_features_df)
19 | similarity_matrixs = [similarity_matrix,None,None]
20 | y_inits = [y_init,None,None]
21 | if similarity_features_lr[0] is not None:
22 | similarity_matrixs[1] = similarity_features_lr[0].values
23 | similarity_matrixs[2] = similarity_features_lr[1].values
24 | y_inits[1] = get_y_init_given_threshold(similarity_features_lr[0])
25 | y_inits[2] = get_y_init_given_threshold(similarity_features_lr[1])
26 | feature_names = similarity_features_df.columns
27 |
28 | c_bay = 0.1
29 | model, y_pred = ZeroerModel.run_em(similarity_matrixs, feature_names, y_inits,id_dfs,LR_dup_free,LR_identical, run_trans, y_true=true_labels,
30 | hard=False, c_bay=c_bay)
31 | if true_labels is not None:
32 | p, r, f1 = get_results(true_labels, np.round(np.clip(y_pred + DEL, 0., 1.)).astype(int))
33 | print("Results after EM:")
34 | print("F1: {:0.2f}, Precision: {:0.2f}, Recall: {:0.2f}".format(f1, p, r))
35 | return y_pred
36 |
--------------------------------------------------------------------------------
/zeroer.py:
--------------------------------------------------------------------------------
1 | from data_loading_helper.data_loader import load_data
2 | from data_loading_helper.feature_extraction import *
3 | from utils import run_zeroer
4 | from blocking_functions import *
5 | from os.path import join
6 | import argparse
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("dataset",type=str)
9 | parser.add_argument("--run_transitivity",type=bool,default=False,nargs="?",const=True, help="whether to enforce transitivity constraint")
10 | parser.add_argument("--LR_dup_free",type=bool,default=False,nargs="?",const=True, help="are the left table and right table duplicate-free?")
11 | parser.add_argument("--LR_identical",type=bool,default=False,nargs="?",const=True, help="are the left table and right table identical?")
12 |
13 | data_path = "datasets"
14 |
15 | if __name__ == '__main__':
16 | args = parser.parse_args()
17 | LR_dup_free = args.LR_dup_free
18 | run_trans = args.run_transitivity
19 | LR_identical = args.LR_identical
20 | dataset_name = args.dataset
21 | dataset_path = join(data_path,dataset_name)
22 | blocking_func = blocking_functions_mapping[dataset_name]
23 | try:
24 | candset_features_df = pd.read_csv(join(dataset_path,"candset_features_df.csv"), index_col=0)
25 | candset_features_df.reset_index(drop=True,inplace=True)
26 | if run_trans==True:
27 | id_df = candset_features_df[["ltable_id","rtable_id"]]
28 | id_df.reset_index(drop=True,inplace=True)
29 | if LR_dup_free==False and LR_identical==False:
30 | candset_features_df_l = pd.read_csv(join(dataset_path,"candset_features_df_l.csv"), index_col=0)
31 | candset_features_df_l.reset_index(drop=True,inplace=True)
32 | candset_features_df_r = pd.read_csv(join(dataset_path,"candset_features_df_r.csv"), index_col=0)
33 | candset_features_df_r.reset_index(drop=True,inplace=True)
34 | id_df_l = candset_features_df_l[["ltable_id","rtable_id"]]
35 | id_df_l.reset_index(drop=True,inplace=True)
36 | id_df_r = candset_features_df_r[["ltable_id","rtable_id"]]
37 | id_df_r.reset_index(drop=True,inplace=True)
38 | print(
39 | "Features already generated, reading from file: " + dataset_path + "/candset_features_df.csv")
40 |
41 | except FileNotFoundError:
42 | print("Generating features and storing in: " + dataset_path + "/candset_features_df.csv")
43 |
44 | f = open(join(dataset_path, 'metadata.txt'), "r")
45 | LEFT_FILE = join(dataset_path, f.readline().strip())
46 | if LR_identical:
47 | RIGHT_FILE = LEFT_FILE
48 | else:
49 | RIGHT_FILE = join(dataset_path, f.readline().strip())
50 | DUPLICATE_TUPLES = join(dataset_path, f.readline().strip())
51 | f.close()
52 | if run_trans==True and LR_dup_free==False and LR_identical==False:
53 | ltable_df, rtable_df, duplicates_df, candset_df,candset_df_l,candset_df_r = load_data(LEFT_FILE, RIGHT_FILE, DUPLICATE_TUPLES,
54 | blocking_func,
55 | include_self_join=True)
56 | else:
57 | ltable_df, rtable_df, duplicates_df, candset_df = load_data(LEFT_FILE, RIGHT_FILE, DUPLICATE_TUPLES,
58 | blocking_func,
59 | include_self_join=False)
60 | if LR_identical:
61 | print("removing self matches")
62 | candset_df = candset_df.loc[candset_df.ltable_id!=candset_df.rtable_id,:]
63 | candset_df.reset_index(inplace=True,drop=True)
64 | candset_df['_id'] = candset_df.index
65 | if duplicates_df is None:
66 | duplicates_df = pd.DataFrame(columns=["ltable_id", "rtable_id"])
67 | candset_features_df = gather_features_and_labels(ltable_df, rtable_df, duplicates_df, candset_df)
68 | candset_features_df.to_csv(join(dataset_path,"candset_features_df.csv"))
69 | id_df = candset_df[["ltable_id", "rtable_id"]]
70 |
71 | if run_trans == True and LR_dup_free == False and LR_identical==False:
72 | duplicates_df_r = pd.DataFrame()
73 | duplicates_df_r['l_id'] = rtable_df["id"]
74 | duplicates_df_r['r_id'] = rtable_df["id"]
75 | candset_features_df_r = gather_features_and_labels(rtable_df, rtable_df, duplicates_df_r, candset_df_r)
76 | candset_features_df_r.to_csv(join(dataset_path,"candset_features_df_r.csv"))
77 |
78 |
79 | duplicates_df_l = pd.DataFrame()
80 | duplicates_df_l['l_id'] = ltable_df["id"]
81 | duplicates_df_l['r_id'] = ltable_df["id"]
82 | candset_features_df_l = gather_features_and_labels(ltable_df, ltable_df, duplicates_df_l, candset_df_l)
83 | candset_features_df_l.to_csv(join(dataset_path,"candset_features_df_l.csv"))
84 |
85 | id_df_l = candset_df_l[["ltable_id","rtable_id"]]
86 | id_df_r = candset_df_r[["ltable_id","rtable_id"]]
87 | id_df_l.to_csv(join(dataset_path,"id_tuple_df_l.csv"))
88 | id_df_r.to_csv(join(dataset_path,"id_tuple_df_r.csv"))
89 |
90 | similarity_features_df = gather_similarity_features(candset_features_df)
91 | similarity_features_lr = (None,None)
92 | id_dfs = (None, None, None)
93 | if run_trans == True:
94 | id_dfs = (id_df, None, None)
95 | if LR_dup_free == False and LR_identical==False:
96 | similarity_features_df_l = gather_similarity_features(candset_features_df_l)
97 | similarity_features_df_r = gather_similarity_features(candset_features_df_r)
98 | features = set(similarity_features_df.columns)
99 | features = features.intersection(set(similarity_features_df_l.columns))
100 | features = features.intersection(set(similarity_features_df_r.columns))
101 | features = sorted(list(features))
102 | similarity_features_df = similarity_features_df[features]
103 | similarity_features_df_l = similarity_features_df_l[features]
104 | similarity_features_df_r = similarity_features_df_r[features]
105 | similarity_features_lr = (similarity_features_df_l,similarity_features_df_r)
106 | id_dfs = (id_df, id_df_l, id_df_r)
107 |
108 | true_labels = candset_features_df.gold.values
109 | if np.sum(true_labels)==0:
110 | true_labels = None
111 | y_pred = run_zeroer(similarity_features_df, similarity_features_lr,id_dfs,
112 | true_labels ,LR_dup_free,LR_identical,run_trans)
113 | pred_df = candset_features_df[["ltable_id","rtable_id"]]
114 | pred_df['pred'] = y_pred
115 | pred_df.to_csv(join(dataset_path,"pred.csv"))
116 |
117 |
--------------------------------------------------------------------------------