├── LICENSE ├── README.md ├── blocking_functions.py ├── data_loading_helper ├── __init__.py ├── data_loader.py ├── feature_extraction.py └── magellan_modified_feature_generation.py ├── datasets ├── fodors_zagats │ ├── fodors.csv │ ├── matches_fodors_zagats.csv │ ├── metadata.txt │ └── zagats.csv └── fodors_zagats_single │ ├── fz.csv │ ├── matches_fodors_zagats.csv │ └── metadata.txt ├── environment.yml ├── model.py ├── utils.py └── zeroer.py /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZeroER 2 | Implementation for the paper [ZeroER: Entity Resolution using Zero Labeled Examples.](https://arxiv.org/abs/1908.06049) 3 | 4 | ## Setup enviroment 5 | conda env create -f environment.yml 6 | conda activate ZeroER 7 | 8 | ## How to use 9 | 1. Put you dataset into the folder `datasets`. You should have a file `metadata.txt` in your data folder that specifies the file name of the table (and possibly right table and ground-truth table). For two table record linkage, you can refer to `datasets/fodors_zagats/metadata.txt`. 10 | For single table deduplication, you can refer to `datasets/fodors_zagats_single/metadata.txt`. 11 | 2. Write a blocking function for your dataset and put it in `blocking_functions.py`. 12 | You can have a look at the blocking functions we wrote in that file to get some ideas of how to write your own blocking function. 13 | We use Magellan to do blocking so you can also refer to its [documentations](https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching). 14 | 15 | 3. **Two-table record linkage**.
16 | To run the code, for example you are using the fodors_zagats dataset: 17 | 18 | `python zeroer.py fodors_zagats` 19 | 20 | If you want to incorporate the transitivity constraint, use arg `--run_transitivity`: 21 | 22 | `python zeroer.py fodors_zagats --run_transitivity` 23 | 24 | *Note this will generate features for self-join of the two tables (LxL and RxR) when arg `--LR_dup_free` is not present, which can take some time. 25 | 26 | If you know that your left table and right table are duplicate free, you can incorporate this information by using arg `--run_transitivity --LR_dup_free`: 27 | 28 | `python zeroer.py fodors_zagats --run_transitivity --LR_dup_free` 29 | 30 | **Single-table deduplication**.
31 | You must explictly tell the system that you are doing single table deduplication by arg `--LR_identical`: 32 | 33 | `python zeroer.py fodors_zagats_single --LR_identical` 34 | 35 | If you want to incorporate the transitivity constraint, add arg `--run_transitivity`: 36 | 37 | `python zeroer.py fodors_zagats_single --LR_identical --run_transitivity` 38 | 39 | 4. Final result for matches and unmatches is the file `pred.csv` that is saved to your dataset folder. 40 | 41 | ## Citation 42 | If you use our work or found it useful, please cite our paper: 43 | ``` 44 | @inproceedings{wu2020zeroer, 45 | author = {Renzhi Wu and Sanya Chaba and Saurabh Sawlani and Xu Chu and Saravanan Thirumuruganathan}, 46 | title = {ZeroER: Entity Resolution using Zero Labeled Examples}, 47 | booktitle = {Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data}, 48 | pages = {1149–1164}, 49 | year = {2020} 50 | } 51 | ``` -------------------------------------------------------------------------------- /blocking_functions.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from pandas import merge 4 | import py_entitymatching as em 5 | 6 | 7 | """ This python file contains blocking functions 8 | specific to table pairs. For example block_fodors_zagat 9 | is the blocking function for Tables fodors and zagat. 10 | Functionality: creates initial set of tuple pairs for 11 | two tables. 12 | """ 13 | 14 | 15 | def verify_blocking_ground_truth(A, B, block_df, duplicates_df, objectify=False): 16 | num_duplicates_missed = 0 17 | duplicates_df.columns = ["ltable_id", "rtable_id"] 18 | # Sometimes pandas / Magellan puts some columns as objects instead of numeric/string. In this case, we will force this to join appropriately 19 | if objectify: 20 | duplicates_df = duplicates_df.astype(object) 21 | 22 | # Intuition: merge function joints two data frames. The outer option creates a number of NaN rows when 23 | # some duplicates are missing in the blocked_df 24 | # we leverage the fact that len gives all rows while count gives non-NaN to compute the missing options 25 | merged_df = block_df.merge(duplicates_df, left_on=["ltable_id", "rtable_id"], right_on=["ltable_id", "rtable_id"], 26 | how='outer') 27 | num_duplicates_missed = len(merged_df) - merged_df["_id"].count() 28 | total_duplicates = len(duplicates_df) 29 | 30 | print("Ratio saved=", 1.0 - float(len(block_df)) / float(len(A) * len(B))) 31 | print("Totally missed:", num_duplicates_missed, " out of ", total_duplicates) 32 | 33 | 34 | def blocking_for_citeseer_dblp(A,B): 35 | #A = em.read_csv_metadata("citeseer_sample.csv", key="id", encoding='utf-8') 36 | #B = em.read_csv_metadata("dblp_sample.csv", key="id", encoding='utf-8') 37 | attributes = ['id', 'title', 'authors', 'journal', 'month', 'year', 'publication_type'] 38 | 39 | ob = em.OverlapBlocker() 40 | C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, show_progress=True, 41 | l_output_attrs=attributes, r_output_attrs=attributes) 42 | return C1 43 | #verify_blocking_ground_truth(A, B, C1, matches_df_head) 44 | 45 | #fodors.csv and zagats.csv 46 | def block_fodors_zagats(A, B): 47 | ob = em.OverlapBlocker() 48 | C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=['name', 'addr', 'city', 'phone'], r_output_attrs=['name', 'addr', 'city', 'phone'], 49 | overlap_size=1, show_progress=False) 50 | return C 51 | 52 | 53 | #babies_r_us.csv and buy_buy_baby.csv 54 | def block_baby_products(A, B): 55 | ob = em.OverlapBlocker() 56 | # attributes = ['title', 'price', 'category', 'company_struct', 'brand', 'weight', 'length', 'width', 'height', 'fabrics', 'colors', 'materials'] 57 | attributes = ['title', 'price', 'is_discounted', 'category', 'company_struct'] 58 | # C = ob.block_tables(A, B, 'title', 'title', l_output_attrs=attributes, r_output_attrs=attributes, 59 | # overlap_size=3, show_progress=False) 60 | C = ob.block_tables(A, B, 'title', 'title', word_level = True, overlap_size = 4, show_progress = True, l_output_attrs = attributes, r_output_attrs = attributes) 61 | return C 62 | 63 | 64 | #barnes_and_noble.csv and half.csv 65 | def block_books(A, B): 66 | #assumes some preprocessing is done: 67 | #Specifically in half.csv : NewPrice => Price 68 | 69 | ob = em.OverlapBlocker() 70 | # attributes = ['Title', 'Price', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions'] 71 | attributes = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions'] 72 | # C = ob.block_tables(A, B, 'Title', 'Title', l_output_attrs=attributes, r_output_attrs=attributes, 73 | # overlap_size=1, show_progress=False) 74 | C = ob.block_tables(A, B, 'Title', 'Title', word_level=True, overlap_size=4, show_progress=True, 75 | l_output_attrs=attributes, r_output_attrs=attributes) 76 | return C 77 | 78 | 79 | #yellow_pages.csv and yelp.csv 80 | def block_restaurants(A, B): 81 | #assumes some preprocessing is done: 82 | #Specifically in half.csv : NewPrice => Price 83 | 84 | ob = em.OverlapBlocker() 85 | attributes = ['name', 'address', 'city', 'state', 'zipcode', 'phone'] 86 | # C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=attributes, r_output_attrs=attributes, 87 | # overlap_size=1, show_progress=False) 88 | C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=4, show_progress=True, 89 | l_output_attrs=attributes, r_output_attrs=attributes) 90 | return C 91 | 92 | 93 | #dblp.csv and ACM.csv 94 | def block_dblp_acm(A, B): 95 | ab = em.AttrEquivalenceBlocker() 96 | C = ab.block_tables(A, B, l_block_attr='year', r_block_attr='year', l_output_attrs=["title","authors","venue","year"], 97 | r_output_attrs=["title","authors","venue","year"], allow_missing=False) 98 | ob = em.OverlapBlocker() 99 | #=================>results in a candidate set of size 46K with 5 missing duplicates out of 2224 100 | C2 = ob.block_candset(C, 'title', 'title', word_level=True, overlap_size=2, show_progress=True) 101 | return C2 102 | 103 | 104 | #dblp.csv and google_scholar.csv 105 | def block_dblp_scholar(A, B): 106 | ob = em.OverlapBlocker() 107 | attributes = ["id","title","authors","venue","year"] 108 | #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes) 109 | #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347 110 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=4, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes) 111 | #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347 112 | return C2 113 | 114 | def block_rotten_imdb(A, B): 115 | ob = em.OverlapBlocker() 116 | attributes = set(A.columns) 117 | attributes.remove("id") 118 | attributes = list(attributes.intersection(set(B.columns))) 119 | #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes) 120 | #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347 121 | C2 = ob.block_tables(A, B, 'Name', 'Name', word_level=True, overlap_size=2, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes) 122 | #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347 123 | return C2 124 | 125 | 126 | #abt.csv and buy.csv 127 | def block_abt_buy(A, B): 128 | try: 129 | B["description"] = B["description"] + " " + B["manufacturer"] 130 | except: 131 | print() 132 | ob = em.OverlapBlocker() 133 | #=================>results in a candidate set of size 164K with 6 missing duplicates out of 1097 134 | C = ob.block_tables(A, B, "name", "name", word_level=True, overlap_size=1, 135 | l_output_attrs=["name","description","price"], r_output_attrs=["name","description","price"], show_progress=True, allow_missing=False) 136 | return C 137 | 138 | 139 | #walmart.csv and amazon.csv 140 | def block_walmart_amazon_(A, B): 141 | #assumes some preprocessing is done: 142 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr 143 | 144 | ob = em.OverlapBlocker() 145 | 146 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2) 147 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154 148 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1) 149 | 150 | attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions'] 151 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, l_output_attrs=attributes, r_output_attrs=attributes, show_progress=True, allow_missing=True) 152 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154 153 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2) 154 | 155 | return C2 156 | 157 | #walmart.csv and amazon.csv 158 | def block_walmart_amazon(A, B): 159 | #assumes some preprocessing is done: 160 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr 161 | 162 | ob = em.OverlapBlocker() 163 | 164 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2) 165 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154 166 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1) 167 | 168 | r_attributes = ["title","proddescrshort","brand","price","dimensions","shipweight"] 169 | l_attributes = ["title","shortdescr","brand","price","dimensions","shipweight"] 170 | 171 | if not set(r_attributes).issubset(B.columns): # fix in case A B are the same dataset 172 | r_attributes = l_attributes 173 | if not set(l_attributes).issubset(A.columns): 174 | l_attributes = r_attributes 175 | #attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions'] 176 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, l_output_attrs=l_attributes, r_output_attrs=r_attributes, show_progress=True, allow_missing=True) 177 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154 178 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2) 179 | return C2 180 | 181 | def block_wa(A, B): 182 | #assumes some preprocessing is done: 183 | #Specifically in amazon.csv : a. pcategory2 => groupname , b. { proddescrshort,proddescrlong } => shortdescr,longdescr 184 | 185 | ob = em.OverlapBlocker() 186 | 187 | #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2) 188 | #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154 189 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C1) 190 | 191 | r_attributes = ["title","category","brand","modelno","price"] 192 | l_attributes = ["title","category","brand","modelno","price"] 193 | 194 | if not set(r_attributes).issubset(B.columns): # fix in case A B are the same dataset 195 | r_attributes = l_attributes 196 | if not set(l_attributes).issubset(A.columns): 197 | l_attributes = r_attributes 198 | #attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions'] 199 | C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, l_output_attrs=l_attributes, r_output_attrs=r_attributes, show_progress=True, allow_missing=True) 200 | #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154 201 | #blocking_utils.verify_blocking_ground_truth(dataset_name, C2) 202 | return C2 203 | 204 | #amazon.csv and GoogleProducts.csv 205 | def block_amazon_googleproducts(A, B): 206 | ob = em.OverlapBlocker() 207 | #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300 208 | C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1, l_output_attrs=["title","description","manufacturer","price"], r_output_attrs=["title","description","manufacturer","price"], show_progress=True, allow_missing=False) 209 | return C 210 | 211 | def block_songs(A, B): 212 | ob = em.OverlapBlocker() 213 | #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300 214 | C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1, 215 | l_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"], 216 | r_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"], 217 | show_progress=True, allow_missing=False,n_jobs=8) 218 | return C 219 | 220 | def generic_blocking_func(A, B): 221 | A_prefix = A.add_prefix('ltable_') 222 | B_prefix = B.add_prefix('rtable_') 223 | A_prefix['key'] = 1 224 | B_prefix['key'] = 1 225 | final = merge(A_prefix, B_prefix,on='key', suffixes=('', '')) 226 | final = final.drop(columns=['key']) 227 | final = final.reset_index() 228 | final = final.rename(columns={'index': '_id'}) 229 | print (list(final)) 230 | return final 231 | 232 | 233 | blocking_functions_mapping = defaultdict(str) 234 | blocking_functions_mapping["fodors_zagats"] = block_fodors_zagats 235 | blocking_functions_mapping["fodors_zagats_single"] = block_fodors_zagats 236 | blocking_functions_mapping["abt_buy"] = block_abt_buy 237 | blocking_functions_mapping["dblp_acm"] = block_dblp_acm 238 | blocking_functions_mapping["dblp_scholar"] = block_dblp_scholar 239 | blocking_functions_mapping["amazon_googleproducts"] = block_amazon_googleproducts 240 | blocking_functions_mapping["walmart_amazon"] = block_walmart_amazon 241 | blocking_functions_mapping["songs"] = block_songs 242 | blocking_functions_mapping["citations"] = blocking_for_citeseer_dblp 243 | 244 | blocking_functions_mapping["dblp_citeseer"] = generic_blocking_func 245 | blocking_functions_mapping["imdb_omdb"] = generic_blocking_func 246 | blocking_functions_mapping["rotten_imdb"] = block_rotten_imdb 247 | 248 | blocking_functions_mapping["cora"] = generic_blocking_func 249 | blocking_functions_mapping["synthetic"] = generic_blocking_func 250 | blocking_functions_mapping["books"] = block_books 251 | blocking_functions_mapping["baby_products"] = block_baby_products 252 | blocking_functions_mapping["restaurants"] = block_restaurants 253 | blocking_functions_mapping['wa'] = block_wa -------------------------------------------------------------------------------- /data_loading_helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/zeroer/4c1ea6acd2c214b32b2e52cca9b8d50afc180220/data_loading_helper/__init__.py -------------------------------------------------------------------------------- /data_loading_helper/data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import merge 3 | import py_entitymatching as em 4 | 5 | def load_data(left_file_name, right_file_name, label_file_name, blocking_fn, include_self_join=False): 6 | A = em.read_csv_metadata(left_file_name , key="id", encoding='iso-8859-1') 7 | B = em.read_csv_metadata(right_file_name , key="id", encoding='iso-8859-1') 8 | try: 9 | G = pd.read_csv(label_file_name) 10 | except: 11 | G=None 12 | C = blocking_fn(A, B) 13 | if include_self_join: 14 | C_A = blocking_fn(A, A) 15 | C_B = blocking_fn(B, B) 16 | return A, B, G, C, C_A,C_B 17 | else: 18 | return A, B, G, C 19 | -------------------------------------------------------------------------------- /data_loading_helper/feature_extraction.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import py_entitymatching as em 4 | from .magellan_modified_feature_generation import get_features 5 | 6 | 7 | #Given a CANDIDATE SET and the list of ACTUAL duplicates (duplicates_df), 8 | #this function adds the 1/0 labels (column name = GOLD) to the candset dataframe 9 | def add_labels_to_candset(duplicates_df, candset_df, ltable_df, rtable_df): 10 | #We are overwriting column names - but thats okay as this is not used anywhere else. 11 | duplicates_df.columns = ["ltable_id", "rtable_id"] 12 | 13 | #We merged two DF based on the common attributes. The indicator 'gold' takes three values both, left_only, right_only 14 | df_with_gold = pd.merge(candset_df, duplicates_df, on=['ltable_id', 'rtable_id'], how='left', indicator='gold') 15 | 16 | #If it is present in both, then it is a duplicate and we set it to 1 and 0 otherwise 17 | df_with_gold['gold'] = np.where(df_with_gold.gold == 'both', 1, 0) 18 | 19 | #This is to handle some Magellan issues 20 | em.set_key(df_with_gold, '_id') 21 | em.set_property(df_with_gold,'ltable', ltable_df) 22 | em.set_property(df_with_gold,'rtable', rtable_df) 23 | em.set_property(df_with_gold,'fk_ltable', "ltable_id") 24 | em.set_property(df_with_gold,'fk_rtable', "rtable_id") 25 | 26 | return df_with_gold 27 | 28 | def get_features_for_type(column_type): 29 | """ 30 | Get features to be generated for a type 31 | """ 32 | # First get the look up table 33 | lookup_table = dict() 34 | 35 | # Features for type str_eq_1w 36 | lookup_table['STR_EQ_1W'] = [('lev_dist'), ('lev_sim'), ('jaro'), 37 | ('jaro_winkler'), 38 | ('exact_match'), 39 | ('jaccard', 'qgm_3', 'qgm_3')] 40 | 41 | # Features for type str_bt_1w_5w 42 | lookup_table['STR_BT_1W_5W'] = [('jaccard', 'qgm_3', 'qgm_3'), 43 | ('cosine', 'dlm_dc0', 'dlm_dc0'), 44 | ('jaccard', 'dlm_dc0', 'dlm_dc0'), 45 | ('monge_elkan'), ('lev_dist'), ('lev_sim'), 46 | ('needleman_wunsch'), 47 | ('smith_waterman')] # dlm_dc0 is the concrete space tokenizer 48 | 49 | # Features for type str_bt_5w_10w 50 | lookup_table['STR_BT_5W_10W'] = [('jaccard', 'qgm_3', 'qgm_3'), 51 | ('cosine', 'dlm_dc0', 'dlm_dc0'), 52 | ('monge_elkan'), ('lev_dist'), ('lev_sim')] 53 | 54 | # Features for type str_gt_10w 55 | lookup_table['STR_GT_10W'] = [('jaccard', 'qgm_3', 'qgm_3'), 56 | ('cosine', 'dlm_dc0', 'dlm_dc0')] 57 | 58 | # Features for NUMERIC type 59 | lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'), 60 | ('lev_sim')] 61 | 62 | # Features for BOOLEAN type 63 | lookup_table['BOOL'] = [('exact_match')] 64 | 65 | # Features for un determined type 66 | lookup_table['UN_DETERMINED'] = [] 67 | # Based on the column type, return the feature functions that should be 68 | # generated. 69 | if column_type is 'str_eq_1w': 70 | features = lookup_table['STR_EQ_1W'] 71 | elif column_type is 'str_bt_1w_5w': 72 | features = lookup_table['STR_BT_1W_5W'] 73 | elif column_type is 'str_bt_5w_10w': 74 | features = lookup_table['STR_BT_5W_10W'] 75 | elif column_type is 'str_gt_10w': 76 | features = lookup_table['STR_GT_10W'] 77 | elif column_type is 'numeric': 78 | features = lookup_table['NUM'] 79 | elif column_type is 'boolean': 80 | features = lookup_table['BOOL'] 81 | elif column_type is 'un_determined': 82 | features = lookup_table['UN_DETERMINED'] 83 | else: 84 | raise TypeError('Unknown type') 85 | return features 86 | 87 | 88 | def extract_features(ltable_df, rtable_df, candset_df): 89 | tokenizers = em.get_tokenizers_for_matching() 90 | sim_functions = em.get_sim_funs_for_matching() 91 | left_attr_types = em.get_attr_types(ltable_df) 92 | right_attr_types = em.get_attr_types(rtable_df) 93 | correspondences = em.get_attr_corres(ltable_df, rtable_df) 94 | 95 | feature_dict_list = [] 96 | attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7} 97 | for c in correspondences['corres']: 98 | if left_attr_types[c[0]] != right_attr_types[c[1]]: 99 | if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]: 100 | left_attr_types[c[0]] = right_attr_types[c[1]] 101 | else: 102 | right_attr_types[c[1]] = left_attr_types[c[0]] 103 | 104 | feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions) 105 | #Remove all features based on id - they are often useless 106 | feature_records = feature_records[feature_records.left_attribute !='id'] 107 | feature_records.reset_index(inplace=True,drop=True) 108 | 109 | distance_functions = ["lev_dist", "rdf"] 110 | non_normalized_functions = ["aff", "sw", "swn", "nmw"] 111 | keep_features = [True]*feature_records.shape[0] 112 | for i in range(feature_records.shape[0]): 113 | feature = feature_records.loc[i,"feature_name"] 114 | for func in distance_functions + non_normalized_functions: 115 | if func in feature: 116 | keep_features[i] = False 117 | feature_records = feature_records.loc[keep_features,:] 118 | 119 | print("\n\nExtracting the full set of features:") 120 | candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1) 121 | candset_features_df.fillna(value=0, inplace=True) 122 | 123 | return candset_features_df 124 | 125 | 126 | 127 | def extract_features_auto(ltable_df, rtable_df, candset_df): 128 | feature_list = em.get_features_for_matching(ltable_df,rtable_df,validate_inferred_attr_types=False) 129 | #Remove all features based on id - they are often useless 130 | feature_list = feature_list[feature_list.left_attribute !='id'] 131 | 132 | print("\n\nExtracting the full set of features:") 133 | candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_list,attrs_after='gold',show_progress=True) 134 | candset_features_df.fillna(value=0, inplace=True) 135 | 136 | return candset_features_df 137 | 138 | 139 | #High level function which just adds labels and the complete set of features to candset 140 | def gather_features_and_labels(ltable_df, rtable_df, labels_df, candset_df): 141 | labels_df.columns = ["ltable_id", "rtable_id"] 142 | labels_df["ltable_id"] = labels_df["ltable_id"].astype(str) 143 | labels_df["rtable_id"] = labels_df["rtable_id"].astype(str) 144 | candset_df["ltable_id"] = candset_df["ltable_id"].astype(str) 145 | candset_df["rtable_id"] = candset_df["rtable_id"].astype(str) 146 | ltable_df["id"] = ltable_df["id"].astype(str) 147 | rtable_df["id"] = rtable_df["id"].astype(str) 148 | candset_df = add_labels_to_candset(labels_df, candset_df, ltable_df, rtable_df) 149 | candset_features_df = extract_features(ltable_df, rtable_df, candset_df) 150 | 151 | return candset_features_df 152 | 153 | 154 | #Filter out bad features (non similarity, non distance, singular valued) 155 | def gather_similarity_features(candset_features_df, avged = False): 156 | distance_functions = ["lev_dist", "rdf"] 157 | non_normalized_functions = ["aff", "sw", "swn", "nmw"] 158 | 159 | cols = candset_features_df.columns 160 | cols_to_be_dropped = [] 161 | for col in cols: 162 | for func in distance_functions + non_normalized_functions: 163 | if func in col: 164 | cols_to_be_dropped.append(col) 165 | break 166 | 167 | candset_similarity_features_df = candset_features_df.drop(cols_to_be_dropped, axis=1) 168 | similarity_features_df = candset_similarity_features_df.drop(['gold', '_id', 'ltable_id', 'rtable_id'], axis=1) 169 | 170 | # Dropping columns that have only one value 171 | cols_to_be_dropped = [] 172 | col_count_map = similarity_features_df.nunique() 173 | for col in similarity_features_df.columns: 174 | if col_count_map[col] == 1: 175 | cols_to_be_dropped.append(col) 176 | similarity_features_df = similarity_features_df.drop(cols_to_be_dropped, axis=1) 177 | 178 | 179 | if (avged==False): 180 | return similarity_features_df 181 | 182 | 183 | headers= similarity_features_df.columns.values 184 | 185 | attributes = [] 186 | for h in headers: 187 | arr = h.split("_") 188 | attributes.append(arr[0]) 189 | attributes = set(attributes) 190 | 191 | avged_df = pd.DataFrame() 192 | 193 | for attribute in attributes: 194 | #print("\nFeatures for attribute:", attribute) 195 | matches = np.zeros(candset_features_df.shape[0]) 196 | counts = 0 197 | for h in headers: 198 | if attribute in h: 199 | #print(h) 200 | matches = np.add(matches, candset_features_df[h].values) 201 | counts += 1 202 | matches = matches/counts 203 | avged_df[attribute] = matches 204 | 205 | return avged_df -------------------------------------------------------------------------------- /data_loading_helper/magellan_modified_feature_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains functions for auto feature generation. 3 | """ 4 | import logging 5 | 6 | import pandas as pd 7 | import six 8 | from py_entitymatching.utils.validation_helper import validate_object_type 9 | 10 | from IPython.display import display 11 | 12 | import py_entitymatching as em 13 | import py_entitymatching.feature.attributeutils as au 14 | import py_entitymatching.feature.simfunctions as sim 15 | import py_entitymatching.feature.tokenizers as tok 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def get_features(ltable, rtable, l_attr_types, r_attr_types, 21 | attr_corres, tok_funcs, sim_funcs): 22 | """ 23 | This function will automatically generate a set of features based on the 24 | attributes of the input tables. 25 | 26 | Specifically, this function will go through the attribute 27 | correspondences between the input tables. For each correspondence , 28 | it examines the types of the involved attributes, then apply the 29 | appropriate tokenizers and sim functions to generate all appropriate 30 | features for this correspondence. 31 | 32 | Args: 33 | ltable,rtable (DataFrame): The pandas DataFrames for which the 34 | features must be generated. 35 | l_attr_types,r_attr_types (dictionary): The attribute types for the 36 | input DataFrames. Typically this is generated using the 37 | function 'get_attr_types'. 38 | attr_corres (dictionary): The attribute correspondences between the 39 | input DataFrames. 40 | tok_funcs (dictionary): A Python dictionary containing tokenizer 41 | functions. 42 | sim_funcs (dictionary): A Python dictionary containing similarity 43 | functions. 44 | 45 | Returns: 46 | A pandas DataFrame containing automatically generated features. 47 | Specifically, the DataFrame contains the following attributes: 48 | 'feature_name', 'left_attribute', 'right_attribute', 49 | 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 50 | 'function', 'function_source', 'is_auto_generated'. 51 | 52 | Raises: 53 | AssertionError: If `ltable` is not of type pandas 54 | DataFrame. 55 | AssertionError: If `rtable` is not of type pandas 56 | DataFrame. 57 | AssertionError: If `l_attr_types` is not of type 58 | python dictionary. 59 | AssertionError: If `r_attr_types` is not of type 60 | python dictionary. 61 | AssertionError: If `attr_corres` is not of type 62 | python dictionary. 63 | AssertionError: If `sim_funcs` is not of type 64 | python dictionary. 65 | AssertionError: If `tok_funcs` is not of type 66 | python dictionary. 67 | AssertionError: If the `ltable` and `rtable` order is same as mentioned 68 | in the `l_attr_types`/`r_attr_types` and `attr_corres`. 69 | 70 | Examples: 71 | 72 | >>> import py_entitymatching as em 73 | >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') 74 | >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') 75 | >>> match_t = em.get_tokenizers_for_matching() 76 | >>> match_s = em.get_sim_funs_for_matching() 77 | >>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step 78 | >>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step 79 | >>> match_c = em.get_attr_corres(A, B) 80 | >>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s) 81 | 82 | See Also: 83 | :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`, 84 | :meth:`py_entitymatching.get_sim_funs_for_blocking`, 85 | :meth:`py_entitymatching.get_tokenizers_for_blocking`, 86 | :meth:`py_entitymatching.get_sim_funs_for_matching`, 87 | :meth:`py_entitymatching.get_tokenizers_for_matching` 88 | 89 | 90 | Note: 91 | In the output DataFrame, two 92 | attributes demand some explanation: (1)function, and (2) 93 | is_auto_generated. The function, points to the actual python function 94 | that implements feature. Specifically, the function takes in two 95 | tuples (one from each input table) and returns a numeric value. The 96 | attribute is_auto_generated contains either True or False. The flag 97 | is True only if the feature is automatically generated by py_entitymatching. 98 | This is important because this flag is used to make some assumptions 99 | about the semantics of the similarity function used and use that 100 | information for scaling purposes. 101 | 102 | """ 103 | # Validate input parameters 104 | # # We expect the ltable to be of type pandas DataFrame 105 | validate_object_type(ltable, pd.DataFrame, 'Input ltable') 106 | 107 | # # We expect the rtable to be of type pandas DataFrame 108 | validate_object_type(rtable, pd.DataFrame, 'Input rtable') 109 | 110 | # # We expect the l_attr_types to be of type python dictionary 111 | validate_object_type(l_attr_types, dict, 'Input l_attr_types') 112 | 113 | # # We expect the r_attr_types to be of type python dictionary 114 | validate_object_type(r_attr_types, dict, 'Input r_attr_types') 115 | 116 | # # We expect the attr_corres to be of type python dictionary 117 | validate_object_type(attr_corres, dict, 'Input attr_corres') 118 | 119 | # # We expect the tok_funcs to be of type python dictionary 120 | validate_object_type(tok_funcs, dict, 'Input tok_funcs') 121 | 122 | # # We expect the sim_funcs to be of type python dictionary 123 | validate_object_type(sim_funcs, dict, 'Input sim_funcs') 124 | 125 | # We expect the table order to be same in l/r_attr_types and attr_corres 126 | if not _check_table_order(ltable, rtable, 127 | l_attr_types, r_attr_types, attr_corres): 128 | logger.error('Table order is different than what is mentioned ' 129 | 'in l/r attr_types and attr_corres') 130 | raise AssertionError('Table order is different than what is mentioned ' 131 | 'in l/r attr_types and attr_corres') 132 | 133 | # Initialize output feature dictionary list 134 | feature_dict_list = [] 135 | 136 | # Generate features for each attr. correspondence 137 | for ac in attr_corres['corres']: 138 | l_attr_type = l_attr_types[ac[0]] 139 | r_attr_type = r_attr_types[ac[1]] 140 | 141 | # Generate a feature only if the attribute types are same 142 | if l_attr_type != r_attr_type: 143 | logger.info('py_entitymatching types: %s type (%s) and %s type (%s) ' 144 | 'are different.' 145 | 'If you want to set them to be same and ' 146 | 'generate features, ' 147 | 'update output from get_attr_types and ' 148 | 'use get_features command.\n.' 149 | % (ac[0], l_attr_type, ac[1], r_attr_type)) 150 | # features_1 = _get_features_for_type(l_attr_type) 151 | # features_2 = _get_features_for_type(r_attr_type) 152 | # features = set(features_1).union(features_2) 153 | continue 154 | 155 | # Generate features 156 | features = _get_features_for_type(l_attr_type) 157 | 158 | # Convert features to function objects 159 | fn_objs = _conv_func_objs(features, ac, tok_funcs, sim_funcs) 160 | # Add the function object to a feature list. 161 | feature_dict_list.append(fn_objs) 162 | 163 | # Create a feature table 164 | feature_table = pd.DataFrame(flatten_list(feature_dict_list)) 165 | # Project out only the necessary columns. 166 | feature_table = feature_table[['feature_name', 'left_attribute', 167 | 'right_attribute', 'left_attr_tokenizer', 168 | 'right_attr_tokenizer', 169 | 'simfunction', 'function', 170 | 'function_source', 'is_auto_generated']] 171 | # Return the feature table. 172 | return feature_table 173 | 174 | def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres): 175 | """ 176 | Check whether the order of tables matches with what is mentioned in 177 | l_attr_types, r_attr_type and attr_corres. 178 | """ 179 | # Validate the input parameters 180 | # We expect the input object ltable to be of type pandas DataFrame 181 | validate_object_type(ltable, pd.DataFrame, 'Input left table') 182 | 183 | # # We expect the rtable to be of type pandas DataFrame 184 | validate_object_type(rtable, pd.DataFrame, 'Input right table') 185 | 186 | # Get the ids of the input tables. This is used to validate the order 187 | # of tables present in the given data structures. 188 | # Note: This kind of checking is bit too aggressive, the reason is this 189 | # checking needs the ltable and rtable to point to exact memory location 190 | # across the given dictionaries and the input. Ideally, we just need to 191 | # check whether the contents of those DataFrames are same. 192 | ltable_id = id(ltable) 193 | rtable_id = id(rtable) 194 | 195 | # Check whether ltable id matches with id of table mentioned in l_attr_types 196 | if ltable_id != id(l_attr_types['_table']): 197 | logger.error( 198 | 'ltable is not the same as table mentioned in left attr types') 199 | return False 200 | 201 | # Check whether rtable id matches with id of table mentioned in r_attr_types 202 | if rtable_id != id(r_attr_types['_table']): 203 | logger.error( 204 | 'rtable is not the same as table mentioned in right attr types') 205 | return False 206 | 207 | # Check whether ltable matches with ltable mentioned in attr_corres 208 | if ltable_id != id(attr_corres['ltable']): 209 | logger.error( 210 | 'ltable is not the same as table mentioned in attr correspondence') 211 | return False 212 | 213 | # Check whether rtable matches with rtable mentioned in attr_corres 214 | if rtable_id != id(attr_corres['rtable']): 215 | logger.error( 216 | 'rtable is not the same as table mentioned in attr correspondence') 217 | return False 218 | 219 | # Finally, return True. 220 | return True 221 | 222 | 223 | # get look up table to generate features 224 | def _get_feat_lkp_tbl(): 225 | """ 226 | This function embeds the knowledge of mapping what features to be 227 | generated for what kind of attr. types. 228 | 229 | """ 230 | # Initialize a lookup table 231 | lookup_table = dict() 232 | 233 | # Features for type str_eq_1w 234 | lookup_table['STR_EQ_1W'] = [('affine'), 235 | ('cosine', 'qgm_2', 'qgm_2'), 236 | ('cosine', 'qgm_3', 'qgm_3'), 237 | ('dice', 'qgm_2', 'qgm_2'), 238 | ('dice', 'qgm_3', 'qgm_3'), 239 | #('hamming_dist'), ('hamming_sim'), 240 | ('lev_dist'), ('lev_sim'), ('jaro'), 241 | ('jaro_winkler'), 242 | ('exact_match'), 243 | ('smith_waterman'), 244 | ('needleman_wunsch'), 245 | ('monge_elkan', 'qgm_2', 'qgm_2'), 246 | ('monge_elkan', 'qgm_3', 'qgm_3'), 247 | ('overlap_coeff', 'qgm_2', 'qgm_2'), 248 | ('overlap_coeff', 'qgm_3', 'qgm_3'), 249 | ('jaccard', 'qgm_2', 'qgm_2'), 250 | ('jaccard', 'qgm_3', 'qgm_3')] 251 | 252 | # Features for type str_bt_1w_5w 253 | lookup_table['STR_BT_1W_5W'] = [('affine'), 254 | ('cosine', 'dlm_dc0', 'dlm_dc0'), 255 | ('cosine', 'qgm_3', 'qgm_3'), 256 | ('dice', 'dlm_dc0', 'dlm_dc0'), 257 | ('dice', 'qgm_3', 'qgm_3'), 258 | #('hamming_dist'), ('hamming_sim'), 259 | ('lev_dist'), ('lev_sim'), ('jaro'), 260 | ('jaro_winkler'), 261 | ('exact_match'), 262 | ('smith_waterman'), 263 | ('needleman_wunsch'), 264 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'), 265 | ('monge_elkan', 'qgm_3', 'qgm_3'), 266 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'), 267 | ('overlap_coeff', 'qgm_3', 'qgm_3'), 268 | ('jaccard', 'dlm_dc0', 'dlm_dc0'), 269 | ('jaccard', 'qgm_3', 'qgm_3')] # dlm_dc0 is the concrete space tokenizer 270 | 271 | # Features for type str_bt_5w_10w 272 | lookup_table['STR_BT_5W_10W'] = [('cosine', 'dlm_dc0', 'dlm_dc0'), 273 | ('cosine', 'qgm_3', 'qgm_3'), 274 | ('dice', 'dlm_dc0', 'dlm_dc0'), 275 | ('dice', 'qgm_3', 'qgm_3'), 276 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'), 277 | ('monge_elkan', 'qgm_3', 'qgm_3'), 278 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'), 279 | ('overlap_coeff', 'qgm_3', 'qgm_3'), 280 | ('jaccard', 'dlm_dc0', 'dlm_dc0'), 281 | ('jaccard', 'qgm_3', 'qgm_3')] 282 | 283 | # Features for type str_gt_10w 284 | lookup_table['STR_GT_10W'] = [('cosine', 'dlm_dc0', 'dlm_dc0'), 285 | ('cosine', 'qgm_3', 'qgm_3'), 286 | ('dice', 'dlm_dc0', 'dlm_dc0'), 287 | ('dice', 'qgm_3', 'qgm_3'), 288 | ('monge_elkan', 'dlm_dc0', 'dlm_dc0'), 289 | ('monge_elkan', 'qgm_3', 'qgm_3'), 290 | ('overlap_coeff', 'dlm_dc0', 'dlm_dc0'), 291 | ('overlap_coeff', 'qgm_3', 'qgm_3'), 292 | ('jaccard', 'dlm_dc0', 'dlm_dc0'), 293 | ('jaccard', 'qgm_3', 'qgm_3')] 294 | 295 | # Features for NUMERIC type 296 | lookup_table['NUM'] = [('exact_match'), ('abs_norm'), ('lev_dist'), 297 | ('lev_sim'), 298 | #('hamming_dist'), ('hamming_sim'), 299 | ('rel_diff')] 300 | 301 | # Features for BOOLEAN type 302 | lookup_table['BOOL'] = [('exact_match')] 303 | 304 | # Features for un determined type 305 | lookup_table['UN_DETERMINED'] = [] 306 | 307 | # Finally, return the lookup table 308 | return lookup_table 309 | 310 | 311 | def _get_features_for_type(column_type): 312 | """ 313 | Get features to be generated for a type 314 | """ 315 | # First get the look up table 316 | lookup_table = _get_feat_lkp_tbl() 317 | 318 | # Based on the column type, return the feature functions that should be 319 | # generated. 320 | if column_type is 'str_eq_1w': 321 | features = lookup_table['STR_EQ_1W'] 322 | elif column_type is 'str_bt_1w_5w': 323 | features = lookup_table['STR_BT_1W_5W'] 324 | elif column_type is 'str_bt_5w_10w': 325 | features = lookup_table['STR_BT_5W_10W'] 326 | elif column_type is 'str_gt_10w': 327 | features = lookup_table['STR_GT_10W'] 328 | elif column_type is 'numeric': 329 | features = lookup_table['NUM'] 330 | elif column_type is 'boolean': 331 | features = lookup_table['BOOL'] 332 | elif column_type is 'un_determined': 333 | features = lookup_table['UN_DETERMINED'] 334 | else: 335 | raise TypeError('Unknown type') 336 | return features 337 | 338 | # convert features from look up table to function objects 339 | def _conv_func_objs(features, attributes, 340 | tokenizer_functions, similarity_functions): 341 | """ 342 | Convert features from look up table to function objects 343 | """ 344 | # We need to check whether the features have allowed tokenizers and 345 | # similarity functions. 346 | 347 | # # First get the tokenizer and similarity functions list. 348 | tokenizer_list = tokenizer_functions.keys() 349 | similarity_functions_list = similarity_functions.keys() 350 | 351 | # # Second get the features that uses only valid tokenizers and 352 | # similarity functions 353 | valid_list = [check_valid_tok_sim(feature, tokenizer_list, 354 | similarity_functions_list) 355 | for feature in features] 356 | 357 | # Get function as a string and other meta data; finally we will get a 358 | # list of tuples 359 | function_tuples = [get_fn_str(input, attributes) for input in valid_list] 360 | 361 | # Convert the function string into a function object 362 | function_objects = conv_fn_str_to_obj(function_tuples, tokenizer_functions, 363 | similarity_functions) 364 | 365 | return function_objects 366 | 367 | 368 | # check whether tokenizers and simfunctions are allowed 369 | # inp is of the form ('jaccard', 'qgm_3', 'qgm_3') or ('lev') 370 | def check_valid_tok_sim(inp, simlist, toklist): 371 | if isinstance(inp, six.string_types): 372 | inp = [inp] 373 | assert len(inp) == 1 or len( 374 | inp) == 3, 'len of feature config should be 1 or 3' 375 | # check whether the sim function in features is in simlist 376 | if len(set(inp).intersection(simlist)) > 0: 377 | return inp 378 | # check whether the tokenizer in features is in tok list 379 | if len(set(inp).intersection(toklist)) > 0: 380 | return inp 381 | return None 382 | 383 | 384 | # get function string for a feature 385 | def get_fn_str(inp, attrs): 386 | if inp: 387 | args = [] 388 | args.extend(attrs) 389 | if isinstance(inp, six.string_types) == True: 390 | inp = [inp] 391 | args.extend(inp) 392 | # fill function string from a template 393 | return fill_fn_template(*args) 394 | else: 395 | return None 396 | 397 | 398 | # fill function template 399 | def fill_fn_template(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None): 400 | # construct function string 401 | s = 'from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers import *\n' 402 | # get the function name 403 | fn_name = get_fn_name(attr1, attr2, sim_func, tok_func_1, tok_func_2) 404 | # proceed with function construction 405 | fn_st = 'def ' + fn_name + '(ltuple, rtuple):' 406 | s += fn_st 407 | s += '\n' 408 | 409 | # add 4 spaces 410 | s += ' ' 411 | fn_body = 'return ' 412 | if tok_func_1 is not None and tok_func_2 is not None: 413 | fn_body = fn_body + sim_func + '(' + tok_func_1 + '(' + 'ltuple["' + attr1 + '"]' 414 | fn_body += '), ' 415 | fn_body = fn_body + tok_func_2 + '(' + 'rtuple["' + attr2 + '"]' 416 | fn_body = fn_body + ')) ' 417 | else: 418 | fn_body = fn_body + sim_func + '(' + 'ltuple["' + attr1 + '"], rtuple["' + attr2 + '"])' 419 | s += fn_body 420 | 421 | return fn_name, attr1, attr2, tok_func_1, tok_func_2, sim_func, s 422 | 423 | 424 | # construct function name from attrs, tokenizers and sim funcs 425 | 426 | # sim_fn_names=['jaccard', 'lev', 'cosine', 'monge_elkan', 427 | # 'needleman_wunsch', 'smith_waterman', 'jaro', 'jaro_winkler', 428 | # 'exact_match', 'rel_diff', 'abs_norm'] 429 | def get_fn_name(attr1, attr2, sim_func, tok_func_1=None, tok_func_2=None): 430 | attr1 = '_'.join(attr1.split()) 431 | attr2 = '_'.join(attr2.split()) 432 | fp = '_'.join([attr1, attr2]) 433 | name_lkp = dict() 434 | name_lkp["jaccard"] = "jac" 435 | name_lkp["lev_dist"] = "lev_dist" 436 | name_lkp["lev_sim"] = "lev_sim" 437 | name_lkp["cosine"] = "cos" 438 | name_lkp["monge_elkan"] = "mel" 439 | name_lkp["needleman_wunsch"] = "nmw" 440 | name_lkp["smith_waterman"] = "sw" 441 | name_lkp["jaro"] = "jar" 442 | name_lkp["jaro_winkler"] = "jwn" 443 | name_lkp["exact_match"] = "exm" 444 | name_lkp["abs_norm"] = "anm" 445 | name_lkp["rel_diff"] = "rdf" 446 | name_lkp["1"] = "1" 447 | name_lkp["2"] = "2" 448 | name_lkp["3"] = "3" 449 | name_lkp["4"] = "4" 450 | name_lkp["tok_whitespace"] = "wsp" 451 | name_lkp["tok_qgram"] = "qgm" 452 | name_lkp["tok_delim"] = "dlm" 453 | 454 | arg_list = [sim_func, tok_func_1, tok_func_2] 455 | nm_list = [name_lkp.get(tok, tok) for tok in arg_list if tok] 456 | sp = '_'.join(nm_list) 457 | return '_'.join([fp, sp]) 458 | 459 | 460 | # conv function string to function object and return with meta data 461 | def conv_fn_str_to_obj(fn_tup, tok, sim_funcs): 462 | d_orig = {} 463 | d_orig.update(tok) 464 | d_orig.update(sim_funcs) 465 | d_ret_list = [] 466 | for f in fn_tup: 467 | d_ret = {} 468 | name = f[0] 469 | attr1 = f[1] 470 | attr2 = f[2] 471 | tok_1 = f[3] 472 | tok_2 = f[4] 473 | simfunction = f[5] 474 | # exec(f[6] in d_orig) 475 | six.exec_(f[6], d_orig) 476 | d_ret['function'] = d_orig[name] 477 | d_ret['feature_name'] = name 478 | d_ret['left_attribute'] = attr1 479 | d_ret['right_attribute'] = attr2 480 | d_ret['left_attr_tokenizer'] = tok_1 481 | d_ret['right_attr_tokenizer'] = tok_2 482 | d_ret['simfunction'] = simfunction 483 | d_ret['function_source'] = f[6] 484 | d_ret['is_auto_generated'] = True 485 | 486 | d_ret_list.append(d_ret) 487 | return d_ret_list 488 | 489 | 490 | def flatten_list(inp_list): 491 | return [item for sublist in inp_list for item in sublist] -------------------------------------------------------------------------------- /datasets/fodors_zagats/fodors.csv: -------------------------------------------------------------------------------- 1 | id,name,addr,city,phone,type,class 2 | 534,'arnie morton\'s of chicago','435 s. la cienega blv.','los angeles',310/246-1501,american,0 3 | 535,'art\'s delicatessen','12224 ventura blvd.','studio city',818/762-1221,american,1 4 | 536,'hotel bel-air','701 stone canyon rd.','bel air',310/472-1211,californian,2 5 | 537,'cafe bizou','14016 ventura blvd.','sherman oaks',818/788-3536,french,3 6 | 538,campanile,'624 s. la brea ave.','los angeles',213/938-1447,american,4 7 | 539,'chinois on main','2709 main st.','santa monica',310/392-9025,french,5 8 | 540,citrus,'6703 melrose ave.','los angeles',213/857-0034,californian,6 9 | 541,fenix,'8358 sunset blvd. west',hollywood,213/848-6677,american,7 10 | 542,granita,'23725 w. malibu rd.',malibu,310/456-0488,californian,8 11 | 543,'grill on the alley','9560 dayton way','los angeles',310/276-0615,american,9 12 | 544,'restaurant katsu','1972 n. hillhurst ave.','los angeles',213/665-1891,asian,10 13 | 545,'l\'orangerie','903 n. la cienega blvd.','los angeles',310/652-9770,french,11 14 | 546,'le chardonnay','8284 melrose ave.','los angeles',213/655-8880,french,12 15 | 547,'locanda veneta','3rd st.','los angeles',310/274-1893,italian,13 16 | 548,matsuhisa,'129 n. la cienega blvd.','beverly hills',310/659-9639,asian,14 17 | 549,'the palm','9001 santa monica blvd.','los angeles',310/550-8811,american,15 18 | 550,patina,'5955 melrose ave.','los angeles',213/467-1108,californian,16 19 | 551,'philippe\'s the original','1001 n. alameda st.','los angeles',213/628-3781,american,17 20 | 552,'pinot bistro','12969 ventura blvd.','los angeles',818/990-0500,french,18 21 | 553,'rex il ristorante','617 s. olive st.','los angeles',213/627-2300,italian,19 22 | 554,spago,'1114 horn ave.','los angeles',310/652-4025,californian,20 23 | 555,valentino,'3115 pico blvd.','santa monica',310/829-4313,italian,21 24 | 556,'yujean kang\'s gourmet chinese cuisine','67 n. raymond ave.','los angeles',818/585-0855,asian,22 25 | 557,'21 club','21 w. 52nd st.','new york',212/582-7200,american,23 26 | 558,aquavit,'13 w. 54th st.','new york',212/307-7311,continental,24 27 | 559,aureole,'34 e. 61st st.','new york','212/ 319-1660',american,25 28 | 560,'cafe lalo','201 w. 83rd st.','new york',212/496-6031,'coffee bar',26 29 | 561,'cafe des artistes','1 w. 67th st.','new york',212/877-3500,continental,27 30 | 562,'carmine\'s','2450 broadway between 90th and 91st sts.','new york',212/362-2200,italian,28 31 | 563,'carnegie deli','854 7th ave. between 54th and 55th sts.','new york',212/757-2245,delicatessen,29 32 | 564,chanterelle,'2 harrison st. near hudson st.','new york',212/966-6960,american,30 33 | 565,daniel,'20 e. 76th st.','new york',212/288-0033,french,31 34 | 566,dawat,'210 e. 58th st.','new york',212/355-7555,asian,32 35 | 567,felidia,'243 e. 58th st.','new york',212/758-1479,italian,33 36 | 568,'four seasons grill room','99 e. 52nd st.','new york',212/754-9494,american,34 37 | 569,'gotham bar & grill','12 e. 12th st.','new york',212/620-4020,american,35 38 | 570,'gramercy tavern','42 e. 20th st. between park ave. s and broadway','new york',212/477-0777,american,36 39 | 571,'island spice','402 w. 44th st.','new york',212/765-1737,'tel caribbean',37 40 | 572,'jo jo','160 e. 64th st.','new york',212/223-5656,american,38 41 | 573,'la caravelle','33 w. 55th st.','new york',212/586-4252,french,39 42 | 574,'la cote basque','60 w. 55th st. between 5th and 6th ave.','new york',212/688-6525,french,40 43 | 575,'le bernardin','155 w. 51st st.','new york',212/489-1515,french,41 44 | 576,'les celebrites','160 central park s','new york',212/484-5113,french,42 45 | 577,lespinasse,'2 e. 55th st.','new york',212/339-6719,american,43 46 | 578,lutece,'249 e. 50th st.','new york',212/752-2225,french,44 47 | 579,'manhattan ocean club','57 w. 58th st.','new york','212/ 371-7777',seafood,45 48 | 580,march,'405 e. 58th st.','new york',212/754-6272,american,46 49 | 581,'mesa grill','102 5th ave. between 15th and 16th sts.','new york',212/807-7400,american,47 50 | 582,'mi cocina','57 jane st. off hudson st.','new york',212/627-8273,mexican,48 51 | 583,montrachet,'239 w. broadway between walker and white sts.','new york','212/ 219-2777',french,49 52 | 584,oceana,'55 e. 54th st.','new york',212/759-5941,seafood,50 53 | 585,'park avenue cafe','100 e. 63rd st.','new york',212/644-1900,american,51 54 | 586,petrossian,'182 w. 58th st.','new york',212/245-2214,french,52 55 | 587,picholine,'35 w. 64th st.','new york',212/724-8585,mediterranean,53 56 | 588,pisces,'95 ave. a at 6th st.','new york',212/260-6660,seafood,54 57 | 589,'rainbow room','30 rockefeller plaza','new york',212/632-5000,'or 212/632-5100 american',55 58 | 590,'river cafe','1 water st. at the east river',brooklyn,718/522-5200,american,56 59 | 591,'san domenico','240 central park s','new york',212/265-5959,italian,57 60 | 592,'second avenue deli','156 2nd ave. at 10th st.','new york',212/677-0606,delicatessen,58 61 | 593,seryna,'11 e. 53rd st.','new york',212/980-9393,asian,59 62 | 594,'shun lee west','43 w. 65th st.','new york',212/371-8844,asian,60 63 | 595,'sign of the dove','1110 3rd ave. at 65th st.','new york',212/861-8080,american,61 64 | 596,'smith & wollensky','201 e. 49th st.','new york',212/753-1530,american,62 65 | 597,'tavern on the green','in central park at 67th st.','new york',212/873-3200,american,63 66 | 598,'uncle nick\'s','747 9th ave. between 50th and 51st sts.','new york',212/315-1726,mediterranean,64 67 | 599,'union square cafe','21 e. 16th st.','new york',212/243-4020,american,65 68 | 600,'virgil\'s','152 w. 44th st.','new york','212/ 921-9494',american,66 69 | 601,'chin\'s','3200 las vegas blvd. s','las vegas',702/733-8899,asian,67 70 | 602,'coyote cafe','3799 las vegas blvd. s','las vegas',702/891-7349,southwestern,68 71 | 603,'le montrachet','3000 w. paradise rd.','las vegas',702/732-5111,continental,69 72 | 604,'palace court','3570 las vegas blvd. s','las vegas',702/731-7547,continental,70 73 | 605,'second street grille','200 e. fremont st.','las vegas',702/385-3232,seafood,71 74 | 606,'steak house','2880 las vegas blvd. s','las vegas',702/734-0410,'steak houses',72 75 | 607,tillerman,'2245 e. flamingo rd.','las vegas',702/731-4036,seafood,73 76 | 608,abruzzi,'2355 peachtree rd. peachtree battle shopping center',atlanta,404/261-8186,italian,74 77 | 609,bacchanalia,'3125 piedmont rd. near peachtree rd.',atlanta,404/365-0410,international,75 78 | 610,'bone\'s','3130 piedmont road',atlanta,404/237-2663,american,76 79 | 611,'brasserie le coze','3393 peachtree rd. lenox square mall near neiman marcus',atlanta,404/266-1440,french,77 80 | 612,'buckhead diner','3073 piedmont road',atlanta,404/262-3336,american,78 81 | 613,ciboulette,'1529 piedmont ave.',atlanta,404/874-7600,french,79 82 | 614,delectables,'1 margaret mitchell sq.',atlanta,404/681-2909,american,80 83 | 615,'georgia grille','2290 peachtree rd. peachtree square shopping center',atlanta,404/352-3517,american,81 84 | 616,'hedgerose heights inn','490 e. paces ferry rd.',atlanta,404/233-7673,international,82 85 | 617,'heera of india','595 piedmont ave. rio shopping mall',atlanta,404/876-4408,asian,83 86 | 618,'indigo coastal grill','1397 n. highland ave.',atlanta,404/876-0676,caribbean,84 87 | 619,'la grotta','2637 peachtree rd. peachtree house condominium',atlanta,404/231-1368,italian,85 88 | 620,'mary mac\'s tea room','224 ponce de leon ave.',atlanta,404/876-1800,southern,86 89 | 621,'nikolai\'s roof','255 courtland st. at harris st.',atlanta,404/221-6362,continental,87 90 | 622,'pano\'s and paul\'s','1232 w. paces ferry rd.',atlanta,404/261-3662,international,88 91 | 623,'cafe ritz-carlton buckhead','3434 peachtree rd.',atlanta,404/237-2700,'ext 6108 international',89 92 | 624,'dining room ritz-carlton buckhead','3434 peachtree rd.',atlanta,404/237-2700,international,90 93 | 625,'restaurant ritz-carlton atlanta','181 peachtree st.',atlanta,404/659-0400,continental,91 94 | 626,toulouse,'b peachtree rd.',atlanta,404/351-9533,french,92 95 | 627,'veni vidi vici','41 14th st.',atlanta,404/875-8424,italian,93 96 | 628,'alain rondelli','126 clement st.','san francisco',415/387-0408,french,94 97 | 629,aqua,'252 california st.','san francisco',415/956-9662,seafood,95 98 | 630,boulevard,'1 mission st.','san francisco',415/543-6084,american,96 99 | 631,'cafe claude','7 claude la.','san francisco',415/392-3505,french,97 100 | 632,'campton place','340 stockton st.','san francisco',415/955-5555,american,98 101 | 633,'chez michel','804 northpoint','san francisco',415/775-7036,french,99 102 | 634,'fleur de lys','777 sutter st.','san francisco',415/673-7779,french,100 103 | 635,fringale,'570 4th st.','san francisco',415/543-0573,french,101 104 | 636,'hawthorne lane','22 hawthorne st.','san francisco',415/777-9779,american,102 105 | 637,'khan toke thai house','5937 geary blvd.','san francisco',415/668-6654,asian,103 106 | 638,'la folie','2316 polk st.','san francisco',415/776-5577,french,104 107 | 639,lulu,'816 folsom st.','san francisco',415/495-5775,mediterranean,105 108 | 640,'masa\'s','648 bush st.','san francisco',415/989-7154,french,106 109 | 641,'mifune japan center kintetsu building','1737 post st.','san francisco',415/922-0337,asian,107 110 | 642,'plumpjack cafe','3201 fillmore st.','san francisco',415/563-4755,mediterranean,108 111 | 643,postrio,'545 post st.','san francisco',415/776-7825,american,109 112 | 644,'ritz-carlton restaurant and dining room','600 stockton st.','san francisco',415/296-7465,american,110 113 | 645,'rose pistola','532 columbus ave.','san francisco',415/399-0499,italian,111 114 | 646,bolo,'23 e. 22nd st.','new york',212/228-2200,mediterranean,191 115 | 647,'il nido','251 e. 53rd st.','new york',212/753-8450,italian,267 116 | 648,remi,'145 w. 53rd st.','new york',212/581-4242,italian,334 117 | 649,'adriano\'s ristorante','2930 beverly glen circle','los angeles',310/475-9807,italian,112 118 | 650,'barney greengrass','9570 wilshire blvd.','beverly hills',310/777-5877,american,113 119 | 651,beaurivage,'26025 pacific coast hwy.',malibu,310/456-5733,french,114 120 | 652,'bistro garden','176 n. canon dr.','los angeles',310/550-3900,californian,115 121 | 653,'border grill','4th st.','los angeles',310/451-1655,mexican,116 122 | 654,'broadway deli','3rd st. promenade','santa monica',310/451-0616,american,117 123 | 655,'ca\'brea','346 s. la brea ave.','los angeles',213/938-2863,italian,118 124 | 656,'ca\'del sol','4100 cahuenga blvd.','los angeles',818/985-4669,italian,119 125 | 657,'cafe pinot','700 w. fifth st.','los angeles',213/239-6500,californian,120 126 | 658,'california pizza kitchen','207 s. beverly dr.','los angeles',310/275-1101,californian,121 127 | 659,'canter\'s','419 n. fairfax ave.','los angeles',213/651-2030.,american,122 128 | 660,cava,'3rd st.','los angeles',213/658-8898,mediterranean,123 129 | 661,'cha cha cha','656 n. virgil ave.','los angeles',213/664-7723,caribbean,124 130 | 662,'chan dara','310 n. larchmont blvd.','los angeles',213/467-1052,asian,125 131 | 663,'clearwater cafe','168 w. colorado blvd.','los angeles',818/356-0959,'health food',126 132 | 664,'dining room','9500 wilshire blvd.','los angeles',310/275-5200,californian,127 133 | 665,dive!,'10250 santa monica blvd.','los angeles',310/788-,'dive american',128 134 | 666,drago,'2628 wilshire blvd.','santa monica',310/828-1585,italian,129 135 | 667,'drai\'s','730 n. la cienega blvd.','los angeles',310/358-8585,french,130 136 | 668,'dynasty room','930 hilgard ave.','los angeles',310/208-8765,continental,131 137 | 669,eclipse,'8800 melrose ave.','los angeles',310/724-5959,californian,132 138 | 670,'ed debevic\'s','134 n. la cienega','los angeles',310/659-1952,american,133 139 | 671,'el cholo','1121 s. western ave.','los angeles',213/734-2773,mexican,134 140 | 672,'gilliland\'s','2424 main st.','santa monica',310/392-3901,american,135 141 | 673,'gladstone\'s','4 fish 17300 pacific coast hwy. at sunset blvd.','pacific palisades',310/454-3474,american,136 142 | 674,'hard rock cafe','8600 beverly blvd.','los angeles',310/276-7605,american,137 143 | 675,'harry\'s bar & american grill','2020 ave. of the stars','los angeles',310/277-2333,italian,138 144 | 676,'il fornaio cucina italiana','301 n. beverly dr.','los angeles',310/550-8330,italian,139 145 | 677,'jack sprat\'s grill','10668 w. pico blvd.','los angeles',310/837-6662,'health food',140 146 | 678,'jackson\'s farm','439 n. beverly drive','los angeles',310/273-5578,californian,141 147 | 679,'jimmy\'s','201 moreno dr.','los angeles',310/552-2394,continental,142 148 | 680,joss,'9255 sunset blvd.','los angeles',310/276-1886,asian,143 149 | 681,'le colonial','8783 beverly blvd.','los angeles',310/289-0660,asian,144 150 | 682,'le dome','8720 sunset blvd.','los angeles',310/659-6919,french,145 151 | 683,'louise\'s trattoria','4500 los feliz blvd.','los angeles',213/667-0777,italian,146 152 | 684,'mon kee seafood restaurant','679 n. spring st.','los angeles',213/628-6717,asian,147 153 | 685,'morton\'s','8764 melrose ave.','los angeles',310/276-5205,american,148 154 | 686,'nate \'n\' al\'s','414 n. beverly dr.','los angeles',310/274-0101,american,149 155 | 687,nicola,'601 s. figueroa st.','los angeles',213/485-0927,american,150 156 | 688,'ocean avenue','1401 ocean ave.','santa monica',310/394-5669,american,151 157 | 689,orleans,'11705 national blvd.','los angeles',310/479-4187,cajun,152 158 | 690,'pacific dining car','6th st.','los angeles',213/483-6000,american,153 159 | 691,'paty\'s','10001 riverside dr.','toluca lake',818/761-9126,american,154 160 | 692,'pinot hollywood','1448 n. gower st.','los angeles',213/461-8800,californian,155 161 | 693,posto,'14928 ventura blvd.','sherman oaks',818/784-4400,italian,156 162 | 694,prego,'362 n. camden dr.','los angeles',310/277-7346,italian,157 163 | 695,'rj\'s the rib joint','252 n. beverly dr.','los angeles',310/274-7427,american,158 164 | 696,remi,'3rd st. promenade','santa monica',310/393-6545,italian,159 165 | 697,'restaurant horikawa','111 s. san pedro st.','los angeles',213/680-9355,asian,160 166 | 698,'roscoe\'s house of chicken \'n\' waffles','1514 n. gower st.','los angeles',213/466-9329,american,161 167 | 699,'schatzi on main','3110 main st.','los angeles',310/399-4800,continental,162 168 | 700,sofi,'3rd st.','los angeles',213/651-0346,mediterranean,163 169 | 701,swingers,'8020 beverly blvd.','los angeles',213/653-5858,american,164 170 | 702,'tavola calda','7371 melrose ave.','los angeles',213/658-6340,italian,165 171 | 703,'the mandarin','430 n. camden dr.','los angeles',310/859-0926,asian,166 172 | 704,'tommy tang\'s','7313 melrose ave.','los angeles',213/937-5733,asian,167 173 | 705,'tra di noi','3835 cross creek rd.','los angeles',310/456-0169,italian,168 174 | 706,'trader vic\'s','9876 wilshire blvd.','los angeles',310/276-6345,asian,169 175 | 707,vida,'1930 north hillhurst ave.','los feliz',213/660-4446,american,170 176 | 708,'west beach cafe','60 n. venice blvd.','los angeles',310/823-5396,american,171 177 | 709,'20 mott','20 mott st. between bowery and pell st.','new york',212/964-0380,asian,172 178 | 710,'9 jones street','9 jones st.','new york',212/989-1220,american,173 179 | 711,adrienne,'700 5th ave. at 55th st.','new york',212/903-3918,french,174 180 | 712,agrotikon,'322 e. 14 st. between 1st and 2nd aves.','new york',212/473-2602,mediterranean,175 181 | 713,aja,'937 broadway at 22nd st.','new york',212/473-8388,american,176 182 | 714,alamo,'304 e. 48th st.','new york','212/ 759-0590',mexican,177 183 | 715,'alley\'s end','311 w. 17th st.','new york',212/627-8899,american,178 184 | 716,'ambassador grill','1 united nations plaza at 44th st.','new york',212/702-5014,american,179 185 | 717,'american place','2 park ave. at 32nd st.','new york',212/684-2122,american,180 186 | 718,'anche vivolo','222 e. 58th st. between 2nd and 3rd aves.','new york',212/308-0112,italian,181 187 | 719,arizona,'206 206 e. 60th st.','new york',212/838-0440,american,182 188 | 720,'arturo\'s','106 w. houston st. off thompson st.','new york',212/677-3820,italian,183 189 | 721,'au mandarin','200-250 vesey st. world financial center','new york',212/385-0313,asian,184 190 | 722,'bar anise','1022 3rd ave. between 60th and 61st sts.','new york',212/355-1112,mediterranean,185 191 | 723,barbetta,'321 w. 46th st.','new york',212/246-9171,italian,186 192 | 724,'ben benson\'s','123 w. 52nd st.','new york',212/581-8888,american,187 193 | 725,'big cup','228 8th ave. between 21st and 22nd sts.','new york',212/206-0059,'coffee bar',188 194 | 726,'billy\'s','948 1st ave. between 52nd and 53rd sts.','new york',212/753-1870,american,189 195 | 727,'boca chica','13 1st ave. near 1st st.','new york',212/473-0108,'latin american',190 196 | 728,boonthai,'1393a 2nd ave. between 72nd and 73rd sts.','new york',212/249-8484,asian,192 197 | 729,bouterin,'420 e. 59th st. off 1st ave.','new york',212/758-0323,french,193 198 | 730,'brothers bar-b-q','225 varick st. at clarkston st.','new york',212/727-2775,american,194 199 | 731,bruno,'240 e. 58th st.','new york',212/688-4190,italian,195 200 | 732,'bryant park grill roof restaurant and bp cafe','25 w. 40th st. between 5th and 6th aves.','new york',212/840-6500,american,196 201 | 733,c3,'103 waverly pl. near washington sq.','new york',212/254-1200,american,197 202 | 734,ct,'111 e. 22nd st. between park ave. s and lexington ave.','new york',212/995-8500,french,198 203 | 735,'cafe bianco','1486 2nd ave. between 77th and 78th sts.','new york',212/988-2655,'coffee bar',199 204 | 736,'cafe botanica','160 central park s','new york',212/484-5120,french,200 205 | 737,'cafe la fortuna','69 w. 71st st.','new york',212/724-5846,'coffee bar',201 206 | 738,'cafe luxembourg','200 w. 70th st.','new york',212/873-7411,french,202 207 | 739,'cafe pierre','2 e. 61st st.','new york',212/940-8185,french,203 208 | 740,'cafe centro','200 park ave. between 45th st. and vanderbilt ave.','new york',212/818-1222,french,204 209 | 741,'cafe fes','246 w. 4th st. at charles st.','new york',212/924-7653,mediterranean,205 210 | 742,'caffe dante','81 macdougal st. between houston and bleeker sts.','new york',212/982-5275,'coffee bar',206 211 | 743,'caffe dell\'artista','46 greenwich ave.','new york',212/645-4431,'coffee bar',207 212 | 744,'caffe lure','169 sullivan st. between houston and bleecker sts.','new york',212/473-2642,french,208 213 | 745,'caffe reggio','119 macdougal st. between 3rd and bleecker sts.','new york',212/475-9557,'coffee bar',209 214 | 746,'caffe roma','385 broome st. at mulberry','new york',212/226-8413,'coffee bar',210 215 | 747,'caffe vivaldi','32 jones st. at bleecker st.','new york',212/691-7538,'coffee bar',211 216 | 748,'caffe bondi ristorante','7 w. 20th st.','new york',212/691-8136,italian,212 217 | 749,'capsouto freres','451 washington st. near watts st.','new york',212/966-4900,french,213 218 | 750,'captain\'s table','860 2nd ave. at 46th st.','new york',212/697-9538,seafood,214 219 | 751,'casa la femme','150 wooster st. between houston and prince sts.','new york',212/505-0005,'middle eastern',215 220 | 752,'cendrillon asian grill & marimba bar','45 mercer st. between broome and grand sts.','new york',212/343-9012,asian,216 221 | 753,'chez jacqueline','72 macdougal st. between w. houston and bleecker sts.','new york',212/505-0727,french,217 222 | 754,chiam,'160 e. 48th st.','new york',212/371-2323,asian,218 223 | 755,'china grill','60 w. 53rd st.','new york',212/333-7788,american,219 224 | 756,cite,'120 w. 51st st.','new york',212/956-7100,french,220 225 | 757,'coco pazzo','23 e. 74th st.','new york',212/794-0205,italian,221 226 | 758,'columbus bakery','53rd sts.','new york',212/421-0334,'coffee bar',222 227 | 759,'corrado cafe','1013 3rd ave. between 60th and 61st sts.','new york',212/753-5100,'coffee bar',223 228 | 760,'cupcake cafe','522 9th ave. at 39th st.','new york',212/465-1530,'coffee bar',224 229 | 761,'da nico','164 mulberry st. between grand and broome sts.','new york',212/343-1212,italian,225 230 | 762,'dean & deluca','121 prince st.','new york',212/254-8776,'coffee bar',226 231 | 763,diva,'341 w. broadway near grand st.','new york',212/941-9024,italian,227 232 | 764,'dix et sept','181 w. 10th st.','new york',212/645-8023,french,228 233 | 765,docks,'633 3rd ave. at 40th st.','new york','212/ 986-8080',seafood,229 234 | 766,'duane park cafe','157 duane st. between w. broadway and hudson st.','new york',212/732-5555,american,230 235 | 767,'el teddy\'s','219 w. broadway between franklin and white sts.','new york',212/941-7070,mexican,231 236 | 768,'emily\'s','1325 5th ave. at 111th st.','new york',212/996-1212,american,232 237 | 769,'empire korea','6 e. 32nd st.','new york',212/725-1333,asian,233 238 | 770,'ernie\'s','2150 broadway between 75th and 76th sts.','new york',212/496-1588,american,234 239 | 771,'evergreen cafe','1288 1st ave. at 69th st.','new york',212/744-3266,asian,235 240 | 772,'f. ille ponte ristorante','39 desbrosses st. near west st.','new york',212/226-4621,italian,236 241 | 773,felix,'340 w. broadway at grand st.','new york',212/431-0021,french,237 242 | 774,ferrier,'29 e. 65th st.','new york',212/772-9000,french,238 243 | 775,'fifty seven fifty seven','57 e. 57th st.','new york',212/758-5757,american,239 244 | 776,'film center cafe','635 9th ave. between 44th and 45th sts.','new york','212/ 262-2525',american,240 245 | 777,'fiorello\'s roman cafe','1900 broadway between 63rd and 64th sts.','new york',212/595-5330,italian,241 246 | 778,firehouse,'522 columbus ave. between 85th and 86th sts.','new york',212/595-3139,american,242 247 | 779,first,'87 1st ave. between 5th and 6th sts.','new york',212/674-3823,american,243 248 | 780,'fishin eddie','73 w. 71st st.','new york',212/874-3474,seafood,244 249 | 781,'fleur de jour','348 e. 62nd st.','new york',212/355-2020,'coffee bar',245 250 | 782,flowers,'21 west 17th st. between 5th and 6th aves.','new york',212/691-8888,american,246 251 | 783,follonico,'6 w. 24th st.','new york',212/691-6359,italian,247 252 | 784,'fraunces tavern','54 pearl st. at broad st.','new york',212/269-0144,american,248 253 | 785,'french roast','458 6th ave. at 11th st.','new york',212/533-2233,french,249 254 | 786,'french roast cafe','2340 broadway at 85th st.','new york',212/799-1533,'coffee bar',250 255 | 787,'frico bar','402 w. 43rd st. off 9th ave.','new york',212/564-7272,italian,251 256 | 788,'fujiyama mama','467 columbus ave. between 82nd and 83rd sts.','new york',212/769-1144,asian,252 257 | 789,'gabriela\'s','685 amsterdam ave. at 93rd st.','new york',212/961-0574,mexican,253 258 | 790,'gallagher\'s','228 w. 52nd st.','new york',212/245-5336,american,254 259 | 791,'gianni\'s','15 fulton st.','new york',212/608-7300,seafood,255 260 | 792,girafe,'208 e. 58th st. between 2nd and 3rd aves.','new york',212/752-3054,italian,256 261 | 793,global,'33 93 2nd ave. between 5th and 6th sts.','new york',212/477-8427,american,257 262 | 794,'golden unicorn','18 e. broadway at catherine st.','new york','212/ 941-0911',asian,258 263 | 795,'grand ticino','228 thompson st. between w. 3rd and bleecker sts.','new york',212/777-5922,italian,259 264 | 796,halcyon,'151 w. 54th st. in the rihga royal hotel','new york',212/468-8888,american,260 265 | 797,'hard rock cafe','221 w. 57th st.','new york',212/489-6565,american,261 266 | 798,'hi-life restaurant and lounge','1340 1st ave. at 72nd st.','new york',212/249-3600,american,262 267 | 799,home,'20 cornelia st. between bleecker and w. 4th st.','new york',212/243-9579,american,263 268 | 800,'hudson river club','4 world financial center','new york',212/786-1500,american,264 269 | 801,'i trulli','122 e. 27th st. between lexington and park aves.','new york',212/481-7372,italian,265 270 | 802,'il cortile','125 mulberry st. between canal and hester sts.','new york',212/226-6060,italian,266 271 | 803,'inca grill','492 broome st. near w. broadway','new york',212/966-3371,'latin american',268 272 | 804,indochine,'430 lafayette st. between 4th st. and astor pl.','new york',212/505-5111,asian,269 273 | 805,'internet cafe','82 e. 3rd st. between 1st and 2nd aves.','new york','212/ 614-0747','coffee bar',270 274 | 806,ipanema,'13 w. 46th st.','new york',212/730-5848,'latin american',271 275 | 807,'jean lafitte','68 w. 58th st.','new york',212/751-2323,french,272 276 | 808,'jewel of india','15 w. 44th st.','new york',212/869-5544,asian,273 277 | 809,'jimmy sung\'s','219 e. 44th st. between 2nd and 3rd aves.','new york',212/682-5678,asian,274 278 | 810,'joe allen','326 w. 46th st.','new york',212/581-6464,american,275 279 | 811,'judson grill','152 w. 52nd st.','new york',212/582-5252,american,276 280 | 812,'l\'absinthe','227 e. 67th st.','new york',212/794-4950,french,277 281 | 813,'l\'auberge','1191 1st ave. between 64th and 65th sts.','new york',212/288-8791,'middle eastern',278 282 | 814,'l\'auberge du midi','310 w. 4th st. between w. 12th and bank sts.','new york',212/242-4705,french,279 283 | 815,'l\'udo','432 lafayette st. near astor pl.','new york',212/388-0978,french,280 284 | 816,'la reserve','4 w. 49th st.','new york',212/247-2993,french,281 285 | 817,'lanza restaurant','168 1st ave. between 10th and 11th sts.','new york',212/674-7014,italian,282 286 | 818,'lattanzi ristorante','361 w. 46th st.','new york',212/315-0980,italian,283 287 | 819,layla,'211 w. broadway at franklin st.','new york',212/431-0700,'middle eastern',284 288 | 820,'le chantilly','106 e. 57th st.','new york',212/751-2931,french,285 289 | 821,'le colonial','149 e. 57th st.','new york','212/ 752-0808',asian,286 290 | 822,'le gamin','50 macdougal st. between houston and prince sts.','new york',212/254-4678,'coffee bar',287 291 | 823,'le jardin','25 cleveland pl. near spring st.','new york',212/343-9599,french,288 292 | 824,'le madri','168 w. 18th st.','new york',212/727-8022,italian,289 293 | 825,'le marais','150 w. 46th st.','new york',212/869-0900,american,290 294 | 826,'le perigord','405 e. 52nd st.','new york',212/755-6244,french,291 295 | 827,'le select','507 columbus ave. between 84th and 85th sts.','new york',212/875-1993,american,292 296 | 828,'les halles','411 park ave. s between 28th and 29th sts.','new york',212/679-4111,french,293 297 | 829,'lincoln tavern','51 w. 64th st.','new york',212/721-8271,american,294 298 | 830,lola,'30 west 22nd st. between 5th and 6th ave.','new york',212/675-6700,american,295 299 | 831,'lucky strike','59 grand st. between wooster st. and w. broadway','new york',212/941-0479,'or 212/941-0772 american',296 300 | 832,'mad fish','2182 broadway between 77th and 78th sts.','new york',212/787-0202,seafood,297 301 | 833,'main street','446 columbus ave. between 81st and 82nd sts.','new york',212/873-5025,american,298 302 | 834,'mangia e bevi','800 9th ave. at 53rd st.','new york',212/956-3976,italian,299 303 | 835,'manhattan cafe','1161 1st ave. between 63rd and 64th sts.','new york',212/888-6556,american,300 304 | 836,'manila garden','325 e. 14th st. between 1st and 2nd aves.','new york',212/777-6314,asian,301 305 | 837,marichu,'342 e. 46th st. between 1st and 2nd aves.','new york',212/370-1866,french,302 306 | 838,'marquet patisserie','15 e. 12th st. between 5th ave. and university pl.','new york',212/229-9313,'coffee bar',303 307 | 839,match,'160 mercer st. between houston and prince sts.','new york',212/906-9173,american,304 308 | 840,'matthew\'s','1030 3rd ave. at 61st st.','new york',212/838-4343,american,305 309 | 841,'mavalli palace','46 e. 29th st.','new york',212/679-5535,asian,306 310 | 842,'milan cafe and coffee bar','120 w. 23rd st.','new york',212/807-1801,'coffee bar',307 311 | 843,'monkey bar','60 e. 54th st.','new york',212/838-2600,american,308 312 | 844,montien,'1134 1st ave. between 62nd and 63rd sts.','new york',212/421-4433,asian,309 313 | 845,'morton\'s','551 5th ave. at 45th st.','new york',212/972-3315,american,310 314 | 846,'motown cafe','104 w. 57th st. near 6th ave.','new york',212/581-8030,american,311 315 | 847,'new york kom tang soot bul house','32 w. 32nd st.','new york','212/ 947-8482',asian,312 316 | 848,'new york noodletown','28 1/2 bowery at bayard st.','new york',212/349-0923,asian,313 317 | 849,newsbar,'2 w. 19th st.','new york',212/255-3996,'coffee bar',314 318 | 850,odeon,'145 w. broadway at thomas st.','new york',212/233-0507,american,315 319 | 851,orso,'322 w. 46th st.','new york',212/489-7212,italian,316 320 | 852,'osteria al droge','142 w. 44th st.','new york',212/944-3643,italian,317 321 | 853,otabe,'68 e. 56th st.','new york',212/223-7575,asian,318 322 | 854,pacifica,'138 lafayette st. between canal and howard sts.','new york',212/941-4168,asian,319 323 | 855,palio,'151 w. 51st. st.','new york',212/245-4850,italian,320 324 | 856,pamir,'1065 1st ave. at 58th st.','new york',212/644-9258,'middle eastern',321 325 | 857,'parioli romanissimo','24 e. 81st st.','new york',212/288-2391,italian,322 326 | 858,patria,'250 park ave. s at 20th st.','new york',212/777-6211,'latin american',323 327 | 859,'peacock alley','301 park ave. between 49th and 50th sts.','new york',212/872-4895,french,324 328 | 860,'pen & pencil','205 e. 45th st.','new york',212/682-8660,american,325 329 | 861,'penang soho','109 spring st. between greene and mercer sts.','new york',212/274-8883,asian,326 330 | 862,persepolis,'1423 2nd ave. between 74th and 75th sts.','new york',212/535-1100,'middle eastern',327 331 | 863,'planet hollywood','140 w. 57th st.','new york',212/333-7827,american,328 332 | 864,pomaire,'371 w. 46th st. off 9th ave.','new york','212/ 956-3055','latin american',329 333 | 865,'popover cafe','551 amsterdam ave. between 86th and 87th sts.','new york',212/595-8555,american,330 334 | 866,'post house','28 e. 63rd st.','new york',212/935-2888,american,331 335 | 867,rain,'100 w. 82nd st.','new york',212/501-0776,asian,332 336 | 868,'red tulip','439 e. 75th st.','new york',212/734-4893,'eastern european',333 337 | 869,republic,'37a union sq. w between 16th and 17th sts.','new york',212/627-7172,asian,335 338 | 870,'roettelle a. g','126 e. 7th st. between 1st ave. and ave. a','new york',212/674-4140,continental,336 339 | 871,'rosa mexicano','1063 1st ave. at 58th st.','new york',212/753-7407,mexican,337 340 | 872,'ruth\'s chris','148 w. 51st st.','new york',212/245-9600,american,338 341 | 873,s.p.q.r,'133 mulberry st. between hester and grand sts.','new york',212/925-3120,italian,339 342 | 874,'sal anthony\'s','55 irving pl.','new york',212/982-9030,italian,340 343 | 875,'sammy\'s roumanian steak house','157 chrystie st. at delancey st.','new york',212/673-0330,'east european',341 344 | 876,'san pietro','18 e. 54th st.','new york',212/753-9015,italian,342 345 | 877,'sant ambroeus','1000 madison ave. between 77th and 78th sts.','new york',212/570-2211,'coffee bar',343 346 | 878,'sarabeth\'s kitchen','423 amsterdam ave. between 80th and 81st sts.','new york',212/496-6280,american,344 347 | 879,'sea grill','19 w. 49th st.','new york',212/332-7610,seafood,345 348 | 880,serendipity,'3 225 e. 60th st.','new york',212/838-3531,american,346 349 | 881,'seventh regiment mess and bar','643 park ave. at 66th st.','new york',212/744-4107,american,347 350 | 882,sfuzzi,'58 w. 65th st.','new york',212/873-3700,american,348 351 | 883,shaan,'57 w. 48th st.','new york','212/ 977-8400',asian,349 352 | 884,'sofia fabulous pizza','1022 madison ave. near 79th st.','new york',212/734-2676,italian,350 353 | 885,'spring street natural restaurant & bar','62 spring st. at lafayette st.','new york',212/966-0290,american,351 354 | 886,'stage deli','834 7th ave. between 53rd and 54th sts.','new york',212/245-7850,delicatessen,352 355 | 887,stingray,'428 amsterdam ave. between 80th and 81st sts.','new york',212/501-7515,seafood,353 356 | 888,'sweet\'n\'tart cafe','76 mott st. at canal st.','new york',212/334-8088,asian,354 357 | 889,'t salon','143 mercer st. at prince st.','new york',212/925-3700,'coffee bar',355 358 | 890,'tang pavillion','65 w. 55th st.','new york',212/956-6888,asian,356 359 | 891,tapika,'950 8th ave. at 56th st.','new york','212/ 397-3737',american,357 360 | 892,'teresa\'s','103 1st ave. between 6th and 7th sts.','new york',212/228-0604,'east european',358 361 | 893,terrace,'400 w. 119th st. between amsterdam and morningside aves.','new york',212/666-9490,continental,359 362 | 894,'the coffee pot','350 9th ave. at 49th st.','new york',212/265-3566,'coffee bar',360 363 | 895,'the savannah club','2420 broadway at 89th st.','new york',212/496-1066,american,361 364 | 896,'trattoria dell\'arte','900 7th ave. between 56th and 57th sts.','new york',212/245-9800,italian,362 365 | 897,triangolo,'345 e. 83rd st.','new york',212/472-4488,italian,363 366 | 898,'tribeca grill','375 greenwich st. near franklin st.','new york',212/941-3900,american,364 367 | 899,'trois jean','154 e. 79th st. between lexington and 3rd aves.','new york',212/988-4858,'coffee bar',365 368 | 900,'tse yang','34 e. 51st st.','new york',212/688-5447,asian,366 369 | 901,'turkish kitchen','386 3rd ave. between 27th and 28th sts.','new york',212/679-1810,'middle eastern',367 370 | 902,'two two two','222 w. 79th st.','new york',212/799-0400,american,368 371 | 903,'veniero\'s pasticceria','342 e. 11th st. near 1st ave.','new york',212/674-7264,'coffee bar',369 372 | 904,verbena,'54 irving pl. at 17th st.','new york',212/260-5454,american,370 373 | 905,'victor\'s cafe','52 236 w. 52nd st.','new york',212/586-7714,'latin american',371 374 | 906,'vince & eddie\'s','70 w. 68th st.','new york',212/721-0068,american,372 375 | 907,vong,'200 e. 54th st.','new york',212/486-9592,american,373 376 | 908,'water club','500 e. 30th st.','new york',212/683-3333,american,374 377 | 909,west,'63rd street steakhouse 44 w. 63rd st.','new york',212/246-6363,american,375 378 | 910,xunta,'174 1st ave. between 10th and 11th sts.','new york',212/614-0620,mediterranean,376 379 | 911,'zen palate','34 union sq. e at 16th st.','new york',212/614-9291,'and 212/614-9345 asian',377 380 | 912,zoe,'90 prince st. between broadway and mercer st.','new york',212/966-6722,american,378 381 | 913,abbey,'163 ponce de leon ave.',atlanta,404/876-8532,international,379 382 | 914,'aleck\'s barbecue heaven','783 martin luther king jr. dr.',atlanta,404/525-2062,barbecue,380 383 | 915,'annie\'s thai castle','3195 roswell rd.',atlanta,404/264-9546,asian,381 384 | 916,anthonys,'3109 piedmont rd. just south of peachtree rd.',atlanta,404/262-7379,american,382 385 | 917,'atlanta fish market','265 pharr rd.',atlanta,404/262-3165,american,383 386 | 918,'beesley\'s of buckhead','260 e. paces ferry road',atlanta,404/264-1334,continental,384 387 | 919,'bertolini\'s','3500 peachtree rd. phipps plaza',atlanta,404/233-2333,italian,385 388 | 920,bistango,'1100 peachtree st.',atlanta,404/724-0901,mediterranean,386 389 | 921,'cafe renaissance','7050 jimmy carter blvd. norcross',atlanta,770/441--0291,american,387 390 | 922,'camille\'s','1186 n. highland ave.',atlanta,404/872-7203,italian,388 391 | 923,cassis,'3300 peachtree rd. grand hyatt',atlanta,404/365-8100,mediterranean,389 392 | 924,'city grill','50 hurt plaza',atlanta,404/524-2489,international,390 393 | 925,'coco loco','40 buckhead crossing mall on the sidney marcus blvd.',atlanta,404/364-0212,caribbean,391 394 | 926,'colonnade restaurant','1879 cheshire bridge rd.',atlanta,404/874-5642,southern,392 395 | 927,'dante\'s down the hatch buckhead','3380 peachtree rd.',atlanta,404/266-1600,continental,393 396 | 928,'dante\'s down the hatch','underground underground mall underground atlanta',atlanta,404/577-1800,continental,394 397 | 929,'fat matt\'s rib shack','1811 piedmont ave. near cheshire bridge rd.',atlanta,404/607-1622,barbecue,395 398 | 930,'french quarter food shop','923 peachtree st. at 8th st.',atlanta,404/875-2489,southern,396 399 | 931,'holt bros. bar-b-q','6359 jimmy carter blvd. at buford hwy. norcross',atlanta,770/242-3984,barbecue,397 400 | 932,'horseradish grill','4320 powers ferry rd.',atlanta,404/255-7277,southern,398 401 | 933,'hsu\'s gourmet','192 peachtree center ave. at international blvd.',atlanta,404/659-2788,asian,399 402 | 934,'imperial fez','2285 peachtree rd. peachtree battle condominium',atlanta,404/351-0870,mediterranean,400 403 | 935,kamogawa,'3300 peachtree rd. grand hyatt',atlanta,404/841-0314,asian,401 404 | 936,'la grotta at ravinia dunwoody rd.','holiday inn/crowne plaza at ravinia dunwoody',atlanta,770/395-9925,italian,402 405 | 937,'little szechuan','c buford hwy. northwoods plaza doraville',atlanta,770/451-0192,asian,403 406 | 938,'lowcountry barbecue','6301 roswell rd. sandy springs plaza sandy springs',atlanta,404/255-5160,barbecue,404 407 | 939,'luna si','1931 peachtree rd.',atlanta,404/355-5993,continental,405 408 | 940,'mambo restaurante cubano','1402 n. highland ave.',atlanta,404/874-2626,caribbean,406 409 | 941,'mckinnon\'s louisiane','3209 maple dr.',atlanta,404/237-1313,southern,407 410 | 942,'mi spia dunwoody rd.','park place across from perimeter mall dunwoody',atlanta,770/393-1333,italian,408 411 | 943,'nickiemoto\'s: a sushi bar','247 buckhead ave. east village sq.',atlanta,404/842-0334,fusion,409 412 | 944,palisades,'1829 peachtree rd.',atlanta,404/350-6755,continental,410 413 | 945,'pleasant peasant','555 peachtree st. at linden ave.',atlanta,404/874-3223,american,411 414 | 946,pricci,'500 pharr rd.',atlanta,404/237-2941,italian,412 415 | 947,'r.j.\'s uptown kitchen & wine bar','870 n. highland ave.',atlanta,404/875-7775,american,413 416 | 948,'rib ranch','25 irby ave.',atlanta,404/233-7644,barbecue,414 417 | 949,'sa tsu ki','3043 buford hwy.',atlanta,404/325-5285,asian,415 418 | 950,'sato sushi and thai','6050 peachtree pkwy. norcross',atlanta,770/449-0033,asian,416 419 | 951,'south city kitchen','1144 crescent ave.',atlanta,404/873-7358,southern,417 420 | 952,'south of france','2345 cheshire bridge rd.',atlanta,404/325-6963,french,418 421 | 953,'stringer\'s fish camp and oyster bar','3384 shallowford rd. chamblee',atlanta,770/458-7145,southern,419 422 | 954,'sundown cafe','2165 cheshire bridge rd.',atlanta,404/321-1118,american,420 423 | 955,'taste of new orleans','889 w. peachtree st.',atlanta,404/874-5535,southern,421 424 | 956,tomtom,'3393 peachtree rd.',atlanta,404/264-1163,continental,422 425 | 957,'antonio\'s','3700 w. flamingo','las vegas',702/252-7737,italian,423 426 | 958,'bally\'s big kitchen','3645 las vegas blvd. s','las vegas',702/739-4111,buffets,424 427 | 959,'bamboo garden','4850 flamingo rd.','las vegas',702/871-3262,asian,425 428 | 960,'battista\'s hole in the wall','4041 audrie st. at flamingo rd.','las vegas',702/732-1424,italian,426 429 | 961,'bertolini\'s','3570 las vegas blvd. s','las vegas',702/735-4663,italian,427 430 | 962,'binion\'s coffee shop','128 fremont st.','las vegas',702/382-1600,'coffee shops/diners',428 431 | 963,bistro,'3400 las vegas blvd. s','las vegas',702/791-7111,continental,429 432 | 964,broiler,'4111 boulder hwy.','las vegas',702/432-7777,american,430 433 | 965,'bugsy\'s diner','3555 las vegas blvd. s','las vegas',702/733-3111,'coffee shops/diners',431 434 | 966,'cafe michelle','1350 e. flamingo rd.','las vegas',702/735-8686,american,432 435 | 967,'cafe roma','3570 las vegas blvd. s','las vegas',702/731-7547,'coffee shops/diners',433 436 | 968,'capozzoli\'s','3333 s. maryland pkwy.','las vegas',702/731-5311,italian,434 437 | 969,'carnival world','3700 w. flamingo rd.','las vegas',702/252-7777,buffets,435 438 | 970,'center stage plaza hotel','1 main st.','las vegas',702/386-2512,american,436 439 | 971,'circus circus','2880 las vegas blvd. s','las vegas',702/734-0410,buffets,437 440 | 972,'empress court','3570 las vegas blvd. s','las vegas',702/731-7888,asian,438 441 | 973,feast,'2411 w. sahara ave.','las vegas',702/367-2411,buffets,439 442 | 974,'golden nugget hotel','129 e. fremont st.','las vegas',702/385-7111,buffets,440 443 | 975,'golden steer','308 w. sahara ave.','las vegas',702/384-4470,'steak houses',441 444 | 976,'lillie langtry\'s','129 e. fremont st.','las vegas',702/385-7111,asian,442 445 | 977,'mandarin court','1510 e. flamingo rd.','las vegas',702/737-1234,asian,443 446 | 978,'margarita\'s mexican cantina','3120 las vegas blvd. s','las vegas',702/794-8200,mexican,444 447 | 979,'mary\'s diner','5111 w. boulder hwy.','las vegas',702/454-8073,'coffee shops/diners',445 448 | 980,mikado,'3400 las vegas blvd. s','las vegas',702/791-7111,asian,446 449 | 981,pamplemousse,'400 e. sahara ave.','las vegas',702/733-2066,continental,447 450 | 982,'ralph\'s diner','3000 las vegas blvd. s','las vegas',702/732-6330,'coffee shops/diners',448 451 | 983,'the bacchanal','3570 las vegas blvd. s','las vegas',702/731-7525,'only in las vegas',449 452 | 984,venetian,'3713 w. sahara ave.','las vegas',702/876-4190,italian,450 453 | 985,'viva mercado\'s','6182 w. flamingo rd.','las vegas',702/871-8826,mexican,451 454 | 986,'yolie\'s','3900 paradise rd.','las vegas',702/794-0700,'steak houses',452 455 | 987,2223,'2223 market st.','san francisco',415/431-0692,american,453 456 | 988,acquarello,'1722 sacramento st.','san francisco',415/567-5432,italian,454 457 | 989,'bardelli\'s','243 o\'farrell st.','san francisco',415/982-0243,'old san francisco',455 458 | 990,betelnut,'2030 union st.','san francisco',415/929-8855,asian,456 459 | 991,'bistro roti','155 steuart st.','san francisco',415/495-6500,french,457 460 | 992,bix,'56 gold st.','san francisco',415/433-6300,american,458 461 | 993,bizou,'598 fourth st.','san francisco',415/543-2222,french,459 462 | 994,'buca giovanni','800 greenwich st.','san francisco',415/776-7766,italian,460 463 | 995,'cafe adriano','3347 fillmore st.','san francisco',415/474-4180,italian,461 464 | 996,'cafe marimba','2317 chestnut st.','san francisco',415/776-1506,'mexican/latin american/spanish',462 465 | 997,'california culinary academy','625 polk st.','san francisco',415/771-3500,french,463 466 | 998,'capp\'s corner','1600 powell st.','san francisco',415/989-2589,italian,464 467 | 999,carta,'1772 market st.','san francisco',415/863-3516,american,465 468 | 1000,chevys,'4th and howard sts.','san francisco',415/543-8060,'mexican/latin american/spanish',466 469 | 1001,'cypress club','500 jackson st.','san francisco',415/296-8555,american,467 470 | 1002,'des alpes','732 broadway','san francisco',415/788-9900,french,468 471 | 1003,faz,'161 sutter st.','san francisco',415/362-0404,'greek and middle eastern',469 472 | 1004,'fog city diner','1300 battery st.','san francisco',415/982-2000,american,470 473 | 1005,'garden court','market and new montgomery sts.','san francisco',415/546-5011,'old san francisco',471 474 | 1006,'gaylord\'s','ghirardelli sq.','san francisco',415/771-8822,asian,472 475 | 1007,'grand cafe hotel monaco','501 geary st.','san francisco',415/292-0101,american,473 476 | 1008,greens,'bldg. a fort mason','san francisco',415/771-6222,vegetarian,474 477 | 1009,'harbor village','4 embarcadero center','san francisco',415/781-8833,asian,475 478 | 1010,'harris\'','2100 van ness ave.','san francisco',415/673-1888,'steak houses',476 479 | 1011,'harry denton\'s','161 steuart st.','san francisco',415/882-1333,american,477 480 | 1012,'hayes street grill','320 hayes st.','san francisco',415/863-5545,seafood,478 481 | 1013,helmand,'430 broadway','san francisco',415/362-0641,'greek and middle eastern',479 482 | 1014,'hong kong flower lounge','5322 geary blvd.','san francisco',415/668-8998,asian,480 483 | 1015,'hong kong villa','2332 clement st.','san francisco',415/752-8833,asian,481 484 | 1016,'hyde street bistro','1521 hyde st.','san francisco',415/441-7778,italian,482 485 | 1017,'il fornaio levi\'s plaza','1265 battery st.','san francisco',415/986-0100,italian,483 486 | 1018,'izzy\'s steak & chop house','3345 steiner st.','san francisco',415/563-0487,'steak houses',484 487 | 1019,'jack\'s','615 sacramento st.','san francisco',415/986-9854,'old san francisco',485 488 | 1020,'kabuto sushi','5116 geary blvd.','san francisco',415/752-5652,asian,486 489 | 1021,'katia\'s','600 5th ave.','san francisco',415/668-9292,'',487 490 | 1022,'kuleto\'s','221 powell st.','san francisco',415/397-7720,italian,488 491 | 1023,'kyo-ya. sheraton palace hotel','2 new montgomery st. at market st.','san francisco',415/546-5000,asian,489 492 | 1024,'l\'osteria del forno','519 columbus ave.','san francisco',415/982-1124,italian,490 493 | 1025,'le central','453 bush st.','san francisco',415/391-2233,french,491 494 | 1026,'le soleil','133 clement st.','san francisco',415/668-4848,asian,492 495 | 1027,'macarthur park','607 front st.','san francisco',415/398-5700,american,493 496 | 1028,manora,'3226 mission st.','san francisco',415/861-6224,asian,494 497 | 1029,maykadeh,'470 green st.','san francisco',415/362-8286,'greek and middle eastern',495 498 | 1030,'mccormick & kuleto\'s','ghirardelli sq.','san francisco',415/929-1730,seafood,496 499 | 1031,millennium,'246 mcallister st.','san francisco',415/487-9800,vegetarian,497 500 | 1032,'moose\'s','1652 stockton st.','san francisco',415/989-7800,mediterranean,498 501 | 1033,'north india','3131 webster st.','san francisco',415/931-1556,asian,499 502 | 1034,'one market','1 market st.','san francisco',415/777-5577,american,500 503 | 1035,oritalia,'1915 fillmore st.','san francisco',415/346-1333,italian,501 504 | 1036,'pacific pan pacific hotel','500 post st.','san francisco',415/929-2087,french,502 505 | 1037,'palio d\'asti','640 sacramento st.','san francisco',415/395-9800,italian,503 506 | 1038,'pane e vino','3011 steiner st.','san francisco',415/346-2111,italian,504 507 | 1039,pastis,'1015 battery st.','san francisco',415/391-2555,french,505 508 | 1040,'perry\'s','1944 union st.','san francisco',415/922-9022,american,506 509 | 1041,'r&g lounge','631 b kearny st.','san francisco',415/982-7877,'or 415/982-3811 asian',507 510 | 1042,rubicon,'558 sacramento st.','san francisco',415/434-4100,american,508 511 | 1043,rumpus,'1 tillman pl.','san francisco',415/421-2300,american,509 512 | 1044,sanppo,'1702 post st.','san francisco',415/346-3486,asian,510 513 | 1045,'scala\'s bistro','432 powell st.','san francisco',415/395-8555,italian,511 514 | 1046,'south park cafe','108 south park','san francisco',415/495-7275,french,512 515 | 1047,'splendido embarcadero',4,'san francisco',415/986-3222,mediterranean,513 516 | 1048,stars,'150 redwood alley','san francisco',415/861-7827,american,514 517 | 1049,'stars cafe','500 van ness ave.','san francisco',415/861-4344,american,515 518 | 1050,'stoyanof\'s cafe','1240 9th ave.','san francisco',415/664-3664,'greek and middle eastern',516 519 | 1051,'straits cafe','3300 geary blvd.','san francisco',415/668-1783,asian,517 520 | 1052,suppenkuche,'601 hayes st.','san francisco',415/252-9289,russian/german,518 521 | 1053,'tadich grill','240 california st.','san francisco',415/391-2373,seafood,519 522 | 1054,'the heights','3235 sacramento st.','san francisco',415/474-8890,french,520 523 | 1055,thepin,'298 gough st.','san francisco',415/863-9335,asian,521 524 | 1056,'ton kiang','3148 geary blvd.','san francisco',415/752-4440,asian,522 525 | 1057,vertigo,'600 montgomery st.','san francisco',415/433-7250,mediterranean,523 526 | 1058,'vivande porta via','2125 fillmore st.','san francisco',415/346-4430,italian,524 527 | 1059,'vivande ristorante','670 golden gate ave.','san francisco',415/673-9245,italian,525 528 | 1060,'world wrapps','2257 chestnut st.','san francisco',415/563-9727,american,526 529 | 1061,'wu kong','101 spear st.','san francisco',415/957-9300,asian,527 530 | 1062,'yank sing','427 battery st.','san francisco',415/541-4949,asian,528 531 | 1063,'yaya cuisine','1220 9th ave.','san francisco',415/566-6966,'greek and middle eastern',529 532 | 1064,'yoyo tsumami bistro','1611 post st.','san francisco',415/922-7788,french,530 533 | 1065,zarzuela,'2000 hyde st.','san francisco',415/346-0800,'mexican/latin american/spanish',531 534 | 1066,'zuni cafe & grill','1658 market st.','san francisco',415/552-2522,mediterranean,532 535 | -------------------------------------------------------------------------------- /datasets/fodors_zagats/matches_fodors_zagats.csv: -------------------------------------------------------------------------------- 1 | fodors_id,zagats_id 2 | 534,219 3 | 535,220 4 | 536,221 5 | 537,222 6 | 538,223 7 | 539,224 8 | 540,225 9 | 541,226 10 | 542,227 11 | 543,228 12 | 544,229 13 | 545,230 14 | 546,231 15 | 547,232 16 | 548,233 17 | 549,234 18 | 550,235 19 | 551,236 20 | 552,237 21 | 553,238 22 | 554,239 23 | 555,240 24 | 556,241 25 | 557,242 26 | 558,243 27 | 559,244 28 | 560,245 29 | 561,246 30 | 562,247 31 | 563,248 32 | 564,249 33 | 565,250 34 | 566,251 35 | 567,252 36 | 568,253 37 | 569,254 38 | 570,255 39 | 571,256 40 | 572,257 41 | 573,258 42 | 574,259 43 | 575,260 44 | 576,261 45 | 577,262 46 | 578,263 47 | 579,264 48 | 580,265 49 | 581,266 50 | 582,267 51 | 583,268 52 | 584,269 53 | 585,270 54 | 586,271 55 | 587,272 56 | 588,273 57 | 589,274 58 | 590,275 59 | 591,276 60 | 592,277 61 | 593,278 62 | 594,279 63 | 595,280 64 | 596,281 65 | 597,282 66 | 598,283 67 | 599,284 68 | 600,285 69 | 601,286 70 | 602,287 71 | 603,288 72 | 604,289 73 | 605,290 74 | 606,291 75 | 607,292 76 | 608,293 77 | 609,294 78 | 610,295 79 | 611,296 80 | 612,297 81 | 613,298 82 | 614,299 83 | 615,300 84 | 616,301 85 | 617,302 86 | 618,303 87 | 619,304 88 | 620,305 89 | 621,306 90 | 622,307 91 | 623,308 92 | 624,309 93 | 625,310 94 | 626,311 95 | 627,312 96 | 628,313 97 | 629,314 98 | 630,315 99 | 631,316 100 | 632,317 101 | 633,318 102 | 634,319 103 | 635,320 104 | 636,321 105 | 637,322 106 | 638,323 107 | 639,324 108 | 640,325 109 | 641,326 110 | 642,327 111 | 643,328 112 | 644,329 113 | 645,330 114 | -------------------------------------------------------------------------------- /datasets/fodors_zagats/metadata.txt: -------------------------------------------------------------------------------- 1 | fodors.csv 2 | zagats.csv 3 | matches_fodors_zagats.csv 4 | -------------------------------------------------------------------------------- /datasets/fodors_zagats/zagats.csv: -------------------------------------------------------------------------------- 1 | id,name,addr,city,phone,type,class 2 | 1,'apple pan the','10801 w. pico blvd.','west la',310-475-3585,american,534 3 | 2,'asahi ramen','2027 sawtelle blvd.','west la',310-479-2231,'noodle shops',535 4 | 3,'baja fresh','3345 kimber dr.','westlake village',805-498-4049,mexican,536 5 | 4,'belvedere the','9882 little santa monica blvd.','beverly hills',310-788-2306,'pacific new wave',537 6 | 5,'benita\'s frites','1433 third st. promenade','santa monica',310-458-2889,'fast food',538 7 | 6,'bernard\'s','515 s. olive st.','los angeles',213-612-1580,continental,539 8 | 7,'bistro 45','45 s. mentor ave.',pasadena,818-795-2478,californian,540 9 | 8,'brent\'s deli','19565 parthenia ave.',northridge,818-886-5679,delis,541 10 | 9,'brighton coffee shop','9600 brighton way','beverly hills',310-276-7732,'coffee shops',542 11 | 10,'bristol farms market cafe','1570 rosecrans ave. s.',pasadena,310-643-5229,californian,543 12 | 11,'bruno\'s','3838 centinela ave.','mar vista',310-397-5703,italian,544 13 | 12,'cafe \'50s','838 lincoln blvd.',venice,310-399-1955,american,545 14 | 13,'cafe blanc','9777 little santa monica blvd.','beverly hills',310-888-0108,'pacific new wave',546 15 | 14,'cassell\'s','3266 w. sixth st.',la,213-480-8668,hamburgers,547 16 | 15,'chez melange','1716 pch','redondo beach',310-540-1222,eclectic,548 17 | 16,diaghilev,'1020 n. san vicente blvd.','w. hollywood',310-854-1111,russian,549 18 | 17,'don antonio\'s','1136 westwood blvd.',westwood,310-209-1422,italian,550 19 | 18,'duke\'s','8909 sunset blvd.','w. hollywood',310-652-3100,'coffee shops',551 20 | 19,'falafel king','1059 broxton ave.',westwood,310-208-4444,'middle eastern',552 21 | 20,'feast from the east','1949 westwood blvd.','west la',310-475-0400,chinese,553 22 | 21,'gumbo pot the','6333 w. third st.',la,213-933-0358,cajun/creole,554 23 | 22,'hollywood hills coffee shop','6145 franklin ave.',hollywood,213-467-7678,'coffee shops',555 24 | 23,'indo cafe','10428 1/2 national blvd.',la,310-815-1290,indonesian,556 25 | 24,'jan\'s family restaurant','8424 beverly blvd.',la,213-651-2866,'coffee shops',557 26 | 25,jiraffe,'502 santa monica blvd','santa monica',310-917-6671,californian,558 27 | 26,'jody maroni\'s sausage kingdom','2011 ocean front walk',venice,310-306-1995,'hot dogs',559 28 | 27,'joe\'s','1023 abbot kinney blvd.',venice,310-399-5811,'american (new)',560 29 | 28,'john o\'groats','10516 w. pico blvd.','west la',310-204-0692,'coffee shops',561 30 | 29,'johnnie\'s pastrami','4017 s. sepulveda blvd.','culver city',310-397-6654,delis,562 31 | 30,'johnny reb\'s southern smokehouse','4663 long beach blvd.','long beach',310-423-7327,southern/soul,563 32 | 31,'johnny rockets (la)','7507 melrose ave.',la,213-651-3361,american,564 33 | 32,'killer shrimp','4000 colfax ave.','studio city',818-508-1570,seafood,565 34 | 33,'kokomo cafe','6333 w. third st.',la,213-933-0773,american,566 35 | 34,'koo koo roo','8393 w. beverly blvd.',la,213-655-9045,chicken,567 36 | 35,'la cachette','10506 little santa monica blvd.','century city',310-470-4992,'french (new)',568 37 | 36,'la salsa (la)','22800 pch',malibu,310-456-6299,mexican,569 38 | 37,'la serenata de garibaldi','1842 e. first','st. boyle hts.',213-265-2887,mexican/tex-mex,570 39 | 38,'langer\'s','704 s. alvarado st.',la,213-483-8050,delis,571 40 | 39,'local nochol','30869 thousand oaks blvd.','westlake village',818-706-7706,'health food',572 41 | 40,'main course the','10509 w. pico blvd.','rancho park',310-475-7564,american,573 42 | 41,'mani\'s bakery & espresso bar','519 s. fairfax ave.',la,213-938-8800,desserts,574 43 | 42,'martha\'s','22nd street grill 25 22nd','st. hermosa beach',310-376-7786,american,575 44 | 43,'maxwell\'s cafe','13329 washington blvd.','marina del rey',310-306-7829,american,576 45 | 44,'michael\'s (los angeles)','1147 third st.','santa monica',310-451-0843,californian,577 46 | 45,mishima,'8474 w. third st.',la,213-782-0181,'noodle shops',578 47 | 46,'mo better meatty meat','7261 melrose ave.',la,213-935-5280,hamburgers,579 48 | 47,'mulberry st.','17040 ventura blvd.',encino,818-906-8881,pizza,580 49 | 48,'ocean park cafe','3117 ocean park blvd.','santa monica',310-452-5728,american,581 50 | 49,'ocean star','145 n. atlantic blvd.','monterey park',818-308-2128,seafood,582 51 | 50,'original pantry bakery','875 s. figueroa st. downtown',la,213-627-6879,diners,583 52 | 51,'parkway grill','510 s. arroyo pkwy.',pasadena,818-795-1001,californian,584 53 | 52,'pho hoa','642 broadway',chinatown,213-626-5530,vietnamese,585 54 | 53,'pink\'s famous chili dogs','709 n. la brea ave.',la,213-931-4223,'hot dogs',586 55 | 54,'poquito mas','2635 w. olive ave.',burbank,818-563-2252,mexican,587 56 | 55,r-23,'923 e. third st.','los angeles',213-687-7178,japanese,588 57 | 56,'rae\'s','2901 pico blvd.','santa monica',310-828-7937,diners,589 58 | 57,'rubin\'s red hots','15322 ventura blvd.',encino,818-905-6515,'hot dogs',590 59 | 58,'ruby\'s (la)','45 s. fair oaks ave.',pasadena,818-796-7829,diners,591 60 | 59,'russell\'s burgers','1198 pch','seal beach',310-596-9556,hamburgers,592 61 | 60,'ruth\'s chris steak house (los angeles)','224 s. beverly dr.','beverly hills',310-859-8744,steakhouses,593 62 | 61,shiro,'1505 mission st. s.',pasadena,818-799-4774,'pacific new wave',594 63 | 62,'sushi nozawa','11288 ventura blvd.','studio city',818-508-7017,japanese,595 64 | 63,'sweet lady jane','8360 melrose ave.',la,213-653-7145,desserts,596 65 | 64,taiko,'11677 san vicente blvd.',brentwood,310-207-7782,'noodle shops',597 66 | 65,'tommy\'s','2575 beverly blvd.',la,213-389-9060,hamburgers,598 67 | 66,'uncle bill\'s pancake house','1305 highland ave.','manhattan beach',310-545-5177,diners,599 68 | 67,'water grill','544 s. grand ave.','los angeles',213-891-0900,seafood,600 69 | 68,'zankou chicken','1415 e. colorado st.',glendale,818-244-1937,'middle eastern',601 70 | 69,'afghan kebab house','764 ninth ave.','new york city',212-307-1612,afghan,602 71 | 70,arcadia,'21 e. 62nd st.','new york city',212-223-2900,'american (new)',603 72 | 71,'benny\'s burritos','93 ave. a','new york city',212-254-2054,mexican,604 73 | 72,'cafe con leche','424 amsterdam ave.','new york city',212-595-7000,cuban,605 74 | 73,'corner bistro','331 w. fourth st.','new york city',212-242-9502,hamburgers,606 75 | 74,'cucina della fontana','368 bleecker st.','new york city',212-242-0636,italian,607 76 | 75,'cucina di pesce','87 e. fourth st.','new york city',212-260-6800,seafood,608 77 | 76,darbar,'44 w. 56th st.','new york city',212-432-7227,indian,609 78 | 77,'ej\'s luncheonette','432 sixth ave.','new york city',212-473-5555,diners,610 79 | 78,'edison cafe','228 w. 47th st.','new york city',212-840-5000,diners,611 80 | 79,'elias corner','24-02 31st st.',queens,718-932-1510,greek,612 81 | 80,'good enough to eat','483 amsterdam ave.','new york city',212-496-0163,american,613 82 | 81,'gray\'s papaya','2090 broadway','new york city',212-799-0243,'hot dogs',614 83 | 82,'il mulino','86 w. third st.','new york city',212-673-3783,italian,615 84 | 83,'jackson diner','37-03 74th st.',queens,718-672-1232,indian,616 85 | 84,'joe\'s shanghai','9 pell st.',queens,718-539-3838,chinese,617 86 | 85,'john\'s pizzeria','48 w. 65th st.','new york city',212-721-7001,pizza,618 87 | 86,'kelley & ping','127 greene st.','new york city',212-228-1212,pan-asian,619 88 | 87,kiev,'117 second ave.','new york city',212-674-4040,ukrainian,620 89 | 88,'kuruma zushi','2nd fl.','new york city',212-317-2802,japanese,621 90 | 89,'la caridad','2199 broadway','new york city',212-874-2780,cuban,622 91 | 90,'la grenouille','3 e. 52nd st.','new york city',212-752-1495,'french (classic)',623 92 | 91,'lemongrass grill','61a seventh ave.',brooklyn,718-399-7100,thai,624 93 | 92,'lombardi\'s','32 spring st.','new york city',212-941-7994,pizza,625 94 | 93,'marnie\'s noodle shop','466 hudson st.','new york city',212-741-3214,asian,626 95 | 94,menchanko-tei,'39 w. 55th st.','new york city',212-247-1585,japanese,627 96 | 95,'mitali east-west','296 bleecker st.','new york city',212-989-1367,indian,628 97 | 96,'monsoon (ny)','435 amsterdam ave.','new york city',212-580-8686,thai,629 98 | 97,moustache,'405 atlantic ave.',brooklyn,718-852-5555,'middle eastern',630 99 | 98,nobu,'105 hudson st.','new york city',212-219-0500,japanese,631 100 | 99,'one if by land tibs','17 barrow st.','new york city',212-228-0822,continental,632 101 | 100,'oyster bar','lower level','new york city',212-490-6650,seafood,633 102 | 101,palm,'837 second ave.','new york city',212-687-2953,steakhouses,634 103 | 102,'palm too','840 second ave.','new york city',212-697-5198,steakhouses,635 104 | 103,'patsy\'s pizza','19 old fulton st.',brooklyn,718-858-4300,pizza,636 105 | 104,'peter luger steak house','178 broadway',brooklyn,718-387-7400,steakhouses,637 106 | 105,'rose of india','308 e. sixth st.','new york city',212-533-5011,indian,638 107 | 106,'sam\'s noodle shop','411 third ave.','new york city',212-213-2288,chinese,639 108 | 107,'sarabeth\'s','1295 madison ave.','new york city',212-410-7335,american,640 109 | 108,'sparks steak house','210 e. 46th st.','new york city',212-687-4855,steakhouses,641 110 | 109,'stick to your ribs','5-16 51st ave.',queens,718-937-3030,bbq,642 111 | 110,sushisay,'38 e. 51st st.','new york city',212-755-1780,japanese,643 112 | 111,'sylvia\'s','328 lenox ave.','new york city',212-996-0660,southern/soul,644 113 | 112,'szechuan hunan cottage','1588 york ave.','new york city',212-535-5223,chinese,645 114 | 113,'szechuan kitchen','1460 first ave.','new york city',212-249-4615,chinese,646 115 | 114,'teresa\'s','80 montague st.',queens,718-520-2910,polish,647 116 | 115,'thai house cafe','151 hudson st.','new york city',212-334-1085,thai,648 117 | 116,'thailand restaurant','106 bayard st.','new york city',212-349-3132,thai,649 118 | 117,veselka,'144 second ave.','new york city',212-228-9682,ukrainian,650 119 | 118,'westside cottage','689 ninth ave.','new york city',212-245-0800,chinese,651 120 | 119,'windows on the world','107th fl.','new york city',212-524-7000,eclectic,652 121 | 120,'wollensky\'s grill','205 e. 49th st.','new york city',212-753-0444,steakhouses,653 122 | 121,yama,'122 e. 17th st.','new york city',212-475-0969,japanese,654 123 | 122,zarela,'953 second ave.','new york city',212-644-6740,mexican,655 124 | 123,'andre\'s french restaurant','401 s. 6th st.','las vegas',702-385-5016,'french (classic)',656 125 | 124,'buccaneer bay club','3300 las vegas blvd. s.','las vegas',702-894-7350,continental,657 126 | 125,'buzio\'s in the rio','3700 w. flamingo rd.','las vegas',702-252-7697,seafood,658 127 | 126,'emeril\'s new orleans fish house','3799 las vegas blvd. s.','las vegas',702-891-7374,seafood,659 128 | 127,'fiore rotisserie & grille','3700 w. flamingo rd.','las vegas',702-252-7702,italian,660 129 | 128,'hugo\'s cellar','202 e. fremont st.','las vegas',702-385-4011,continental,661 130 | 129,'madame ching\'s','3300 las vegas blvd. s.','las vegas',702-894-7111,asian,662 131 | 130,'mayflower cuisinier','4750 w. sahara ave.','las vegas',702-870-8432,chinese,663 132 | 131,'michael\'s (las vegas)','3595 las vegas blvd. s.','las vegas',702-737-7111,continental,664 133 | 132,'monte carlo','3145 las vegas blvd. s.','las vegas',702-733-4524,'french (new)',665 134 | 133,moongate,'3400 las vegas blvd. s.','las vegas',702-791-7352,chinese,666 135 | 134,'morton\'s of chicago (las vegas)','3200 las vegas blvd. s.','las vegas',702-893-0703,steakhouses,667 136 | 135,'nicky blair\'s','3925 paradise rd.','las vegas',702-792-9900,italian,668 137 | 136,'piero\'s restaurant','355 convention center dr.','las vegas',702-369-2305,italian,669 138 | 137,'spago (las vegas)','3500 las vegas blvd. s.','las vegas',702-369-6300,californian,670 139 | 138,'steakhouse the','128 e. fremont st.','las vegas',702-382-1600,steakhouses,671 140 | 139,'stefano\'s','129 fremont st.','las vegas',702-385-7111,italian,672 141 | 140,'sterling brunch','3645 las vegas blvd. s.','las vegas',702-739-4651,eclectic,673 142 | 141,'tre visi','3799 las vegas blvd. s.','las vegas',702-891-7331,italian,674 143 | 142,'103 west','103 w. paces ferry rd.',atlanta,404-233-5993,continental,675 144 | 143,'alon\'s at the terrace','659 peachtree st.',atlanta,404-724-0444,sandwiches,676 145 | 144,'baker\'s cajun cafe','1134 euclid ave.',atlanta,404-223-5039,cajun/creole,677 146 | 145,'barbecue kitchen','1437 virginia ave.',atlanta,404-766-9906,bbq,678 147 | 146,'bistro the','56 e. andrews dr. nw',atlanta,404-231-5733,'french bistro',679 148 | 147,'bobby & june\'s kountry kitchen','375 14th st.',atlanta,404-876-3872,southern/soul,680 149 | 148,'bradshaw\'s restaurant','2911 s. pharr court',atlanta,404-261-7015,southern/soul,681 150 | 149,'brookhaven cafe','4274 peachtree rd.',atlanta,404-231-5907,vegetarian,682 151 | 150,'cafe sunflower','5975 roswell rd.',atlanta,404-256-1675,'health food',683 152 | 151,canoe,'4199 paces ferry rd.',atlanta,770-432-2663,'american (new)',684 153 | 152,'carey\'s','1021 cobb pkwy. se',marietta,770-422-8042,hamburgers,685 154 | 153,'carey\'s corner','1215 powers ferry rd.',marietta,770-933-0909,hamburgers,686 155 | 154,chops,'70 w. paces ferry rd.',atlanta,404-262-2675,steakhouses,687 156 | 155,chopstix,'4279 roswell rd.',atlanta,404-255-4868,chinese,688 157 | 156,'deacon burton\'s soulfood restaurant','1029 edgewood ave. se',atlanta,404-523-1929,southern/soul,689 158 | 157,eats,'600 ponce de leon ave.',atlanta,404-888-9149,italian,690 159 | 158,'flying biscuit the','1655 mclendon ave.',atlanta,404-687-8888,eclectic,691 160 | 159,frijoleros,'1031 peachtree st. ne',atlanta,404-892-8226,tex-mex,692 161 | 160,'greenwood\'s','1087 green st.',roswell,770-992-5383,southern/soul,693 162 | 161,'harold\'s barbecue','171 mcdonough blvd.',atlanta,404-627-9268,bbq,694 163 | 162,'havana sandwich shop','2905 buford hwy.',atlanta,404-636-4094,cuban,695 164 | 163,'house of chan','2469 cobb pkwy.',smyrna,770-955-9444,chinese,696 165 | 164,'indian delights','3675 satellite blvd.',duluth,100-813-8212,indian,697 166 | 165,'java jive','790 ponce de leon ave.',atlanta,404-876-6161,'coffee shops',698 167 | 166,'johnny rockets (at)','2970 cobb pkwy.',atlanta,770-955-6068,american,699 168 | 167,'kalo\'s coffee house','1248 clairmont rd.',decatur,404-325-3733,coffeehouses,700 169 | 168,'la fonda latina','4427 roswell rd.',atlanta,404-303-8201,spanish,701 170 | 169,'lettuce souprise you (at)','3525 mall blvd.',duluth,770-418-9969,cafeterias,702 171 | 170,majestic,'1031 ponce de leon ave.',atlanta,404-875-0276,diners,703 172 | 171,'morton\'s of chicago (atlanta)','303 peachtree st. ne',atlanta,404-577-4366,steakhouses,704 173 | 172,'my thai','1248 clairmont rd.',atlanta,404-636-4280,thai,705 174 | 173,nava,'3060 peachtree rd.',atlanta,404-240-1984,southwestern,706 175 | 174,'nuevo laredo cantina','1495 chattahoochee ave. nw',atlanta,404-352-9009,mexican,707 176 | 175,'original pancake house (at)','4330 peachtree rd.',atlanta,404-237-4116,american,708 177 | 176,'palm the (atlanta)','3391 peachtree rd. ne',atlanta,404-814-1955,steakhouses,709 178 | 177,'rainbow restaurant','2118 n. decatur rd.',decatur,404-633-3538,vegetarian,710 179 | 178,riviera,'519 e. paces ferry rd.',atlanta,404-262-7112,mediterranean,712 180 | 179,'silver skillet the','200 14th st. nw',atlanta,404-874-1388,'coffee shops',713 181 | 180,soto,'3330 piedmont rd.',atlanta,404-233-2005,japanese,714 182 | 181,'thelma\'s kitchen','764 marietta st. nw',atlanta,404-688-5855,cafeterias,715 183 | 182,tortillas,'774 ponce de leon ave. ne',atlanta,404-892-0193,tex-mex,716 184 | 183,'van gogh\'s restaurant & bar','70 w. crossville rd.',roswell,770-993-1156,'american (new)',717 185 | 184,veggieland,'220 sandy springs circle',atlanta,404-231-3111,vegetarian,718 186 | 185,'white house restaurant','3172 peachtree rd. ne',atlanta,404-237-7601,diners,719 187 | 186,zab-e-lee,'4837 old national hwy.','college park',404-768-2705,thai,720 188 | 187,'bill\'s place','2315 clement st.','san francisco',415-221-5262,hamburgers,721 189 | 188,'cafe flore','2298 market st.','san francisco',415-621-8579,californian,722 190 | 189,'caffe greco','423 columbus ave.','san francisco',415-397-6261,continental,723 191 | 190,'campo santo','240 columbus ave.','san francisco',415-433-9623,mexican,724 192 | 191,'cha cha cha\'s','1805 haight st.','san francisco',415-386-5758,caribbean,725 193 | 192,'doidge\'s','2217 union st.','san francisco',415-921-2149,american,726 194 | 193,'dottie\'s true blue cafe','522 jones st.','san francisco',415-885-2767,diners,727 195 | 194,'dusit thai','3221 mission st.','san francisco',415-826-4639,thai,728 196 | 195,ebisu,'1283 ninth ave.','san francisco',415-566-1770,japanese,729 197 | 196,'emerald garden restaurant','1550 california st.','san francisco',415-673-1155,vietnamese,730 198 | 197,'eric\'s chinese restaurant','1500 church st.','san francisco',415-282-0919,chinese,731 199 | 198,'hamburger mary\'s','1582 folsom st.','san francisco',415-626-1985,hamburgers,732 200 | 199,'kelly\'s on trinity','333 bush st.','san francisco',415-362-4454,californian,733 201 | 200,'la cumbre','515 valencia st.','san francisco',415-863-8205,mexican,734 202 | 201,'la mediterranee','288 noe st.','san francisco',415-431-7210,mediterranean,735 203 | 202,'la taqueria','2889 mission st.','san francisco',415-285-7117,mexican,736 204 | 203,'mario\'s bohemian cigar store cafe','2209 polk st.','san francisco',415-776-8226,italian,737 205 | 204,'marnee thai','2225 irving st.','san francisco',415-665-9500,thai,738 206 | 205,'mel\'s drive-in','3355 geary st.','san francisco',415-387-2244,hamburgers,739 207 | 206,'mo\'s burgers','1322 grant st.','san francisco',415-788-3779,hamburgers,740 208 | 207,'phnom penh cambodian restaurant','631 larkin st.','san francisco',415-775-5979,cambodian,741 209 | 208,'roosevelt tamale parlor','2817 24th st.','san francisco',415-550-9213,mexican,742 210 | 209,'sally\'s cafe & bakery','300 de haro st.','san francisco',415-626-6006,american,743 211 | 210,'san francisco bbq','1328 18th st.','san francisco',415-431-8956,thai,744 212 | 211,'slanted door','584 valencia st.','san francisco',415-861-8032,vietnamese,745 213 | 212,'swan oyster depot','1517 polk st.','san francisco',415-673-1101,seafood,746 214 | 213,'thep phanom','400 waller st.','san francisco',415-431-2526,thai,747 215 | 214,'ti couz','3108 16th st.','san francisco',415-252-7373,french,748 216 | 215,'trio cafe','1870 fillmore st.','san francisco',415-563-2248,american,749 217 | 216,'tu lan','8 sixth st.','san francisco',415-626-0927,vietnamese,750 218 | 217,'vicolo pizzeria','201 ivy st.','san francisco',415-863-2382,pizza,751 219 | 218,'wa-ha-ka oaxaca mexican grill','2141 polk st.','san francisco',415-775-1055,mexican,752 220 | 219,'arnie morton\'s of chicago','435 s. la cienega blvd.','los angeles',310-246-1501,steakhouses,0 221 | 220,'art\'s deli','12224 ventura blvd.','studio city',818-762-1221,delis,1 222 | 221,'bel-air hotel','701 stone canyon rd.','bel air',310-472-1211,californian,2 223 | 222,'cafe bizou','14016 ventura blvd.','sherman oaks',818-788-3536,'french bistro',3 224 | 223,campanile,'624 s. la brea ave.','los angeles',213-938-1447,californian,4 225 | 224,'chinois on main','2709 main st.','santa monica',310-392-9025,'pacific new wave',5 226 | 225,citrus,'6703 melrose ave.','los angeles',213-857-0034,californian,6 227 | 226,'fenix at the argyle','8358 sunset blvd.','w. hollywood',213-848-6677,'french (new)',7 228 | 227,granita,'23725 w. malibu rd.',malibu,310-456-0488,californian,8 229 | 228,'grill the','9560 dayton way','beverly hills',310-276-0615,'american (traditional)',9 230 | 229,katsu,'1972 hillhurst ave.','los feliz',213-665-1891,japanese,10 231 | 230,'l\'orangerie','903 n. la cienega blvd.','w. hollywood',310-652-9770,'french (classic)',11 232 | 231,'le chardonnay (los angeles)','8284 melrose ave.','los angeles',213-655-8880,'french bistro',12 233 | 232,'locanda veneta','8638 w. third st.','los angeles',310-274-1893,italian,13 234 | 233,matsuhisa,'129 n. la cienega blvd.','beverly hills',310-659-9639,seafood,14 235 | 234,'palm the (los angeles)','9001 santa monica blvd.','w. hollywood',310-550-8811,steakhouses,15 236 | 235,patina,'5955 melrose ave.','los angeles',213-467-1108,californian,16 237 | 236,'philippe the original','1001 n. alameda st.',chinatown,213-628-3781,cafeterias,17 238 | 237,'pinot bistro','12969 ventura blvd.','studio city',818-990-0500,'french bistro',18 239 | 238,'rex il ristorante','617 s. olive st.','los angeles',213-627-2300,'nuova cucina italian',19 240 | 239,'spago (los angeles)','8795 sunset blvd.','w. hollywood',310-652-4025,californian,20 241 | 240,valentino,'3115 pico blvd.','santa monica',310-829-4313,italian,21 242 | 241,'yujean kang\'s','67 n. raymond ave.',pasadena,818-585-0855,chinese,22 243 | 242,'21 club','21 w. 52nd st.','new york city',212-582-7200,'american (new)',23 244 | 243,aquavit,'13 w. 54th st.','new york city',212-307-7311,scandinavian,24 245 | 244,aureole,'34 e. 61st st.','new york city',212-319-1660,'american (new)',25 246 | 245,'cafe lalo','201 w. 83rd st.','new york city',212-496-6031,coffeehouses,26 247 | 246,'cafe des artistes','1 w. 67th st.','new york city',212-877-3500,'french (classic)',27 248 | 247,'carmine\'s','2450 broadway','new york city',212-362-2200,italian,28 249 | 248,'carnegie deli','854 seventh ave.','new york city',212-757-2245,delis,29 250 | 249,chanterelle,'2 harrison st.','new york city',212-966-6960,'french (new)',30 251 | 250,daniel,'20 e. 76th st.','new york city',212-288-0033,'french (new)',31 252 | 251,dawat,'210 e. 58th st.','new york city',212-355-7555,indian,32 253 | 252,felidia,'243 e. 58th st.','new york city',212-758-1479,italian,33 254 | 253,'four seasons','99 e. 52nd st.','new york city',212-754-9494,'american (new)',34 255 | 254,'gotham bar & grill','12 e. 12th st.','new york city',212-620-4020,'american (new)',35 256 | 255,'gramercy tavern','42 e. 20th st.','new york city',212-477-0777,'american (new)',36 257 | 256,'island spice','402 w. 44th st.','new york city',212-765-1737,caribbean,37 258 | 257,'jo jo','160 e. 64th st.','new york city',212-223-5656,'french bistro',38 259 | 258,'la caravelle','33 w. 55th st.','new york city',212-586-4252,'french (classic)',39 260 | 259,'la cote basque','60 w. 55th st.','new york city',212-688-6525,'french (classic)',40 261 | 260,'le bernardin','155 w. 51st st.','new york city',212-489-1515,seafood,41 262 | 261,'les celebrites','155 w. 58th st.','new york city',212-484-5113,'french (classic)',42 263 | 262,'lespinasse (new york city)','2 e. 55th st.','new york city',212-339-6719,asian,43 264 | 263,lutece,'249 e. 50th st.','new york city',212-752-2225,'french (classic)',44 265 | 264,'manhattan ocean club','57 w. 58th st.','new york city',212-371-7777,seafood,45 266 | 265,march,'405 e. 58th st.','new york city',212-754-6272,'american (new)',46 267 | 266,'mesa grill','102 fifth ave.','new york city',212-807-7400,southwestern,47 268 | 267,'mi cocina','57 jane st.','new york city',212-627-8273,mexican,48 269 | 268,montrachet,'239 w. broadway','new york city',212-219-2777,'french bistro',49 270 | 269,oceana,'55 e. 54th st.','new york city',212-759-5941,seafood,50 271 | 270,'park avenue cafe (new york city)','100 e. 63rd st.','new york city',212-644-1900,'american (new)',51 272 | 271,petrossian,'182 w. 58th st.','new york city',212-245-2214,russian,52 273 | 272,picholine,'35 w. 64th st.','new york city',212-724-8585,mediterranean,53 274 | 273,pisces,'95 ave. a','new york city',212-260-6660,seafood,54 275 | 274,'rainbow room','30 rockefeller plaza','new york city',212-632-5000,'american (new)',55 276 | 275,'river cafe','1 water st.',brooklyn,718-522-5200,'american (new)',56 277 | 276,'san domenico','240 central park s.','new york city',212-265-5959,italian,57 278 | 277,'second avenue deli','156 second ave.','new york city',212-677-0606,delis,58 279 | 278,seryna,'11 e. 53rd st.','new york city',212-980-9393,japanese,59 280 | 279,'shun lee palace','155 e. 55th st.','new york city',212-371-8844,chinese,60 281 | 280,'sign of the dove','1110 third ave.','new york city',212-861-8080,'american (new)',61 282 | 281,'smith & wollensky','797 third ave.','new york city',212-753-1530,steakhouses,62 283 | 282,'tavern on the green','central park west','new york city',212-873-3200,'american (new)',63 284 | 283,'uncle nick\'s','747 ninth ave.','new york city',212-245-7992,greek,64 285 | 284,'union square cafe','21 e. 16th st.','new york city',212-243-4020,'american (new)',65 286 | 285,'virgil\'s real bbq','152 w. 44th st.','new york city',212-921-9494,bbq,66 287 | 286,'chin\'s','3200 las vegas blvd. s.','las vegas',702-733-8899,chinese,67 288 | 287,'coyote cafe (las vegas)','3799 las vegas blvd. s.','las vegas',702-891-7349,southwestern,68 289 | 288,'le montrachet bistro','3000 paradise rd.','las vegas',702-732-5651,'french bistro',69 290 | 289,'palace court','3570 las vegas blvd. s.','las vegas',702-731-7110,'french (new)',70 291 | 290,'second street grill','200 e. fremont st.','las vegas',702-385-6277,'pacific rim',71 292 | 291,'steak house the','2880 las vegas blvd. s.','las vegas',702-734-0410,steakhouses,72 293 | 292,'tillerman the','2245 e. flamingo rd.','las vegas',702-731-4036,steakhouses,73 294 | 293,abruzzi,'2355 peachtree rd. ne',atlanta,404-261-8186,italian,74 295 | 294,bacchanalia,'3125 piedmont rd.',atlanta,404-365-0410,californian,75 296 | 295,'bone\'s restaurant','3130 piedmont rd. ne',atlanta,404-237-2663,steakhouses,76 297 | 296,'brasserie le coze','3393 peachtree rd.',atlanta,404-266-1440,'french bistro',77 298 | 297,'buckhead diner','3073 piedmont rd.',atlanta,404-262-3336,'american (new)',78 299 | 298,'ciboulette restaurant','1529 piedmont ave.',atlanta,404-874-7600,'french (new)',79 300 | 299,delectables,'1 margaret mitchell sq.',atlanta,404-681-2909,cafeterias,80 301 | 300,'georgia grille','2290 peachtree rd.',atlanta,404-352-3517,southwestern,81 302 | 301,'hedgerose heights inn the','490 e. paces ferry rd. ne',atlanta,404-233-7673,continental,82 303 | 302,'heera of india','595 piedmont ave.',atlanta,404-876-4408,indian,83 304 | 303,'indigo coastal grill','1397 n. highland ave.',atlanta,404-876-0676,eclectic,84 305 | 304,'la grotta','2637 peachtree rd. ne',atlanta,404-231-1368,italian,85 306 | 305,'mary mac\'s tea room','224 ponce de leon ave.',atlanta,404-876-1800,southern/soul,86 307 | 306,'nikolai\'s roof','255 courtland st.',atlanta,404-221-6362,continental,87 308 | 307,'pano\'s & paul\'s','1232 w. paces ferry rd.',atlanta,404-261-3662,'american (new)',88 309 | 308,'ritz-carlton cafe (buckhead)','3434 peachtree rd. ne',atlanta,404-237-2700,'american (new)',89 310 | 309,'ritz-carlton dining room (buckhead)','3434 peachtree rd. ne',atlanta,404-237-2700,'american (new)',90 311 | 310,'ritz-carlton restaurant','181 peachtree st.',atlanta,404-659-0400,'french (classic)',91 312 | 311,toulouse,'293-b peachtree rd.',atlanta,404-351-9533,'french (new)',92 313 | 312,'veni vidi vici','41 14th st.',atlanta,404-875-8424,italian,93 314 | 313,'alain rondelli','126 clement st.','san francisco',415-387-0408,'french (new)',94 315 | 314,aqua,'252 california st.','san francisco',415-956-9662,'american (new)',95 316 | 315,boulevard,'1 mission st.','san francisco',415-543-6084,'american (new)',96 317 | 316,'cafe claude','7 claude ln.','san francisco',415-392-3505,'french bistro',97 318 | 317,'campton place','340 stockton st.','san francisco',415-955-5555,'american (new)',98 319 | 318,'chez michel','804 north point st.','san francisco',415-775-7036,californian,99 320 | 319,'fleur de lys','777 sutter st.','san francisco',415-673-7779,'french (new)',100 321 | 320,fringale,'570 fourth st.','san francisco',415-543-0573,'french bistro',101 322 | 321,'hawthorne lane','22 hawthorne st.','san francisco',415-777-9779,californian,102 323 | 322,'khan toke thai house','5937 geary blvd.','san francisco',415-668-6654,thai,103 324 | 323,'la folie','2316 polk st.','san francisco',415-776-5577,'french (new)',104 325 | 324,'lulu restaurant-bis-cafe','816 folsom st.','san francisco',415-495-5775,mediterranean,105 326 | 325,'masa\'s','648 bush st.','san francisco',415-989-7154,'french (new)',106 327 | 326,mifune,'1737 post st.','san francisco',415-922-0337,japanese,107 328 | 327,'plumpjack cafe','3127 fillmore st.','san francisco',415-563-4755,'american (new)',108 329 | 328,postrio,'545 post st.','san francisco',415-776-7825,californian,109 330 | 329,'ritz-carlton dining room (san francisco)','600 stockton st.','san francisco',415-296-7465,'french (new)',110 331 | 330,'rose pistola','532 columbus ave.','san francisco',415-399-0499,italian,111 332 | 331,'ritz-carlton cafe (atlanta)','181 peachtree st.',atlanta,404-659-0400,'american (new)',711 333 | -------------------------------------------------------------------------------- /datasets/fodors_zagats_single/matches_fodors_zagats.csv: -------------------------------------------------------------------------------- 1 | l_id,r_id 2 | 534,219 3 | 535,220 4 | 536,221 5 | 537,222 6 | 538,223 7 | 539,224 8 | 540,225 9 | 541,226 10 | 542,227 11 | 543,228 12 | 544,229 13 | 545,230 14 | 546,231 15 | 547,232 16 | 548,233 17 | 549,234 18 | 550,235 19 | 551,236 20 | 552,237 21 | 553,238 22 | 554,239 23 | 555,240 24 | 556,241 25 | 557,242 26 | 558,243 27 | 559,244 28 | 560,245 29 | 561,246 30 | 562,247 31 | 563,248 32 | 564,249 33 | 565,250 34 | 566,251 35 | 567,252 36 | 568,253 37 | 569,254 38 | 570,255 39 | 571,256 40 | 572,257 41 | 573,258 42 | 574,259 43 | 575,260 44 | 576,261 45 | 577,262 46 | 578,263 47 | 579,264 48 | 580,265 49 | 581,266 50 | 582,267 51 | 583,268 52 | 584,269 53 | 585,270 54 | 586,271 55 | 587,272 56 | 588,273 57 | 589,274 58 | 590,275 59 | 591,276 60 | 592,277 61 | 593,278 62 | 594,279 63 | 595,280 64 | 596,281 65 | 597,282 66 | 598,283 67 | 599,284 68 | 600,285 69 | 601,286 70 | 602,287 71 | 603,288 72 | 604,289 73 | 605,290 74 | 606,291 75 | 607,292 76 | 608,293 77 | 609,294 78 | 610,295 79 | 611,296 80 | 612,297 81 | 613,298 82 | 614,299 83 | 615,300 84 | 616,301 85 | 617,302 86 | 618,303 87 | 619,304 88 | 620,305 89 | 621,306 90 | 622,307 91 | 623,308 92 | 624,309 93 | 625,310 94 | 626,311 95 | 627,312 96 | 628,313 97 | 629,314 98 | 630,315 99 | 631,316 100 | 632,317 101 | 633,318 102 | 634,319 103 | 635,320 104 | 636,321 105 | 637,322 106 | 638,323 107 | 639,324 108 | 640,325 109 | 641,326 110 | 642,327 111 | 643,328 112 | 644,329 113 | 645,330 114 | 219,534 115 | 220,535 116 | 221,536 117 | 222,537 118 | 223,538 119 | 224,539 120 | 225,540 121 | 226,541 122 | 227,542 123 | 228,543 124 | 229,544 125 | 230,545 126 | 231,546 127 | 232,547 128 | 233,548 129 | 234,549 130 | 235,550 131 | 236,551 132 | 237,552 133 | 238,553 134 | 239,554 135 | 240,555 136 | 241,556 137 | 242,557 138 | 243,558 139 | 244,559 140 | 245,560 141 | 246,561 142 | 247,562 143 | 248,563 144 | 249,564 145 | 250,565 146 | 251,566 147 | 252,567 148 | 253,568 149 | 254,569 150 | 255,570 151 | 256,571 152 | 257,572 153 | 258,573 154 | 259,574 155 | 260,575 156 | 261,576 157 | 262,577 158 | 263,578 159 | 264,579 160 | 265,580 161 | 266,581 162 | 267,582 163 | 268,583 164 | 269,584 165 | 270,585 166 | 271,586 167 | 272,587 168 | 273,588 169 | 274,589 170 | 275,590 171 | 276,591 172 | 277,592 173 | 278,593 174 | 279,594 175 | 280,595 176 | 281,596 177 | 282,597 178 | 283,598 179 | 284,599 180 | 285,600 181 | 286,601 182 | 287,602 183 | 288,603 184 | 289,604 185 | 290,605 186 | 291,606 187 | 292,607 188 | 293,608 189 | 294,609 190 | 295,610 191 | 296,611 192 | 297,612 193 | 298,613 194 | 299,614 195 | 300,615 196 | 301,616 197 | 302,617 198 | 303,618 199 | 304,619 200 | 305,620 201 | 306,621 202 | 307,622 203 | 308,623 204 | 309,624 205 | 310,625 206 | 311,626 207 | 312,627 208 | 313,628 209 | 314,629 210 | 315,630 211 | 316,631 212 | 317,632 213 | 318,633 214 | 319,634 215 | 320,635 216 | 321,636 217 | 322,637 218 | 323,638 219 | 324,639 220 | 325,640 221 | 326,641 222 | 327,642 223 | 328,643 224 | 329,644 225 | 330,645 226 | -------------------------------------------------------------------------------- /datasets/fodors_zagats_single/metadata.txt: -------------------------------------------------------------------------------- 1 | fz.csv 2 | matches_fodors_zagats.csv 3 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ZeroER 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - pip=21.0.1 7 | - python=3.6.12 8 | - setuptools=52.0.0 9 | - tk=8.6.10 10 | - pip: 11 | - backcall==0.2.0 12 | - chardet==4.0.0 13 | - cloudpickle==1.6.0 14 | - cycler==0.10.0 15 | - decorator==4.4.2 16 | - idna==2.10 17 | - ipython==7.16.1 18 | - ipython-genutils==0.2.0 19 | - jedi==0.18.0 20 | - joblib==1.0.1 21 | - kiwisolver==1.3.1 22 | - matplotlib==3.3.4 23 | - numpy==1.19.5 24 | - pandas==1.1.5 25 | - parso==0.8.1 26 | - pexpect==4.8.0 27 | - pickleshare==0.7.5 28 | - pillow==8.1.0 29 | - prompt-toolkit==3.0.16 30 | - ptyprocess==0.7.0 31 | - py-entitymatching==0.4.0 32 | - py-stringmatching==0.4.2 33 | - py-stringsimjoin==0.3.2 34 | - pygments==2.8.0 35 | - pyparsing==2.4.7 36 | - pyprind==2.11.2 37 | - python-dateutil==2.8.1 38 | - pytz==2021.1 39 | - requests==2.25.1 40 | - scikit-learn==0.24.1 41 | - scipy==1.5.4 42 | - six==1.15.0 43 | - threadpoolctl==2.1.0 44 | - tqdm==4.57.0 45 | - traitlets==4.3.3 46 | - urllib3==1.26.3 47 | - wcwidth==0.2.5 -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pandas as pd 3 | from collections import Counter 4 | from scipy.optimize import newton 5 | import numpy as np 6 | from scipy.stats import norm, multivariate_normal 7 | from sklearn.metrics import precision_score, recall_score, f1_score 8 | from sklearn.mixture import GaussianMixture 9 | from tqdm import tqdm 10 | from collections import defaultdict 11 | from sklearn.preprocessing import MinMaxScaler 12 | 13 | def getScaledSum(similarity_features): 14 | feature_sums = np.sum(similarity_features, axis=1) 15 | scaler = MinMaxScaler() 16 | scaled = scaler.fit_transform(feature_sums.reshape(-1,1)) 17 | return scaled 18 | 19 | 20 | def get_y_init_given_threshold(similarity_features_df, threshold=0.8): 21 | x = similarity_features_df.values 22 | min_max_scaler = MinMaxScaler() 23 | x_scaled = min_max_scaler.fit_transform(x) 24 | scaled_sum = getScaledSum(x_scaled) 25 | training_labels_ = scaled_sum > threshold 26 | y_init = [int(val) for val in training_labels_] 27 | return y_init 28 | 29 | 30 | DEL = 1e-300 31 | 32 | def _get_results(true_labels, predicted_labels): 33 | p = precision_score(true_labels, predicted_labels) 34 | r = recall_score(true_labels, predicted_labels) 35 | f1 = f1_score(true_labels, predicted_labels) 36 | return p, r, f1 37 | 38 | def bay_coeff(a,b,u): 39 | return np.exp(-(np.log(a/(b+DEL)+b/(a+DEL)+2)+u/(a+b+DEL))) 40 | 41 | 42 | class ConvergenceMeter: 43 | def __init__(self, num_converged, rate_threshold, 44 | diff_fn=lambda a, b: abs(a - b)): 45 | self._num_converged = num_converged 46 | self._rate_threshold = rate_threshold 47 | self._diff_fn = diff_fn 48 | self._diff_history = list() 49 | self._last_val = None 50 | 51 | def offer(self, val): 52 | if self._last_val is not None: 53 | self._diff_history.append( 54 | self._diff_fn(val, self._last_val)) 55 | 56 | self._last_val = val 57 | 58 | @property 59 | def is_converged(self): 60 | if len(self._diff_history) < self._num_converged: 61 | return False 62 | 63 | return np.mean( 64 | self._diff_history[-self._num_converged:]) \ 65 | <= self._rate_threshold 66 | 67 | 68 | 69 | class ZeroerModel: 70 | class Gaussian: 71 | def __init__(self, mu, std): 72 | self.mu = mu 73 | self.std = (std + DEL) 74 | 75 | def plot(self, axis): 76 | x = np.linspace(0, 1, 1000) 77 | pdf = norm.pdf(x, self.mu, self.std) 78 | axis.plot(x, pdf, linewidth=4) 79 | 80 | def pdf(self, s): 81 | return norm.pdf(s, loc=self.mu, scale=self.std) 82 | 83 | def logpdf(self, s): 84 | return norm.logpdf(s, loc=self.mu, scale=self.std) 85 | 86 | 87 | def __init__(self, similarity_matrix, feature_names, y,id_df, c_bay,pi_M=None, hard=False): 88 | self.c_bay = c_bay 89 | self.y = get_y_init_given_threshold(pd.DataFrame(similarity_matrix)) 90 | self.X = np.array(similarity_matrix) 91 | self.id_tuple_to_index = {} 92 | if id_df is not None: 93 | self.ids = id_df.values 94 | for i in range(self.ids.shape[0]): 95 | self.id_tuple_to_index[(self.ids[i,0],self.ids[i,1])] = i 96 | self.id_tuple_to_index[(self.ids[i,1], self.ids[i,0])] = i 97 | 98 | Mu_all = np.mean(self.X,axis=0) 99 | self.Cov_all = np.dot(np.transpose(self.X - Mu_all),(self.X - Mu_all))/self.X.shape[0] 100 | self.corr = pd.DataFrame(similarity_matrix).corr().values 101 | self.sigma = np.zeros_like(self.corr) 102 | for i in range(self.corr.shape[0]): 103 | self.sigma[i,i] = np.std(self.X[:,i]) 104 | self.P_M = np.zeros(self.X.shape[0]) # M is class 1 105 | self.Q_avg = 0 106 | self.feature_names = feature_names 107 | 108 | self.col_index_2_group_name = [] 109 | self.group_name_2_col_indices = defaultdict(list) 110 | for i_col,name in enumerate(feature_names): 111 | self.col_index_2_group_name.append(name.split("_")[0]) 112 | self.group_name_2_col_indices[self.col_index_2_group_name[-1]].append(i_col) 113 | self.group_names = list(set(self.col_index_2_group_name)) 114 | 115 | if pi_M is None: 116 | pi_M = Counter(list(y))[1] / float(len(y)) 117 | 118 | self._hard = hard 119 | self._num_rows = self.X.shape[0] 120 | self._num_cols = self.X.shape[1] 121 | self._labels = list(sorted(np.unique(y))) 122 | self.y_step = y 123 | 124 | self.pi_M = pi_M 125 | self.pi_M_l = pi_M 126 | self.pi_M_r = pi_M 127 | self.params = [] 128 | self.Mu_M = np.zeros((self._num_cols,)) 129 | self.Mu_U = np.zeros((self._num_cols,)) 130 | self.Cov_M = np.zeros((self._num_cols,self._num_cols)) 131 | self.Cov_U = np.zeros((self._num_cols,self._num_cols)) 132 | for i in range(self._num_cols): 133 | self.params.append(self.fit_conditional_parameters(i)) 134 | self.Mu_U[i] = self.params[-1][0].mu 135 | self.Mu_M[i] = self.params[-1][1].mu 136 | self.Cov_U[i,i] = self.params[-1][0].std**2 137 | self.Cov_M[i,i] = self.params[-1][1].std**2 138 | self.P_M_2_dimen = None 139 | self.log_P_M_2_dimen = None 140 | self.log_P_U_2_dimen = None 141 | 142 | 143 | def get_class_wise_scores(self, i_cols): 144 | class_wise_scores = dict() 145 | for label in self._labels: 146 | class_wise_scores[label] = \ 147 | self.X[np.where(self.y == label), i_cols] 148 | 149 | return class_wise_scores 150 | 151 | 152 | def fit_conditional_parameters(self, i): 153 | class_wise_scores = self.get_class_wise_scores(i) 154 | 155 | class_wise_parameters = dict() 156 | for label in self._labels: 157 | gmm = GaussianMixture(n_components=1) 158 | gmm.fit(class_wise_scores[label].reshape(-1, 1)) 159 | 160 | class_wise_parameters[label] = \ 161 | self.Gaussian(mu=gmm.means_.flatten()[0], 162 | std=np.sqrt(gmm.covariances_.flatten()[0])) 163 | 164 | return class_wise_parameters 165 | 166 | 167 | def e_step(self, model_l = None,model_r = None): 168 | self.model_l = model_l 169 | self.model_r = model_r 170 | N = self._num_rows 171 | M = self._num_cols 172 | 173 | reg_cov = 1e-8 * np.identity(len(self.X[0])) 174 | self.Cov_M += reg_cov 175 | self.Cov_U += reg_cov 176 | 177 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_M))) 178 | if min_eig < 0: 179 | self.Cov_M -= 10 * min_eig * np.eye(*self.Cov_M.shape) 180 | #self.Cov_M += reg_cov 181 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_U))) 182 | if min_eig < 0: 183 | self.Cov_U -= 10 * min_eig * np.eye(*self.Cov_U.shape) 184 | #self.Cov_U += reg_cov 185 | log_prods_dup = multivariate_normal.logpdf(self.X, mean=self.Mu_M, cov=self.Cov_M,allow_singular=True) 186 | log_prods_non_dup = multivariate_normal.logpdf(self.X, mean=self.Mu_U, cov=self.Cov_U,allow_singular=True) 187 | 188 | pi_M = self.pi_M 189 | pi_U = 1 - pi_M 190 | 191 | prob_non_dup_over_dup = np.exp(np.clip(log_prods_non_dup - log_prods_dup, -500, 500)) 192 | 193 | self.Q_M = log_prods_dup 194 | self.Q_U = log_prods_non_dup 195 | 196 | 197 | self.P_M = pi_M/ (pi_M + pi_U * prob_non_dup_over_dup) 198 | self.P_U = 1-self.P_M 199 | if self._hard: 200 | self.P_M = np.round(np.clip(self.P_M, 0., 1.)) 201 | 202 | def free_energy(self): 203 | return self.P_M*(np.log(self.pi_M+DEL)-np.log(self.P_M+DEL)+self.Q_M)+self.P_U*(np.log(1-self.pi_M+DEL)-np.log(self.P_U+DEL)+self.Q_U) 204 | 205 | def predict_PM(self,X_test): 206 | reg_cov = 1e-8 * np.identity(len(self.X[0])) 207 | self.Cov_M += reg_cov 208 | self.Cov_U += reg_cov 209 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_M))) 210 | if min_eig < 0: 211 | self.Cov_M -= 10 * min_eig * np.eye(*self.Cov_M.shape) 212 | min_eig = np.min(np.real(np.linalg.eigvals(self.Cov_U))) 213 | if min_eig < 0: 214 | self.Cov_U -= 10 * min_eig * np.eye(*self.Cov_U.shape) 215 | log_prods_dup = multivariate_normal.logpdf(X_test, mean=self.Mu_M, cov=self.Cov_M) 216 | log_prods_non_dup = multivariate_normal.logpdf(X_test, mean=self.Mu_U, cov=self.Cov_U) 217 | 218 | pi_M = self.pi_M 219 | pi_U = 1 - pi_M 220 | 221 | prob_non_dup_over_dup = np.exp(np.clip(log_prods_non_dup - log_prods_dup, -500, 500)) 222 | 223 | 224 | P_M_test = pi_M / (pi_M + pi_U * prob_non_dup_over_dup) 225 | P_M_test = np.round(np.clip(P_M_test, 0., 1.)) 226 | return P_M_test 227 | 228 | def enforce_transitivity(self, P_M, ids, id_tuple_to_index, model_l, model_r,LR_dup_free=False,LR_identical=False): 229 | model_l_P_M=None 230 | model_r_P_M=None 231 | if model_l is not None: 232 | model_l_P_M = model_l.P_M 233 | model_r_P_M = model_r.P_M 234 | id_tuple_to_index_l = model_l.id_tuple_to_index 235 | id_tuple_to_index_r = model_r.id_tuple_to_index 236 | P_M = P_M.copy() 237 | pred_tuples = [] 238 | 239 | for i in range(P_M.shape[0]): 240 | if P_M[i]>0.5: 241 | pred_tuples.append((ids[i,0],ids[i,1])) 242 | pred_tuples = sorted(pred_tuples) 243 | 244 | for i in range(len(pred_tuples)): 245 | for j in range(i+1, len(pred_tuples)): 246 | if pred_tuples[j][0] == pred_tuples[i][0]: 247 | p1 = P_M[id_tuple_to_index[pred_tuples[i]]] 248 | p2 = P_M[id_tuple_to_index[pred_tuples[j]]] 249 | p_r = 0 250 | id1 = id_tuple_to_index[pred_tuples[i]] 251 | id2 = id_tuple_to_index[pred_tuples[j]] 252 | if LR_dup_free: 253 | p_r = 0 254 | idr = -1 255 | elif LR_identical: 256 | if (pred_tuples[i][1], pred_tuples[j][1]) not in id_tuple_to_index: 257 | p_r = 0 258 | idr = -1 259 | else: 260 | p_r = P_M[id_tuple_to_index[(pred_tuples[i][1],pred_tuples[j][1])]] 261 | idr = id_tuple_to_index[(pred_tuples[i][1],pred_tuples[j][1])] 262 | elif model_r_P_M is not None: 263 | if (pred_tuples[i][1], pred_tuples[j][1]) not in id_tuple_to_index_r: 264 | p_r = 0 265 | idr = -1 266 | else: 267 | p_r = model_r_P_M[id_tuple_to_index_r[(pred_tuples[i][1],pred_tuples[j][1])]] 268 | idr = id_tuple_to_index_r[(pred_tuples[i][1],pred_tuples[j][1])] 269 | 270 | if p1*p2 > p_r: 271 | delta_ls = [self.delta_L(p_r/p2,id1),self.delta_L(p_r/p1,id2)] 272 | if idr != -1: 273 | if LR_identical: 274 | delta_ls.append(self.delta_L(p1 * p2, idr)) 275 | else: 276 | delta_ls.append(model_r.delta_L(p1 * p2, idr)) 277 | i_max = np.argmax(delta_ls) 278 | if delta_ls[i_max]>-1e100: 279 | if i_max == 0: 280 | P_M[id1] = p_r / p2 281 | elif i_max == 1: 282 | P_M[id2] = p_r / p1 283 | elif i_max == 2: 284 | if LR_identical: 285 | P_M[idr] = p1 * p2 286 | else: 287 | model_r_P_M[idr] = p1*p2 288 | else: 289 | break 290 | 291 | pred_tuples = sorted(pred_tuples,key=lambda x:(x[1],x[0])) 292 | for i in range(len(pred_tuples)): 293 | for j in range(i+1, len(pred_tuples)): 294 | if pred_tuples[j][1] == pred_tuples[i][1]: 295 | p1 = P_M[id_tuple_to_index[pred_tuples[i]]] 296 | p2 = P_M[id_tuple_to_index[pred_tuples[j]]] 297 | p_l=0 298 | id1 = id_tuple_to_index[pred_tuples[i]] 299 | id2 = id_tuple_to_index[pred_tuples[j]] 300 | if LR_dup_free: 301 | p_l = 0 302 | idl = -1 303 | elif LR_identical: 304 | if (pred_tuples[i][0], pred_tuples[j][0]) not in id_tuple_to_index: 305 | p_l = 0 306 | idl = -1 307 | else: 308 | p_l = P_M[id_tuple_to_index[(pred_tuples[i][0],pred_tuples[j][0])]] 309 | idl = id_tuple_to_index[(pred_tuples[i][0],pred_tuples[j][0])] 310 | elif model_l_P_M is not None: 311 | if (pred_tuples[i][0], pred_tuples[j][0]) not in id_tuple_to_index_l: 312 | p_l = 0 313 | idl = -1 314 | else: 315 | p_l = model_l_P_M[id_tuple_to_index_l[(pred_tuples[i][0],pred_tuples[j][0])]] 316 | idl = id_tuple_to_index_l[(pred_tuples[i][0],pred_tuples[j][0])] 317 | #p_l = 0 318 | #idl = -1 319 | if p1*p2 > p_l: 320 | delta_ls = [self.delta_L(p_l / p2, id1), self.delta_L(p_l / p1, id2)] 321 | if idl != -1: 322 | if LR_identical: 323 | delta_ls.append(self.delta_L(p1 * p2, idl)) 324 | else: 325 | delta_ls.append(model_l.delta_L(p1 * p2, idl)) 326 | i_max = np.argmax(delta_ls) 327 | if delta_ls[i_max]>-1e100: 328 | if i_max == 0: 329 | P_M[id1] = p_l / p2 330 | elif i_max == 1: 331 | P_M[id2] = p_l / p1 332 | elif i_max == 2: 333 | if LR_identical: 334 | P_M[idl] = p1*p2 335 | else: 336 | model_l_P_M[idl] = p1 * p2 337 | else: 338 | break 339 | if model_r_P_M is not None: 340 | model_l.P_M = model_l_P_M 341 | model_r.P_M = model_r_P_M 342 | return P_M 343 | 344 | def m_step(self): 345 | N = self._num_rows 346 | M = self._num_cols 347 | 348 | X = self.X 349 | P_M = self.P_M 350 | P_U = 1. - P_M 351 | 352 | if self._hard: 353 | P_M = P_M.astype(int) 354 | P_U = P_U.astype(int) 355 | 356 | N_M = np.sum(P_M, axis=0) 357 | N_U = N - N_M 358 | 359 | self.pi_M = N_M / N 360 | 361 | 362 | P_M = P_M.reshape(N, 1) 363 | P_U = P_U.reshape(N, 1) 364 | 365 | self.Mu_M = np.sum(P_M * X, axis=0) / (N_M + DEL) 366 | self.Mu_U = np.sum(P_U * X, axis=0) / (N_U + DEL) 367 | 368 | smooth_factor = abs((self.Mu_M - self.Mu_U))**2 369 | 370 | std_M = (np.sqrt(np.sum( 371 | P_M * ((X - np.tile(self.Mu_M, (N, 1))) ** 2), axis=0) / (N_M + DEL))) + 1e-100 372 | std_U = (np.sqrt(np.sum( 373 | P_U * ((X - np.tile(self.Mu_U, (N, 1))) ** 2), axis=0) / (N_U + DEL))) + 1e-100 374 | 375 | Cov_M = np.dot(np.transpose(self.X - self.Mu_M),P_M*(self.X - self.Mu_M))/(N_M + DEL) 376 | Cov_U = np.dot(np.transpose(self.X - self.Mu_U),P_U*(self.X - self.Mu_U))/(N_U + DEL) 377 | 378 | a = np.diag(Cov_M) 379 | b = np.diag(Cov_U) 380 | u = (self.Mu_M - self.Mu_U)**2 381 | c=0.15 382 | 383 | c_bay = self.c_bay 384 | bay_ori = bay_coeff(a,b,u) 385 | target_bay =bay_ori + c_bay 386 | target_bay[target_bay>=1] = bay_ori[target_bay>=1]/2+0.5 387 | def bay_coeff_equ(x): 388 | return bay_coeff(a + x, b + x, u) - target_bay 389 | x0=c*smooth_factor 390 | x1 = np.zeros_like(x0) 391 | kappas = newton(bay_coeff_equ,x0=x0,x1=x1,maxiter=5,tol=1) 392 | kappas[kappas<0] = 0 393 | kappas[kappas>1] = 1 394 | kappas = np.nan_to_num(kappas,posinf=0,neginf=0) 395 | self.Cov_M = np.zeros_like(Cov_M) 396 | self.Cov_U = np.zeros_like(Cov_U) 397 | 398 | for g_name in self.group_names: 399 | i_cols = self.group_name_2_col_indices[g_name] 400 | 401 | for col_1 in i_cols: 402 | for col_2 in i_cols: 403 | if col_2 == col_1: 404 | self.Cov_M[col_1, col_2] = Cov_M[col_1, col_2]+kappas[col_1] 405 | self.Cov_U[col_1, col_2] = Cov_U[col_1, col_2]+kappas[col_1] 406 | else: 407 | self.Cov_M[col_1, col_2] = self.corr[col_1,col_2]*std_M[col_1]*std_M[col_2] 408 | self.Cov_U[col_1, col_2] = self.corr[col_1,col_2]*std_U[col_1]*std_U[col_2] 409 | def L(self,q,i): 410 | return q*(np.log(self.pi_M+DEL) + self.Q_M[i] - np.log(q+DEL)) +(1-q)*(np.log(1-self.pi_M+DEL)+self.Q_U[i]-np.log(1-q+DEL)) 411 | 412 | def delta_L(self,q,i): 413 | delta = self.L(q,i) - self.L(self.P_M[i],i) 414 | if delta > 0.00001: 415 | return -1e200 416 | return delta 417 | 418 | def save_model(self, filepath): 419 | pickle.dump(self, open(filepath, 'wb')) 420 | 421 | @staticmethod 422 | def load_model(filepath): 423 | return pickle.load(open(filepath, 'rb')) 424 | 425 | @classmethod 426 | def run_em(cls, similarity_matrixs, feature_names, y_inits,id_dfs,LR_dup_free,LR_identical,run_trans, 427 | c_bay=0.015, 428 | y_true=None, 429 | pi_M=None, 430 | hard=False, 431 | max_iter=40): 432 | sims, sims_l, sims_r = similarity_matrixs 433 | y_init,y_init_l,y_init_r = y_inits 434 | id_df, id_df_l, id_df_r = id_dfs 435 | model = cls(sims, feature_names,y_init,id_df,pi_M=pi_M, hard=hard,c_bay=c_bay) 436 | if run_trans and LR_dup_free==False and LR_identical==False: 437 | model_l = cls(sims_l, feature_names,y_init_l,id_df_l,c_bay=c_bay) 438 | model_r = cls(sims_r, feature_names,y_init_r,id_df_r,c_bay=c_bay) 439 | 440 | convergence = ConvergenceMeter(10, 0.01, diff_fn=lambda a, b: np.linalg.norm(a - b)) 441 | 442 | with tqdm(range(max_iter)) as pbar: 443 | for i in pbar: 444 | model.e_step() 445 | if run_trans: 446 | if LR_dup_free==False and LR_identical==False: 447 | model_r.e_step() 448 | model_l.e_step() 449 | for i in range(4): 450 | if LR_dup_free == False and LR_identical==False: 451 | model_l.P_M = model_l.enforce_transitivity(model_l.P_M, model_l.ids, model_l.id_tuple_to_index, model_l, model_l) 452 | model_r.P_M = model_r.enforce_transitivity(model_r.P_M, model_r.ids, model_r.id_tuple_to_index, model_r, model_r) 453 | model.P_M = model.enforce_transitivity(model.P_M, model.ids, model.id_tuple_to_index, model_l, model_r) 454 | else: 455 | model.P_M = model.enforce_transitivity(model.P_M, model.ids, model.id_tuple_to_index, None, None,LR_dup_free,LR_identical) 456 | model.m_step() 457 | if run_trans and LR_dup_free == False and LR_identical==False: 458 | model_r.m_step() 459 | model_l.m_step() 460 | 461 | convergence.offer(model.free_energy()) 462 | if convergence.is_converged: 463 | break 464 | if y_true is not None: 465 | y_pred = np.round(np.clip(model.P_M + DEL, 0., 1.)).astype(int) \ 466 | if not hard else model.P_M.astype(int) 467 | p, r, f1 = _get_results(y_true, y_pred) 468 | result_str = ( 469 | "norm: {:0.2f}, " 470 | "F1: {:0.2f}, " 471 | "Precision: {:0.2f}, " 472 | "Recall: {:0.2f}".format( 473 | np.linalg.norm(model.P_M), 474 | f1, p, r)) 475 | pbar.set_description_str(result_str) 476 | 477 | return model, model.P_M 478 | 479 | 480 | if __name__ == '__main__': 481 | pass 482 | 483 | 484 | 485 | 486 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import precision_score, recall_score, f1_score 3 | 4 | from model import get_y_init_given_threshold,ZeroerModel 5 | 6 | DEL = 1e-300 7 | 8 | 9 | def get_results(true_labels, predicted_labels): 10 | p = precision_score(true_labels, predicted_labels) 11 | r = recall_score(true_labels, predicted_labels) 12 | f1 = f1_score(true_labels, predicted_labels) 13 | return p, r, f1 14 | 15 | 16 | def run_zeroer(similarity_features_df, similarity_features_lr,id_dfs,true_labels,LR_dup_free,LR_identical,run_trans): 17 | similarity_matrix = similarity_features_df.values 18 | y_init = get_y_init_given_threshold(similarity_features_df) 19 | similarity_matrixs = [similarity_matrix,None,None] 20 | y_inits = [y_init,None,None] 21 | if similarity_features_lr[0] is not None: 22 | similarity_matrixs[1] = similarity_features_lr[0].values 23 | similarity_matrixs[2] = similarity_features_lr[1].values 24 | y_inits[1] = get_y_init_given_threshold(similarity_features_lr[0]) 25 | y_inits[2] = get_y_init_given_threshold(similarity_features_lr[1]) 26 | feature_names = similarity_features_df.columns 27 | 28 | c_bay = 0.1 29 | model, y_pred = ZeroerModel.run_em(similarity_matrixs, feature_names, y_inits,id_dfs,LR_dup_free,LR_identical, run_trans, y_true=true_labels, 30 | hard=False, c_bay=c_bay) 31 | if true_labels is not None: 32 | p, r, f1 = get_results(true_labels, np.round(np.clip(y_pred + DEL, 0., 1.)).astype(int)) 33 | print("Results after EM:") 34 | print("F1: {:0.2f}, Precision: {:0.2f}, Recall: {:0.2f}".format(f1, p, r)) 35 | return y_pred 36 | -------------------------------------------------------------------------------- /zeroer.py: -------------------------------------------------------------------------------- 1 | from data_loading_helper.data_loader import load_data 2 | from data_loading_helper.feature_extraction import * 3 | from utils import run_zeroer 4 | from blocking_functions import * 5 | from os.path import join 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("dataset",type=str) 9 | parser.add_argument("--run_transitivity",type=bool,default=False,nargs="?",const=True, help="whether to enforce transitivity constraint") 10 | parser.add_argument("--LR_dup_free",type=bool,default=False,nargs="?",const=True, help="are the left table and right table duplicate-free?") 11 | parser.add_argument("--LR_identical",type=bool,default=False,nargs="?",const=True, help="are the left table and right table identical?") 12 | 13 | data_path = "datasets" 14 | 15 | if __name__ == '__main__': 16 | args = parser.parse_args() 17 | LR_dup_free = args.LR_dup_free 18 | run_trans = args.run_transitivity 19 | LR_identical = args.LR_identical 20 | dataset_name = args.dataset 21 | dataset_path = join(data_path,dataset_name) 22 | blocking_func = blocking_functions_mapping[dataset_name] 23 | try: 24 | candset_features_df = pd.read_csv(join(dataset_path,"candset_features_df.csv"), index_col=0) 25 | candset_features_df.reset_index(drop=True,inplace=True) 26 | if run_trans==True: 27 | id_df = candset_features_df[["ltable_id","rtable_id"]] 28 | id_df.reset_index(drop=True,inplace=True) 29 | if LR_dup_free==False and LR_identical==False: 30 | candset_features_df_l = pd.read_csv(join(dataset_path,"candset_features_df_l.csv"), index_col=0) 31 | candset_features_df_l.reset_index(drop=True,inplace=True) 32 | candset_features_df_r = pd.read_csv(join(dataset_path,"candset_features_df_r.csv"), index_col=0) 33 | candset_features_df_r.reset_index(drop=True,inplace=True) 34 | id_df_l = candset_features_df_l[["ltable_id","rtable_id"]] 35 | id_df_l.reset_index(drop=True,inplace=True) 36 | id_df_r = candset_features_df_r[["ltable_id","rtable_id"]] 37 | id_df_r.reset_index(drop=True,inplace=True) 38 | print( 39 | "Features already generated, reading from file: " + dataset_path + "/candset_features_df.csv") 40 | 41 | except FileNotFoundError: 42 | print("Generating features and storing in: " + dataset_path + "/candset_features_df.csv") 43 | 44 | f = open(join(dataset_path, 'metadata.txt'), "r") 45 | LEFT_FILE = join(dataset_path, f.readline().strip()) 46 | if LR_identical: 47 | RIGHT_FILE = LEFT_FILE 48 | else: 49 | RIGHT_FILE = join(dataset_path, f.readline().strip()) 50 | DUPLICATE_TUPLES = join(dataset_path, f.readline().strip()) 51 | f.close() 52 | if run_trans==True and LR_dup_free==False and LR_identical==False: 53 | ltable_df, rtable_df, duplicates_df, candset_df,candset_df_l,candset_df_r = load_data(LEFT_FILE, RIGHT_FILE, DUPLICATE_TUPLES, 54 | blocking_func, 55 | include_self_join=True) 56 | else: 57 | ltable_df, rtable_df, duplicates_df, candset_df = load_data(LEFT_FILE, RIGHT_FILE, DUPLICATE_TUPLES, 58 | blocking_func, 59 | include_self_join=False) 60 | if LR_identical: 61 | print("removing self matches") 62 | candset_df = candset_df.loc[candset_df.ltable_id!=candset_df.rtable_id,:] 63 | candset_df.reset_index(inplace=True,drop=True) 64 | candset_df['_id'] = candset_df.index 65 | if duplicates_df is None: 66 | duplicates_df = pd.DataFrame(columns=["ltable_id", "rtable_id"]) 67 | candset_features_df = gather_features_and_labels(ltable_df, rtable_df, duplicates_df, candset_df) 68 | candset_features_df.to_csv(join(dataset_path,"candset_features_df.csv")) 69 | id_df = candset_df[["ltable_id", "rtable_id"]] 70 | 71 | if run_trans == True and LR_dup_free == False and LR_identical==False: 72 | duplicates_df_r = pd.DataFrame() 73 | duplicates_df_r['l_id'] = rtable_df["id"] 74 | duplicates_df_r['r_id'] = rtable_df["id"] 75 | candset_features_df_r = gather_features_and_labels(rtable_df, rtable_df, duplicates_df_r, candset_df_r) 76 | candset_features_df_r.to_csv(join(dataset_path,"candset_features_df_r.csv")) 77 | 78 | 79 | duplicates_df_l = pd.DataFrame() 80 | duplicates_df_l['l_id'] = ltable_df["id"] 81 | duplicates_df_l['r_id'] = ltable_df["id"] 82 | candset_features_df_l = gather_features_and_labels(ltable_df, ltable_df, duplicates_df_l, candset_df_l) 83 | candset_features_df_l.to_csv(join(dataset_path,"candset_features_df_l.csv")) 84 | 85 | id_df_l = candset_df_l[["ltable_id","rtable_id"]] 86 | id_df_r = candset_df_r[["ltable_id","rtable_id"]] 87 | id_df_l.to_csv(join(dataset_path,"id_tuple_df_l.csv")) 88 | id_df_r.to_csv(join(dataset_path,"id_tuple_df_r.csv")) 89 | 90 | similarity_features_df = gather_similarity_features(candset_features_df) 91 | similarity_features_lr = (None,None) 92 | id_dfs = (None, None, None) 93 | if run_trans == True: 94 | id_dfs = (id_df, None, None) 95 | if LR_dup_free == False and LR_identical==False: 96 | similarity_features_df_l = gather_similarity_features(candset_features_df_l) 97 | similarity_features_df_r = gather_similarity_features(candset_features_df_r) 98 | features = set(similarity_features_df.columns) 99 | features = features.intersection(set(similarity_features_df_l.columns)) 100 | features = features.intersection(set(similarity_features_df_r.columns)) 101 | features = sorted(list(features)) 102 | similarity_features_df = similarity_features_df[features] 103 | similarity_features_df_l = similarity_features_df_l[features] 104 | similarity_features_df_r = similarity_features_df_r[features] 105 | similarity_features_lr = (similarity_features_df_l,similarity_features_df_r) 106 | id_dfs = (id_df, id_df_l, id_df_r) 107 | 108 | true_labels = candset_features_df.gold.values 109 | if np.sum(true_labels)==0: 110 | true_labels = None 111 | y_pred = run_zeroer(similarity_features_df, similarity_features_lr,id_dfs, 112 | true_labels ,LR_dup_free,LR_identical,run_trans) 113 | pred_df = candset_features_df[["ltable_id","rtable_id"]] 114 | pred_df['pred'] = y_pred 115 | pred_df.to_csv(join(dataset_path,"pred.csv")) 116 | 117 | --------------------------------------------------------------------------------