├── .gitignore ├── LICENSE ├── README.md ├── cs_util.py ├── dw_util.py ├── java ├── HiveSerdes │ ├── dependency-reduced-pom.xml │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── cloudera │ │ └── hive │ │ └── serde │ │ └── JSONSerDe.java └── MapReduce │ ├── dependency-reduced-pom.xml │ ├── pom.xml │ └── src │ └── main │ └── java │ └── com │ └── onefold │ └── hadoop │ └── MapReduce │ └── TransformDataMultiOutputFormat.java ├── json ├── generate-schema-mapper.py ├── generate-schema-reducer.py └── transform-data-mapper.py ├── onefold.py └── onefold_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Java / Eclipse 60 | .idea 61 | *.class 62 | *.classpath 63 | .project 64 | **/.settings -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mongo - Google Big Query Connector 2 | 3 | Super-easy way to load your MongoDB collection into Google BigQuery. The code creates Google BigQuery schema automatically by performing a deep inspection of each MongoDB record and deriving the data type of each field. Supports basic data types, nested objects, array of primitive data types and array of objects. 4 | 5 | Nested fields are flattened out into columns. 6 | 7 | Arrays are typically split into a different (child) BigQuery table with parent/child relationship with the root table. 8 | 9 | ## How it works 10 | 11 | 1. Connects to your MongoDB and extract the specified collection into local file which is then copied to Google Cloud Storage. 12 | 2. MapReduce generates schema (a copy is saved back to MongoDB for info). 13 | 3. MapReduce transforms data, breaking the array into multiple files in Google Cloud Storage output folder. 14 | 4. Create BigQuery tables using schema generated in step 2. 15 | 5. Load BigQuery tables using Google Cloud Storage files generated in step 3. 16 | 17 | ## Pre-requisites 18 | 19 | 1. You have a Hadoop cluster. 20 | 2. You can SSH to the master node. 21 | 3. Make sure `hadoop` program is in your `PATH`. 22 | 4. In each node, the following is installed: 23 | * python (2.6+) 24 | * pip 25 | * pymongo 26 | 27 | If not, you can run the following on each node: 28 | ``` 29 | yum -y install epel-release 30 | yum -y install python-pip 31 | pip install pymongo 32 | ``` 33 | 34 | ## Install 35 | 36 | 1. git clone this repo on the master node in your Hadoop cluster. 37 | 2. Run this to compile custom code needed for MapReduce: 38 | 39 | ``` 40 | cd java/MapReduce 41 | mvn package 42 | ``` 43 | 3. Make sure you have gcloud command line utilities installed in your Hadoop master mode. Executables that this program depends on are: 44 | 45 | ``` 46 | gsutil 47 | bq 48 | ``` 49 | 50 | In `onefold.py`, near the top, there are a few configuration that you can customized. Make sure these variables are set correctly before proceeding. 51 | 52 | `TMP_PATH` 53 | 54 | Where the script will store extracted data from MongoDB. 55 | 56 | `CLOUD_STORAGE_PATH` 57 | 58 | Google Cloud Storage Path where it will store files for MapReduce and BigQuery. 59 | 60 | 61 | ## Usage 62 | 63 | ### Simple case 64 | Say you have a MongoDB collection called "test.users", and you have some records in it: 65 | 66 | ``` 67 | > db.users.find(); 68 | { "_id" : ObjectId("5688d0855d53fc2c133f3429"), "mobile" : { "carrier" : "Sprint", "device" : "Samsung" }, "name" : "John Doe", "age" : 24, "utm_campaign" : "Facebook_Offer", "app_version" : "2.4", "address" : { "city" : "Chicago", "zipcode" : 94012 } } 69 | ``` 70 | 71 | To load this into BigQuery, 72 | 73 | ``` 74 | ./onefold.py --mongo mongodb://[mongodb_host]:[mongodb_port] \ 75 | --source_db test \ 76 | --source_collection users \ 77 | --infra_type gcloud \ 78 | --dest_db_name test \ 79 | --dest_table_name users \ 80 | --gcloud_project_id [google_cloud_project_id] \ 81 | --gcloud_storage_bucket_id [google_cloud_storage_bucket_id] 82 | ``` 83 | 84 | Results: 85 | ``` 86 | -- Initializing Google BigQuery module -- 87 | Creating file /tmp/onefold_mongo/users/data/1 88 | Executing command: cat /tmp/onefold_mongo/users/data/1 | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py mongodb://localhost:27017/test/users_schema > /dev/null 89 | Executing command: cat /tmp/onefold_mongo/users/data/1 | json/transform-data-mapper.py mongodb://localhost:27017/test/users_schema,/tmp/onefold_mongo/users/data_transform/output > /dev/null 90 | ... 91 | Executing command: gsutil -m rm -rf gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/ 92 | Removing gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/part-00000#1451806915461000... 93 | copy_from_local: /tmp/onefold_mongo/users/data_transform/output/root/part-00000 onefold_mongo/users/data_transform/output/root/ 94 | Executing command: gsutil -m cp /tmp/onefold_mongo/users/data_transform/output/root/part-00000 gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/part-00000 95 | ... 96 | Executing command: bq --project_id mongo-gbq --format csv ls test 97 | Executing command: bq --project_id mongo-gbq mk --schema users_schema.json test.users 98 | Table 'mongo-gbq:test.users' successfully created. 99 | Loading fragment: root 100 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.users gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/* 101 | Successfully started load mongo-gbq:bqjob_r4d275de6da77baf3_0000015206702df7_1 102 | ------------------- 103 | RUN SUMMARY 104 | ------------------- 105 | Num records extracted 1 106 | Num records rejected 0 107 | Extracted data with _id from 5688d0855d53fc2c133f3429 to 5688d0855d53fc2c133f3429 108 | Extracted files are located at: /tmp/onefold_mongo/users/data/1 109 | Destination Tables: users 110 | Schema is stored in Mongo test.users_schema 111 | ``` 112 | 113 | In Google BigQuery, you can see: 114 | ``` 115 | $ bq show test.users 116 | Table mongo-gbq:test.users 117 | 118 | Last modified Schema Total Rows Total Bytes Expiration 119 | ----------------- --------------------------- ------------ ------------- ------------ 120 | 02 Jan 23:43:12 |- address_city: string 1 141 121 | |- address_zipcode: float 122 | |- age: float 123 | |- app_version: string 124 | |- id_oid: string 125 | |- mobile_carrier: string 126 | |- mobile_device: string 127 | |- name: string 128 | |- utm_campaign: string 129 | |- hash_code: string 130 | 131 | $ bq query "select * from test.users" 132 | Waiting on bqjob_r710f4e875a413367_000001520674ebba_1 ... (0s) Current status: DONE 133 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+ 134 | | address_city | address_zipcode | age | app_version | id_oid | mobile_carrier | mobile_device | name | utm_campaign | hash_code | 135 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+ 136 | | Chicago | 94012.0 | 24.0 | 2.4 | 5688d0855d53fc2c133f3429 | Sprint | Samsung | John Doe | Facebook_Offer | abf9a2ac1ce71feb12418c889b913f8d8361a6d4 | 137 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+ 138 | ``` 139 | 140 | In Mongo, you can see the schema saved in a collection called `users_schema`: 141 | ``` 142 | > db.users_schema.find(); 143 | { "_id" : ObjectId("55426ae6296e827fc79300b1"), "type" : "field", "data_type" : "string-nullable", "key" : "address_city" } 144 | { "_id" : ObjectId("55426ae6296e827fc79300b2"), "type" : "field", "data_type" : "record-nullable", "key" : "address" } 145 | { "_id" : ObjectId("55426ae6296e827fc79300b3"), "type" : "field", "data_type" : "integer-nullable", "key" : "address_zipcode" } 146 | { "_id" : ObjectId("55426ae6296e827fc79300b4"), "type" : "field", "data_type" : "integer-nullable", "key" : "age" } 147 | { "_id" : ObjectId("55426ae6296e827fc79300b5"), "type" : "field", "data_type" : "string-nullable", "key" : "app_version" } 148 | { "_id" : ObjectId("55426ae6296e827fc79300b6"), "type" : "field", "data_type" : "string-nullable", "key" : "id_oid" } 149 | { "_id" : ObjectId("55426ae6296e827fc79300b7"), "type" : "field", "data_type" : "record-nullable", "key" : "id" } 150 | { "_id" : ObjectId("55426ae6296e827fc79300b8"), "type" : "field", "data_type" : "string-nullable", "key" : "mobile_carrier" } 151 | { "_id" : ObjectId("55426ae6296e827fc79300b9"), "type" : "field", "data_type" : "string-nullable", "key" : "mobile_device" } 152 | { "_id" : ObjectId("55426ae6296e827fc79300ba"), "type" : "field", "data_type" : "record-nullable", "key" : "mobile" } 153 | { "_id" : ObjectId("55426ae6296e827fc79300bb"), "type" : "field", "data_type" : "string-nullable", "key" : "name" } 154 | { "_id" : ObjectId("55426ae6296e827fc79300bc"), "type" : "field", "data_type" : "string-nullable", "key" : "utm_campaign" } 155 | { "_id" : ObjectId("55426ae72e2ecef82b7417d1"), "type" : "fragments", "fragments" : [ "root" ] } 156 | ``` 157 | 158 | Notes: 159 | 160 | 1. By default, extracted data is saved in `/tmp/onefold_mongo`. It can be changed by specifying the `tmp_path` parameter. 161 | 2. If `--use_mr` parameter is specified, it will use MapReduce to generate schema and transform data. Otherwise, it runs the mapper and reducer via command line using `cat [input] | mapper | sort | reducer` metaphor. This is handy if you don't have many records and/or just want to get this going quickly. 162 | 3. The generated files are in JSON format. 163 | 4. Nested objects like `mobile` and `address` in the above example are flattened out in the BigQuery table. 164 | 5. `hash_code` column is added. It's basically an SHA1 hash of the object. It's useful later on when we use `hash_code` as parent-child key to represent array in a child table. 165 | 166 | 167 | ### Now let's try a more complex collection. 168 | 169 | In Mongo, create a `complex_users` collection with the following fields: 170 | ``` 171 | > db.complex_users.find() 172 | { "_id" : ObjectId("5688d73c5d53fc2c133f342b"), "hobbies" : [ "reading", "cycling" ], "age" : 34, "work_history" : [ { "to" : "present", "from" : 2013, "name" : "IBM" }, { "to" : 2013, "from" : 2003, "name" : "Bell" } ], "utm_campaign" : "Google", "name" : "Alexander Keith", "app_version" : "2.5", "mobile" : { "device" : "iPhone", "carrier" : "Rogers" }, "address" : { "state" : "Ontario", "zipcode" : "M1K3A5", "street" : "26 Marshall Lane", "city" : "Toronto" } } 173 | ``` 174 | 175 | A new `hobbies` field is added that is a string array. 176 | A new `work_history` field is added that is an array of nested objects. 177 | 178 | Run the following command to load `complex_users` collection into BigQuery: 179 | ``` 180 | ./onefold.py --mongo mongodb://[mongodb_host]:[mongodb_port] \ 181 | --source_db test \ 182 | --source_collection complex_users \ 183 | --infra_type gcloud \ 184 | --dest_db_name test \ 185 | --dest_table_name complex_users \ 186 | --gcloud_project_id [google_cloud_project_id] \ 187 | --gcloud_storage_bucket_id [google_cloud_storage_bucket_id] 188 | ``` 189 | 190 | Results: 191 | ``` 192 | -- Initializing Google BigQuery module -- 193 | Creating file /tmp/onefold_mongo/complex_users/data/1 194 | Executing command: cat /tmp/onefold_mongo/complex_users/data/1 | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py mongodb://localhost:27017/test/complex_users_schema > /dev/null 195 | Executing command: cat /tmp/onefold_mongo/complex_users/data/1 | json/transform-data-mapper.py mongodb://localhost:27017/test/complex_users_schema,/tmp/onefold_mongo/complex_users/data_transform/output > /dev/null 196 | Executing command: rm -rf /tmp/onefold_mongo/complex_users/data_transform/output 197 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/root 198 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 199 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 200 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/work_history 201 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 202 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 203 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/hobbies 204 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 205 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 206 | ... 207 | Executing command: gsutil -m rm -rf gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/ 208 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 onefold_mongo/complex_users/data_transform/output/root/ 209 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/root/part-00000 210 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 [Content-Type=application/octet-stream]... 211 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 onefold_mongo/complex_users/data_transform/output/work_history/ 212 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 213 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 [Content-Type=application/octet-stream]... 214 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 onefold_mongo/complex_users/data_transform/output/hobbies/ 215 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 216 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 [Content-Type=application/octet-stream]... 217 | ... 218 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_schema.json test.complex_users 219 | Table 'mongo-gbq:test.complex_users' successfully created. 220 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_work_history_schema.json test.complex_users_work_history 221 | Table 'mongo-gbq:test.complex_users_work_history' successfully created. 222 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_hobbies_schema.json test.complex_users_hobbies 223 | Table 'mongo-gbq:test.complex_users_hobbies' successfully created. 224 | Loading fragment: root 225 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/root/* 226 | Successfully started load mongo-gbq:bqjob_r4fe3384c09234c1d_00000152068b5e85_1 227 | Loading fragment: work_history 228 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users_work_history gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/work_history/* 229 | Successfully started load mongo-gbq:bqjob_r138de33f6e2058cc_00000152068b62aa_1 230 | Loading fragment: hobbies 231 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users_hobbies gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/hobbies/* 232 | Successfully started load mongo-gbq:bqjob_r361aa8424636d4a0_00000152068b689e_1 233 | ------------------- 234 | RUN SUMMARY 235 | ------------------- 236 | Num records extracted 1 237 | Num records rejected 0 238 | Extracted data with _id from 5688d73c5d53fc2c133f342b to 5688d73c5d53fc2c133f342b 239 | Extracted files are located at: /tmp/onefold_mongo/complex_users/data/1 240 | Destination Tables: complex_users complex_users_work_history complex_users_hobbies 241 | Schema is stored in Mongo test.complex_users_schema 242 | ``` 243 | 244 | In BigQuery, three new tables are created: `complex_users`, `complex_users_hobbies` and `complex_users_work_history` 245 | ``` 246 | $ bq ls test 247 | tableId Type 248 | ---------------------------- ------- 249 | complex_users TABLE 250 | complex_users_hobbies TABLE 251 | complex_users_work_history TABLE 252 | 253 | $ bq show test.complex_users 254 | Table mongo-gbq:test.complex_users 255 | 256 | Last modified Schema Total Rows Total Bytes Expiration 257 | ----------------- ---------------------------- ------------ ------------- ------------ 258 | 03 Jan 00:12:48 |- address_city: string 1 166 259 | |- address_state: string 260 | |- address_street: string 261 | |- address_zipcode: string 262 | |- age: float 263 | |- app_version: string 264 | |- id_oid: string 265 | |- mobile_carrier: string 266 | |- mobile_device: string 267 | |- name: string 268 | |- utm_campaign: string 269 | |- hash_code: string 270 | 271 | $ bq show test.complex_users_hobbies 272 | Table mongo-gbq:test.complex_users_hobbies 273 | 274 | Last modified Schema Total Rows Total Bytes Expiration 275 | ----------------- ----------------------------- ------------ ------------- ------------ 276 | 03 Jan 00:12:49 |- parent_hash_code: string 2 102 277 | |- hash_code: string 278 | |- value: string 279 | 280 | $ bq show test.complex_users_work_history 281 | Table mongo-gbq:test.complex_users_work_history 282 | 283 | Last modified Schema Total Rows Total Bytes Expiration 284 | ----------------- ----------------------------- ------------ ------------- ------------ 285 | 03 Jan 00:12:47 |- parent_hash_code: string 2 212 286 | |- hash_code: string 287 | |- from: float 288 | |- name: string 289 | |- to: string 290 | ``` 291 | 292 | You can join parent and child table like: 293 | ``` 294 | $ bq query "select * from test.complex_users join test.complex_users_hobbies on test.complex_users.hash_code = test.complex_users_hobbies.parent_hash_code" 295 | ``` 296 | 297 | ## Parameters 298 | 299 | `--mongo` 300 | MongoDB connectivity URI, e.g. mongodb://127.0.0.1:27017 301 | 302 | `--source_db` 303 | The MongoDB database name from which to extract data. 304 | 305 | `--source_collection` 306 | The MongoDB collection name from which to extract data. 307 | 308 | `--query` 309 | Optional query users can specify when doing extraction. Useful for filtering out only incremental records. See below for some examples. 310 | 311 | `--tmp_path` 312 | Optional. Path used to store extracted data. Default is `/tmp/onefold_mongo` 313 | 314 | `--schema_db` 315 | Optional. The MongoDB database name to which schema data is written. Default to the same database as source. 316 | 317 | `--schema_collection` 318 | Optional. The MongoDB collection to which schema data is written. Default to `[source_collection]_schema`. 319 | 320 | `--dest_db_name` 321 | Optional. The BigQuery dataset to use. 322 | 323 | `--dest_table_name` 324 | Optional. The BigQuery table name to use. If not specified, it will use source collection name. 325 | 326 | `--use_mr` 327 | If this parameter is specified, the program will use MapReduce to generate schema and transform data. If not, the mapper and reducer will be executed as command line using the `cat [input] | mapper | sort | reducer` metaphore. This is useful for small data set and if you just want to get things up and running quickly. 328 | 329 | `--policy_file` 330 | Use the specified file for policies which you can use to configure required fields, etc. See below for supported policies 331 | 332 | `--infra_type` 333 | Specify `gcloud` for Google BigQuery 334 | 335 | `--gcloud_project_id` 336 | Specify the Google Cloud project id 337 | 338 | `--gcloud_storage_bucket_id` 339 | Specify the bucket ID of the Google Cloud Storage bucket to use for file storage 340 | 341 | 342 | ## Policy Manager 343 | 344 | Policy manager is used to control schema generation. With the policy manager, you can: 345 | 346 | 1. Specify required fields. If the field is missing, the document is rejected. Rejected documents are saved in `[TMP_PATH]/[collection_name]/rejected` folder. 347 | 2. Enforce data type for certain fields. In the example below, `age` is forced to be integer. So if there is a document that contains non-integer, the field will be null. 348 | 349 | Example policy file: 350 | 351 | ``` 352 | [ 353 | { 354 | "key": "last_name", 355 | "required": true 356 | }, 357 | { 358 | "key": "address.zipcode", 359 | "data_type": "integer" 360 | } 361 | ] 362 | ``` 363 | 364 | Save the policy file, and pass the policy file in as command line argument via `--policy_file`. 365 | 366 | 367 | ## Query Examples 368 | 369 | To query for charge_id > 1237489: 370 | ``` 371 | --query '{"charge_id":{"$gt":1237489}}' 372 | ``` 373 | 374 | To query for _id > 55401a60151a4b1a4f000001: 375 | ``` 376 | --query '{"_id": {"$gt":ObjectId("55401a60151a4b1a4f000001")}}' 377 | ``` 378 | 379 | ## Known Issues 380 | 381 | * There is no easy way to capture records that were updated in MongoDB. We are working on capturing oplog and replay inserts and updates. 382 | * The ways in which the data type of a given changes over time is huge. A field can change from an int, to a string, to an array of string, to an array of mix types, to an array of complex objects over time. We haven't tested all the different combinations, but very interested in support as many as we can. Let us know if you have found a case that we don't support well. 383 | * Currently since BigQuery doesn't support alter-table, we can only support `overwrite` mode. 384 | 385 | -------------------------------------------------------------------------------- /cs_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Author: Jorge Chang 5 | # 6 | # Cloud Storage utility - abstraction of all cloud storage related calls. 7 | # Implementation for Hadoop and Google Cloud Storage provided. Basic functionaliy 8 | # like mkdir, rmdir and copy_from_local. 9 | # 10 | 11 | from onefold_util import execute 12 | 13 | class CloudStorage: 14 | 15 | def rmdir(self, path): 16 | return 17 | 18 | def mkdir(self, path): 19 | return 20 | 21 | def copy_from_local(self, source_local_file_path, dest_path): 22 | return 23 | 24 | 25 | # HDFS implementation. 26 | class HDFSStorage(CloudStorage): 27 | 28 | def rmdir(self, path): 29 | execute("hadoop fs -rm -r -f %s" % path, ignore_error=True) 30 | 31 | def mkdir(self, path): 32 | execute("hadoop fs -mkdir -p %s" % path, ignore_error=True) 33 | 34 | def copy_from_local(self, source_local_file_path, dest_path): 35 | execute("hadoop fs -copyFromLocal %s %s/" % (source_local_file_path, dest_path)) 36 | 37 | 38 | # Google Cloud Storage implementation. 39 | class GCloudStorage(CloudStorage): 40 | 41 | project_id = None 42 | bucket_id = None 43 | 44 | def __init__(self, project_id, bucket_id): 45 | self.project_id = project_id 46 | self.bucket_id = bucket_id 47 | 48 | def rmdir(self, path): 49 | 50 | print 'rmdir: %s' % (path) 51 | 52 | if not path.endswith("/"): 53 | path = path + "/" 54 | 55 | command = "gsutil -m rm -rf gs://%s/%s" % (self.bucket_id, path) 56 | execute(command, ignore_error=True) 57 | 58 | def mkdir(self, path): 59 | # nothing to do. there are no folders in google cloud storage 60 | pass 61 | 62 | def copy_from_local(self, source_local_file_path, dest_path): 63 | 64 | print 'copy_from_local: %s %s' % (source_local_file_path, dest_path) 65 | 66 | if not dest_path.endswith("/"): 67 | dest_path = dest_path + "/" 68 | 69 | dest_path = dest_path + source_local_file_path.split("/")[-1] 70 | 71 | command = "gsutil -m cp %s gs://%s/%s" % (source_local_file_path, self.bucket_id, dest_path) 72 | execute(command, ignore_error=False, retry=True) 73 | -------------------------------------------------------------------------------- /dw_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Author: Jorge Chang 5 | # 6 | # See license in LICENSE file. 7 | # 8 | # Data warehouse utility - interface to DataWarehouse + implementation for Hive. 9 | # basic functionaliy like create table, update table, list tables, execute queries / DMLs, etc. 10 | # 11 | 12 | import abc 13 | import json 14 | import pprint 15 | import re 16 | 17 | from onefold_util import execute, execute_and_read 18 | 19 | 20 | class DataWarehouse: 21 | __metaclass__ = abc.ABCMeta 22 | 23 | @abc.abstractmethod 24 | def create_dataset(self, database_name): 25 | return 26 | 27 | @abc.abstractmethod 28 | def delete_dataset(self, database_name): 29 | return 30 | 31 | @abc.abstractmethod 32 | def create_table(self, database_name, table_name, schema_fields, process_array): 33 | return 34 | 35 | @abc.abstractmethod 36 | def update_table(self, database_name, table_name, schema_fields): 37 | return 38 | 39 | @abc.abstractmethod 40 | def delete_table(self, database_name, table_name): 41 | return 42 | 43 | @abc.abstractmethod 44 | def get_num_rows(self, database_name, table_name): 45 | return 46 | 47 | @abc.abstractmethod 48 | def table_exists(self, database_name, table_name): 49 | return 50 | 51 | @abc.abstractmethod 52 | def get_table_schema(self, database_name, table_name): 53 | return 54 | 55 | @abc.abstractmethod 56 | def get_job_state(self, job_id): 57 | return 58 | 59 | @abc.abstractmethod 60 | def list_tables(self, database_name, table_prefix): 61 | return 62 | 63 | @abc.abstractmethod 64 | def load_table(self, table_name, file_path): 65 | return 66 | 67 | @abc.abstractmethod 68 | def query(self, query): 69 | return 70 | 71 | 72 | class Hive(DataWarehouse): 73 | 74 | host = None 75 | port = None 76 | hive_serdes_path = None 77 | 78 | def __init__(self, host, port, hive_serdes_path): 79 | print '-- Initializing Hive Util --' 80 | self.host = host 81 | self.port = port 82 | self.hive_serdes_path = hive_serdes_path 83 | 84 | def execute_sql (self, database_name, sql, fetch_result = False): 85 | import pyhs2 86 | conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="NOSASL", database='default') 87 | 88 | # turn on tez and add serde jar 89 | c = conn.cursor() 90 | c.execute("set hive.execution.engine=tez") 91 | c.execute("set hive.cache.expr.evaluation=false") 92 | c.execute("add jar %s" % self.hive_serdes_path) 93 | 94 | if database_name != None: 95 | c.execute("use %s" % database_name) 96 | 97 | # run actual command command 98 | print "Executing HiveQL: %s" % (sql) 99 | c.execute(sql) 100 | 101 | output = [] 102 | if fetch_result: 103 | rows = c.fetchall() 104 | for row in rows: 105 | output.append(row) 106 | 107 | c.close() 108 | conn.close() 109 | 110 | return output 111 | 112 | def create_dataset(self, database_name): 113 | pass 114 | 115 | def delete_dataset(self, database_name): 116 | pass 117 | 118 | def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"): 119 | 120 | # used to keep track of table_name -> column list 121 | table_columns = {} 122 | 123 | for field in schema_fields: 124 | data_type = None 125 | 126 | if field['data_type'] == 'string': 127 | data_type = 'string' 128 | elif field['data_type'] in ('timestamp', 'boolean'): 129 | data_type = field['type'] 130 | elif field['data_type'] == 'float': 131 | data_type = 'double' 132 | elif field['data_type'] == 'integer': 133 | data_type = 'int' 134 | elif field['data_type'] in ('record'): 135 | # ignore record 136 | pass 137 | else: 138 | raise Exception("Unsupported data type %s for column %s" % (field['data_type'], field['key'])) 139 | 140 | if data_type is not None: 141 | if field['mode'] == 'repeated': 142 | if process_array == "child_table": 143 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower() 144 | column_name = "value" 145 | else: 146 | continue 147 | else: 148 | if "." in field['key']: 149 | if process_array == "child_table": 150 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower() 151 | column_name = field['key'].rsplit(".",1)[1] 152 | print " Child Table column:" + column_name 153 | else: 154 | child_table_name = table_name 155 | column_name = field['key'].split(".",1)[0] 156 | data_type = "string" 157 | print " Inline column:" + column_name 158 | else: 159 | child_table_name = table_name 160 | column_name = field['key'] 161 | 162 | if child_table_name not in table_columns: 163 | table_columns[child_table_name] = set() 164 | if child_table_name != table_name: 165 | table_columns[child_table_name].add("%s %s" % ("parent_hash_code", "string")) 166 | table_columns[child_table_name].add("%s %s" % ("hash_code", "string")) 167 | 168 | table_columns[child_table_name].add("`%s` %s" % (column_name, data_type)) 169 | 170 | for table_name, columns in table_columns.iteritems(): 171 | sql = "create table `%s` (%s) ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe' " % (table_name, ",".join(columns)) 172 | self.execute_sql(database_name, sql) 173 | 174 | return table_columns.keys() 175 | 176 | def update_table(self, database_name, table_name, schema_fields): 177 | 178 | # current columns 179 | table_names = self.list_tables(database_name, table_name) 180 | current_table_columns = {} 181 | for table_name in table_names: 182 | current_columns = {} 183 | current_schema = self.get_table_schema(database_name, table_name) 184 | for field in current_schema: 185 | current_columns[field['key']] = field['data_type'] 186 | current_table_columns[table_name] = current_columns 187 | 188 | # used to keep track of table_name -> column list 189 | new_table_columns = {} 190 | 191 | alter_sqls = [] 192 | modify_instructions = {} 193 | 194 | for field in schema_fields: 195 | 196 | # print "processing field %s" % str(field) 197 | sql_data_type = None 198 | 199 | if field['data_type'] == 'string': 200 | sql_data_type = 'string' 201 | elif field['data_type'] in ('timestamp', 'boolean'): 202 | sql_data_type = field['type'] 203 | elif field['data_type'] == 'float': 204 | sql_data_type = 'double' 205 | elif field['data_type'] == 'integer': 206 | sql_data_type = 'int' 207 | elif field['data_type'] in ('record'): 208 | # ignore record 209 | pass 210 | else: 211 | raise Exception("Unsupported data type %s for column %s" % (field['data_type'], field['key'])) 212 | 213 | if sql_data_type is not None: 214 | 215 | if field['mode'] == 'repeated': 216 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower() 217 | column_name = "value" 218 | else: 219 | if "." in field['key']: 220 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower() 221 | column_name = field['key'].rsplit(".",1)[1] 222 | else: 223 | child_table_name = table_name 224 | column_name = field['key'] 225 | 226 | # print "column name %s" % column_name 227 | if child_table_name in current_table_columns: 228 | current_columns = current_table_columns[child_table_name] 229 | if column_name in current_columns: 230 | print " column %s found in current table schema." % column_name 231 | if field['data_type'].lower() != current_columns[column_name].lower(): 232 | print " but data type is different. new: %s old: %s" % (field['data_type'], current_columns[column_name]) 233 | if child_table_name not in modify_instructions: 234 | modify_instructions[child_table_name] = {} 235 | modify_instructions[child_table_name][column_name] = sql_data_type 236 | else: 237 | print " data type is same.. no-op." 238 | pass 239 | else: 240 | print " column %s not found in current table schema." % column_name 241 | alter_sqls.append ("alter table `%s` add columns (`%s` %s)" % (child_table_name, column_name, sql_data_type)) 242 | 243 | else: 244 | # new table needed 245 | if child_table_name not in new_table_columns: 246 | new_table_columns[child_table_name] = [] 247 | new_table_columns[child_table_name].append("%s %s" % ("parent_hash_code", "string")) 248 | new_table_columns[child_table_name].append("%s %s" % ("hash_code", "string")) 249 | new_table_columns[child_table_name].append("`%s` %s" % (column_name, sql_data_type)) 250 | 251 | # generate sqls to modify column data type 252 | modify_sqls = [] 253 | for child_table_name, modify_columns in modify_instructions.iteritems(): 254 | for modify_column_name, data_type in modify_columns.iteritems(): 255 | modify_sqls.append("alter table `%s` change `%s` `%s` %s" % (child_table_name, modify_column_name, modify_column_name, data_type)) 256 | 257 | # execute alter table to change data type 258 | for sql in modify_sqls: 259 | self.execute_sql(database_name, sql) 260 | 261 | # execute alter table to add columns 262 | for sql in alter_sqls: 263 | self.execute_sql(database_name, sql) 264 | 265 | # create new tables 266 | for child_table_name, columns in new_table_columns.iteritems(): 267 | sql = "create table `%s` (%s) ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe' " % (child_table_name, ",".join(columns)) 268 | self.execute_sql(database_name, sql) 269 | 270 | return table_names + new_table_columns.keys() 271 | 272 | def delete_table(self, database_name, table_name): 273 | sql = "drop table if exists `%s`" % (table_name) 274 | self.execute_sql(database_name, sql, False) 275 | 276 | child_table_names = self.list_tables(database_name, table_name) 277 | for child_table_name in child_table_names: 278 | sql = "drop table if exists `%s`" % (child_table_name) 279 | self.execute_sql(database_name, sql, False) 280 | 281 | def get_num_rows(self, database_name, table_name): 282 | sql = "select count(*) from `%s`" % (table_name) 283 | r = self.execute_sql(database_name, sql, True) 284 | return r[0][0] 285 | 286 | def table_exists(self, database_name, table_name): 287 | r = self.execute_sql(database_name, "show tables", True) 288 | for row in r: 289 | if row[0] == table_name: 290 | return True 291 | 292 | return False 293 | 294 | def get_table_schema(self, database_name, table_name): 295 | 296 | sql = "desc %s" % (table_name) 297 | r = self.execute_sql(database_name, sql, True) 298 | 299 | fields = [] 300 | for row in r: 301 | d = {} 302 | if 'string' in row[1]: 303 | d['data_type'] = 'string' 304 | elif 'float' in row[1] or 'double' in row[1]: 305 | d['data_type'] = 'float' 306 | elif 'int' in row[1] or 'bigint' in row[1]: 307 | d['data_type'] = 'integer' 308 | elif 'timestamp' in row[1]: 309 | d['data_type'] = 'timestamp' 310 | elif 'boolean' in row[1]: 311 | d['data_type'] = 'boolean' 312 | 313 | d['key'] = row[0] 314 | d['mode'] = 'nullable' 315 | fields.append(d) 316 | 317 | return fields 318 | 319 | def get_job_state(self, job_id): 320 | 321 | job_state = None 322 | job_result = None 323 | job_error_message = None 324 | job_error_reason = None 325 | job_output_rows = 0 326 | 327 | return (job_state, job_result, job_error_message, job_error_reason, job_output_rows) 328 | 329 | 330 | def list_tables(self, database_name, table_prefix): 331 | sql = "show tables" 332 | r = self.execute_sql(database_name, sql, True) 333 | output = [] 334 | for row in r: 335 | if row[0].startswith(table_prefix): 336 | output.append(row[0]) 337 | return output 338 | 339 | def load_table(self, database_name, table_name, file_path): 340 | sql = "load data inpath '%s*' into table `%s`" % (file_path, table_name) 341 | self.execute_sql(database_name, sql, fetch_result = False) 342 | 343 | def query(self, database_name, query): 344 | result = self.execute_sql(database_name, query, True) 345 | output = {} 346 | output['rows'] = [] 347 | for r in result: 348 | f = [] 349 | for i in r: 350 | f.append({"v": i}) 351 | output['rows'].append({"f": f}) 352 | 353 | return output 354 | 355 | 356 | # Implementation for Google BigQuery 357 | class GBigQuery(DataWarehouse): 358 | 359 | project_id = None 360 | bucket_id = None 361 | 362 | def __init__(self, project_id, bucket_id): 363 | print '-- Initializing Google BigQuery module --' 364 | self.project_id = project_id 365 | self.bucket_id = bucket_id 366 | 367 | def create_dataset(self, database_name): 368 | command = "bq --project_id %s mk %s" % (self.project_id, database_name) 369 | execute(command, ignore_error=True) 370 | 371 | def delete_dataset(self, database_name): 372 | pass 373 | 374 | def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"): 375 | 376 | table_columns = {} 377 | 378 | for field in schema_fields: 379 | data_type = field['data_type'] 380 | 381 | # ignore record 382 | if field['data_type'] in ('record'): 383 | continue 384 | 385 | if data_type is not None: 386 | if field['mode'] == 'repeated': 387 | if process_array == "child_table": 388 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower() 389 | column_name = "value" 390 | else: 391 | continue 392 | else: 393 | if "." in field['key']: 394 | if process_array == "child_table": 395 | child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower() 396 | column_name = field['key'].rsplit(".",1)[1] 397 | print " Child Table column:" + column_name 398 | else: 399 | child_table_name = table_name 400 | column_name = field['key'].split(".",1)[0] 401 | data_type = "string" 402 | print " Inline column:" + column_name 403 | else: 404 | child_table_name = table_name 405 | column_name = field['key'] 406 | 407 | if child_table_name not in table_columns: 408 | table_columns[child_table_name] = [] 409 | if child_table_name != table_name: 410 | table_columns[child_table_name].append({"name": "parent_hash_code", "type": "string", "mode": "nullable"}) 411 | table_columns[child_table_name].append({"name": "hash_code", "type": "string", "mode": "nullable"}) 412 | 413 | table_columns[child_table_name].append({"name": column_name, "type": data_type, "mode": "nullable"}) 414 | 415 | for table_name, columns in table_columns.iteritems(): 416 | 417 | # create schema file 418 | schema_file_name = table_name + "_schema.json" 419 | schema_json = json.dumps(columns) 420 | schema_file = open(schema_file_name, "w") 421 | schema_file.write(schema_json) 422 | schema_file.close() 423 | 424 | # execute create-table command 425 | command = "bq --project_id %s mk --schema %s %s.%s" % (self.project_id, schema_file_name, 426 | database_name, table_name) 427 | execute(command) 428 | 429 | return table_columns.keys() 430 | 431 | def update_table(self, database_name, table_name, schema_fields): 432 | # Currently BigQuery doesn't support update table 433 | raise Exception("BigQuery doesn't support table update.") 434 | 435 | def delete_table(self, database_name, table_name): 436 | command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, table_name) 437 | execute(command, ignore_error=True) 438 | 439 | child_table_names = self.list_tables(database_name, table_name) 440 | for child_table_name in child_table_names: 441 | command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, child_table_name) 442 | execute(command, ignore_error=True) 443 | # sql = "drop table if exists `%s`" % (child_table_name) 444 | # self.execute_sql(database_name, sql, False) 445 | 446 | def get_num_rows(self, database_name, table_name): 447 | sql = "select count(*) from `%s`" % (table_name) 448 | r = self.execute_sql(database_name, sql, True) 449 | return r[0][0] 450 | 451 | def table_exists(self, database_name, table_name): 452 | 453 | all_tables = self.list_tables(database_name, table_name) 454 | for table in all_tables: 455 | if table == table_name: 456 | return True 457 | 458 | return False 459 | 460 | def get_table_schema(self, database_name, table_name): 461 | pass 462 | 463 | def get_job_state(self, job_id): 464 | job_state = None 465 | job_result = None 466 | job_error_message = None 467 | job_error_reason = None 468 | job_output_rows = 0 469 | 470 | return (job_state, job_result, job_error_message, job_error_reason, job_output_rows) 471 | 472 | def list_tables(self, database_name, table_prefix): 473 | output = [] 474 | (rc, stdout_lines, stderr_lines) = execute_and_read("bq --project_id %s --format csv ls %s" % (self.project_id, database_name)) 475 | for line in stdout_lines: 476 | table_name = line.split(",")[0] 477 | if table_name.startswith(table_prefix): 478 | output.append(table_name) 479 | return output 480 | 481 | def load_table(self, database_name, table_name, file_path): 482 | command = "bq --project_id %s --nosync load --source_format NEWLINE_DELIMITED_JSON %s.%s gs://%s/%s*" % \ 483 | (self.project_id, database_name, table_name, self.bucket_id, file_path) 484 | execute(command) 485 | 486 | def query(self, database_name, query): 487 | pass 488 | -------------------------------------------------------------------------------- /java/HiveSerdes/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cloudera.serde 5 | hive-serdes 6 | hive-serdes 7 | 1.0-SNAPSHOT 8 | http://www.cloudera.com 9 | 10 | 11 | 12 | 13 | maven-compiler-plugin 14 | 2.3.2 15 | 16 | 1.6 17 | 1.6 18 | 19 | 20 | 21 | 22 | 23 | 24 | maven-eclipse-plugin 25 | 2.9 26 | 27 | eclipse-classes 28 | true 29 | false 30 | 31 | 32 | 33 | maven-shade-plugin 34 | 1.7.1 35 | 36 | 37 | package 38 | 39 | shade 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | false 51 | 52 | cloudera 53 | https://repository.cloudera.com/artifactory/cloudera-repos 54 | 55 | 56 | 57 | 58 | junit 59 | junit 60 | 4.8.2 61 | test 62 | 63 | 64 | org.apache.hive 65 | hive-serde 66 | 0.10.0-cdh4.2.0 67 | provided 68 | 69 | 70 | hive-common 71 | org.apache.hive 72 | 73 | 74 | hive-shims 75 | org.apache.hive 76 | 77 | 78 | slf4j-api 79 | org.slf4j 80 | 81 | 82 | slf4j-log4j12 83 | org.slf4j 84 | 85 | 86 | mockito-all 87 | org.mockito 88 | 89 | 90 | libthrift 91 | org.apache.thrift 92 | 93 | 94 | libfb303 95 | org.apache.thrift 96 | 97 | 98 | commons-lang 99 | commons-lang 100 | 101 | 102 | commons-logging 103 | commons-logging 104 | 105 | 106 | commons-logging-api 107 | commons-logging 108 | 109 | 110 | commons-codec 111 | commons-codec 112 | 113 | 114 | avro 115 | org.apache.avro 116 | 117 | 118 | avro-mapred 119 | org.apache.avro 120 | 121 | 122 | 123 | 124 | org.apache.hadoop 125 | hadoop-common 126 | 2.0.0-cdh4.2.0 127 | provided 128 | 129 | 130 | hadoop-annotations 131 | org.apache.hadoop 132 | 133 | 134 | guava 135 | com.google.guava 136 | 137 | 138 | commons-cli 139 | commons-cli 140 | 141 | 142 | commons-math 143 | org.apache.commons 144 | 145 | 146 | xmlenc 147 | xmlenc 148 | 149 | 150 | commons-httpclient 151 | commons-httpclient 152 | 153 | 154 | commons-io 155 | commons-io 156 | 157 | 158 | commons-net 159 | commons-net 160 | 161 | 162 | servlet-api 163 | javax.servlet 164 | 165 | 166 | jetty 167 | org.mortbay.jetty 168 | 169 | 170 | jetty-util 171 | org.mortbay.jetty 172 | 173 | 174 | jersey-core 175 | com.sun.jersey 176 | 177 | 178 | jersey-json 179 | com.sun.jersey 180 | 181 | 182 | jersey-server 183 | com.sun.jersey 184 | 185 | 186 | jasper-compiler 187 | tomcat 188 | 189 | 190 | jasper-runtime 191 | tomcat 192 | 193 | 194 | jsp-api 195 | javax.servlet.jsp 196 | 197 | 198 | commons-el 199 | commons-el 200 | 201 | 202 | log4j 203 | log4j 204 | 205 | 206 | jets3t 207 | net.java.dev.jets3t 208 | 209 | 210 | commons-configuration 211 | commons-configuration 212 | 213 | 214 | jackson-mapper-asl 215 | org.codehaus.jackson 216 | 217 | 218 | kfs 219 | net.sf.kosmosfs 220 | 221 | 222 | protobuf-java 223 | com.google.protobuf 224 | 225 | 226 | hadoop-auth 227 | org.apache.hadoop 228 | 229 | 230 | jsch 231 | com.jcraft 232 | 233 | 234 | zookeeper 235 | org.apache.zookeeper 236 | 237 | 238 | commons-codec 239 | commons-codec 240 | 241 | 242 | commons-logging 243 | commons-logging 244 | 245 | 246 | commons-lang 247 | commons-lang 248 | 249 | 250 | slf4j-api 251 | org.slf4j 252 | 253 | 254 | slf4j-log4j12 255 | org.slf4j 256 | 257 | 258 | mockito-all 259 | org.mockito 260 | 261 | 262 | avro 263 | org.apache.avro 264 | 265 | 266 | 267 | 268 | 269 | eclipse-classes 270 | 2.0.0-cdh4.2.0 271 | 0.10.0-cdh4.2.0 272 | UTF-8 273 | 274 | 275 | 276 | -------------------------------------------------------------------------------- /java/HiveSerdes/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 20 | 4.0.0 21 | 22 | com.cloudera.serde 23 | hive-serdes 24 | 1.0-SNAPSHOT 25 | jar 26 | 27 | hive-serdes 28 | http://www.cloudera.com 29 | 30 | 31 | UTF-8 32 | eclipse-classes 33 | 0.10.0-cdh4.2.0 34 | 2.0.0-cdh4.2.0 35 | 36 | 37 | 38 | 39 | 40 | org.apache.maven.plugins 41 | maven-eclipse-plugin 42 | 2.9 43 | 44 | eclipse-classes 45 | true 46 | false 47 | 48 | 49 | 50 | 51 | org.apache.maven.plugins 52 | maven-shade-plugin 53 | 1.7.1 54 | 55 | 56 | package 57 | 58 | shade 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | org.apache.maven.plugins 69 | maven-compiler-plugin 70 | 2.3.2 71 | 72 | 1.6 73 | 1.6 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | junit 83 | junit 84 | 4.8.2 85 | test 86 | 87 | 88 | 89 | 90 | org.codehaus.jackson 91 | jackson-core-asl 92 | 1.9.8 93 | 94 | 95 | 96 | 97 | org.apache.hive 98 | hive-serde 99 | ${hive.version} 100 | provided 101 | 102 | 103 | org.apache.hadoop 104 | hadoop-common 105 | ${hadoop.version} 106 | provided 107 | 108 | 109 | 110 | 111 | 112 | cloudera 113 | https://repository.cloudera.com/artifactory/cloudera-repos 114 | 115 | true 116 | 117 | 118 | false 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /java/HiveSerdes/src/main/java/com/cloudera/hive/serde/JSONSerDe.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.cloudera.hive.serde; 19 | 20 | import java.util.ArrayList; 21 | import java.util.Arrays; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | import java.util.Properties; 26 | 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.hive.serde.serdeConstants; 29 | import org.apache.hadoop.hive.serde2.SerDe; 30 | import org.apache.hadoop.hive.serde2.SerDeException; 31 | import org.apache.hadoop.hive.serde2.SerDeStats; 32 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 33 | import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; 34 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 35 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 36 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 37 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 38 | import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; 39 | import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; 40 | import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; 41 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 42 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; 43 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; 44 | import org.apache.hadoop.io.Text; 45 | import org.apache.hadoop.io.Writable; 46 | import org.codehaus.jackson.map.ObjectMapper; 47 | 48 | /** 49 | * This SerDe can be used for processing JSON data in Hive. It supports 50 | * arbitrary JSON data, and can handle all Hive types except for UNION. 51 | * However, the JSON data is expected to be a series of discrete records, 52 | * rather than a JSON array of objects. 53 | * 54 | * The Hive table is expected to contain columns with names corresponding to 55 | * fields in the JSON data, but it is not necessary for every JSON field to 56 | * have a corresponding Hive column. Those JSON fields will be ignored during 57 | * queries. 58 | * 59 | * Example: 60 | * 61 | * { "a": 1, "b": [ "str1", "str2" ], "c": { "field1": "val1" } } 62 | * 63 | * Could correspond to a table: 64 | * 65 | * CREATE TABLE foo (a INT, b ARRAY, c STRUCT); 66 | * 67 | * JSON objects can also interpreted as a Hive MAP type, so long as the keys 68 | * and values in the JSON object are all of the appropriate types. For example, 69 | * in the JSON above, another valid table declaraction would be: 70 | * 71 | * CREATE TABLE foo (a INT, b ARRAY, c MAP); 72 | * 73 | * Only STRING keys are supported for Hive MAPs. 74 | */ 75 | public class JSONSerDe implements SerDe { 76 | 77 | private StructTypeInfo rowTypeInfo; 78 | private ObjectInspector rowOI; 79 | private List colNames; 80 | private List row = new ArrayList(); 81 | 82 | /** 83 | * An initialization function used to gather information about the table. 84 | * Typically, a SerDe implementation will be interested in the list of 85 | * column names and their types. That information will be used to help perform 86 | * actual serialization and deserialization of data. 87 | */ 88 | @Override 89 | public void initialize(Configuration conf, Properties tbl) 90 | throws SerDeException { 91 | // Get a list of the table's column names. 92 | String colNamesStr = tbl.getProperty(serdeConstants.LIST_COLUMNS); 93 | colNames = Arrays.asList(colNamesStr.split(",")); 94 | 95 | // Get a list of TypeInfos for the columns. This list lines up with 96 | // the list of column names. 97 | String colTypesStr = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); 98 | List colTypes = 99 | TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr); 100 | 101 | rowTypeInfo = 102 | (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(colNames, colTypes); 103 | rowOI = 104 | TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo); 105 | } 106 | 107 | /** 108 | * This method does the work of deserializing a record into Java objects that 109 | * Hive can work with via the ObjectInspector interface. For this SerDe, the 110 | * blob that is passed in is a JSON string, and the Jackson JSON parser is 111 | * being used to translate the string into Java objects. 112 | * 113 | * The JSON deserialization works by taking the column names in the Hive 114 | * table, and looking up those fields in the parsed JSON object. If the value 115 | * of the field is not a primitive, the object is parsed further. 116 | */ 117 | @Override 118 | public Object deserialize(Writable blob) throws SerDeException { 119 | Map root = null; 120 | row.clear(); 121 | try { 122 | ObjectMapper mapper = new ObjectMapper(); 123 | // This is really a Map. For more information about how 124 | // Jackson parses JSON in this example, see 125 | // http://wiki.fasterxml.com/JacksonDataBinding 126 | root = mapper.readValue(blob.toString(), Map.class); 127 | } catch (Exception e) { 128 | throw new SerDeException(e); 129 | } 130 | 131 | // Lowercase the keys as expected by hive 132 | Map lowerRoot = new HashMap(); 133 | for(Map.Entry entry: root.entrySet()) { 134 | lowerRoot.put(((String)entry.getKey()).toLowerCase(), entry.getValue()); 135 | } 136 | root = lowerRoot; 137 | 138 | Object value= null; 139 | for (String fieldName : rowTypeInfo.getAllStructFieldNames()) { 140 | try { 141 | TypeInfo fieldTypeInfo = rowTypeInfo.getStructFieldTypeInfo(fieldName); 142 | value = parseField(root.get(fieldName), fieldTypeInfo); 143 | } catch (Exception e) { 144 | value = null; 145 | } 146 | row.add(value); 147 | } 148 | return row; 149 | } 150 | 151 | /** 152 | * Parses a JSON object according to the Hive column's type. 153 | * 154 | * @param field - The JSON object to parse 155 | * @param fieldTypeInfo - Metadata about the Hive column 156 | * @return - The parsed value of the field 157 | */ 158 | private Object parseField(Object field, TypeInfo fieldTypeInfo) { 159 | switch (fieldTypeInfo.getCategory()) { 160 | case PRIMITIVE: 161 | // Jackson will return the right thing in this case, so just return 162 | // the object 163 | if (field instanceof String) { 164 | field = field.toString().replaceAll("\n", "\\\\n"); 165 | } 166 | return field; 167 | case LIST: 168 | return parseList(field, (ListTypeInfo) fieldTypeInfo); 169 | case MAP: 170 | return parseMap(field, (MapTypeInfo) fieldTypeInfo); 171 | case STRUCT: 172 | return parseStruct(field, (StructTypeInfo) fieldTypeInfo); 173 | case UNION: 174 | // Unsupported by JSON 175 | default: 176 | return null; 177 | } 178 | } 179 | 180 | /** 181 | * Parses a JSON object and its fields. The Hive metadata is used to 182 | * determine how to parse the object fields. 183 | * 184 | * @param field - The JSON object to parse 185 | * @param fieldTypeInfo - Metadata about the Hive column 186 | * @return - A map representing the object and its fields 187 | */ 188 | private Object parseStruct(Object field, StructTypeInfo fieldTypeInfo) { 189 | Map map = (Map)field; 190 | ArrayList structTypes = fieldTypeInfo.getAllStructFieldTypeInfos(); 191 | ArrayList structNames = fieldTypeInfo.getAllStructFieldNames(); 192 | 193 | List structRow = new ArrayList(structTypes.size()); 194 | if (map != null) { 195 | for (int i = 0; i < structNames.size(); i++) { 196 | structRow.add(parseField(map.get(structNames.get(i)), structTypes.get(i))); 197 | } 198 | } 199 | return structRow; 200 | } 201 | 202 | /** 203 | * Parse a JSON list and its elements. This uses the Hive metadata for the 204 | * list elements to determine how to parse the elements. 205 | * 206 | * @param field - The JSON list to parse 207 | * @param fieldTypeInfo - Metadata about the Hive column 208 | * @return - A list of the parsed elements 209 | */ 210 | private Object parseList(Object field, ListTypeInfo fieldTypeInfo) { 211 | ArrayList list = (ArrayList) field; 212 | TypeInfo elemTypeInfo = fieldTypeInfo.getListElementTypeInfo(); 213 | if (list != null) { 214 | for (int i = 0; i < list.size(); i++) { 215 | list.set(i, parseField(list.get(i), elemTypeInfo)); 216 | } 217 | } 218 | return list.toArray(); 219 | } 220 | 221 | /** 222 | * Parse a JSON object as a map. This uses the Hive metadata for the map 223 | * values to determine how to parse the values. The map is assumed to have 224 | * a string for a key. 225 | * 226 | * @param field - The JSON list to parse 227 | * @param fieldTypeInfo - Metadata about the Hive column 228 | * @return 229 | */ 230 | private Object parseMap(Object field, MapTypeInfo fieldTypeInfo) { 231 | Map map = (Map) field; 232 | TypeInfo valueTypeInfo = fieldTypeInfo.getMapValueTypeInfo(); 233 | if (map != null) { 234 | for (Map.Entry entry : map.entrySet()) { 235 | map.put(entry.getKey(), parseField(entry.getValue(), valueTypeInfo)); 236 | } 237 | } 238 | return map; 239 | } 240 | 241 | /** 242 | * Return an ObjectInspector for the row of data 243 | */ 244 | @Override 245 | public ObjectInspector getObjectInspector() throws SerDeException { 246 | return rowOI; 247 | } 248 | 249 | /** 250 | * Unimplemented 251 | */ 252 | @Override 253 | public SerDeStats getSerDeStats() { 254 | return null; 255 | } 256 | 257 | /** 258 | * JSON is just a textual representation, so our serialized class 259 | * is just Text. 260 | */ 261 | @Override 262 | public Class getSerializedClass() { 263 | return Text.class; 264 | } 265 | 266 | /** 267 | * This method takes an object representing a row of data from Hive, and uses 268 | * the ObjectInspector to get the data for each column and serialize it. This 269 | * implementation deparses the row into an object that Jackson can easily 270 | * serialize into a JSON blob. 271 | */ 272 | @Override 273 | public Writable serialize(Object obj, ObjectInspector oi) 274 | throws SerDeException { 275 | Object deparsedObj = deparseRow(obj, oi); 276 | ObjectMapper mapper = new ObjectMapper(); 277 | try { 278 | // Let Jackson do the work of serializing the object 279 | return new Text(mapper.writeValueAsString(deparsedObj)); 280 | } catch (Exception e) { 281 | throw new SerDeException(e); 282 | } 283 | } 284 | 285 | /** 286 | * Deparse a Hive object into a Jackson-serializable object. This uses 287 | * the ObjectInspector to extract the column data. 288 | * 289 | * @param obj - Hive object to deparse 290 | * @param oi - ObjectInspector for the object 291 | * @return - A deparsed object 292 | */ 293 | private Object deparseObject(Object obj, ObjectInspector oi) { 294 | switch (oi.getCategory()) { 295 | case LIST: 296 | return deparseList(obj, (ListObjectInspector)oi); 297 | case MAP: 298 | return deparseMap(obj, (MapObjectInspector)oi); 299 | case PRIMITIVE: 300 | return deparsePrimitive(obj, (PrimitiveObjectInspector)oi); 301 | case STRUCT: 302 | return deparseStruct(obj, (StructObjectInspector)oi, false); 303 | case UNION: 304 | // Unsupported by JSON 305 | default: 306 | return null; 307 | } 308 | } 309 | 310 | /** 311 | * Deparses a row of data. We have to treat this one differently from 312 | * other structs, because the field names for the root object do not match 313 | * the column names for the Hive table. 314 | * 315 | * @param obj - Object representing the top-level row 316 | * @param structOI - ObjectInspector for the row 317 | * @return - A deparsed row of data 318 | */ 319 | private Object deparseRow(Object obj, ObjectInspector structOI) { 320 | return deparseStruct(obj, (StructObjectInspector)structOI, true); 321 | } 322 | 323 | /** 324 | * Deparses struct data into a serializable JSON object. 325 | * 326 | * @param obj - Hive struct data 327 | * @param structOI - ObjectInspector for the struct 328 | * @param isRow - Whether or not this struct represents a top-level row 329 | * @return - A deparsed struct 330 | */ 331 | private Object deparseStruct(Object obj, 332 | StructObjectInspector structOI, 333 | boolean isRow) { 334 | Map struct = new HashMap(); 335 | List fields = structOI.getAllStructFieldRefs(); 336 | for (int i = 0; i < fields.size(); i++) { 337 | StructField field = fields.get(i); 338 | // The top-level row object is treated slightly differently from other 339 | // structs, because the field names for the row do not correctly reflect 340 | // the Hive column names. For lower-level structs, we can get the field 341 | // name from the associated StructField object. 342 | String fieldName = isRow ? colNames.get(i) : field.getFieldName(); 343 | ObjectInspector fieldOI = field.getFieldObjectInspector(); 344 | Object fieldObj = structOI.getStructFieldData(obj, field); 345 | struct.put(fieldName, deparseObject(fieldObj, fieldOI)); 346 | } 347 | return struct; 348 | } 349 | 350 | /** 351 | * Deparses a primitive type. 352 | * 353 | * @param obj - Hive object to deparse 354 | * @param oi - ObjectInspector for the object 355 | * @return - A deparsed object 356 | */ 357 | private Object deparsePrimitive(Object obj, PrimitiveObjectInspector primOI) { 358 | return primOI.getPrimitiveJavaObject(obj); 359 | } 360 | 361 | private Object deparseMap(Object obj, MapObjectInspector mapOI) { 362 | Map map = new HashMap(); 363 | ObjectInspector mapValOI = mapOI.getMapValueObjectInspector(); 364 | Map fields = mapOI.getMap(obj); 365 | for (Map.Entry field : fields.entrySet()) { 366 | Object fieldName = field.getKey(); 367 | Object fieldObj = field.getValue(); 368 | map.put(fieldName, deparseObject(fieldObj, mapValOI)); 369 | } 370 | return map; 371 | } 372 | 373 | /** 374 | * Deparses a list and its elements. 375 | * 376 | * @param obj - Hive object to deparse 377 | * @param oi - ObjectInspector for the object 378 | * @return - A deparsed object 379 | */ 380 | private Object deparseList(Object obj, ListObjectInspector listOI) { 381 | List list = new ArrayList(); 382 | List field = listOI.getList(obj); 383 | ObjectInspector elemOI = listOI.getListElementObjectInspector(); 384 | for (Object elem : field) { 385 | list.add(deparseObject(elem, elemOI)); 386 | } 387 | return list; 388 | } 389 | } 390 | -------------------------------------------------------------------------------- /java/MapReduce/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.onefold.hadoop 5 | MapReduce 6 | MapReduce 7 | 0.0.1-SNAPSHOT 8 | http://www.onefold.io 9 | 10 | 11 | 12 | 13 | maven-compiler-plugin 14 | 2.3.2 15 | 16 | 1.6 17 | 1.6 18 | 19 | 20 | 21 | 22 | 23 | 24 | maven-eclipse-plugin 25 | 2.9 26 | 27 | eclipse-classes 28 | true 29 | false 30 | 31 | 32 | 33 | maven-shade-plugin 34 | 1.7.1 35 | 36 | 37 | package 38 | 39 | shade 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | always 50 | warn 51 | 52 | 53 | false 54 | never 55 | fail 56 | 57 | HDPReleases 58 | HDP Releases 59 | http://repo.hortonworks.com/content/repositories/releases/ 60 | 61 | 62 | 63 | 64 | org.mortbay.jetty 65 | jetty 66 | 6.1.26 67 | provided 68 | 69 | 70 | servlet-api 71 | org.mortbay.jetty 72 | 73 | 74 | 75 | 76 | org.mortbay.jetty 77 | jetty-util 78 | 6.1.26 79 | provided 80 | 81 | 82 | org.apache.hadoop 83 | hadoop-mapreduce-client-core 84 | 2.6.0.2.2.0.0-2041 85 | provided 86 | 87 | 88 | hadoop-yarn-common 89 | org.apache.hadoop 90 | 91 | 92 | protobuf-java 93 | com.google.protobuf 94 | 95 | 96 | avro 97 | org.apache.avro 98 | 99 | 100 | slf4j-api 101 | org.slf4j 102 | 103 | 104 | slf4j-log4j12 105 | org.slf4j 106 | 107 | 108 | hadoop-annotations 109 | org.apache.hadoop 110 | 111 | 112 | guice-servlet 113 | com.google.inject.extensions 114 | 115 | 116 | netty 117 | io.netty 118 | 119 | 120 | 121 | 122 | org.apache.hadoop 123 | hadoop-common 124 | 2.6.0.2.2.0.0-2041 125 | provided 126 | 127 | 128 | guava 129 | com.google.guava 130 | 131 | 132 | commons-cli 133 | commons-cli 134 | 135 | 136 | commons-math3 137 | org.apache.commons 138 | 139 | 140 | xmlenc 141 | xmlenc 142 | 143 | 144 | commons-httpclient 145 | commons-httpclient 146 | 147 | 148 | commons-codec 149 | commons-codec 150 | 151 | 152 | commons-io 153 | commons-io 154 | 155 | 156 | commons-net 157 | commons-net 158 | 159 | 160 | commons-collections 161 | commons-collections 162 | 163 | 164 | servlet-api 165 | javax.servlet 166 | 167 | 168 | jersey-core 169 | com.sun.jersey 170 | 171 | 172 | jersey-json 173 | com.sun.jersey 174 | 175 | 176 | jersey-server 177 | com.sun.jersey 178 | 179 | 180 | jasper-compiler 181 | tomcat 182 | 183 | 184 | jasper-runtime 185 | tomcat 186 | 187 | 188 | jsp-api 189 | javax.servlet.jsp 190 | 191 | 192 | commons-el 193 | commons-el 194 | 195 | 196 | commons-logging 197 | commons-logging 198 | 199 | 200 | log4j 201 | log4j 202 | 203 | 204 | jets3t 205 | net.java.dev.jets3t 206 | 207 | 208 | microsoft-windowsazure-storage-sdk 209 | com.microsoft.windowsazure.storage 210 | 211 | 212 | commons-lang 213 | commons-lang 214 | 215 | 216 | commons-configuration 217 | commons-configuration 218 | 219 | 220 | jackson-mapper-asl 221 | org.codehaus.jackson 222 | 223 | 224 | gson 225 | com.google.code.gson 226 | 227 | 228 | hadoop-auth 229 | org.apache.hadoop 230 | 231 | 232 | jsch 233 | com.jcraft 234 | 235 | 236 | curator-client 237 | org.apache.curator 238 | 239 | 240 | curator-recipes 241 | org.apache.curator 242 | 243 | 244 | jsr305 245 | com.google.code.findbugs 246 | 247 | 248 | htrace-core 249 | org.htrace 250 | 251 | 252 | zookeeper 253 | org.apache.zookeeper 254 | 255 | 256 | commons-compress 257 | org.apache.commons 258 | 259 | 260 | hadoop-annotations 261 | org.apache.hadoop 262 | 263 | 264 | slf4j-api 265 | org.slf4j 266 | 267 | 268 | slf4j-log4j12 269 | org.slf4j 270 | 271 | 272 | avro 273 | org.apache.avro 274 | 275 | 276 | protobuf-java 277 | com.google.protobuf 278 | 279 | 280 | 281 | 282 | org.apache.hive 283 | hive-serde 284 | 0.14.0.2.2.5.1-3 285 | provided 286 | 287 | 288 | hive-common 289 | org.apache.hive 290 | 291 | 292 | hive-shims 293 | org.apache.hive 294 | 295 | 296 | libthrift 297 | org.apache.thrift 298 | 299 | 300 | opencsv 301 | net.sf.opencsv 302 | 303 | 304 | commons-codec 305 | commons-codec 306 | 307 | 308 | commons-lang 309 | commons-lang 310 | 311 | 312 | commons-logging 313 | commons-logging 314 | 315 | 316 | avro 317 | org.apache.avro 318 | 319 | 320 | slf4j-api 321 | org.slf4j 322 | 323 | 324 | slf4j-log4j12 325 | org.slf4j 326 | 327 | 328 | 329 | 330 | org.apache.hive 331 | hive-exec 332 | 0.14.0.2.2.5.1-3 333 | provided 334 | 335 | 336 | hive-ant 337 | org.apache.hive 338 | 339 | 340 | hive-metastore 341 | org.apache.hive 342 | 343 | 344 | antlr-runtime 345 | org.antlr 346 | 347 | 348 | ST4 349 | org.antlr 350 | 351 | 352 | ant 353 | org.apache.ant 354 | 355 | 356 | libfb303 357 | org.apache.thrift 358 | 359 | 360 | groovy-all 361 | org.codehaus.groovy 362 | 363 | 364 | datanucleus-core 365 | org.datanucleus 366 | 367 | 368 | calcite-core 369 | org.apache.calcite 370 | 371 | 372 | calcite-avatica 373 | org.apache.calcite 374 | 375 | 376 | stax-api 377 | stax 378 | 379 | 380 | jline 381 | jline 382 | 383 | 384 | jansi 385 | org.fusesource.jansi 386 | 387 | 388 | hive-shims 389 | org.apache.hive 390 | 391 | 392 | commons-httpclient 393 | commons-httpclient 394 | 395 | 396 | commons-io 397 | commons-io 398 | 399 | 400 | log4j 401 | log4j 402 | 403 | 404 | commons-compress 405 | org.apache.commons 406 | 407 | 408 | zookeeper 409 | org.apache.zookeeper 410 | 411 | 412 | commons-codec 413 | commons-codec 414 | 415 | 416 | commons-logging 417 | commons-logging 418 | 419 | 420 | slf4j-api 421 | org.slf4j 422 | 423 | 424 | slf4j-log4j12 425 | org.slf4j 426 | 427 | 428 | 429 | 430 | 431 | UTF-8 432 | 433 | 434 | 435 | -------------------------------------------------------------------------------- /java/MapReduce/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.onefold.hadoop 5 | MapReduce 6 | 0.0.1-SNAPSHOT 7 | jar 8 | MapReduce 9 | http://maven.apache.org 10 | 11 | UTF-8 12 | 13 | 14 | 15 | 16 | org.mortbay.jetty 17 | jetty 18 | 6.1.26 19 | 20 | 21 | org.mortbay.jetty 22 | jetty-util 23 | 6.1.26 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-mapreduce-client-core 28 | 2.6.0.2.2.0.0-2041 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-common 33 | 2.6.0.2.2.0.0-2041 34 | 35 | 36 | 37 | 38 | 39 | 40 | true 41 | always 42 | warn 43 | 44 | 45 | false 46 | never 47 | fail 48 | 49 | HDPReleases 50 | HDP Releases 51 | http://repo.hortonworks.com/content/repositories/releases/ 52 | default 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /java/MapReduce/src/main/java/com/onefold/hadoop/MapReduce/TransformDataMultiOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.onefold.hadoop.MapReduce; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; 6 | 7 | /* 8 | * Copyright 2015, OneFold 9 | * All rights reserved. 10 | * http://www.onefold.io 11 | * 12 | * Author: Jorge Chang 13 | * 14 | * See license in LICENSE file. 15 | * 16 | * Used by transform-data-mapper to write each fragment to its own MapReduce output Folder 17 | * 18 | */ 19 | public class TransformDataMultiOutputFormat extends MultipleTextOutputFormat { 20 | @Override 21 | protected String generateFileNameForKeyValue(Text key, Text value, String leaf) { 22 | return new Path(key.toString(), leaf).toString(); 23 | } 24 | 25 | @Override 26 | protected Text generateActualKey(Text key, Text value) { 27 | return null; 28 | } 29 | } -------------------------------------------------------------------------------- /json/generate-schema-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright 2015, OneFold 5 | # All rights reserved. 6 | # http://www.onefold.io 7 | # 8 | # Author: Jorge Chang 9 | # 10 | # See license in LICENSE file. 11 | # 12 | # Generate Schema Mapper - takes data from stdin, performs deep inspection and emits 13 | # field-name -> data-type tuples. 14 | # 15 | 16 | import re 17 | import sys 18 | import json 19 | import codecs 20 | 21 | # create utf reader and writer for stdin and stdout 22 | output_stream = codecs.getwriter("utf-8")(sys.stdout) 23 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore") 24 | error_stream = codecs.getwriter("utf-8")(sys.stderr) 25 | 26 | def is_integer(value): 27 | try: 28 | a = int(str(value)) 29 | if a > sys.maxint or a < -sys.maxint - 1: 30 | return False 31 | return True 32 | except: 33 | return False 34 | 35 | def is_float(value): 36 | try: 37 | float(str(value)) 38 | return True 39 | except: 40 | return False 41 | 42 | def process_line(line, line_num, parent=None, seperator="_"): 43 | 44 | # parse the line 45 | try: 46 | data = json.loads(line, encoding='utf-8') 47 | except ValueError: 48 | print >> error_stream, "Line %i: JSON Parse Error. Data: %s" % (line_num, line) 49 | return 50 | 51 | if data: 52 | 53 | for key, value in data.iteritems(): 54 | 55 | k = re.sub("[^0-9a-zA-Z_]", '_', key).lower() 56 | 57 | # BigQuery disallows field to start with non alpha 58 | if ord(k[0]) >= 48 and ord(k[0]) <= 59: 59 | k = "_f" + k 60 | 61 | # Hive disallows field to start with "_" 62 | if k[0] == '_': 63 | k = k.lstrip("_") 64 | 65 | if parent == None: 66 | full_key = k 67 | else: 68 | full_key = parent + seperator + k 69 | 70 | if value is None: 71 | # if data is Null, PASS. 72 | pass 73 | 74 | elif isinstance(value, dict): 75 | 76 | if len(value) > 0: 77 | print >> output_stream, "%s\t%s" % (full_key, "record-nullable") 78 | process_line(json.dumps(value, ensure_ascii=False), line_num, full_key) 79 | else: 80 | print >> error_stream, "Key %s has value of type dict %s which is empty. Ignoring." % (full_key, value) 81 | 82 | elif isinstance(value, list): 83 | 84 | if len(value) > 0: 85 | 86 | for list_value in value: 87 | if isinstance(list_value, dict): 88 | print >> output_stream, "%s\t%s" % (full_key, "record-repeated") 89 | process_line(json.dumps(list_value, ensure_ascii=False), line_num, full_key, ".") 90 | elif isinstance(list_value, bool): 91 | print >> output_stream, "%s\t%s" % (full_key, "boolean-repeated") 92 | elif isinstance(list_value, int): 93 | print >> output_stream, "%s\t%s" % (full_key, "integer-repeated") 94 | elif isinstance(list_value, float): 95 | print >> output_stream, "%s\t%s" % (full_key, "float-repeated") 96 | else: 97 | print >> output_stream, "%s\t%s" % (full_key, "string-repeated") 98 | 99 | else: 100 | print >> error_stream, "Key %s has value of type list %s which is empty. Ignoring." % (full_key, value) 101 | 102 | else: 103 | 104 | if isinstance(value, bool): 105 | print >> output_stream, "%s\t%s" % (full_key, "boolean-nullable") 106 | elif isinstance(value, int): 107 | print >> output_stream, "%s\t%s" % (full_key, "integer-nullable") 108 | elif isinstance(value, float): 109 | print >> output_stream, "%s\t%s" % (full_key, "float-nullable") 110 | else: 111 | print >> output_stream, "%s\t%s" % (full_key, "string-nullable") 112 | 113 | 114 | def main(): 115 | 116 | line_num = 1 117 | for line in input_stream: 118 | try: 119 | process_line(line, line_num, None) 120 | line_num += 1 121 | except Exception: 122 | print >> error_stream, "Line %i: Error. Data: %s" % (line_num, line) 123 | 124 | if __name__ == "__main__": 125 | main() -------------------------------------------------------------------------------- /json/generate-schema-reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright 2015, OneFold 5 | # All rights reserved. 6 | # http://www.onefold.io 7 | # 8 | # Author: Jorge Chang 9 | # 10 | # See license in LICENSE file. 11 | # 12 | # Generate Schema Reducer - reduces multiple / conflicting data type of a particular 13 | # field into the most general one, e.g. 14 | # input: "zip_code" => (int, string) 15 | # output: "zip_code" => (string) because string > int. 16 | # 17 | 18 | import sys 19 | import codecs 20 | from pymongo import MongoClient 21 | 22 | # create utf reader and writer for stdin and stdout 23 | output_stream = codecs.getwriter("utf-8")(sys.stdout) 24 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore") 25 | error_stream = codecs.getwriter("utf-8")(sys.stderr) 26 | 27 | mongo_schema_collection = None 28 | 29 | 30 | def parse_datatype_mode (datatype_mode): 31 | a = datatype_mode.split("-") 32 | if len(a) >= 2: 33 | return (a[0], a[1]) 34 | else: 35 | raise ValueError('Invalid datatype / mode tuple %s' % datatype_mode) 36 | 37 | 38 | def process_new_field(key, datatype_mode): 39 | 40 | if key is not None and datatype_mode is not None: 41 | # check if key is already in mongodb 42 | orig_field_record = mongo_schema_collection.find_one({"key": key, "type": "field"}) 43 | 44 | # compare orig data type and save schema to mongodb 45 | if orig_field_record is not None: 46 | orig_datatype_mode = orig_field_record['data_type'] + "-" + orig_field_record['mode'] 47 | 48 | forced = False 49 | if 'forced' in orig_field_record and orig_field_record['forced'] == True: 50 | forced = True 51 | 52 | # if 'forced' not in orig_datatype: 53 | if not forced: 54 | new_datatype_mode = max_datatype_mode(orig_datatype_mode, datatype_mode) 55 | 56 | (new_datatype, new_mode) = parse_datatype_mode(new_datatype_mode) 57 | mongo_schema_collection.find_one_and_update({"key": key, "type": "field"}, 58 | {"$set": {"data_type": new_datatype, 59 | "mode": new_mode}}) 60 | 61 | else: 62 | (datatype, mode) = parse_datatype_mode(datatype_mode) 63 | mongo_schema_collection.insert_one({"key": key, 64 | "type": "field", 65 | "data_type": datatype, 66 | "mode": mode}) 67 | 68 | 69 | def max_datatype_mode (datatype_mode_1, datatype_mode_2): 70 | 71 | if datatype_mode_1 == datatype_mode_2: 72 | return datatype_mode_1 73 | 74 | if datatype_mode_1 == 'record-repeated' or datatype_mode_2 == 'record-repeated': 75 | return 'record-repeated' 76 | 77 | if datatype_mode_1 == 'string-repeated' or datatype_mode_2 == 'string-repeated': 78 | return 'string-repeated' 79 | 80 | if datatype_mode_1 == 'repeated-nullable' or datatype_mode_2 == 'repeated-nullable': 81 | return 'repeated-nullable' 82 | 83 | if datatype_mode_1 == 'record-nullable' or datatype_mode_2 == 'record-nullable': 84 | return 'record-nullable' 85 | 86 | if datatype_mode_1 == 'string-nullable' or datatype_mode_2 == 'string-nullable': 87 | return 'string-nullable' 88 | 89 | if datatype_mode_1 == 'float-nullable' and datatype_mode_2 == 'integer-nullable': 90 | return 'float-nullable' 91 | 92 | if datatype_mode_1 == 'integer-nullable' and datatype_mode_2 == 'float-nullable': 93 | return 'float-nullable' 94 | 95 | return 'string-nullable' 96 | 97 | 98 | def usage(): 99 | print "Usage: %s mongodb://[host]:[port]/[db_name]/[schema_collection_name]" % sys.argv[0] 100 | sys.exit(2) 101 | 102 | 103 | def main(argv): 104 | 105 | if len(argv) < 0: 106 | usage() 107 | 108 | try: 109 | 110 | args = argv[0].split("/") 111 | schema_collection_name = args[-1] 112 | schema_db_name = args[-2] 113 | mongo_uri = '/'.join(args[0:-2]) 114 | 115 | client = MongoClient(mongo_uri) 116 | db = client[schema_db_name] 117 | 118 | global mongo_schema_collection 119 | mongo_schema_collection = db[schema_collection_name] 120 | 121 | except: 122 | usage() 123 | 124 | current_key = None 125 | current_datatype_mode = None 126 | key = None 127 | 128 | # input comes from STDIN 129 | for line in input_stream: 130 | 131 | # remove leading and trailing whitespace 132 | line = line.strip() 133 | 134 | # parse the input we got from mapper.py 135 | (key, datatype_mode) = line.split('\t', 1) 136 | 137 | # this IF-switch only works because Hadoop sorts map output 138 | # by key (here: key) before it is passed to the reducer 139 | if current_key == key: 140 | current_datatype_mode = max_datatype_mode(current_datatype_mode, datatype_mode) 141 | else: 142 | if current_key: 143 | process_new_field(current_key, current_datatype_mode) 144 | current_datatype_mode = datatype_mode 145 | current_key = key 146 | 147 | # do not forget to output the last key if needed! 148 | if current_key == key: 149 | process_new_field(current_key, current_datatype_mode) 150 | 151 | 152 | if __name__ == "__main__": 153 | main(sys.argv[1:]) 154 | -------------------------------------------------------------------------------- /json/transform-data-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright 2015, OneFold 5 | # All rights reserved. 6 | # http://www.onefold.io 7 | # 8 | # Author: Jorge Chang 9 | # 10 | # See license in LICENSE file. 11 | # 12 | # Transform Data Mapper - takes data from stdin, cleans the data based on schema 13 | # generated previously, and split array fields into different files. 14 | # 15 | 16 | import re 17 | import sys 18 | import json 19 | import os 20 | import socket 21 | import subprocess 22 | import codecs 23 | import hashlib 24 | import pprint 25 | from pymongo import MongoClient 26 | 27 | # create utf reader and writer for stdin and stdout 28 | output_stream = codecs.getwriter("utf-8")(sys.stdout) 29 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore") 30 | error_stream = codecs.getwriter("utf-8")(sys.stderr) 31 | 32 | process_array = "child_table" 33 | shard_key = None 34 | 35 | BATCH_SIZE = 10000 36 | BATCH_NUM_LINES = 50000 37 | 38 | # e.g. schema['event'] = 'string-nullable' 39 | mongo_schema_collection = None 40 | schema = {} 41 | shard_values = [] 42 | 43 | # params 44 | tmp_path = None 45 | 46 | # create file descriptors 47 | file_descriptors = {} 48 | 49 | 50 | def clean_data(line, line_num, parent = None, parent_hash_code = None, is_array = False): 51 | new_data = {} 52 | new_data_fragments = {} 53 | 54 | # read each line into a line_hash 55 | try: 56 | data = json.loads(line, encoding="utf-8") 57 | except ValueError: 58 | print >> error_stream, "Line %i: JSON Parse Error. Data: %s" % (line_num, line) 59 | return None 60 | 61 | # create hash code 62 | hash_code = hashlib.sha1(json.dumps(data, sort_keys=True)).hexdigest() 63 | new_data['hash_code'] = hash_code 64 | 65 | if parent_hash_code != None: 66 | new_data['parent_hash_code'] = parent_hash_code 67 | 68 | # determine shard key (only for root level). 69 | if parent == None: 70 | if shard_key is not None: 71 | shard_value = get_shard_value(data, shard_key) 72 | 73 | if shard_value is None: 74 | print >> error_stream, "Line %i: Invalid shard value. Data: %s" % (line_num, line) 75 | return 76 | 77 | new_data_fragments["root/%s" % shard_value] = new_data 78 | shard_values.append(shard_value) 79 | else: 80 | new_data_fragments['root'] = new_data 81 | 82 | else: 83 | new_data_fragments['root'] = new_data 84 | 85 | if data: 86 | 87 | for (key, value) in data.iteritems(): 88 | 89 | k = re.sub("[^0-9a-zA-Z_]", '_', key).lower() 90 | 91 | # BigQuery disallows field to start with number 92 | if ord(k[0]) >= 48 and ord(k[0]) <= 59: 93 | k = "_f" + k 94 | 95 | # Hive disallows field to start with "_" 96 | if k[0] == '_': 97 | k = k.lstrip("_") 98 | 99 | if parent == None: 100 | full_key = k 101 | dict_key = full_key 102 | else: 103 | if is_array: 104 | full_key = parent + "." + k 105 | dict_key = key 106 | else: 107 | full_key = parent + "_" + k 108 | dict_key = full_key 109 | 110 | # check to see if dict is empty - BigQuery doesn't support RECORD data type with no fields 111 | if isinstance(value, dict) and len(value) == 0: 112 | continue 113 | 114 | # check to see if list is empty - BigQuery doesn't support REPEATED data type with no data 115 | if isinstance(value, list) and len(value) == 0: 116 | continue 117 | 118 | # print error if data type is not found for this key! 119 | if full_key not in schema: 120 | print >> error_stream, "Line %i: Couldn't find data type for key %s. Skipping this value. Data: %s" % ( 121 | line_num, full_key, line) 122 | continue 123 | 124 | data_type = schema[full_key]["data_type"] 125 | mode = schema[full_key]["mode"] 126 | 127 | data_type_forced = False 128 | if 'forced' in schema[full_key]: 129 | data_type_forced = schema[full_key]['forced'] 130 | 131 | if data_type == 'record': 132 | 133 | if mode == 'repeated': 134 | if not isinstance(value, list): 135 | print >> error_stream, "Line %i: Expect repeated record but found %s. Data: %s" % (line_num, value, line) 136 | return None 137 | else: 138 | 139 | if process_array == "child_table": 140 | if full_key not in new_data_fragments: 141 | new_data_fragments[full_key] = [] 142 | 143 | for v in value: 144 | t = clean_data(json.dumps(v, ensure_ascii=False), line_num, full_key, hash_code, True) 145 | 146 | for fragment, fragment_content in t.iteritems(): 147 | if fragment == 'root': 148 | new_data_fragments[full_key].append(fragment_content) 149 | else: 150 | fragment_key = re.sub("[^0-9a-zA-Z_]", '_', fragment).lower() 151 | new_data_fragments[fragment_key] = fragment_content 152 | 153 | else: 154 | new_data[dict_key] = json.dumps(value) 155 | 156 | else: 157 | if not isinstance(value, dict): 158 | print >> error_stream, "Line %i: Expect record but found %s. Data: %s" % (line_num, value, line) 159 | return None 160 | else: 161 | t = clean_data(json.dumps(value, ensure_ascii=False), line_num, full_key) 162 | 163 | for fragment, fragment_content in t.iteritems(): 164 | if fragment == 'root': 165 | fragment_content.pop("hash_code", None) 166 | new_data.update(fragment_content) 167 | 168 | if isinstance(fragment_content, list): 169 | new_data_fragments[fragment] = fragment_content 170 | 171 | else: 172 | 173 | if value: 174 | 175 | # check if data type mismatch 176 | if data_type == 'string': 177 | 178 | if mode == 'repeated': 179 | if not isinstance(value, list): 180 | print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % ( 181 | line_num, value, line) 182 | return None 183 | else: 184 | 185 | if process_array == "child_table": 186 | if full_key not in new_data_fragments: 187 | new_data_fragments[full_key] = [] 188 | 189 | for v in value: 190 | cleaned_v = unicode(v) 191 | t = {"value": cleaned_v, "parent_hash_code": hash_code} 192 | new_data_fragments[full_key].append(t) 193 | else: 194 | new_data[dict_key] = json.dumps(value) 195 | 196 | else: 197 | new_data[dict_key] = unicode(value) 198 | 199 | elif data_type == 'float': 200 | 201 | if mode == 'repeated': 202 | if not isinstance(value, list): 203 | print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % ( 204 | line_num, value, line) 205 | return None 206 | else: 207 | 208 | if process_array == "child_table": 209 | if full_key not in new_data_fragments: 210 | new_data_fragments[full_key] = [] 211 | 212 | for v in value: 213 | 214 | cleaned_v = None 215 | 216 | try: 217 | cleaned_v = float(v) 218 | except ValueError: 219 | if not data_type_forced: 220 | print >> error_stream, "Line %i: Couldn't convert %s to float. Data: %s" % ( 221 | line_num, str(value), line) 222 | return None 223 | 224 | t = {"value": cleaned_v, "parent_hash_code": hash_code} 225 | new_data_fragments[full_key].append(t) 226 | else: 227 | new_data[dict_key] = json.dumps(value) 228 | 229 | else: 230 | try: 231 | new_data[dict_key] = float(value) 232 | except ValueError: 233 | if data_type_forced: 234 | new_data[dict_key] = None 235 | else: 236 | print >> error_stream, "Line %i: Couldn't convert %s to float. Data: %s" % ( 237 | line_num, str(value), line) 238 | return None 239 | 240 | elif data_type == 'integer': 241 | 242 | if mode == 'repeated': 243 | if not isinstance(value, list): 244 | print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % ( 245 | line_num, value, line) 246 | return None 247 | else: 248 | 249 | if process_array == "child_table": 250 | if full_key not in new_data_fragments: 251 | new_data_fragments[full_key] = [] 252 | 253 | for v in value: 254 | 255 | cleaned_v = None 256 | 257 | try: 258 | cleaned_v = int(v) 259 | except ValueError: 260 | if not data_type_forced: 261 | print >> error_stream, "Line %i: Couldn't convert %s to int. Data: %s" % ( 262 | line_num, str(value), line) 263 | return None 264 | 265 | t = {"value": cleaned_v, "parent_hash_code": hash_code} 266 | new_data_fragments[full_key].append(t) 267 | else: 268 | new_data[dict_key] = json.dumps(value) 269 | 270 | else: 271 | try: 272 | new_data[dict_key] = int(value) 273 | except ValueError: 274 | if data_type_forced: 275 | new_data[dict_key] = None 276 | else: 277 | print >> error_stream, "Line %i: Couldn't convert %s to int. Data: %s" % (line_num, str(value), line) 278 | return None 279 | 280 | elif data_type == 'boolean': 281 | 282 | if mode == 'repeated': 283 | if not isinstance(value, list): 284 | print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % ( 285 | line_num, value, line) 286 | return None 287 | else: 288 | 289 | if process_array == "child_table": 290 | if full_key not in new_data_fragments: 291 | new_data_fragments[full_key] = [] 292 | 293 | for v in value: 294 | t = {"value": str(v).lower() == 'true', "parent_hash_code": hash_code} 295 | new_data_fragments[full_key].append(t) 296 | else: 297 | new_data[dict_key] = json.dumps(value) 298 | 299 | else: 300 | new_data[dict_key] = (str(value).lower() == 'true') 301 | 302 | else: 303 | 304 | if mode == 'repeated': 305 | if not isinstance(value, list): 306 | print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % ( 307 | line_num, value, line) 308 | return None 309 | else: 310 | 311 | if process_array == "child_table": 312 | if full_key not in new_data_fragments: 313 | new_data_fragments[full_key] = [] 314 | 315 | for v in value: 316 | cleaned_v = unicode(v) 317 | t = {"value": cleaned_v, "parent_hash_code": hash_code} 318 | new_data_fragments[full_key].append(t) 319 | else: 320 | new_data[dict_key] = json.dumps(value) 321 | 322 | else: 323 | new_data[dict_key] = unicode(value) 324 | 325 | else: 326 | new_data[dict_key] = None 327 | 328 | return new_data_fragments 329 | 330 | 331 | def get_shard_value(data, shard_key): 332 | # split shard key by "." 333 | tmp = data 334 | shard_key_parts = shard_key.split(".") 335 | for shard_key_part in shard_key_parts: 336 | if shard_key_part in tmp: 337 | tmp = tmp[shard_key_part] 338 | else: 339 | return None 340 | 341 | if isinstance(tmp, dict): 342 | return None 343 | else: 344 | 345 | shard_value = str(tmp) 346 | 347 | if len(shard_value) > 32 or len(shard_value) <= 0: 348 | return None 349 | 350 | shard_value = re.sub("[^0-9a-zA-Z_]", '_', shard_value).lower() 351 | return shard_value 352 | 353 | 354 | # creating folder and opening file (for local mode) 355 | def create_file_descriptor(fragment_value, shard_value = None): 356 | 357 | path = fragment_value 358 | if shard_value != None: 359 | path = fragment_value + "/" + shard_value 360 | 361 | # creating folder and opening file (for local mode) 362 | execute('mkdir -p %s/%s' % (tmp_path, path), ignore_error=True) 363 | file_name = '%s/%s/part-00000' % (tmp_path, path) 364 | print >> error_stream, "Opening file descriptor %s" % file_name 365 | file = open(file_name, 'w') 366 | file_descriptors[path] = {"file": file, "file_name": file_name} 367 | print >> error_stream, "Opened file descriptor %s" % file_name 368 | 369 | 370 | def process_line(line, line_num): 371 | # clean data 372 | data_fragments = clean_data(line, line_num, None) 373 | 374 | # skip if data is not clean.. 375 | if data_fragments is None or len(data_fragments) == 0: 376 | return 377 | 378 | # handle other fragments 379 | for fragment_value, fragment_content in data_fragments.iteritems(): 380 | 381 | # open local file descriptor for this fragment (for local mode only) 382 | if tmp_path != None: 383 | if fragment_value not in file_descriptors: 384 | create_file_descriptor(fragment_value) 385 | file = file_descriptors[fragment_value]["file"] 386 | 387 | if isinstance(fragment_content, list): 388 | for element in fragment_content: 389 | if tmp_path != None: 390 | # write data to local file 391 | file.write(json.dumps(element)) 392 | file.write('\n') 393 | else: 394 | print >> output_stream, "%s\t%s" % (fragment_value, json.dumps(element)) 395 | else: 396 | if tmp_path != None: 397 | # write data to local file 398 | file.write(json.dumps(fragment_content)) 399 | file.write('\n') 400 | else: 401 | print >> output_stream, "%s\t%s" % (fragment_value, json.dumps(fragment_content)) 402 | 403 | 404 | def execute(command, ignore_error=False): 405 | print >> error_stream, 'Executing command: %s' % command 406 | if subprocess.call(command, shell=True): 407 | # Non-zero return code indicates an error. 408 | if not ignore_error: 409 | raise Exception("Error executing command: %s" % command) 410 | 411 | 412 | def main(argv): 413 | 414 | # parse parameters 415 | global tmp_path, mongo_schema_collection 416 | 417 | args = argv[0].split(",") 418 | schema_arg = args[0] 419 | if len(args) > 1: 420 | tmp_path = args[1] 421 | 422 | schema_args = schema_arg.split("/") 423 | schema_collection_name = schema_args[-1] 424 | schema_db_name = schema_args[-2] 425 | mongo_uri = '/'.join(schema_args[0:-2]) 426 | 427 | client = MongoClient(mongo_uri) 428 | db = client[schema_db_name] 429 | 430 | mongo_schema_collection = db[schema_collection_name] 431 | 432 | # delete temp folder if already exist (only for local mode) 433 | if tmp_path != None: 434 | execute('rm -rf %s' % tmp_path, ignore_error=True) 435 | 436 | # read schema from MongoDB 437 | global schema, process_array, shard_key 438 | 439 | # read schema from mongodb server 440 | schema_fields = mongo_schema_collection.find({"type": "field"}) 441 | schema = dict((schema_field['key'], schema_field) for schema_field in schema_fields) 442 | 443 | # read process_array from redis 444 | # if redis_server.hget('%s/policy' % app_id, "process_array") != None: 445 | # process_array = redis_server.hget('%s/policy' % app_id, "process_array") 446 | # 447 | # # read shard_key from redis 448 | # if redis_server.hget('%s/policy' % app_id, "shard_key") != None: 449 | # shard_key = redis_server.hget('%s/policy' % app_id, "shard_key") 450 | 451 | # process input 452 | line_num = 1 453 | for line in input_stream: 454 | process_line(line, line_num) 455 | line_num += 1 456 | 457 | # print something to stderr and stdout every 1000 lines 458 | if line_num % 1000 == 0: 459 | print >> error_stream, "Processed %i lines." % line_num 460 | 461 | print >> error_stream, "Finished writing to local files." 462 | 463 | # close out the local files 464 | for fragment_value, file_descriptor in file_descriptors.iteritems(): 465 | print >> error_stream, "Closing file descriptor %s" % fragment_value 466 | 467 | # close file 468 | file_descriptor["file"].close() 469 | 470 | # write fragment values to mongodb 471 | print >> error_stream, "Adding fragment value %s to mongodb." % (fragment_value) 472 | mongo_schema_collection.update_one({"type": "fragments"}, {"$addToSet": {"fragments": fragment_value}}, upsert = True); 473 | 474 | for shard_value in shard_values: 475 | # write shard values to mongodb 476 | if shard_key is not None: 477 | print >> error_stream, "Adding shard value %s to mongodb." % (shard_value) 478 | mongo_schema_collection.update_one({"type": "shards"}, {"$addToSet": {"shards": shard_value}}, upsert = True); 479 | 480 | 481 | if __name__ == "__main__": 482 | main(sys.argv[1:]) 483 | -------------------------------------------------------------------------------- /onefold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Author: Jorge Chang 5 | # 6 | # See license in LICENSE file. 7 | # 8 | # This is the main program used to ETL mongodb collections into Hive tables. 9 | # 10 | 11 | from pymongo import MongoClient 12 | import argparse 13 | import os 14 | import glob 15 | from bson.json_util import dumps 16 | import codecs 17 | import pprint 18 | import json 19 | from onefold_util import execute 20 | from dw_util import Hive, GBigQuery 21 | from cs_util import HDFSStorage, GCloudStorage 22 | 23 | 24 | NUM_RECORDS_PER_PART = 100000 25 | TMP_PATH = '/tmp/onefold_mongo' 26 | CLOUD_STORAGE_PATH = 'onefold_mongo' 27 | HADOOP_MAPREDUCE_STREAMING_LIB = "/usr/hdp/current/hadoop-mapreduce-client/hadoop-streaming.jar" 28 | ONEFOLD_MAPREDUCE_JAR = os.getcwd() + "/java/MapReduce/target/MapReduce-0.0.1-SNAPSHOT.jar" 29 | ONEFOLD_HIVESERDES_JAR = os.getcwd() + "/java/HiveSerdes/target/hive-serdes-1.0-SNAPSHOT.jar" 30 | 31 | # default mapreduce params 32 | mapreduce_params = {} 33 | mapreduce_params["mapred.reduce.max.attempts"] = "0" 34 | mapreduce_params["mapred.map.max.attempts"] = "0" 35 | mapreduce_params["mapred.task.timeout"] = "12000000" 36 | MAPREDUCE_PARAMS_STR = ' '.join(["-D %s=%s"%(k,v) for k,v in mapreduce_params.iteritems()]) 37 | 38 | 39 | # helper function to split "[datatype]-[mode]" into datatype and mode 40 | def parse_datatype_mode (datatype_mode): 41 | a = datatype_mode.split("-") 42 | if len(a) >= 2: 43 | return (a[0], a[1]) 44 | else: 45 | raise ValueError('Invalid datatype / mode tuple %s' % datatype_mode) 46 | 47 | # helper function to check if "address.zip_code" is in data by spliting the jsonpath by "." 48 | def jsonpath_get(mydict, path): 49 | elem = mydict 50 | try: 51 | for x in path.split("."): 52 | elem = elem.get(x) 53 | except: 54 | pass 55 | 56 | return elem 57 | 58 | 59 | class Loader: 60 | 61 | # control params 62 | infra_type = None 63 | mongo_uri = None 64 | db_name = None 65 | collection_name = None 66 | collection_sort_by_field = None 67 | extract_query = None 68 | tmp_path = None 69 | schema_db_name = None 70 | schema_collection_name = None 71 | use_mr = False 72 | 73 | hiveserveer_host = None 74 | hiveserver_port = None 75 | 76 | gcloud_project_id = None 77 | gcloud_storage_bucket_id = None 78 | 79 | write_disposition = None 80 | process_array = "child_table" 81 | dw_database_name = None 82 | dw_table_name = None 83 | 84 | policies = None 85 | 86 | # mongo client and schema collection 87 | mongo_client = None 88 | mongo_schema_collection = None 89 | 90 | # runtime variables 91 | extract_file_names = [] 92 | reject_file_names = [] 93 | sort_by_field_min = None 94 | sort_by_field_max = None 95 | dw_table_names = [] 96 | dw = None 97 | cs = None 98 | num_records_extracted = 0 99 | num_records_rejected = 0 100 | 101 | # policy related variables 102 | required_fields = {} 103 | 104 | 105 | def initialize(self): 106 | 107 | # open mongo client 108 | self.mongo_client = MongoClient(self.mongo_uri) 109 | 110 | # open schema collection 111 | mongo_schema_db = self.mongo_client[self.schema_db_name] 112 | self.mongo_schema_collection = mongo_schema_db[self.schema_collection_name] 113 | 114 | # if overwrite, delete schema collection 115 | if self.write_disposition == 'overwrite': 116 | self.mongo_schema_collection.remove({}) 117 | 118 | # create data warehouse object 119 | if self.infra_type == 'hadoop': 120 | self.dw = Hive(self.hiveserveer_host, self.hiveserver_port, ONEFOLD_HIVESERDES_JAR) 121 | self.cs = HDFSStorage() 122 | elif self.infra_type == 'gcloud': 123 | self.dw = GBigQuery(self.gcloud_project_id, self.gcloud_storage_bucket_id) 124 | self.cs = GCloudStorage(self.gcloud_project_id, self.gcloud_storage_bucket_id) 125 | 126 | # turn policies into better data structure for use later (required_fields) 127 | if self.policies != None: 128 | for policy in self.policies: 129 | if 'key' in policy: 130 | if 'required' in policy: 131 | if policy['key'] not in self.required_fields == None: 132 | self.required_fields[policy['key']] = {} 133 | self.required_fields[policy['key']] = policy 134 | 135 | if 'data_type' in policy: 136 | datatype_overwrite = policy['data_type'] 137 | 138 | if 'mode' in policy: 139 | mode_overwrite = policy['mode'] 140 | else: 141 | mode_overwrite = 'nullable' 142 | 143 | self.mongo_schema_collection.update_one( 144 | {"key": policy['key'].replace(".", "_"), "type": "field"}, 145 | {"$set": {"data_type": datatype_overwrite, 146 | "mode": mode_overwrite, 147 | "forced": True}}, 148 | upsert = True) 149 | 150 | 151 | def extract_data(self): 152 | 153 | # create tmp_path folder if necessary 154 | if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'data')): 155 | os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'data')) 156 | 157 | if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'rejected')): 158 | os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'rejected')) 159 | 160 | # delete old tmp files if exists 161 | for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'data', '*')): 162 | print "Deleting old file %s" % (old_file) 163 | os.remove(old_file) 164 | 165 | for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'rejected', '*')): 166 | print "Deleting old file %s" % (old_file) 167 | os.remove(old_file) 168 | 169 | # some state variables 170 | part_num = 0 171 | extract_file = None 172 | 173 | reject_part_num = 0 174 | reject_file = None 175 | 176 | # start mongo client 177 | db = self.mongo_client[self.db_name] 178 | collection = db[self.collection_name] 179 | 180 | # turn query string into json 181 | if self.extract_query is not None: 182 | if 'ObjectId' in self.extract_query: 183 | # kinda hacky.. and dangerous! This is to evaluate an expression 184 | # like {"_id": {$gt:ObjectId("55401a60151a4b1a4f000001")}} 185 | from bson.objectid import ObjectId 186 | extract_query_json = eval(self.extract_query) 187 | else: 188 | extract_query_json = json.loads(self.extract_query) 189 | else: 190 | extract_query_json = None 191 | 192 | # query collection, sort by collection_sort_by_field 193 | for data in collection.find(extract_query_json).sort(self.collection_sort_by_field, 1): 194 | 195 | # track min and max id for auditing.. 196 | if self.sort_by_field_min == None: 197 | self.sort_by_field_min = data[self.collection_sort_by_field] 198 | self.sort_by_field_max = data[self.collection_sort_by_field] 199 | 200 | # open a new file if necessary 201 | if self.num_records_extracted % NUM_RECORDS_PER_PART == 0: 202 | 203 | if extract_file != None: 204 | extract_file.close() 205 | 206 | part_num += 1 207 | extract_file_name = os.path.join(self.tmp_path, self.collection_name, 'data', str(part_num)) 208 | extract_file = open(extract_file_name, "w") 209 | extract_file_codec = codecs.getwriter("utf-8")(extract_file) 210 | self.extract_file_names.append(extract_file_name) 211 | print "Creating file %s" % extract_file_name 212 | 213 | # validate policies 214 | rejected = False 215 | for required_field_name, policy in self.required_fields.iteritems(): 216 | if policy['required'] and jsonpath_get(data, required_field_name) is None: 217 | 218 | # -------------------------------------------------------- 219 | # document found that doesn't contain required fields. 220 | # -------------------------------------------------------- 221 | 222 | # open a new file if necessary 223 | if self.num_records_rejected % NUM_RECORDS_PER_PART == 0: 224 | 225 | if reject_file != None: 226 | reject_file.close() 227 | 228 | reject_part_num += 1 229 | reject_file_name = os.path.join(self.tmp_path, self.collection_name, 'rejected', str(reject_part_num)) 230 | reject_file = open(reject_file_name, "w") 231 | reject_file_codec = codecs.getwriter("utf-8")(reject_file) 232 | self.reject_file_names.append(reject_file_name) 233 | print "Creating reject file %s" % reject_file_name 234 | 235 | self.num_records_rejected += 1 236 | reject_file_codec.write("Rejected. Missing %s. Data: %s" % (required_field_name, dumps(data))) 237 | reject_file_codec.write('\n') 238 | 239 | rejected = True 240 | break 241 | 242 | if not rejected: 243 | self.num_records_extracted += 1 244 | extract_file_codec.write(dumps(data)) 245 | extract_file_codec.write('\n') 246 | 247 | if extract_file != None: 248 | extract_file.close() 249 | 250 | if reject_file != None: 251 | reject_file.close() 252 | 253 | def simple_schema_gen(self): 254 | command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \ 255 | % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name) 256 | execute(command) 257 | 258 | 259 | def mr_schema_gen(self): 260 | 261 | hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name) 262 | hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (CLOUD_STORAGE_PATH, self.collection_name) 263 | 264 | # delete folders 265 | self.cs.rmdir(hdfs_data_folder) 266 | self.cs.rmdir(hdfs_mr_output_folder) 267 | 268 | 269 | # copy extracted files to hdfs data folder 270 | self.cs.mkdir(hdfs_data_folder) 271 | 272 | for extract_file_name in self.extract_file_names: 273 | self.cs.copy_from_local(extract_file_name, hdfs_data_folder) 274 | 275 | hadoop_command = """hadoop jar %s \ 276 | -D mapred.job.name="onefold-mongo-generate-schema" \ 277 | %s \ 278 | -input %s -output %s \ 279 | -mapper 'json/generate-schema-mapper.py' \ 280 | -reducer 'json/generate-schema-reducer.py %s/%s/%s' \ 281 | -file json/generate-schema-mapper.py \ 282 | -file json/generate-schema-reducer.py 283 | """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder, 284 | hdfs_mr_output_folder, self.mongo_uri, 285 | self.schema_db_name, self.schema_collection_name) 286 | execute(hadoop_command) 287 | 288 | 289 | def simple_data_transform(self): 290 | 291 | hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name) 292 | transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name) 293 | 294 | command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \ 295 | % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, 296 | self.schema_collection_name, transform_data_tmp_path) 297 | execute(command) 298 | 299 | # delete folders 300 | self.cs.rmdir (hdfs_mr_output_folder) 301 | 302 | # manually copy files into hdfs 303 | fragment_values = self.get_fragments() 304 | for fragment_value in fragment_values: 305 | self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value)) 306 | self.cs.copy_from_local("%s/%s/part-00000" % (transform_data_tmp_path, fragment_value), 307 | "%s/%s/" % (hdfs_mr_output_folder, fragment_value)) 308 | 309 | 310 | def mr_data_transform(self): 311 | 312 | hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name) 313 | hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name) 314 | 315 | # delete folders 316 | self.cs.rmdir(hdfs_mr_output_folder) 317 | 318 | hadoop_command = """hadoop jar %s \ 319 | -libjars %s \ 320 | -D mapred.job.name="onefold-mongo-transform-data" \ 321 | -D mapred.reduce.tasks=0 \ 322 | %s \ 323 | -input %s -output %s \ 324 | -mapper 'json/transform-data-mapper.py %s/%s/%s' \ 325 | -file json/transform-data-mapper.py \ 326 | -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat 327 | """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, 328 | self.schema_db_name, self.schema_collection_name) 329 | execute(hadoop_command) 330 | 331 | 332 | # retrieve schema tree from schema collection 333 | def retrieve_schema_fields(self): 334 | 335 | # read schema from mongodb schema collection 336 | schema_fields = [] 337 | 338 | mongo_schema_fields = self.mongo_schema_collection.find({"type": "field"}) 339 | for mongo_schema_field in mongo_schema_fields: 340 | schema_fields.append(mongo_schema_field) 341 | 342 | # add hash code to field 343 | field = {} 344 | field['key'] = "hash_code" 345 | field['mode'] = "nullable" 346 | field['data_type'] = "string" 347 | schema_fields.append(field) 348 | 349 | return schema_fields 350 | 351 | 352 | def get_fragments(self): 353 | fragment_record = self.mongo_schema_collection.find_one({"type": "fragments"}) 354 | if fragment_record != None: 355 | return fragment_record['fragments'] 356 | else: 357 | return [] 358 | 359 | 360 | def load_table_hive (self, shard_value = None, table_name = None, different_table_per_shard = False, data_import_id = None): 361 | 362 | # if shard_value is None: 363 | # gcs_uri = "%s/data/*" % (self.mr4_output_folder_uri) 364 | # else: 365 | # gcs_uri = "%s/data/%s/*" % (self.mr4_output_folder_uri, shard_value) 366 | 367 | if different_table_per_shard: 368 | full_table_name = "%s_%s" % (table_name, shard_value) 369 | else: 370 | full_table_name = "%s" % (table_name) 371 | 372 | cloud_storage_path = "%s/%s/data_transform/output/%s/" % (CLOUD_STORAGE_PATH, self.collection_name, shard_value) 373 | self.dw.load_table(self.dw_database_name, full_table_name, cloud_storage_path) 374 | 375 | # extract bq_job_id and save to db 376 | return "%s/%s" % (data_import_id, shard_value) 377 | 378 | 379 | def load_dw (self): 380 | 381 | # retrieve schema fields from mongodb schema collection 382 | schema_fields = self.retrieve_schema_fields() 383 | 384 | # create tables 385 | if self.write_disposition == 'overwrite': 386 | if self.dw.table_exists(self.dw_database_name, self.dw_table_name): 387 | self.dw.delete_table(self.dw_database_name, self.dw_table_name) 388 | self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array) 389 | else: 390 | # if append, update table. 391 | if self.dw.table_exists(self.dw_database_name, self.dw_table_name): 392 | self.dw_table_names = self.dw.update_table(self.dw_database_name, self.dw_table_name, schema_fields) 393 | else: 394 | self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array) 395 | 396 | # load data 397 | fragment_values = self.get_fragments() 398 | 399 | if fragment_values == None or len(fragment_values) == 0: 400 | table_name = self.dw_table_name 401 | self.load_table_hive(shard_value = None, table_name = table_name, different_table_per_shard=False, data_import_id=None) 402 | 403 | else: 404 | for fragment_value in fragment_values: 405 | print "Loading fragment: " + fragment_value 406 | if fragment_value == 'root': 407 | table_name = self.dw_table_name 408 | else: 409 | table_name = self.dw_table_name + "_" + fragment_value 410 | 411 | self.load_table_hive(shard_value = fragment_value, table_name = table_name, different_table_per_shard=False, data_import_id=None) 412 | 413 | 414 | def run(self): 415 | # init (start mongo client) 416 | self.initialize() 417 | 418 | # extract data from Mongo 419 | self.extract_data() 420 | 421 | if self.num_records_extracted > 0: 422 | # generate schema and transform data 423 | if self.use_mr: 424 | self.mr_schema_gen() 425 | self.mr_data_transform() 426 | else: 427 | self.simple_schema_gen() 428 | self.simple_data_transform() 429 | 430 | # Create data warehouse tables and load data into them 431 | self.load_dw() 432 | 433 | print '-------------------' 434 | print ' RUN SUMMARY' 435 | print '-------------------' 436 | print 'Num records extracted %s' % self.num_records_extracted 437 | print 'Num records rejected %s' % self.num_records_rejected 438 | print 'Extracted data with %s from %s to %s' % (self.collection_sort_by_field, self.sort_by_field_min, self.sort_by_field_max) 439 | print 'Extracted files are located at: %s' % (' '.join(self.extract_file_names)) 440 | print 'Destination Tables: %s' % (' '.join(self.dw_table_names)) 441 | print 'Schema is stored in Mongo %s.%s' % (self.schema_db_name, self.schema_collection_name) 442 | 443 | def usage(): 444 | # ./onefold.py --mongo mongodb://173.255.115.8:27017 --source_db test --source_collection uber_events --schema_db test --schema_collection uber_events_schema --hiveserver_host 130.211.146.208 --hiveserver_port 10000 445 | # ./onefold.py --mongo mongodb://173.255.115.8:27017 --source_db test --source_collection uber_events --schema_db test --schema_collection uber_events_schema --hiveserver_host 130.211.146.208 --hiveserver_port 10000 --use_mr 446 | pass 447 | 448 | def main(): 449 | 450 | # parse command line 451 | parser = argparse.ArgumentParser(description='Generate schema for MongoDB collections.') 452 | parser.add_argument('--mongo', metavar='mongo', type=str, required=True, help='MongoDB connectivity') 453 | parser.add_argument('--source_db', metavar='source_db', type=str, required=True, help='Source MongoDB database name') 454 | parser.add_argument('--source_collection', metavar='source_collection', type=str, required=True, 455 | help='Source MongoDB collection name') 456 | parser.add_argument('--source_sort_by_field', metavar='source_sort_by_field', type=str, default='_id', 457 | help='Source MongoDB collection name') 458 | parser.add_argument('--query', metavar='query', type=str, help='Mongo Query for filtering') 459 | parser.add_argument('--tmp_path', metavar='tmp_path', type=str, help='Path to store tmp file from extraction.', 460 | default=TMP_PATH) 461 | parser.add_argument('--schema_db', metavar='schema_db', type=str, 462 | help='MongoDB database name to store schema. If not provided, default to source db.') 463 | parser.add_argument('--schema_collection', metavar='schema_collection', type=str, 464 | help='MongoDB collection name to store schema. If not provided, default to [source_collection]_schema') 465 | parser.add_argument('--write_disposition', metavar='write_disposition', type=str, 466 | help='overwrite or append. Default is overwrite', default='overwrite', choices=['overwrite', 'append']) 467 | parser.add_argument('--dest_db_name', metavar='dest_db_name', type=str, 468 | help='Hive database name. If not provided, default to \'default\' hive database.') 469 | parser.add_argument('--dest_table_name', metavar='dest_table_name', type=str, 470 | help='Hive table name. If not provided, default to source collection name.') 471 | parser.add_argument('--use_mr', action='store_true') 472 | parser.add_argument('--policy_file', metavar='policy_file', type=str, 473 | help='Data Policy file name.') 474 | parser.add_argument('--infra_type', metavar='infra_type', type=str, default='hadoop', 475 | help='Infrastructure type. One of hadoop or gcloud') 476 | 477 | # hive related parameters 478 | parser.add_argument('--hiveserver_host', metavar='hiveserver_host', type=str, required=False, help='Hiveserver host') 479 | parser.add_argument('--hiveserver_port', metavar='hiveserver_port', type=str, required=False, help='Hiveserver port') 480 | 481 | # gcloud related parameters 482 | parser.add_argument('--gcloud_project_id', metavar='gcloud_project_id', type=str, required=False, help='GCloud project id') 483 | parser.add_argument('--gcloud_storage_bucket_id', metavar='gcloud_storage_bucket_id', type=str, required=False, help='GCloud storage bucket id') 484 | 485 | args = parser.parse_args() 486 | 487 | # global mongo_uri, db_name, collection_name, extract_query, tmp_path, schema_db_name, schema_collection_name, use_mr 488 | loader = Loader() 489 | loader.infra_type = args.infra_type 490 | loader.mongo_uri = args.mongo 491 | loader.db_name = args.source_db 492 | loader.collection_name = args.source_collection 493 | loader.collection_sort_by_field = args.source_sort_by_field 494 | loader.extract_query = args.query 495 | loader.tmp_path = args.tmp_path 496 | 497 | if args.schema_db != None: 498 | loader.schema_db_name = args.schema_db 499 | else: 500 | loader.schema_db_name = args.source_db 501 | 502 | if args.schema_collection != None: 503 | loader.schema_collection_name = args.schema_collection 504 | else: 505 | loader.schema_collection_name = "%s_schema" % args.source_collection 506 | 507 | if args.infra_type == 'hadoop': 508 | if args.hiveserver_host is None: 509 | raise ValueError("hiveserver_host must be specified for 'hadoop' infrastructure type.") 510 | if args.hiveserver_port is None: 511 | raise ValueError("hiveserver_port must be specified for 'hadoop' infrastructure type.") 512 | 513 | loader.hiveserver_host = args.hiveserver_host 514 | loader.hiveserver_port = args.hiveserver_port 515 | else: 516 | if args.gcloud_project_id is None: 517 | raise ValueError("gcloud_project_id must be specified for 'gcloud' infrastructure type.") 518 | if args.gcloud_storage_bucket_id is None: 519 | raise ValueError("gcloud_storage_bucket_id must be specified for 'gcloud' infrastructure type.") 520 | 521 | loader.gcloud_project_id = args.gcloud_project_id 522 | loader.gcloud_storage_bucket_id = args.gcloud_storage_bucket_id 523 | 524 | loader.write_disposition = args.write_disposition 525 | 526 | if args.dest_table_name != None: 527 | loader.dw_table_name = args.dest_table_name 528 | else: 529 | loader.dw_table_name = args.source_collection 530 | 531 | if args.dest_db_name != None: 532 | loader.dw_database_name = args.dest_db_name 533 | 534 | if args.use_mr: 535 | loader.use_mr = args.use_mr 536 | 537 | if args.policy_file != None: 538 | # open policy file 539 | policy_file = open(args.policy_file, "r") 540 | loader.policies = json.loads(policy_file.read()) 541 | 542 | loader.run() 543 | 544 | 545 | if __name__ == '__main__': 546 | main() 547 | -------------------------------------------------------------------------------- /onefold_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright 2015, OneFold 5 | # All rights reserved. 6 | # http://www.onefold.io 7 | # 8 | # Author: Jorge Chang 9 | # 10 | # See license in LICENSE file. 11 | # 12 | # OneFold utility functions - mainly for executing shell commands. 13 | # 14 | 15 | import subprocess 16 | import os 17 | import random 18 | import time 19 | 20 | # execute shell command 21 | def execute(command, ignore_error=False, retry=False, subpress_output=False): 22 | 23 | if retry: 24 | num_retries = 5 25 | else: 26 | num_retries = 1 27 | 28 | l = range(0,num_retries) 29 | for n in l: 30 | try: 31 | print 'Executing command: %s' % command 32 | 33 | if subpress_output: 34 | devnull = open(os.devnull, 'w') 35 | rc = subprocess.call(command, shell=True, stdout=devnull, stderr=devnull) 36 | else: 37 | rc = subprocess.call(command, shell=True) 38 | 39 | if rc: 40 | # Non-zero return code indicates an error. 41 | if not ignore_error: 42 | raise Exception("Error executing command: %s" % command) 43 | 44 | # if command ran successfully, return! 45 | return 46 | except: 47 | if retry: 48 | # Apply exponential backoff. 49 | print 'Retry-able. Sleeping...' 50 | time.sleep((2 ** n) + random.randint(0, 1000) / 1000) 51 | else: 52 | raise 53 | 54 | # only reach this point if we've re-tried and still failed. 55 | if retry: 56 | print "Retries exceeded (%s times). Throwing exception.." % num_retries 57 | raise Exception ("Retries exceeded (%s times) when executing this command." % num_retries) 58 | 59 | 60 | def execute_and_read_with_retry(command): 61 | for n in range(0,5): 62 | (return_code, stdout_lines, stderr_lines) = execute_and_read(command) 63 | if return_code == 0: 64 | break 65 | else: 66 | print "Error executing command: %s with return code %s" % (command, return_code) 67 | print 'Retry-able. Sleeping...' 68 | time.sleep((2 ** n) + random.randint(0, 1000) / 1000) 69 | 70 | return (return_code, stdout_lines, stderr_lines) 71 | 72 | 73 | # execute shell command and return stdout as list of strings 74 | def execute_and_read(command): 75 | # run command and read stdout 76 | print 'Executing command: %s' % command 77 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 78 | p.wait() 79 | 80 | return_code = p.returncode 81 | 82 | stdout_lines = p.stdout.readlines() 83 | # print stdout 84 | for line in stdout_lines: 85 | print line.strip() 86 | 87 | stderr_lines = p.stderr.readlines() 88 | # print stderr 89 | for line in stderr_lines: 90 | print line.strip() 91 | 92 | return (return_code, stdout_lines, stderr_lines) 93 | 94 | 95 | --------------------------------------------------------------------------------