├── .gitignore
├── LICENSE
├── README.md
├── cs_util.py
├── dw_util.py
├── java
    ├── HiveSerdes
    │   ├── dependency-reduced-pom.xml
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── cloudera
    │   │                   └── hive
    │   │                       └── serde
    │   │                           └── JSONSerDe.java
    └── MapReduce
    │   ├── dependency-reduced-pom.xml
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── com
    │                   └── onefold
    │                       └── hadoop
    │                           └── MapReduce
    │                               └── TransformDataMultiOutputFormat.java
├── json
    ├── generate-schema-mapper.py
    ├── generate-schema-reducer.py
    └── transform-data-mapper.py
├── onefold.py
└── onefold_util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Java / Eclipse
60 | .idea
61 | *.class
62 | *.classpath
63 | .project
64 | **/.settings


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Mongo - Google Big Query Connector
  2 | 
  3 | Super-easy way to load your MongoDB collection into Google BigQuery. The code creates Google BigQuery schema automatically by performing a deep inspection of each MongoDB record and deriving the data type of each field. Supports basic data types, nested objects, array of primitive data types and array of objects.
  4 | 
  5 | Nested fields are flattened out into columns.
  6 | 
  7 | Arrays are typically split into a different (child) BigQuery table with parent/child relationship with the root table.
  8 | 
  9 | ## How it works
 10 | 
 11 | 1. Connects to your MongoDB and extract the specified collection into local file which is then copied to Google Cloud Storage.
 12 | 2. MapReduce generates schema (a copy is saved back to MongoDB for info).
 13 | 3. MapReduce transforms data, breaking the array into multiple files in Google Cloud Storage output folder.
 14 | 4. Create BigQuery tables using schema generated in step 2.
 15 | 5. Load BigQuery tables using Google Cloud Storage files generated in step 3.
 16 | 
 17 | ## Pre-requisites
 18 | 
 19 | 1. You have a Hadoop cluster.
 20 | 2. You can SSH to the master node.
 21 | 3. Make sure `hadoop` program is in your `PATH`.
 22 | 4. In each node, the following is installed:
 23 |   * python (2.6+)
 24 |   * pip
 25 |   * pymongo
 26 | 
 27 | If not, you can run the following on each node:
 28 | ```
 29 | yum -y install epel-release
 30 | yum -y install python-pip
 31 | pip install pymongo
 32 | ```
 33 | 
 34 | ## Install
 35 | 
 36 | 1. git clone this repo on the master node in your Hadoop cluster.
 37 | 2. Run this to compile custom code needed for MapReduce:
 38 | 
 39 |   ```
 40 |   cd java/MapReduce
 41 |   mvn package
 42 |   ```
 43 | 3. Make sure you have gcloud command line utilities installed in your Hadoop master mode. Executables that this program depends on are:
 44 | 
 45 |   ```
 46 |   gsutil
 47 |   bq
 48 |   ```
 49 | 
 50 | In `onefold.py`, near the top, there are a few configuration that you can customized. Make sure these variables are set correctly before proceeding.
 51 | 
 52 | `TMP_PATH`
 53 | 
 54 | Where the script will store extracted data from MongoDB.
 55 | 
 56 | `CLOUD_STORAGE_PATH`
 57 | 
 58 | Google Cloud Storage Path where it will store files for MapReduce and BigQuery.
 59 | 
 60 | 
 61 | ## Usage
 62 | 
 63 | ### Simple case
 64 | Say you have a MongoDB collection called "test.users", and you have some records in it:
 65 | 
 66 | ```
 67 | > db.users.find();
 68 | { "_id" : ObjectId("5688d0855d53fc2c133f3429"), "mobile" : { "carrier" : "Sprint", "device" : "Samsung" }, "name" : "John Doe", "age" : 24, "utm_campaign" : "Facebook_Offer", "app_version" : "2.4", "address" : { "city" : "Chicago", "zipcode" : 94012 } }
 69 | ```
 70 | 
 71 | To load this into BigQuery,
 72 | 
 73 | ```
 74 | ./onefold.py --mongo mongodb://[mongodb_host]:[mongodb_port] \
 75 |              --source_db test \
 76 |              --source_collection users \
 77 |              --infra_type gcloud \
 78 |              --dest_db_name test \
 79 |              --dest_table_name users \
 80 |              --gcloud_project_id [google_cloud_project_id] \
 81 |              --gcloud_storage_bucket_id [google_cloud_storage_bucket_id]
 82 | ```
 83 | 
 84 | Results:
 85 | ```
 86 | -- Initializing Google BigQuery module --
 87 | Creating file /tmp/onefold_mongo/users/data/1
 88 | Executing command: cat /tmp/onefold_mongo/users/data/1 | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py mongodb://localhost:27017/test/users_schema > /dev/null
 89 | Executing command: cat /tmp/onefold_mongo/users/data/1 | json/transform-data-mapper.py mongodb://localhost:27017/test/users_schema,/tmp/onefold_mongo/users/data_transform/output > /dev/null
 90 | ...
 91 | Executing command: gsutil -m rm -rf gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/
 92 | Removing gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/part-00000#1451806915461000...
 93 | copy_from_local: /tmp/onefold_mongo/users/data_transform/output/root/part-00000 onefold_mongo/users/data_transform/output/root/
 94 | Executing command: gsutil -m cp /tmp/onefold_mongo/users/data_transform/output/root/part-00000 gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/part-00000
 95 | ...
 96 | Executing command: bq --project_id mongo-gbq --format csv ls test
 97 | Executing command: bq --project_id mongo-gbq mk --schema users_schema.json test.users
 98 | Table 'mongo-gbq:test.users' successfully created.
 99 | Loading fragment: root
100 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.users gs://mongo-gbq-bucket/onefold_mongo/users/data_transform/output/root/*
101 | Successfully started load mongo-gbq:bqjob_r4d275de6da77baf3_0000015206702df7_1
102 | -------------------
103 |     RUN SUMMARY
104 | -------------------
105 | Num records extracted 1
106 | Num records rejected 0
107 | Extracted data with _id from 5688d0855d53fc2c133f3429 to 5688d0855d53fc2c133f3429
108 | Extracted files are located at: /tmp/onefold_mongo/users/data/1
109 | Destination Tables: users
110 | Schema is stored in Mongo test.users_schema
111 | ```
112 | 
113 | In Google BigQuery, you can see:
114 | ```
115 | $ bq show test.users
116 | Table mongo-gbq:test.users
117 | 
118 |    Last modified             Schema             Total Rows   Total Bytes   Expiration
119 |  ----------------- --------------------------- ------------ ------------- ------------
120 |   02 Jan 23:43:12   |- address_city: string     1            141
121 |                     |- address_zipcode: float
122 |                     |- age: float
123 |                     |- app_version: string
124 |                     |- id_oid: string
125 |                     |- mobile_carrier: string
126 |                     |- mobile_device: string
127 |                     |- name: string
128 |                     |- utm_campaign: string
129 |                     |- hash_code: string
130 | 
131 | $ bq query "select * from test.users"
132 | Waiting on bqjob_r710f4e875a413367_000001520674ebba_1 ... (0s) Current status: DONE
133 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+
134 | | address_city | address_zipcode | age  | app_version |          id_oid          | mobile_carrier | mobile_device |   name   |  utm_campaign  |                hash_code                 |
135 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+
136 | | Chicago      |         94012.0 | 24.0 | 2.4         | 5688d0855d53fc2c133f3429 | Sprint         | Samsung       | John Doe | Facebook_Offer | abf9a2ac1ce71feb12418c889b913f8d8361a6d4 |
137 | +--------------+-----------------+------+-------------+--------------------------+----------------+---------------+----------+----------------+------------------------------------------+
138 | ```
139 | 
140 | In Mongo, you can see the schema saved in a collection called `users_schema`:
141 | ```
142 | > db.users_schema.find();
143 | { "_id" : ObjectId("55426ae6296e827fc79300b1"), "type" : "field", "data_type" : "string-nullable", "key" : "address_city" }
144 | { "_id" : ObjectId("55426ae6296e827fc79300b2"), "type" : "field", "data_type" : "record-nullable", "key" : "address" }
145 | { "_id" : ObjectId("55426ae6296e827fc79300b3"), "type" : "field", "data_type" : "integer-nullable", "key" : "address_zipcode" }
146 | { "_id" : ObjectId("55426ae6296e827fc79300b4"), "type" : "field", "data_type" : "integer-nullable", "key" : "age" }
147 | { "_id" : ObjectId("55426ae6296e827fc79300b5"), "type" : "field", "data_type" : "string-nullable", "key" : "app_version" }
148 | { "_id" : ObjectId("55426ae6296e827fc79300b6"), "type" : "field", "data_type" : "string-nullable", "key" : "id_oid" }
149 | { "_id" : ObjectId("55426ae6296e827fc79300b7"), "type" : "field", "data_type" : "record-nullable", "key" : "id" }
150 | { "_id" : ObjectId("55426ae6296e827fc79300b8"), "type" : "field", "data_type" : "string-nullable", "key" : "mobile_carrier" }
151 | { "_id" : ObjectId("55426ae6296e827fc79300b9"), "type" : "field", "data_type" : "string-nullable", "key" : "mobile_device" }
152 | { "_id" : ObjectId("55426ae6296e827fc79300ba"), "type" : "field", "data_type" : "record-nullable", "key" : "mobile" }
153 | { "_id" : ObjectId("55426ae6296e827fc79300bb"), "type" : "field", "data_type" : "string-nullable", "key" : "name" }
154 | { "_id" : ObjectId("55426ae6296e827fc79300bc"), "type" : "field", "data_type" : "string-nullable", "key" : "utm_campaign" }
155 | { "_id" : ObjectId("55426ae72e2ecef82b7417d1"), "type" : "fragments", "fragments" : [ "root" ] }
156 | ```
157 | 
158 | Notes:
159 | 
160 | 1. By default, extracted data is saved in `/tmp/onefold_mongo`. It can be changed by specifying the `tmp_path` parameter.
161 | 2. If `--use_mr` parameter is specified, it will use MapReduce to generate schema and transform data. Otherwise, it runs the mapper and reducer via command line using `cat [input] | mapper | sort | reducer` metaphor. This is handy if you don't have many records and/or just want to get this going quickly.
162 | 3. The generated files are in JSON format.
163 | 4. Nested objects like `mobile` and `address` in the above example are flattened out in the BigQuery table.
164 | 5. `hash_code` column is added. It's basically an SHA1 hash of the object. It's useful later on when we use `hash_code` as parent-child key to represent array in a child table.
165 | 
166 | 
167 | ### Now let's try a more complex collection.
168 | 
169 | In Mongo, create a `complex_users` collection with the following fields:
170 | ```
171 | > db.complex_users.find()
172 | { "_id" : ObjectId("5688d73c5d53fc2c133f342b"), "hobbies" : [ "reading", "cycling" ], "age" : 34, "work_history" : [ { "to" : "present", "from" : 2013, "name" : "IBM" }, { "to" : 2013, "from" : 2003, "name" : "Bell" } ], "utm_campaign" : "Google", "name" : "Alexander Keith", "app_version" : "2.5", "mobile" : { "device" : "iPhone", "carrier" : "Rogers" }, "address" : { "state" : "Ontario", "zipcode" : "M1K3A5", "street" : "26 Marshall Lane", "city" : "Toronto" } }
173 | ```
174 | 
175 | A new `hobbies` field is added that is a string array.
176 | A new `work_history` field is added that is an array of nested objects.
177 | 
178 | Run the following command to load `complex_users` collection into BigQuery:
179 | ```
180 | ./onefold.py --mongo mongodb://[mongodb_host]:[mongodb_port] \
181 |              --source_db test \
182 |              --source_collection complex_users \
183 |              --infra_type gcloud \
184 |              --dest_db_name test \
185 |              --dest_table_name complex_users \
186 |              --gcloud_project_id [google_cloud_project_id] \
187 |              --gcloud_storage_bucket_id [google_cloud_storage_bucket_id]
188 | ```
189 | 
190 | Results:
191 | ```
192 | -- Initializing Google BigQuery module --
193 | Creating file /tmp/onefold_mongo/complex_users/data/1
194 | Executing command: cat /tmp/onefold_mongo/complex_users/data/1 | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py mongodb://localhost:27017/test/complex_users_schema > /dev/null
195 | Executing command: cat /tmp/onefold_mongo/complex_users/data/1 | json/transform-data-mapper.py mongodb://localhost:27017/test/complex_users_schema,/tmp/onefold_mongo/complex_users/data_transform/output > /dev/null
196 | Executing command: rm -rf /tmp/onefold_mongo/complex_users/data_transform/output
197 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/root
198 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000
199 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000
200 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/work_history
201 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000
202 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000
203 | Executing command: mkdir -p /tmp/onefold_mongo/complex_users/data_transform/output/hobbies
204 | Opening file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000
205 | Opened file descriptor /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000
206 | ...
207 | Executing command: gsutil -m rm -rf gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/
208 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 onefold_mongo/complex_users/data_transform/output/root/
209 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/root/part-00000
210 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/root/part-00000 [Content-Type=application/octet-stream]...
211 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 onefold_mongo/complex_users/data_transform/output/work_history/
212 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/work_history/part-00000
213 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/work_history/part-00000 [Content-Type=application/octet-stream]...
214 | copy_from_local: /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 onefold_mongo/complex_users/data_transform/output/hobbies/
215 | Executing command: gsutil -m cp /tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000
216 | Copying file:///tmp/onefold_mongo/complex_users/data_transform/output/hobbies/part-00000 [Content-Type=application/octet-stream]...
217 | ...
218 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_schema.json test.complex_users
219 | Table 'mongo-gbq:test.complex_users' successfully created.
220 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_work_history_schema.json test.complex_users_work_history
221 | Table 'mongo-gbq:test.complex_users_work_history' successfully created.
222 | Executing command: bq --project_id mongo-gbq mk --schema complex_users_hobbies_schema.json test.complex_users_hobbies
223 | Table 'mongo-gbq:test.complex_users_hobbies' successfully created.
224 | Loading fragment: root
225 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/root/*
226 | Successfully started load mongo-gbq:bqjob_r4fe3384c09234c1d_00000152068b5e85_1
227 | Loading fragment: work_history
228 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users_work_history gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/work_history/*
229 | Successfully started load mongo-gbq:bqjob_r138de33f6e2058cc_00000152068b62aa_1
230 | Loading fragment: hobbies
231 | Executing command: bq --project_id mongo-gbq --nosync load --source_format NEWLINE_DELIMITED_JSON test.complex_users_hobbies gs://mongo-gbq-bucket/onefold_mongo/complex_users/data_transform/output/hobbies/*
232 | Successfully started load mongo-gbq:bqjob_r361aa8424636d4a0_00000152068b689e_1
233 | -------------------
234 |     RUN SUMMARY
235 | -------------------
236 | Num records extracted 1
237 | Num records rejected 0
238 | Extracted data with _id from 5688d73c5d53fc2c133f342b to 5688d73c5d53fc2c133f342b
239 | Extracted files are located at: /tmp/onefold_mongo/complex_users/data/1
240 | Destination Tables: complex_users complex_users_work_history complex_users_hobbies
241 | Schema is stored in Mongo test.complex_users_schema
242 | ```
243 | 
244 | In BigQuery, three new tables are created: `complex_users`, `complex_users_hobbies` and `complex_users_work_history`
245 | ```
246 | $ bq ls test
247 |            tableId             Type
248 |  ---------------------------- -------
249 |   complex_users                TABLE
250 |   complex_users_hobbies        TABLE
251 |   complex_users_work_history   TABLE
252 | 
253 | $ bq show test.complex_users
254 | Table mongo-gbq:test.complex_users
255 | 
256 |    Last modified              Schema             Total Rows   Total Bytes   Expiration
257 |  ----------------- ---------------------------- ------------ ------------- ------------
258 |   03 Jan 00:12:48   |- address_city: string      1            166
259 |                     |- address_state: string
260 |                     |- address_street: string
261 |                     |- address_zipcode: string
262 |                     |- age: float
263 |                     |- app_version: string
264 |                     |- id_oid: string
265 |                     |- mobile_carrier: string
266 |                     |- mobile_device: string
267 |                     |- name: string
268 |                     |- utm_campaign: string
269 |                     |- hash_code: string
270 | 
271 | $ bq show test.complex_users_hobbies
272 | Table mongo-gbq:test.complex_users_hobbies
273 | 
274 |    Last modified              Schema              Total Rows   Total Bytes   Expiration
275 |  ----------------- ----------------------------- ------------ ------------- ------------
276 |   03 Jan 00:12:49   |- parent_hash_code: string   2            102
277 |                     |- hash_code: string
278 |                     |- value: string
279 | 
280 | $ bq show test.complex_users_work_history
281 | Table mongo-gbq:test.complex_users_work_history
282 | 
283 |    Last modified              Schema              Total Rows   Total Bytes   Expiration
284 |  ----------------- ----------------------------- ------------ ------------- ------------
285 |   03 Jan 00:12:47   |- parent_hash_code: string   2            212
286 |                     |- hash_code: string
287 |                     |- from: float
288 |                     |- name: string
289 |                     |- to: string
290 | ```
291 | 
292 | You can join parent and child table like:
293 | ```
294 | $ bq query "select * from test.complex_users join test.complex_users_hobbies on test.complex_users.hash_code = test.complex_users_hobbies.parent_hash_code"
295 | ```
296 | 
297 | ## Parameters
298 | 
299 | `--mongo`
300 | MongoDB connectivity URI, e.g. mongodb://127.0.0.1:27017
301 | 
302 | `--source_db`
303 | The MongoDB database name from which to extract data.
304 | 
305 | `--source_collection`
306 | The MongoDB collection name from which to extract data.
307 | 
308 | `--query`
309 | Optional query users can specify when doing extraction. Useful for filtering out only incremental records. See below for some examples.
310 | 
311 | `--tmp_path`
312 | Optional. Path used to store extracted data. Default is `/tmp/onefold_mongo`
313 | 
314 | `--schema_db`
315 | Optional. The MongoDB database name to which schema data is written. Default to the same database as source.
316 | 
317 | `--schema_collection`
318 | Optional. The MongoDB collection to which schema data is written. Default to `[source_collection]_schema`.
319 | 
320 | `--dest_db_name`
321 | Optional. The BigQuery dataset to use.
322 | 
323 | `--dest_table_name`
324 | Optional. The BigQuery table name to use. If not specified, it will use source collection name.
325 | 
326 | `--use_mr`
327 | If this parameter is specified, the program will use MapReduce to generate schema and transform data. If not, the mapper and reducer will be executed as command line using the `cat [input] | mapper | sort | reducer` metaphore. This is useful for small data set and if you just want to get things up and running quickly.
328 | 
329 | `--policy_file`
330 | Use the specified file for policies which you can use to configure required fields, etc. See below for supported policies
331 | 
332 | `--infra_type`
333 | Specify `gcloud` for Google BigQuery
334 | 
335 | `--gcloud_project_id`
336 | Specify the Google Cloud project id
337 | 
338 | `--gcloud_storage_bucket_id`
339 | Specify the bucket ID of the Google Cloud Storage bucket to use for file storage
340 | 
341 | 
342 | ## Policy Manager
343 | 
344 | Policy manager is used to control schema generation. With the policy manager, you can:
345 | 
346 | 1. Specify required fields. If the field is missing, the document is rejected. Rejected documents are saved in `[TMP_PATH]/[collection_name]/rejected` folder.
347 | 2. Enforce data type for certain fields. In the example below, `age` is forced to be integer. So if there is a document that contains non-integer, the field will be null.
348 | 
349 | Example policy file:
350 | 
351 | ```
352 | [
353 |     {
354 |         "key": "last_name",
355 |         "required": true
356 |     },
357 |     {
358 |         "key": "address.zipcode",
359 |         "data_type": "integer"
360 |     }
361 | ]
362 | ```
363 | 
364 | Save the policy file, and pass the policy file in as command line argument via `--policy_file`.
365 | 
366 | 
367 | ## Query Examples
368 | 
369 | To query for charge_id > 1237489:
370 | ```
371 | --query '{"charge_id":{"$gt":1237489}}'
372 | ```
373 | 
374 | To query for _id > 55401a60151a4b1a4f000001:
375 | ```
376 | --query '{"_id": {"$gt":ObjectId("55401a60151a4b1a4f000001")}}'
377 | ```
378 | 
379 | ## Known Issues
380 | 
381 | * There is no easy way to capture records that were updated in MongoDB. We are working on capturing oplog and replay inserts and updates.
382 | * The ways in which the data type of a given changes over time is huge. A field can change from an int, to a string, to an array of string, to an array of mix types, to an array of complex objects over time. We haven't tested all the different combinations, but very interested in support as many as we can. Let us know if you have found a case that we don't support well.
383 | * Currently since BigQuery doesn't support alter-table, we can only support `overwrite` mode.
384 | 
385 | 


--------------------------------------------------------------------------------
/cs_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #
 4 | # Author: Jorge Chang
 5 | #
 6 | # Cloud Storage utility - abstraction of all cloud storage related calls.
 7 | # Implementation for Hadoop and Google Cloud Storage provided. Basic functionaliy
 8 | # like mkdir, rmdir and copy_from_local.
 9 | #
10 | 
11 | from onefold_util import execute
12 | 
13 | class CloudStorage:
14 |     
15 |     def rmdir(self, path):
16 |         return 
17 | 
18 |     def mkdir(self, path):
19 |         return
20 |     
21 |     def copy_from_local(self, source_local_file_path, dest_path):
22 |         return
23 | 
24 | 
25 | # HDFS implementation.
26 | class HDFSStorage(CloudStorage):
27 |     
28 |     def rmdir(self, path):
29 |         execute("hadoop fs -rm -r -f %s" % path, ignore_error=True)
30 | 
31 |     def mkdir(self, path):
32 |         execute("hadoop fs -mkdir -p %s" % path, ignore_error=True)
33 |     
34 |     def copy_from_local(self, source_local_file_path, dest_path):
35 |         execute("hadoop fs -copyFromLocal %s %s/" % (source_local_file_path, dest_path))
36 |         
37 | 
38 | # Google Cloud Storage implementation.
39 | class GCloudStorage(CloudStorage):
40 |     
41 |     project_id = None
42 |     bucket_id = None
43 |     
44 |     def __init__(self, project_id, bucket_id):
45 |         self.project_id = project_id
46 |         self.bucket_id = bucket_id
47 | 
48 |     def rmdir(self, path):
49 |         
50 |         print 'rmdir: %s' % (path)
51 | 
52 |         if not path.endswith("/"):
53 |             path = path + "/"
54 |         
55 |         command = "gsutil -m rm -rf gs://%s/%s" % (self.bucket_id, path)
56 |         execute(command, ignore_error=True)
57 | 
58 |     def mkdir(self, path):
59 |         # nothing to do. there are no folders in google cloud storage
60 |         pass
61 |     
62 |     def copy_from_local(self, source_local_file_path, dest_path):
63 | 
64 |         print 'copy_from_local: %s %s' % (source_local_file_path, dest_path)
65 |         
66 |         if not dest_path.endswith("/"):
67 |             dest_path = dest_path + "/"
68 |             
69 |         dest_path = dest_path + source_local_file_path.split("/")[-1]
70 |         
71 |         command = "gsutil -m cp %s gs://%s/%s" % (source_local_file_path, self.bucket_id, dest_path)
72 |         execute(command, ignore_error=False, retry=True)
73 | 


--------------------------------------------------------------------------------
/dw_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Author: Jorge Chang
  5 | #
  6 | # See license in LICENSE file.
  7 | #
  8 | # Data warehouse utility - interface to DataWarehouse + implementation for Hive.
  9 | # basic functionaliy like create table, update table, list tables, execute queries / DMLs, etc.
 10 | #
 11 | 
 12 | import abc
 13 | import json
 14 | import pprint
 15 | import re
 16 | 
 17 | from onefold_util import execute, execute_and_read
 18 | 
 19 | 
 20 | class DataWarehouse:
 21 |   __metaclass__ = abc.ABCMeta
 22 | 
 23 |   @abc.abstractmethod
 24 |   def create_dataset(self, database_name):
 25 |     return
 26 | 
 27 |   @abc.abstractmethod
 28 |   def delete_dataset(self, database_name):
 29 |     return
 30 | 
 31 |   @abc.abstractmethod
 32 |   def create_table(self, database_name, table_name, schema_fields, process_array):
 33 |     return
 34 | 
 35 |   @abc.abstractmethod
 36 |   def update_table(self, database_name, table_name, schema_fields):
 37 |     return
 38 | 
 39 |   @abc.abstractmethod
 40 |   def delete_table(self, database_name, table_name):
 41 |     return
 42 | 
 43 |   @abc.abstractmethod
 44 |   def get_num_rows(self, database_name, table_name):
 45 |     return
 46 | 
 47 |   @abc.abstractmethod
 48 |   def table_exists(self, database_name, table_name):
 49 |     return
 50 | 
 51 |   @abc.abstractmethod
 52 |   def get_table_schema(self, database_name, table_name):
 53 |     return
 54 | 
 55 |   @abc.abstractmethod
 56 |   def get_job_state(self, job_id):
 57 |     return
 58 | 
 59 |   @abc.abstractmethod
 60 |   def list_tables(self, database_name, table_prefix):
 61 |     return
 62 | 
 63 |   @abc.abstractmethod
 64 |   def load_table(self, table_name, file_path):
 65 |     return
 66 | 
 67 |   @abc.abstractmethod
 68 |   def query(self, query):
 69 |     return
 70 | 
 71 | 
 72 | class Hive(DataWarehouse):
 73 | 
 74 |   host = None
 75 |   port = None
 76 |   hive_serdes_path = None
 77 | 
 78 |   def __init__(self, host, port, hive_serdes_path):
 79 |     print '-- Initializing Hive Util --'
 80 |     self.host = host
 81 |     self.port = port
 82 |     self.hive_serdes_path = hive_serdes_path
 83 | 
 84 |   def execute_sql (self, database_name, sql, fetch_result = False):
 85 |     import pyhs2
 86 |     conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="NOSASL", database='default')
 87 | 
 88 |     # turn on tez and add serde jar
 89 |     c = conn.cursor()
 90 |     c.execute("set hive.execution.engine=tez")
 91 |     c.execute("set hive.cache.expr.evaluation=false")
 92 |     c.execute("add jar %s" % self.hive_serdes_path)
 93 | 
 94 |     if database_name != None:
 95 |       c.execute("use %s" % database_name)
 96 | 
 97 |     # run actual command command
 98 |     print "Executing HiveQL: %s" % (sql)
 99 |     c.execute(sql)
100 | 
101 |     output = []
102 |     if fetch_result:
103 |       rows = c.fetchall()
104 |       for row in rows:
105 |         output.append(row)
106 | 
107 |     c.close()
108 |     conn.close()
109 | 
110 |     return output
111 | 
112 |   def create_dataset(self, database_name):
113 |     pass
114 | 
115 |   def delete_dataset(self, database_name):
116 |     pass
117 | 
118 |   def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"):
119 | 
120 |     # used to keep track of table_name -> column list
121 |     table_columns = {}
122 | 
123 |     for field in schema_fields:
124 |       data_type = None
125 | 
126 |       if field['data_type'] == 'string':
127 |         data_type = 'string'
128 |       elif field['data_type'] in ('timestamp', 'boolean'):
129 |         data_type = field['type']
130 |       elif field['data_type'] == 'float':
131 |         data_type = 'double'
132 |       elif field['data_type'] ==  'integer':
133 |         data_type = 'int'
134 |       elif field['data_type'] in ('record'):
135 |         # ignore record
136 |         pass
137 |       else:
138 |         raise Exception("Unsupported data type %s for column %s" % (field['data_type'], field['key']))
139 | 
140 |       if data_type is not None:
141 |         if field['mode'] == 'repeated':
142 |           if process_array == "child_table":
143 |             child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower()
144 |             column_name = "value"
145 |           else:
146 |             continue
147 |         else:
148 |           if "." in field['key']:
149 |             if process_array == "child_table":
150 |               child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower()
151 |               column_name = field['key'].rsplit(".",1)[1]
152 |               print "  Child Table column:" + column_name
153 |             else:
154 |               child_table_name = table_name
155 |               column_name = field['key'].split(".",1)[0]
156 |               data_type = "string"
157 |               print "  Inline column:" + column_name
158 |           else:
159 |             child_table_name = table_name
160 |             column_name = field['key']
161 | 
162 |         if child_table_name not in table_columns:
163 |           table_columns[child_table_name] = set()
164 |           if child_table_name != table_name:
165 |             table_columns[child_table_name].add("%s %s" % ("parent_hash_code", "string"))
166 |             table_columns[child_table_name].add("%s %s" % ("hash_code", "string"))
167 | 
168 |         table_columns[child_table_name].add("`%s` %s" % (column_name, data_type))
169 | 
170 |     for table_name, columns in table_columns.iteritems():
171 |       sql = "create table `%s` (%s) ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe' " % (table_name, ",".join(columns))
172 |       self.execute_sql(database_name, sql)
173 | 
174 |     return table_columns.keys()
175 | 
176 |   def update_table(self, database_name, table_name, schema_fields):
177 | 
178 |     # current columns
179 |     table_names = self.list_tables(database_name, table_name)
180 |     current_table_columns = {}
181 |     for table_name in table_names:
182 |       current_columns = {}
183 |       current_schema = self.get_table_schema(database_name, table_name)
184 |       for field in current_schema:
185 |         current_columns[field['key']] = field['data_type']
186 |       current_table_columns[table_name] = current_columns
187 | 
188 |     # used to keep track of table_name -> column list
189 |     new_table_columns = {}
190 | 
191 |     alter_sqls = []
192 |     modify_instructions = {}
193 | 
194 |     for field in schema_fields:
195 | 
196 |       # print "processing field %s" % str(field)
197 |       sql_data_type = None
198 | 
199 |       if field['data_type'] == 'string':
200 |         sql_data_type = 'string'
201 |       elif field['data_type'] in ('timestamp', 'boolean'):
202 |         sql_data_type = field['type']
203 |       elif field['data_type'] == 'float':
204 |         sql_data_type = 'double'
205 |       elif field['data_type'] ==  'integer':
206 |         sql_data_type = 'int'
207 |       elif field['data_type'] in ('record'):
208 |         # ignore record
209 |         pass
210 |       else:
211 |         raise Exception("Unsupported data type %s for column %s" % (field['data_type'], field['key']))
212 | 
213 |       if sql_data_type is not None:
214 | 
215 |         if field['mode'] == 'repeated':
216 |           child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower()
217 |           column_name = "value"
218 |         else:
219 |           if "." in field['key']:
220 |             child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower()
221 |             column_name = field['key'].rsplit(".",1)[1]
222 |           else:
223 |             child_table_name = table_name
224 |             column_name = field['key']
225 | 
226 |         # print "column name %s" % column_name
227 |         if child_table_name in current_table_columns:
228 |           current_columns = current_table_columns[child_table_name]
229 |           if column_name in current_columns:
230 |             print "  column %s found in current table schema." % column_name
231 |             if field['data_type'].lower() != current_columns[column_name].lower():
232 |               print "  but data type is different. new: %s old: %s" % (field['data_type'], current_columns[column_name])
233 |               if child_table_name not in modify_instructions:
234 |                 modify_instructions[child_table_name] = {}
235 |               modify_instructions[child_table_name][column_name] = sql_data_type
236 |             else:
237 |               print "  data type is same.. no-op."
238 |               pass
239 |           else:
240 |             print "  column %s not found in current table schema." % column_name
241 |             alter_sqls.append ("alter table `%s` add columns (`%s` %s)" % (child_table_name, column_name, sql_data_type))
242 | 
243 |         else:
244 |           # new table needed
245 |           if child_table_name not in new_table_columns:
246 |             new_table_columns[child_table_name] = []
247 |             new_table_columns[child_table_name].append("%s %s" % ("parent_hash_code", "string"))
248 |             new_table_columns[child_table_name].append("%s %s" % ("hash_code", "string"))
249 |           new_table_columns[child_table_name].append("`%s` %s" % (column_name, sql_data_type))
250 | 
251 |     # generate sqls to modify column data type
252 |     modify_sqls = []
253 |     for child_table_name, modify_columns in modify_instructions.iteritems():
254 |       for modify_column_name, data_type in modify_columns.iteritems():
255 |         modify_sqls.append("alter table `%s` change `%s` `%s` %s" % (child_table_name, modify_column_name, modify_column_name, data_type))
256 | 
257 |     # execute alter table to change data type
258 |     for sql in modify_sqls:
259 |       self.execute_sql(database_name, sql)
260 | 
261 |     # execute alter table to add columns
262 |     for sql in alter_sqls:
263 |       self.execute_sql(database_name, sql)
264 | 
265 |     # create new tables
266 |     for child_table_name, columns in new_table_columns.iteritems():
267 |       sql = "create table `%s` (%s) ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe' " % (child_table_name, ",".join(columns))
268 |       self.execute_sql(database_name, sql)
269 | 
270 |     return table_names + new_table_columns.keys()
271 | 
272 |   def delete_table(self, database_name, table_name):
273 |     sql = "drop table if exists `%s`" % (table_name)
274 |     self.execute_sql(database_name, sql, False)
275 | 
276 |     child_table_names = self.list_tables(database_name, table_name)
277 |     for child_table_name in child_table_names:
278 |       sql = "drop table if exists `%s`" % (child_table_name)
279 |       self.execute_sql(database_name, sql, False)
280 | 
281 |   def get_num_rows(self, database_name, table_name):
282 |     sql = "select count(*) from `%s`" % (table_name)
283 |     r = self.execute_sql(database_name, sql, True)
284 |     return r[0][0]
285 | 
286 |   def table_exists(self, database_name, table_name):
287 |     r = self.execute_sql(database_name, "show tables", True)
288 |     for row in r:
289 |       if row[0] == table_name:
290 |         return True
291 | 
292 |     return False
293 | 
294 |   def get_table_schema(self, database_name, table_name):
295 | 
296 |     sql = "desc %s" % (table_name)
297 |     r = self.execute_sql(database_name, sql, True)
298 | 
299 |     fields = []
300 |     for row in r:
301 |       d = {}
302 |       if 'string' in row[1]:
303 |         d['data_type'] = 'string'
304 |       elif 'float' in row[1] or 'double' in row[1]:
305 |         d['data_type'] = 'float'
306 |       elif 'int' in row[1] or 'bigint' in row[1]:
307 |         d['data_type'] = 'integer'
308 |       elif 'timestamp' in row[1]:
309 |         d['data_type'] = 'timestamp'
310 |       elif 'boolean' in row[1]:
311 |         d['data_type'] = 'boolean'
312 | 
313 |       d['key'] = row[0]
314 |       d['mode'] = 'nullable'
315 |       fields.append(d)
316 | 
317 |     return fields
318 | 
319 |   def get_job_state(self, job_id):
320 | 
321 |     job_state = None
322 |     job_result = None
323 |     job_error_message = None
324 |     job_error_reason = None
325 |     job_output_rows = 0
326 | 
327 |     return (job_state, job_result, job_error_message, job_error_reason, job_output_rows)
328 | 
329 | 
330 |   def list_tables(self, database_name, table_prefix):
331 |     sql = "show tables"
332 |     r = self.execute_sql(database_name, sql, True)
333 |     output = []
334 |     for row in r:
335 |       if row[0].startswith(table_prefix):
336 |         output.append(row[0])
337 |     return output
338 | 
339 |   def load_table(self, database_name, table_name, file_path):
340 |     sql = "load data inpath '%s*' into table `%s`" % (file_path, table_name)
341 |     self.execute_sql(database_name, sql, fetch_result = False)
342 | 
343 |   def query(self, database_name, query):
344 |     result = self.execute_sql(database_name, query, True)
345 |     output = {}
346 |     output['rows'] = []
347 |     for r in result:
348 |       f = []
349 |       for i in r:
350 |         f.append({"v": i})
351 |       output['rows'].append({"f": f})
352 | 
353 |     return output
354 | 
355 | 
356 | # Implementation for Google BigQuery
357 | class GBigQuery(DataWarehouse):
358 | 
359 |   project_id = None
360 |   bucket_id = None
361 | 
362 |   def __init__(self, project_id, bucket_id):
363 |     print '-- Initializing Google BigQuery module --'
364 |     self.project_id = project_id
365 |     self.bucket_id = bucket_id 
366 | 
367 |   def create_dataset(self, database_name):
368 |     command = "bq --project_id %s mk %s" % (self.project_id, database_name)
369 |     execute(command, ignore_error=True)
370 | 
371 |   def delete_dataset(self, database_name):
372 |     pass
373 | 
374 |   def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"):
375 |     
376 |     table_columns = {}
377 | 
378 |     for field in schema_fields:
379 |       data_type = field['data_type']
380 | 
381 |       # ignore record
382 |       if field['data_type'] in ('record'):
383 |         continue
384 | 
385 |       if data_type is not None:
386 |         if field['mode'] == 'repeated':
387 |           if process_array == "child_table":
388 |             child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower()
389 |             column_name = "value"
390 |           else:
391 |             continue
392 |         else:
393 |           if "." in field['key']:
394 |             if process_array == "child_table":
395 |               child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower()
396 |               column_name = field['key'].rsplit(".",1)[1]
397 |               print "  Child Table column:" + column_name
398 |             else:
399 |               child_table_name = table_name
400 |               column_name = field['key'].split(".",1)[0]
401 |               data_type = "string"
402 |               print "  Inline column:" + column_name
403 |           else:
404 |             child_table_name = table_name
405 |             column_name = field['key']
406 | 
407 |         if child_table_name not in table_columns:
408 |           table_columns[child_table_name] = []
409 |           if child_table_name != table_name:
410 |             table_columns[child_table_name].append({"name": "parent_hash_code", "type": "string", "mode": "nullable"})
411 |             table_columns[child_table_name].append({"name": "hash_code", "type": "string", "mode": "nullable"})
412 | 
413 |         table_columns[child_table_name].append({"name": column_name, "type": data_type, "mode": "nullable"})
414 | 
415 |     for table_name, columns in table_columns.iteritems():
416 | 
417 |       # create schema file
418 |       schema_file_name = table_name + "_schema.json"
419 |       schema_json = json.dumps(columns)
420 |       schema_file = open(schema_file_name, "w")
421 |       schema_file.write(schema_json)
422 |       schema_file.close()
423 | 
424 |       # execute create-table command
425 |       command = "bq --project_id %s mk --schema %s %s.%s" % (self.project_id, schema_file_name,
426 |                                                              database_name, table_name)
427 |       execute(command)
428 | 
429 |     return table_columns.keys()
430 | 
431 |   def update_table(self, database_name, table_name, schema_fields):
432 |     # Currently BigQuery doesn't support update table
433 |     raise Exception("BigQuery doesn't support table update.")
434 | 
435 |   def delete_table(self, database_name, table_name):
436 |     command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, table_name)
437 |     execute(command, ignore_error=True)
438 |       
439 |     child_table_names = self.list_tables(database_name, table_name)
440 |     for child_table_name in child_table_names:
441 |       command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, child_table_name)
442 |       execute(command, ignore_error=True)
443 |       # sql = "drop table if exists `%s`" % (child_table_name)
444 |       # self.execute_sql(database_name, sql, False)
445 | 
446 |   def get_num_rows(self, database_name, table_name):
447 |     sql = "select count(*) from `%s`" % (table_name)
448 |     r = self.execute_sql(database_name, sql, True)
449 |     return r[0][0]
450 | 
451 |   def table_exists(self, database_name, table_name):
452 |       
453 |     all_tables = self.list_tables(database_name, table_name)
454 |     for table in all_tables:
455 |         if table == table_name:
456 |             return True
457 |     
458 |     return False      
459 | 
460 |   def get_table_schema(self, database_name, table_name):
461 |     pass
462 | 
463 |   def get_job_state(self, job_id):
464 |     job_state = None
465 |     job_result = None
466 |     job_error_message = None
467 |     job_error_reason = None
468 |     job_output_rows = 0
469 | 
470 |     return (job_state, job_result, job_error_message, job_error_reason, job_output_rows)
471 | 
472 |   def list_tables(self, database_name, table_prefix):
473 |     output = []
474 |     (rc, stdout_lines, stderr_lines) = execute_and_read("bq --project_id %s --format csv ls %s" % (self.project_id, database_name))
475 |     for line in stdout_lines:
476 |       table_name = line.split(",")[0]
477 |       if table_name.startswith(table_prefix):
478 |         output.append(table_name)
479 |     return output
480 | 
481 |   def load_table(self, database_name, table_name, file_path):
482 |     command = "bq --project_id %s --nosync load --source_format NEWLINE_DELIMITED_JSON %s.%s gs://%s/%s*" % \
483 |                   (self.project_id, database_name, table_name, self.bucket_id, file_path)
484 |     execute(command)
485 | 
486 |   def query(self, database_name, query):
487 |     pass
488 | 


--------------------------------------------------------------------------------
/java/HiveSerdes/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>com.cloudera.serde</groupId>
  5 |   <artifactId>hive-serdes</artifactId>
  6 |   <name>hive-serdes</name>
  7 |   <version>1.0-SNAPSHOT</version>
  8 |   <url>http://www.cloudera.com</url>
  9 |   <build>
 10 |     <pluginManagement>
 11 |       <plugins>
 12 |         <plugin>
 13 |           <artifactId>maven-compiler-plugin</artifactId>
 14 |           <version>2.3.2</version>
 15 |           <configuration>
 16 |             <source>1.6</source>
 17 |             <target>1.6</target>
 18 |           </configuration>
 19 |         </plugin>
 20 |       </plugins>
 21 |     </pluginManagement>
 22 |     <plugins>
 23 |       <plugin>
 24 |         <artifactId>maven-eclipse-plugin</artifactId>
 25 |         <version>2.9</version>
 26 |         <configuration>
 27 |           <buildOutputDirectory>eclipse-classes</buildOutputDirectory>
 28 |           <downloadSources>true</downloadSources>
 29 |           <downloadJavadocs>false</downloadJavadocs>
 30 |         </configuration>
 31 |       </plugin>
 32 |       <plugin>
 33 |         <artifactId>maven-shade-plugin</artifactId>
 34 |         <version>1.7.1</version>
 35 |         <executions>
 36 |           <execution>
 37 |             <phase>package</phase>
 38 |             <goals>
 39 |               <goal>shade</goal>
 40 |             </goals>
 41 |           </execution>
 42 |         </executions>
 43 |       </plugin>
 44 |     </plugins>
 45 |   </build>
 46 |   <repositories>
 47 |     <repository>
 48 |       <releases />
 49 |       <snapshots>
 50 |         <enabled>false</enabled>
 51 |       </snapshots>
 52 |       <id>cloudera</id>
 53 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 54 |     </repository>
 55 |   </repositories>
 56 |   <dependencies>
 57 |     <dependency>
 58 |       <groupId>junit</groupId>
 59 |       <artifactId>junit</artifactId>
 60 |       <version>4.8.2</version>
 61 |       <scope>test</scope>
 62 |     </dependency>
 63 |     <dependency>
 64 |       <groupId>org.apache.hive</groupId>
 65 |       <artifactId>hive-serde</artifactId>
 66 |       <version>0.10.0-cdh4.2.0</version>
 67 |       <scope>provided</scope>
 68 |       <exclusions>
 69 |         <exclusion>
 70 |           <artifactId>hive-common</artifactId>
 71 |           <groupId>org.apache.hive</groupId>
 72 |         </exclusion>
 73 |         <exclusion>
 74 |           <artifactId>hive-shims</artifactId>
 75 |           <groupId>org.apache.hive</groupId>
 76 |         </exclusion>
 77 |         <exclusion>
 78 |           <artifactId>slf4j-api</artifactId>
 79 |           <groupId>org.slf4j</groupId>
 80 |         </exclusion>
 81 |         <exclusion>
 82 |           <artifactId>slf4j-log4j12</artifactId>
 83 |           <groupId>org.slf4j</groupId>
 84 |         </exclusion>
 85 |         <exclusion>
 86 |           <artifactId>mockito-all</artifactId>
 87 |           <groupId>org.mockito</groupId>
 88 |         </exclusion>
 89 |         <exclusion>
 90 |           <artifactId>libthrift</artifactId>
 91 |           <groupId>org.apache.thrift</groupId>
 92 |         </exclusion>
 93 |         <exclusion>
 94 |           <artifactId>libfb303</artifactId>
 95 |           <groupId>org.apache.thrift</groupId>
 96 |         </exclusion>
 97 |         <exclusion>
 98 |           <artifactId>commons-lang</artifactId>
 99 |           <groupId>commons-lang</groupId>
100 |         </exclusion>
101 |         <exclusion>
102 |           <artifactId>commons-logging</artifactId>
103 |           <groupId>commons-logging</groupId>
104 |         </exclusion>
105 |         <exclusion>
106 |           <artifactId>commons-logging-api</artifactId>
107 |           <groupId>commons-logging</groupId>
108 |         </exclusion>
109 |         <exclusion>
110 |           <artifactId>commons-codec</artifactId>
111 |           <groupId>commons-codec</groupId>
112 |         </exclusion>
113 |         <exclusion>
114 |           <artifactId>avro</artifactId>
115 |           <groupId>org.apache.avro</groupId>
116 |         </exclusion>
117 |         <exclusion>
118 |           <artifactId>avro-mapred</artifactId>
119 |           <groupId>org.apache.avro</groupId>
120 |         </exclusion>
121 |       </exclusions>
122 |     </dependency>
123 |     <dependency>
124 |       <groupId>org.apache.hadoop</groupId>
125 |       <artifactId>hadoop-common</artifactId>
126 |       <version>2.0.0-cdh4.2.0</version>
127 |       <scope>provided</scope>
128 |       <exclusions>
129 |         <exclusion>
130 |           <artifactId>hadoop-annotations</artifactId>
131 |           <groupId>org.apache.hadoop</groupId>
132 |         </exclusion>
133 |         <exclusion>
134 |           <artifactId>guava</artifactId>
135 |           <groupId>com.google.guava</groupId>
136 |         </exclusion>
137 |         <exclusion>
138 |           <artifactId>commons-cli</artifactId>
139 |           <groupId>commons-cli</groupId>
140 |         </exclusion>
141 |         <exclusion>
142 |           <artifactId>commons-math</artifactId>
143 |           <groupId>org.apache.commons</groupId>
144 |         </exclusion>
145 |         <exclusion>
146 |           <artifactId>xmlenc</artifactId>
147 |           <groupId>xmlenc</groupId>
148 |         </exclusion>
149 |         <exclusion>
150 |           <artifactId>commons-httpclient</artifactId>
151 |           <groupId>commons-httpclient</groupId>
152 |         </exclusion>
153 |         <exclusion>
154 |           <artifactId>commons-io</artifactId>
155 |           <groupId>commons-io</groupId>
156 |         </exclusion>
157 |         <exclusion>
158 |           <artifactId>commons-net</artifactId>
159 |           <groupId>commons-net</groupId>
160 |         </exclusion>
161 |         <exclusion>
162 |           <artifactId>servlet-api</artifactId>
163 |           <groupId>javax.servlet</groupId>
164 |         </exclusion>
165 |         <exclusion>
166 |           <artifactId>jetty</artifactId>
167 |           <groupId>org.mortbay.jetty</groupId>
168 |         </exclusion>
169 |         <exclusion>
170 |           <artifactId>jetty-util</artifactId>
171 |           <groupId>org.mortbay.jetty</groupId>
172 |         </exclusion>
173 |         <exclusion>
174 |           <artifactId>jersey-core</artifactId>
175 |           <groupId>com.sun.jersey</groupId>
176 |         </exclusion>
177 |         <exclusion>
178 |           <artifactId>jersey-json</artifactId>
179 |           <groupId>com.sun.jersey</groupId>
180 |         </exclusion>
181 |         <exclusion>
182 |           <artifactId>jersey-server</artifactId>
183 |           <groupId>com.sun.jersey</groupId>
184 |         </exclusion>
185 |         <exclusion>
186 |           <artifactId>jasper-compiler</artifactId>
187 |           <groupId>tomcat</groupId>
188 |         </exclusion>
189 |         <exclusion>
190 |           <artifactId>jasper-runtime</artifactId>
191 |           <groupId>tomcat</groupId>
192 |         </exclusion>
193 |         <exclusion>
194 |           <artifactId>jsp-api</artifactId>
195 |           <groupId>javax.servlet.jsp</groupId>
196 |         </exclusion>
197 |         <exclusion>
198 |           <artifactId>commons-el</artifactId>
199 |           <groupId>commons-el</groupId>
200 |         </exclusion>
201 |         <exclusion>
202 |           <artifactId>log4j</artifactId>
203 |           <groupId>log4j</groupId>
204 |         </exclusion>
205 |         <exclusion>
206 |           <artifactId>jets3t</artifactId>
207 |           <groupId>net.java.dev.jets3t</groupId>
208 |         </exclusion>
209 |         <exclusion>
210 |           <artifactId>commons-configuration</artifactId>
211 |           <groupId>commons-configuration</groupId>
212 |         </exclusion>
213 |         <exclusion>
214 |           <artifactId>jackson-mapper-asl</artifactId>
215 |           <groupId>org.codehaus.jackson</groupId>
216 |         </exclusion>
217 |         <exclusion>
218 |           <artifactId>kfs</artifactId>
219 |           <groupId>net.sf.kosmosfs</groupId>
220 |         </exclusion>
221 |         <exclusion>
222 |           <artifactId>protobuf-java</artifactId>
223 |           <groupId>com.google.protobuf</groupId>
224 |         </exclusion>
225 |         <exclusion>
226 |           <artifactId>hadoop-auth</artifactId>
227 |           <groupId>org.apache.hadoop</groupId>
228 |         </exclusion>
229 |         <exclusion>
230 |           <artifactId>jsch</artifactId>
231 |           <groupId>com.jcraft</groupId>
232 |         </exclusion>
233 |         <exclusion>
234 |           <artifactId>zookeeper</artifactId>
235 |           <groupId>org.apache.zookeeper</groupId>
236 |         </exclusion>
237 |         <exclusion>
238 |           <artifactId>commons-codec</artifactId>
239 |           <groupId>commons-codec</groupId>
240 |         </exclusion>
241 |         <exclusion>
242 |           <artifactId>commons-logging</artifactId>
243 |           <groupId>commons-logging</groupId>
244 |         </exclusion>
245 |         <exclusion>
246 |           <artifactId>commons-lang</artifactId>
247 |           <groupId>commons-lang</groupId>
248 |         </exclusion>
249 |         <exclusion>
250 |           <artifactId>slf4j-api</artifactId>
251 |           <groupId>org.slf4j</groupId>
252 |         </exclusion>
253 |         <exclusion>
254 |           <artifactId>slf4j-log4j12</artifactId>
255 |           <groupId>org.slf4j</groupId>
256 |         </exclusion>
257 |         <exclusion>
258 |           <artifactId>mockito-all</artifactId>
259 |           <groupId>org.mockito</groupId>
260 |         </exclusion>
261 |         <exclusion>
262 |           <artifactId>avro</artifactId>
263 |           <groupId>org.apache.avro</groupId>
264 |         </exclusion>
265 |       </exclusions>
266 |     </dependency>
267 |   </dependencies>
268 |   <properties>
269 |     <eclipse.output.directory>eclipse-classes</eclipse.output.directory>
270 |     <hadoop.version>2.0.0-cdh4.2.0</hadoop.version>
271 |     <hive.version>0.10.0-cdh4.2.0</hive.version>
272 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
273 |   </properties>
274 | </project>
275 | 
276 | 


--------------------------------------------------------------------------------
/java/HiveSerdes/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <!--
  4 |   Licensed to the Apache Software Foundation (ASF) under one
  5 |   or more contributor license agreements.  See the NOTICE file
  6 |   distributed with this work for additional information
  7 |   regarding copyright ownership.  The ASF licenses this file
  8 |   to you under the Apache License, Version 2.0 (the
  9 |   "License"); you may not use this file except in compliance
 10 |   with the License.  You may obtain a copy of the License at
 11 | 
 12 |        http://www.apache.org/licenses/LICENSE-2.0
 13 | 
 14 |   Unless required by applicable law or agreed to in writing, software
 15 |   distributed under the License is distributed on an "AS IS" BASIS,
 16 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |   See the License for the specific language governing permissions and
 18 |   limitations under the License.
 19 |   -->
 20 |   <modelVersion>4.0.0</modelVersion>
 21 | 
 22 |   <groupId>com.cloudera.serde</groupId>
 23 |   <artifactId>hive-serdes</artifactId>
 24 |   <version>1.0-SNAPSHOT</version>
 25 |   <packaging>jar</packaging>
 26 | 
 27 |   <name>hive-serdes</name>
 28 |   <url>http://www.cloudera.com</url>
 29 | 
 30 |   <properties>
 31 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 32 |     <eclipse.output.directory>eclipse-classes</eclipse.output.directory>
 33 |     <hive.version>0.10.0-cdh4.2.0</hive.version>
 34 |     <hadoop.version>2.0.0-cdh4.2.0</hadoop.version>
 35 |   </properties>
 36 | 
 37 |   <build>
 38 |     <plugins>
 39 |       <plugin>
 40 |         <groupId>org.apache.maven.plugins</groupId>
 41 |         <artifactId>maven-eclipse-plugin</artifactId>
 42 |         <version>2.9</version>
 43 |         <configuration>
 44 |           <buildOutputDirectory>eclipse-classes</buildOutputDirectory>
 45 |           <downloadSources>true</downloadSources>
 46 |           <downloadJavadocs>false</downloadJavadocs>
 47 |         </configuration>
 48 |       </plugin>
 49 | 
 50 |       <plugin>
 51 |         <groupId>org.apache.maven.plugins</groupId>
 52 |         <artifactId>maven-shade-plugin</artifactId>
 53 |         <version>1.7.1</version>
 54 |         <executions>
 55 |           <execution>
 56 |             <phase>package</phase>
 57 |             <goals>
 58 |               <goal>shade</goal>
 59 |             </goals>
 60 |           </execution>
 61 |         </executions>
 62 |       </plugin>
 63 |     </plugins>
 64 | 
 65 |     <pluginManagement>
 66 |       <plugins>
 67 |         <plugin>
 68 |           <groupId>org.apache.maven.plugins</groupId>
 69 |           <artifactId>maven-compiler-plugin</artifactId>
 70 |           <version>2.3.2</version>
 71 |           <configuration>
 72 |             <source>1.6</source>
 73 |             <target>1.6</target>
 74 |           </configuration>
 75 |         </plugin>
 76 |       </plugins>
 77 |     </pluginManagement>
 78 |   </build>
 79 | 
 80 |   <dependencies>
 81 |     <dependency>
 82 |       <groupId>junit</groupId>
 83 |       <artifactId>junit</artifactId>
 84 |       <version>4.8.2</version>
 85 |       <scope>test</scope>      
 86 |     </dependency>
 87 | 
 88 |     <!-- JSON Parser -->
 89 |     <dependency>
 90 |       <groupId>org.codehaus.jackson</groupId>
 91 |       <artifactId>jackson-core-asl</artifactId>
 92 |       <version>1.9.8</version>
 93 |     </dependency>
 94 | 
 95 |     <!-- Hadoop Dependencies -->
 96 |     <dependency>
 97 |       <groupId>org.apache.hive</groupId>
 98 |       <artifactId>hive-serde</artifactId>
 99 |       <version>${hive.version}</version>
100 |       <scope>provided</scope>
101 |     </dependency>
102 |     <dependency>
103 |       <groupId>org.apache.hadoop</groupId>
104 |       <artifactId>hadoop-common</artifactId>
105 |       <version>${hadoop.version}</version>
106 |       <scope>provided</scope>
107 |     </dependency>
108 |   </dependencies>
109 | 
110 |   <repositories>
111 |     <repository>
112 |       <id>cloudera</id>
113 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
114 |       <releases>
115 |         <enabled>true</enabled>
116 |       </releases>
117 |       <snapshots>
118 |         <enabled>false</enabled>
119 |       </snapshots>
120 |     </repository>
121 |   </repositories>
122 | </project>
123 | 


--------------------------------------------------------------------------------
/java/HiveSerdes/src/main/java/com/cloudera/hive/serde/JSONSerDe.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package com.cloudera.hive.serde;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.Arrays;
 22 | import java.util.HashMap;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | import java.util.Properties;
 26 | 
 27 | import org.apache.hadoop.conf.Configuration;
 28 | import org.apache.hadoop.hive.serde.serdeConstants;
 29 | import org.apache.hadoop.hive.serde2.SerDe;
 30 | import org.apache.hadoop.hive.serde2.SerDeException;
 31 | import org.apache.hadoop.hive.serde2.SerDeStats;
 32 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
 33 | import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
 34 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 35 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 36 | import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 37 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 38 | import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
 39 | import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
 40 | import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
 41 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 42 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 43 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 44 | import org.apache.hadoop.io.Text;
 45 | import org.apache.hadoop.io.Writable;
 46 | import org.codehaus.jackson.map.ObjectMapper;
 47 | 
 48 | /**
 49 |  * This SerDe can be used for processing JSON data in Hive. It supports
 50 |  * arbitrary JSON data, and can handle all Hive types except for UNION.
 51 |  * However, the JSON data is expected to be a series of discrete records,
 52 |  * rather than a JSON array of objects.
 53 |  * 
 54 |  * The Hive table is expected to contain columns with names corresponding to
 55 |  * fields in the JSON data, but it is not necessary for every JSON field to
 56 |  * have a corresponding Hive column. Those JSON fields will be ignored during
 57 |  * queries.
 58 |  * 
 59 |  * Example:
 60 |  * 
 61 |  * { "a": 1, "b": [ "str1", "str2" ], "c": { "field1": "val1" } }
 62 |  * 
 63 |  * Could correspond to a table:
 64 |  * 
 65 |  * CREATE TABLE foo (a INT, b ARRAY<STRING>, c STRUCT<field1:STRING>);
 66 |  * 
 67 |  * JSON objects can also interpreted as a Hive MAP type, so long as the keys
 68 |  * and values in the JSON object are all of the appropriate types. For example,
 69 |  * in the JSON above, another valid table declaraction would be:
 70 |  * 
 71 |  * CREATE TABLE foo (a INT, b ARRAY<STRING>, c MAP<STRING,STRING>);
 72 |  * 
 73 |  * Only STRING keys are supported for Hive MAPs.
 74 |  */
 75 | public class JSONSerDe implements SerDe {
 76 |   
 77 |   private StructTypeInfo rowTypeInfo;
 78 |   private ObjectInspector rowOI;
 79 |   private List<String> colNames;
 80 |   private List<Object> row = new ArrayList<Object>();
 81 |   
 82 |   /**
 83 |    * An initialization function used to gather information about the table.
 84 |    * Typically, a SerDe implementation will be interested in the list of
 85 |    * column names and their types. That information will be used to help perform
 86 |    * actual serialization and deserialization of data.
 87 |    */
 88 |   @Override
 89 |   public void initialize(Configuration conf, Properties tbl)
 90 |       throws SerDeException {
 91 |     // Get a list of the table's column names.
 92 |     String colNamesStr = tbl.getProperty(serdeConstants.LIST_COLUMNS);
 93 |     colNames = Arrays.asList(colNamesStr.split(","));
 94 |     
 95 |     // Get a list of TypeInfos for the columns. This list lines up with
 96 |     // the list of column names.
 97 |     String colTypesStr = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
 98 |     List<TypeInfo> colTypes =
 99 |         TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr);
100 |     
101 |     rowTypeInfo =
102 |         (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(colNames, colTypes);
103 |     rowOI =
104 |         TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo);
105 |   }
106 | 
107 |   /**
108 |    * This method does the work of deserializing a record into Java objects that
109 |    * Hive can work with via the ObjectInspector interface. For this SerDe, the
110 |    * blob that is passed in is a JSON string, and the Jackson JSON parser is
111 |    * being used to translate the string into Java objects.
112 |    * 
113 |    * The JSON deserialization works by taking the column names in the Hive
114 |    * table, and looking up those fields in the parsed JSON object. If the value
115 |    * of the field is not a primitive, the object is parsed further.
116 |    */
117 |   @Override
118 |   public Object deserialize(Writable blob) throws SerDeException {
119 |     Map<?,?> root = null;
120 |     row.clear();
121 |     try {
122 |       ObjectMapper mapper = new ObjectMapper();
123 |       // This is really a Map<String, Object>. For more information about how
124 |       // Jackson parses JSON in this example, see
125 |       // http://wiki.fasterxml.com/JacksonDataBinding
126 |       root = mapper.readValue(blob.toString(), Map.class);
127 |     } catch (Exception e) {
128 |       throw new SerDeException(e);
129 |     }
130 | 
131 |     // Lowercase the keys as expected by hive
132 |     Map<String, Object> lowerRoot = new HashMap();
133 |     for(Map.Entry entry: root.entrySet()) {
134 |       lowerRoot.put(((String)entry.getKey()).toLowerCase(), entry.getValue());
135 |     }
136 |     root = lowerRoot;
137 |     
138 |     Object value= null;
139 |     for (String fieldName : rowTypeInfo.getAllStructFieldNames()) {
140 |       try {
141 |         TypeInfo fieldTypeInfo = rowTypeInfo.getStructFieldTypeInfo(fieldName);
142 |         value = parseField(root.get(fieldName), fieldTypeInfo);
143 |       } catch (Exception e) {
144 |         value = null;
145 |       }
146 |       row.add(value);
147 |     }
148 |     return row;
149 |   }
150 |   
151 |   /**
152 |    * Parses a JSON object according to the Hive column's type.
153 |    * 
154 |    * @param field - The JSON object to parse
155 |    * @param fieldTypeInfo - Metadata about the Hive column
156 |    * @return - The parsed value of the field
157 |    */
158 |   private Object parseField(Object field, TypeInfo fieldTypeInfo) {
159 |     switch (fieldTypeInfo.getCategory()) {
160 |     case PRIMITIVE:
161 |       // Jackson will return the right thing in this case, so just return
162 |       // the object
163 |       if (field instanceof String) {
164 |         field = field.toString().replaceAll("\n", "\\\\n");
165 |       }
166 |       return field;
167 |     case LIST:
168 |       return parseList(field, (ListTypeInfo) fieldTypeInfo);
169 |     case MAP:
170 |       return parseMap(field, (MapTypeInfo) fieldTypeInfo);
171 |     case STRUCT:
172 |       return parseStruct(field, (StructTypeInfo) fieldTypeInfo);
173 |     case UNION:
174 |       // Unsupported by JSON
175 |     default:
176 |       return null;
177 |     }
178 |   }
179 |   
180 |   /**
181 |    * Parses a JSON object and its fields. The Hive metadata is used to
182 |    * determine how to parse the object fields.
183 |    * 
184 |    * @param field - The JSON object to parse
185 |    * @param fieldTypeInfo - Metadata about the Hive column
186 |    * @return - A map representing the object and its fields
187 |    */
188 |   private Object parseStruct(Object field, StructTypeInfo fieldTypeInfo) {
189 |     Map<Object,Object> map = (Map<Object,Object>)field;
190 |     ArrayList<TypeInfo> structTypes = fieldTypeInfo.getAllStructFieldTypeInfos();
191 |     ArrayList<String> structNames = fieldTypeInfo.getAllStructFieldNames();
192 | 
193 |     List<Object> structRow = new ArrayList<Object>(structTypes.size());
194 |     if (map != null) {
195 |         for (int i = 0; i < structNames.size(); i++) {
196 |             structRow.add(parseField(map.get(structNames.get(i)), structTypes.get(i)));
197 |         }
198 |     }
199 |     return structRow;
200 |   }
201 | 
202 |   /**
203 |    * Parse a JSON list and its elements. This uses the Hive metadata for the
204 |    * list elements to determine how to parse the elements.
205 |    * 
206 |    * @param field - The JSON list to parse
207 |    * @param fieldTypeInfo - Metadata about the Hive column
208 |    * @return - A list of the parsed elements
209 |    */
210 |   private Object parseList(Object field, ListTypeInfo fieldTypeInfo) {
211 |     ArrayList<Object> list = (ArrayList<Object>) field;
212 |     TypeInfo elemTypeInfo = fieldTypeInfo.getListElementTypeInfo();
213 |     if (list != null) {
214 |         for (int i = 0; i < list.size(); i++) {
215 |             list.set(i, parseField(list.get(i), elemTypeInfo));
216 |         }
217 |     }
218 |     return list.toArray();
219 |   }
220 | 
221 |   /**
222 |    * Parse a JSON object as a map. This uses the Hive metadata for the map
223 |    * values to determine how to parse the values. The map is assumed to have
224 |    * a string for a key.
225 |    * 
226 |    * @param field - The JSON list to parse
227 |    * @param fieldTypeInfo - Metadata about the Hive column
228 |    * @return
229 |    */
230 |   private Object parseMap(Object field, MapTypeInfo fieldTypeInfo) {
231 |     Map<Object,Object> map = (Map<Object,Object>) field;
232 |     TypeInfo valueTypeInfo = fieldTypeInfo.getMapValueTypeInfo();
233 |     if (map != null) {
234 |         for (Map.Entry<Object, Object> entry : map.entrySet()) {
235 |             map.put(entry.getKey(), parseField(entry.getValue(), valueTypeInfo));
236 |         }
237 |     }
238 |     return map;
239 |   }
240 | 
241 |   /**
242 |    * Return an ObjectInspector for the row of data
243 |    */
244 |   @Override
245 |   public ObjectInspector getObjectInspector() throws SerDeException {
246 |     return rowOI;
247 |   }
248 | 
249 |   /**
250 |    * Unimplemented
251 |    */
252 |   @Override
253 |   public SerDeStats getSerDeStats() {
254 |     return null;
255 |   }
256 | 
257 |   /**
258 |    * JSON is just a textual representation, so our serialized class
259 |    * is just Text.
260 |    */
261 |   @Override
262 |   public Class<? extends Writable> getSerializedClass() {
263 |     return Text.class;
264 |   }
265 | 
266 |   /**
267 |    * This method takes an object representing a row of data from Hive, and uses
268 |    * the ObjectInspector to get the data for each column and serialize it. This
269 |    * implementation deparses the row into an object that Jackson can easily
270 |    * serialize into a JSON blob.
271 |    */
272 |   @Override
273 |   public Writable serialize(Object obj, ObjectInspector oi)
274 |       throws SerDeException {
275 |     Object deparsedObj = deparseRow(obj, oi);
276 |     ObjectMapper mapper = new ObjectMapper();
277 |     try {
278 |       // Let Jackson do the work of serializing the object
279 |       return new Text(mapper.writeValueAsString(deparsedObj));
280 |     } catch (Exception e) {
281 |       throw new SerDeException(e);
282 |     }
283 |   }
284 | 
285 |   /**
286 |    * Deparse a Hive object into a Jackson-serializable object. This uses
287 |    * the ObjectInspector to extract the column data.
288 |    * 
289 |    * @param obj - Hive object to deparse
290 |    * @param oi - ObjectInspector for the object
291 |    * @return - A deparsed object
292 |    */
293 |   private Object deparseObject(Object obj, ObjectInspector oi) {
294 |     switch (oi.getCategory()) {
295 |     case LIST:
296 |       return deparseList(obj, (ListObjectInspector)oi);
297 |     case MAP:
298 |       return deparseMap(obj, (MapObjectInspector)oi);
299 |     case PRIMITIVE:
300 |       return deparsePrimitive(obj, (PrimitiveObjectInspector)oi);
301 |     case STRUCT:
302 |       return deparseStruct(obj, (StructObjectInspector)oi, false);
303 |     case UNION:
304 |       // Unsupported by JSON
305 |     default:
306 |       return null;
307 |     }
308 |   }
309 |   
310 |   /**
311 |    * Deparses a row of data. We have to treat this one differently from
312 |    * other structs, because the field names for the root object do not match
313 |    * the column names for the Hive table.
314 |    * 
315 |    * @param obj - Object representing the top-level row
316 |    * @param structOI - ObjectInspector for the row
317 |    * @return - A deparsed row of data
318 |    */
319 |   private Object deparseRow(Object obj, ObjectInspector structOI) {
320 |     return deparseStruct(obj, (StructObjectInspector)structOI, true);
321 |   }
322 | 
323 |   /**
324 |    * Deparses struct data into a serializable JSON object.
325 |    * 
326 |    * @param obj - Hive struct data
327 |    * @param structOI - ObjectInspector for the struct
328 |    * @param isRow - Whether or not this struct represents a top-level row
329 |    * @return - A deparsed struct
330 |    */
331 |   private Object deparseStruct(Object obj,
332 |                                StructObjectInspector structOI,
333 |                                boolean isRow) {
334 |     Map<Object,Object> struct = new HashMap<Object,Object>();
335 |     List<? extends StructField> fields = structOI.getAllStructFieldRefs();
336 |     for (int i = 0; i < fields.size(); i++) {
337 |       StructField field = fields.get(i);
338 |       // The top-level row object is treated slightly differently from other
339 |       // structs, because the field names for the row do not correctly reflect
340 |       // the Hive column names. For lower-level structs, we can get the field
341 |       // name from the associated StructField object.
342 |       String fieldName = isRow ? colNames.get(i) : field.getFieldName();
343 |       ObjectInspector fieldOI = field.getFieldObjectInspector();
344 |       Object fieldObj = structOI.getStructFieldData(obj, field);
345 |       struct.put(fieldName, deparseObject(fieldObj, fieldOI));
346 |     }
347 |     return struct;
348 |   }
349 | 
350 |   /**
351 |    * Deparses a primitive type.
352 |    * 
353 |    * @param obj - Hive object to deparse
354 |    * @param oi - ObjectInspector for the object
355 |    * @return - A deparsed object
356 |    */
357 |   private Object deparsePrimitive(Object obj, PrimitiveObjectInspector primOI) {
358 |     return primOI.getPrimitiveJavaObject(obj);
359 |   }
360 | 
361 |   private Object deparseMap(Object obj, MapObjectInspector mapOI) {
362 |     Map<Object,Object> map = new HashMap<Object,Object>();
363 |     ObjectInspector mapValOI = mapOI.getMapValueObjectInspector();
364 |     Map<?,?> fields = mapOI.getMap(obj);
365 |     for (Map.Entry<?,?> field : fields.entrySet()) {
366 |       Object fieldName = field.getKey();
367 |       Object fieldObj = field.getValue();
368 |       map.put(fieldName, deparseObject(fieldObj, mapValOI));
369 |     }
370 |     return map;
371 |   }
372 | 
373 |   /**
374 |    * Deparses a list and its elements.
375 |    * 
376 |    * @param obj - Hive object to deparse
377 |    * @param oi - ObjectInspector for the object
378 |    * @return - A deparsed object
379 |    */
380 |   private Object deparseList(Object obj, ListObjectInspector listOI) {
381 |     List<Object> list = new ArrayList<Object>();
382 |     List<?> field = listOI.getList(obj);
383 |     ObjectInspector elemOI = listOI.getListElementObjectInspector();
384 |     for (Object elem : field) {
385 |       list.add(deparseObject(elem, elemOI));
386 |     }
387 |     return list;
388 |   }
389 | }
390 | 


--------------------------------------------------------------------------------
/java/MapReduce/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>com.onefold.hadoop</groupId>
  5 |   <artifactId>MapReduce</artifactId>
  6 |   <name>MapReduce</name>
  7 |   <version>0.0.1-SNAPSHOT</version>
  8 |   <url>http://www.onefold.io</url>
  9 |   <build>
 10 |     <pluginManagement>
 11 |       <plugins>
 12 |         <plugin>
 13 |           <artifactId>maven-compiler-plugin</artifactId>
 14 |           <version>2.3.2</version>
 15 |           <configuration>
 16 |             <source>1.6</source>
 17 |             <target>1.6</target>
 18 |           </configuration>
 19 |         </plugin>
 20 |       </plugins>
 21 |     </pluginManagement>
 22 |     <plugins>
 23 |       <plugin>
 24 |         <artifactId>maven-eclipse-plugin</artifactId>
 25 |         <version>2.9</version>
 26 |         <configuration>
 27 |           <buildOutputDirectory>eclipse-classes</buildOutputDirectory>
 28 |           <downloadSources>true</downloadSources>
 29 |           <downloadJavadocs>false</downloadJavadocs>
 30 |         </configuration>
 31 |       </plugin>
 32 |       <plugin>
 33 |         <artifactId>maven-shade-plugin</artifactId>
 34 |         <version>1.7.1</version>
 35 |         <executions>
 36 |           <execution>
 37 |             <phase>package</phase>
 38 |             <goals>
 39 |               <goal>shade</goal>
 40 |             </goals>
 41 |           </execution>
 42 |         </executions>
 43 |       </plugin>
 44 |     </plugins>
 45 |   </build>
 46 |   <repositories>
 47 |     <repository>
 48 |       <releases>
 49 |         <updatePolicy>always</updatePolicy>
 50 |         <checksumPolicy>warn</checksumPolicy>
 51 |       </releases>
 52 |       <snapshots>
 53 |         <enabled>false</enabled>
 54 |         <updatePolicy>never</updatePolicy>
 55 |         <checksumPolicy>fail</checksumPolicy>
 56 |       </snapshots>
 57 |       <id>HDPReleases</id>
 58 |       <name>HDP Releases</name>
 59 |       <url>http://repo.hortonworks.com/content/repositories/releases/</url>
 60 |     </repository>
 61 |   </repositories>
 62 |   <dependencies>
 63 |     <dependency>
 64 |       <groupId>org.mortbay.jetty</groupId>
 65 |       <artifactId>jetty</artifactId>
 66 |       <version>6.1.26</version>
 67 |       <scope>provided</scope>
 68 |       <exclusions>
 69 |         <exclusion>
 70 |           <artifactId>servlet-api</artifactId>
 71 |           <groupId>org.mortbay.jetty</groupId>
 72 |         </exclusion>
 73 |       </exclusions>
 74 |     </dependency>
 75 |     <dependency>
 76 |       <groupId>org.mortbay.jetty</groupId>
 77 |       <artifactId>jetty-util</artifactId>
 78 |       <version>6.1.26</version>
 79 |       <scope>provided</scope>
 80 |     </dependency>
 81 |     <dependency>
 82 |       <groupId>org.apache.hadoop</groupId>
 83 |       <artifactId>hadoop-mapreduce-client-core</artifactId>
 84 |       <version>2.6.0.2.2.0.0-2041</version>
 85 |       <scope>provided</scope>
 86 |       <exclusions>
 87 |         <exclusion>
 88 |           <artifactId>hadoop-yarn-common</artifactId>
 89 |           <groupId>org.apache.hadoop</groupId>
 90 |         </exclusion>
 91 |         <exclusion>
 92 |           <artifactId>protobuf-java</artifactId>
 93 |           <groupId>com.google.protobuf</groupId>
 94 |         </exclusion>
 95 |         <exclusion>
 96 |           <artifactId>avro</artifactId>
 97 |           <groupId>org.apache.avro</groupId>
 98 |         </exclusion>
 99 |         <exclusion>
100 |           <artifactId>slf4j-api</artifactId>
101 |           <groupId>org.slf4j</groupId>
102 |         </exclusion>
103 |         <exclusion>
104 |           <artifactId>slf4j-log4j12</artifactId>
105 |           <groupId>org.slf4j</groupId>
106 |         </exclusion>
107 |         <exclusion>
108 |           <artifactId>hadoop-annotations</artifactId>
109 |           <groupId>org.apache.hadoop</groupId>
110 |         </exclusion>
111 |         <exclusion>
112 |           <artifactId>guice-servlet</artifactId>
113 |           <groupId>com.google.inject.extensions</groupId>
114 |         </exclusion>
115 |         <exclusion>
116 |           <artifactId>netty</artifactId>
117 |           <groupId>io.netty</groupId>
118 |         </exclusion>
119 |       </exclusions>
120 |     </dependency>
121 |     <dependency>
122 |       <groupId>org.apache.hadoop</groupId>
123 |       <artifactId>hadoop-common</artifactId>
124 |       <version>2.6.0.2.2.0.0-2041</version>
125 |       <scope>provided</scope>
126 |       <exclusions>
127 |         <exclusion>
128 |           <artifactId>guava</artifactId>
129 |           <groupId>com.google.guava</groupId>
130 |         </exclusion>
131 |         <exclusion>
132 |           <artifactId>commons-cli</artifactId>
133 |           <groupId>commons-cli</groupId>
134 |         </exclusion>
135 |         <exclusion>
136 |           <artifactId>commons-math3</artifactId>
137 |           <groupId>org.apache.commons</groupId>
138 |         </exclusion>
139 |         <exclusion>
140 |           <artifactId>xmlenc</artifactId>
141 |           <groupId>xmlenc</groupId>
142 |         </exclusion>
143 |         <exclusion>
144 |           <artifactId>commons-httpclient</artifactId>
145 |           <groupId>commons-httpclient</groupId>
146 |         </exclusion>
147 |         <exclusion>
148 |           <artifactId>commons-codec</artifactId>
149 |           <groupId>commons-codec</groupId>
150 |         </exclusion>
151 |         <exclusion>
152 |           <artifactId>commons-io</artifactId>
153 |           <groupId>commons-io</groupId>
154 |         </exclusion>
155 |         <exclusion>
156 |           <artifactId>commons-net</artifactId>
157 |           <groupId>commons-net</groupId>
158 |         </exclusion>
159 |         <exclusion>
160 |           <artifactId>commons-collections</artifactId>
161 |           <groupId>commons-collections</groupId>
162 |         </exclusion>
163 |         <exclusion>
164 |           <artifactId>servlet-api</artifactId>
165 |           <groupId>javax.servlet</groupId>
166 |         </exclusion>
167 |         <exclusion>
168 |           <artifactId>jersey-core</artifactId>
169 |           <groupId>com.sun.jersey</groupId>
170 |         </exclusion>
171 |         <exclusion>
172 |           <artifactId>jersey-json</artifactId>
173 |           <groupId>com.sun.jersey</groupId>
174 |         </exclusion>
175 |         <exclusion>
176 |           <artifactId>jersey-server</artifactId>
177 |           <groupId>com.sun.jersey</groupId>
178 |         </exclusion>
179 |         <exclusion>
180 |           <artifactId>jasper-compiler</artifactId>
181 |           <groupId>tomcat</groupId>
182 |         </exclusion>
183 |         <exclusion>
184 |           <artifactId>jasper-runtime</artifactId>
185 |           <groupId>tomcat</groupId>
186 |         </exclusion>
187 |         <exclusion>
188 |           <artifactId>jsp-api</artifactId>
189 |           <groupId>javax.servlet.jsp</groupId>
190 |         </exclusion>
191 |         <exclusion>
192 |           <artifactId>commons-el</artifactId>
193 |           <groupId>commons-el</groupId>
194 |         </exclusion>
195 |         <exclusion>
196 |           <artifactId>commons-logging</artifactId>
197 |           <groupId>commons-logging</groupId>
198 |         </exclusion>
199 |         <exclusion>
200 |           <artifactId>log4j</artifactId>
201 |           <groupId>log4j</groupId>
202 |         </exclusion>
203 |         <exclusion>
204 |           <artifactId>jets3t</artifactId>
205 |           <groupId>net.java.dev.jets3t</groupId>
206 |         </exclusion>
207 |         <exclusion>
208 |           <artifactId>microsoft-windowsazure-storage-sdk</artifactId>
209 |           <groupId>com.microsoft.windowsazure.storage</groupId>
210 |         </exclusion>
211 |         <exclusion>
212 |           <artifactId>commons-lang</artifactId>
213 |           <groupId>commons-lang</groupId>
214 |         </exclusion>
215 |         <exclusion>
216 |           <artifactId>commons-configuration</artifactId>
217 |           <groupId>commons-configuration</groupId>
218 |         </exclusion>
219 |         <exclusion>
220 |           <artifactId>jackson-mapper-asl</artifactId>
221 |           <groupId>org.codehaus.jackson</groupId>
222 |         </exclusion>
223 |         <exclusion>
224 |           <artifactId>gson</artifactId>
225 |           <groupId>com.google.code.gson</groupId>
226 |         </exclusion>
227 |         <exclusion>
228 |           <artifactId>hadoop-auth</artifactId>
229 |           <groupId>org.apache.hadoop</groupId>
230 |         </exclusion>
231 |         <exclusion>
232 |           <artifactId>jsch</artifactId>
233 |           <groupId>com.jcraft</groupId>
234 |         </exclusion>
235 |         <exclusion>
236 |           <artifactId>curator-client</artifactId>
237 |           <groupId>org.apache.curator</groupId>
238 |         </exclusion>
239 |         <exclusion>
240 |           <artifactId>curator-recipes</artifactId>
241 |           <groupId>org.apache.curator</groupId>
242 |         </exclusion>
243 |         <exclusion>
244 |           <artifactId>jsr305</artifactId>
245 |           <groupId>com.google.code.findbugs</groupId>
246 |         </exclusion>
247 |         <exclusion>
248 |           <artifactId>htrace-core</artifactId>
249 |           <groupId>org.htrace</groupId>
250 |         </exclusion>
251 |         <exclusion>
252 |           <artifactId>zookeeper</artifactId>
253 |           <groupId>org.apache.zookeeper</groupId>
254 |         </exclusion>
255 |         <exclusion>
256 |           <artifactId>commons-compress</artifactId>
257 |           <groupId>org.apache.commons</groupId>
258 |         </exclusion>
259 |         <exclusion>
260 |           <artifactId>hadoop-annotations</artifactId>
261 |           <groupId>org.apache.hadoop</groupId>
262 |         </exclusion>
263 |         <exclusion>
264 |           <artifactId>slf4j-api</artifactId>
265 |           <groupId>org.slf4j</groupId>
266 |         </exclusion>
267 |         <exclusion>
268 |           <artifactId>slf4j-log4j12</artifactId>
269 |           <groupId>org.slf4j</groupId>
270 |         </exclusion>
271 |         <exclusion>
272 |           <artifactId>avro</artifactId>
273 |           <groupId>org.apache.avro</groupId>
274 |         </exclusion>
275 |         <exclusion>
276 |           <artifactId>protobuf-java</artifactId>
277 |           <groupId>com.google.protobuf</groupId>
278 |         </exclusion>
279 |       </exclusions>
280 |     </dependency>
281 |     <dependency>
282 |       <groupId>org.apache.hive</groupId>
283 |       <artifactId>hive-serde</artifactId>
284 |       <version>0.14.0.2.2.5.1-3</version>
285 |       <scope>provided</scope>
286 |       <exclusions>
287 |         <exclusion>
288 |           <artifactId>hive-common</artifactId>
289 |           <groupId>org.apache.hive</groupId>
290 |         </exclusion>
291 |         <exclusion>
292 |           <artifactId>hive-shims</artifactId>
293 |           <groupId>org.apache.hive</groupId>
294 |         </exclusion>
295 |         <exclusion>
296 |           <artifactId>libthrift</artifactId>
297 |           <groupId>org.apache.thrift</groupId>
298 |         </exclusion>
299 |         <exclusion>
300 |           <artifactId>opencsv</artifactId>
301 |           <groupId>net.sf.opencsv</groupId>
302 |         </exclusion>
303 |         <exclusion>
304 |           <artifactId>commons-codec</artifactId>
305 |           <groupId>commons-codec</groupId>
306 |         </exclusion>
307 |         <exclusion>
308 |           <artifactId>commons-lang</artifactId>
309 |           <groupId>commons-lang</groupId>
310 |         </exclusion>
311 |         <exclusion>
312 |           <artifactId>commons-logging</artifactId>
313 |           <groupId>commons-logging</groupId>
314 |         </exclusion>
315 |         <exclusion>
316 |           <artifactId>avro</artifactId>
317 |           <groupId>org.apache.avro</groupId>
318 |         </exclusion>
319 |         <exclusion>
320 |           <artifactId>slf4j-api</artifactId>
321 |           <groupId>org.slf4j</groupId>
322 |         </exclusion>
323 |         <exclusion>
324 |           <artifactId>slf4j-log4j12</artifactId>
325 |           <groupId>org.slf4j</groupId>
326 |         </exclusion>
327 |       </exclusions>
328 |     </dependency>
329 |     <dependency>
330 |       <groupId>org.apache.hive</groupId>
331 |       <artifactId>hive-exec</artifactId>
332 |       <version>0.14.0.2.2.5.1-3</version>
333 |       <scope>provided</scope>
334 |       <exclusions>
335 |         <exclusion>
336 |           <artifactId>hive-ant</artifactId>
337 |           <groupId>org.apache.hive</groupId>
338 |         </exclusion>
339 |         <exclusion>
340 |           <artifactId>hive-metastore</artifactId>
341 |           <groupId>org.apache.hive</groupId>
342 |         </exclusion>
343 |         <exclusion>
344 |           <artifactId>antlr-runtime</artifactId>
345 |           <groupId>org.antlr</groupId>
346 |         </exclusion>
347 |         <exclusion>
348 |           <artifactId>ST4</artifactId>
349 |           <groupId>org.antlr</groupId>
350 |         </exclusion>
351 |         <exclusion>
352 |           <artifactId>ant</artifactId>
353 |           <groupId>org.apache.ant</groupId>
354 |         </exclusion>
355 |         <exclusion>
356 |           <artifactId>libfb303</artifactId>
357 |           <groupId>org.apache.thrift</groupId>
358 |         </exclusion>
359 |         <exclusion>
360 |           <artifactId>groovy-all</artifactId>
361 |           <groupId>org.codehaus.groovy</groupId>
362 |         </exclusion>
363 |         <exclusion>
364 |           <artifactId>datanucleus-core</artifactId>
365 |           <groupId>org.datanucleus</groupId>
366 |         </exclusion>
367 |         <exclusion>
368 |           <artifactId>calcite-core</artifactId>
369 |           <groupId>org.apache.calcite</groupId>
370 |         </exclusion>
371 |         <exclusion>
372 |           <artifactId>calcite-avatica</artifactId>
373 |           <groupId>org.apache.calcite</groupId>
374 |         </exclusion>
375 |         <exclusion>
376 |           <artifactId>stax-api</artifactId>
377 |           <groupId>stax</groupId>
378 |         </exclusion>
379 |         <exclusion>
380 |           <artifactId>jline</artifactId>
381 |           <groupId>jline</groupId>
382 |         </exclusion>
383 |         <exclusion>
384 |           <artifactId>jansi</artifactId>
385 |           <groupId>org.fusesource.jansi</groupId>
386 |         </exclusion>
387 |         <exclusion>
388 |           <artifactId>hive-shims</artifactId>
389 |           <groupId>org.apache.hive</groupId>
390 |         </exclusion>
391 |         <exclusion>
392 |           <artifactId>commons-httpclient</artifactId>
393 |           <groupId>commons-httpclient</groupId>
394 |         </exclusion>
395 |         <exclusion>
396 |           <artifactId>commons-io</artifactId>
397 |           <groupId>commons-io</groupId>
398 |         </exclusion>
399 |         <exclusion>
400 |           <artifactId>log4j</artifactId>
401 |           <groupId>log4j</groupId>
402 |         </exclusion>
403 |         <exclusion>
404 |           <artifactId>commons-compress</artifactId>
405 |           <groupId>org.apache.commons</groupId>
406 |         </exclusion>
407 |         <exclusion>
408 |           <artifactId>zookeeper</artifactId>
409 |           <groupId>org.apache.zookeeper</groupId>
410 |         </exclusion>
411 |         <exclusion>
412 |           <artifactId>commons-codec</artifactId>
413 |           <groupId>commons-codec</groupId>
414 |         </exclusion>
415 |         <exclusion>
416 |           <artifactId>commons-logging</artifactId>
417 |           <groupId>commons-logging</groupId>
418 |         </exclusion>
419 |         <exclusion>
420 |           <artifactId>slf4j-api</artifactId>
421 |           <groupId>org.slf4j</groupId>
422 |         </exclusion>
423 |         <exclusion>
424 |           <artifactId>slf4j-log4j12</artifactId>
425 |           <groupId>org.slf4j</groupId>
426 |         </exclusion>
427 |       </exclusions>
428 |     </dependency>
429 |   </dependencies>
430 |   <properties>
431 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
432 |   </properties>
433 | </project>
434 | 
435 | 


--------------------------------------------------------------------------------
/java/MapReduce/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.onefold.hadoop</groupId>
 5 |   <artifactId>MapReduce</artifactId>
 6 |   <version>0.0.1-SNAPSHOT</version>
 7 |   <packaging>jar</packaging>
 8 |   <name>MapReduce</name>
 9 |   <url>http://maven.apache.org</url>
10 |   <properties>
11 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
12 |   </properties>
13 |   
14 |   <dependencies>
15 |     <dependency>
16 |       <groupId>org.mortbay.jetty</groupId>
17 |       <artifactId>jetty</artifactId>
18 |       <version>6.1.26</version>
19 |     </dependency>
20 |     <dependency>
21 |       <groupId>org.mortbay.jetty</groupId>
22 |       <artifactId>jetty-util</artifactId>
23 |       <version>6.1.26</version>
24 |     </dependency>
25 |     <dependency>
26 |       <groupId>org.apache.hadoop</groupId>
27 |       <artifactId>hadoop-mapreduce-client-core</artifactId>
28 |       <version>2.6.0.2.2.0.0-2041</version>
29 |     </dependency>
30 |     <dependency>
31 |       <groupId>org.apache.hadoop</groupId>
32 |       <artifactId>hadoop-common</artifactId>
33 |       <version>2.6.0.2.2.0.0-2041</version>
34 |     </dependency>
35 |   </dependencies>
36 |   
37 |   <repositories>
38 |     <repository>
39 |       <releases>
40 |         <enabled>true</enabled>
41 |         <updatePolicy>always</updatePolicy>
42 |         <checksumPolicy>warn</checksumPolicy>
43 |       </releases>
44 |       <snapshots>
45 |         <enabled>false</enabled>
46 |         <updatePolicy>never</updatePolicy>
47 |         <checksumPolicy>fail</checksumPolicy>
48 |       </snapshots>
49 |       <id>HDPReleases</id>
50 |       <name>HDP Releases</name>
51 |       <url>http://repo.hortonworks.com/content/repositories/releases/</url>
52 |       <layout>default</layout>
53 |     </repository>
54 |   </repositories>
55 |   
56 | </project>
57 | 


--------------------------------------------------------------------------------
/java/MapReduce/src/main/java/com/onefold/hadoop/MapReduce/TransformDataMultiOutputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.onefold.hadoop.MapReduce;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
 6 | 
 7 | /*
 8 |  * Copyright 2015, OneFold
 9 |  * All rights reserved.
10 |  * http://www.onefold.io
11 |  *
12 |  * Author: Jorge Chang
13 |  *
14 |  * See license in LICENSE file.
15 |  *
16 |  * Used by transform-data-mapper to write each fragment to its own MapReduce output Folder
17 |  *
18 |  */
19 | public class TransformDataMultiOutputFormat extends MultipleTextOutputFormat<Text, Text> {
20 |   @Override
21 |   protected String generateFileNameForKeyValue(Text key, Text value, String leaf) {
22 |     return new Path(key.toString(), leaf).toString();
23 |   }
24 | 
25 |   @Override
26 |   protected Text generateActualKey(Text key, Text value) {
27 |     return null;
28 |   }
29 | }


--------------------------------------------------------------------------------
/json/generate-schema-mapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Copyright 2015, OneFold
  5 | # All rights reserved.
  6 | # http://www.onefold.io
  7 | #
  8 | # Author: Jorge Chang
  9 | #
 10 | # See license in LICENSE file.
 11 | #
 12 | # Generate Schema Mapper - takes data from stdin, performs deep inspection and emits
 13 | # field-name -> data-type tuples.
 14 | #
 15 | 
 16 | import re
 17 | import sys
 18 | import json
 19 | import codecs
 20 | 
 21 | # create utf reader and writer for stdin and stdout
 22 | output_stream = codecs.getwriter("utf-8")(sys.stdout)
 23 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore")
 24 | error_stream = codecs.getwriter("utf-8")(sys.stderr)
 25 | 
 26 | def is_integer(value):
 27 |   try:
 28 |     a = int(str(value))
 29 |     if a > sys.maxint or a < -sys.maxint - 1:
 30 |       return False
 31 |     return True
 32 |   except:
 33 |     return False
 34 | 
 35 | def is_float(value):
 36 |   try:
 37 |     float(str(value))
 38 |     return True
 39 |   except:
 40 |     return False
 41 | 
 42 | def process_line(line, line_num, parent=None, seperator="_"):
 43 | 
 44 |   # parse the line
 45 |   try:
 46 |     data = json.loads(line, encoding='utf-8')
 47 |   except ValueError:
 48 |     print >> error_stream, "Line %i: JSON Parse Error. Data: %s" % (line_num, line)
 49 |     return
 50 | 
 51 |   if data:
 52 | 
 53 |     for key, value in data.iteritems():
 54 | 
 55 |       k = re.sub("[^0-9a-zA-Z_]", '_', key).lower()
 56 | 
 57 |       # BigQuery disallows field to start with non alpha
 58 |       if ord(k[0]) >= 48 and ord(k[0]) <= 59:
 59 |         k = "_f" + k
 60 | 
 61 |       # Hive disallows field to start with "_"
 62 |       if k[0] == '_':
 63 |         k = k.lstrip("_")
 64 | 
 65 |       if parent == None:
 66 |         full_key = k
 67 |       else:
 68 |         full_key = parent + seperator + k
 69 | 
 70 |       if value is None:
 71 |         # if data is Null, PASS.
 72 |         pass
 73 | 
 74 |       elif isinstance(value, dict):
 75 | 
 76 |         if len(value) > 0:
 77 |           print >> output_stream, "%s\t%s" % (full_key, "record-nullable")
 78 |           process_line(json.dumps(value, ensure_ascii=False), line_num, full_key)
 79 |         else:
 80 |           print >> error_stream, "Key %s has value of type dict %s which is empty. Ignoring." % (full_key, value)
 81 | 
 82 |       elif isinstance(value, list):
 83 | 
 84 |         if len(value) > 0:
 85 | 
 86 |           for list_value in value:
 87 |             if isinstance(list_value, dict):
 88 |               print >> output_stream, "%s\t%s" % (full_key, "record-repeated")
 89 |               process_line(json.dumps(list_value, ensure_ascii=False), line_num, full_key, ".")
 90 |             elif isinstance(list_value, bool):
 91 |               print >> output_stream, "%s\t%s" % (full_key, "boolean-repeated")
 92 |             elif isinstance(list_value, int):
 93 |               print >> output_stream, "%s\t%s" % (full_key, "integer-repeated")
 94 |             elif isinstance(list_value, float):
 95 |               print >> output_stream, "%s\t%s" % (full_key, "float-repeated")
 96 |             else:
 97 |               print >> output_stream, "%s\t%s" % (full_key, "string-repeated")
 98 | 
 99 |         else:
100 |           print >> error_stream, "Key %s has value of type list %s which is empty. Ignoring." % (full_key, value)
101 | 
102 |       else:
103 | 
104 |         if isinstance(value, bool):
105 |           print >> output_stream, "%s\t%s" % (full_key, "boolean-nullable")
106 |         elif isinstance(value, int):
107 |           print >> output_stream, "%s\t%s" % (full_key, "integer-nullable")
108 |         elif isinstance(value, float):
109 |           print >> output_stream, "%s\t%s" % (full_key, "float-nullable")
110 |         else:
111 |           print >> output_stream, "%s\t%s" % (full_key, "string-nullable")
112 | 
113 | 
114 | def main():
115 | 
116 |   line_num = 1
117 |   for line in input_stream:
118 |     try:
119 |       process_line(line, line_num, None)
120 |       line_num += 1
121 |     except Exception:
122 |       print >> error_stream, "Line %i: Error. Data: %s" % (line_num, line)
123 | 
124 | if __name__ == "__main__":
125 |   main()


--------------------------------------------------------------------------------
/json/generate-schema-reducer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Copyright 2015, OneFold
  5 | # All rights reserved.
  6 | # http://www.onefold.io
  7 | #
  8 | # Author: Jorge Chang
  9 | #
 10 | # See license in LICENSE file.
 11 | #
 12 | # Generate Schema Reducer - reduces multiple / conflicting data type of a particular
 13 | # field into the most general one, e.g.
 14 | # input: "zip_code" => (int, string)
 15 | # output: "zip_code" => (string) because string > int.
 16 | #
 17 | 
 18 | import sys
 19 | import codecs
 20 | from pymongo import MongoClient
 21 | 
 22 | # create utf reader and writer for stdin and stdout
 23 | output_stream = codecs.getwriter("utf-8")(sys.stdout)
 24 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore")
 25 | error_stream = codecs.getwriter("utf-8")(sys.stderr)
 26 | 
 27 | mongo_schema_collection = None
 28 | 
 29 | 
 30 | def parse_datatype_mode (datatype_mode):
 31 |   a = datatype_mode.split("-")
 32 |   if len(a) >= 2:
 33 |     return (a[0], a[1])
 34 |   else:
 35 |     raise ValueError('Invalid datatype / mode tuple %s' % datatype_mode)
 36 | 
 37 | 
 38 | def process_new_field(key, datatype_mode):
 39 | 
 40 |   if key is not None and datatype_mode is not None:
 41 |     # check if key is already in mongodb
 42 |     orig_field_record = mongo_schema_collection.find_one({"key": key, "type": "field"})
 43 | 
 44 |     # compare orig data type and save schema to mongodb
 45 |     if orig_field_record is not None:
 46 |       orig_datatype_mode = orig_field_record['data_type'] + "-" + orig_field_record['mode']
 47 | 
 48 |       forced = False
 49 |       if 'forced' in orig_field_record and orig_field_record['forced'] == True:
 50 |         forced = True
 51 | 
 52 |       # if 'forced' not in orig_datatype:
 53 |       if not forced:
 54 |         new_datatype_mode = max_datatype_mode(orig_datatype_mode, datatype_mode)
 55 | 
 56 |         (new_datatype, new_mode) = parse_datatype_mode(new_datatype_mode)
 57 |         mongo_schema_collection.find_one_and_update({"key": key, "type": "field"},
 58 |                                                     {"$set": {"data_type": new_datatype,
 59 |                                                               "mode": new_mode}})
 60 | 
 61 |     else:
 62 |       (datatype, mode) = parse_datatype_mode(datatype_mode)
 63 |       mongo_schema_collection.insert_one({"key": key,
 64 |                                           "type": "field",
 65 |                                           "data_type": datatype,
 66 |                                           "mode": mode})
 67 | 
 68 | 
 69 | def max_datatype_mode (datatype_mode_1, datatype_mode_2):
 70 | 
 71 |   if datatype_mode_1 == datatype_mode_2:
 72 |     return datatype_mode_1
 73 | 
 74 |   if datatype_mode_1 == 'record-repeated' or datatype_mode_2 == 'record-repeated':
 75 |     return 'record-repeated'
 76 | 
 77 |   if datatype_mode_1 == 'string-repeated' or datatype_mode_2 == 'string-repeated':
 78 |     return 'string-repeated'
 79 | 
 80 |   if datatype_mode_1 == 'repeated-nullable' or datatype_mode_2 == 'repeated-nullable':
 81 |     return 'repeated-nullable'
 82 | 
 83 |   if datatype_mode_1 == 'record-nullable' or datatype_mode_2 == 'record-nullable':
 84 |     return 'record-nullable'
 85 | 
 86 |   if datatype_mode_1 == 'string-nullable' or datatype_mode_2 == 'string-nullable':
 87 |     return 'string-nullable'
 88 | 
 89 |   if datatype_mode_1 == 'float-nullable' and datatype_mode_2 == 'integer-nullable':
 90 |     return 'float-nullable'
 91 | 
 92 |   if datatype_mode_1 == 'integer-nullable' and datatype_mode_2 == 'float-nullable':
 93 |     return 'float-nullable'
 94 | 
 95 |   return 'string-nullable'
 96 | 
 97 | 
 98 | def usage():
 99 |   print "Usage: %s mongodb://[host]:[port]/[db_name]/[schema_collection_name]" % sys.argv[0]
100 |   sys.exit(2)
101 | 
102 | 
103 | def main(argv):
104 | 
105 |   if len(argv) < 0:
106 |     usage()
107 | 
108 |   try:
109 | 
110 |     args = argv[0].split("/")
111 |     schema_collection_name = args[-1]
112 |     schema_db_name = args[-2]
113 |     mongo_uri = '/'.join(args[0:-2])
114 | 
115 |     client = MongoClient(mongo_uri)
116 |     db = client[schema_db_name]
117 | 
118 |     global mongo_schema_collection
119 |     mongo_schema_collection = db[schema_collection_name]
120 | 
121 |   except:
122 |     usage()
123 | 
124 |   current_key = None
125 |   current_datatype_mode = None
126 |   key = None
127 | 
128 |   # input comes from STDIN
129 |   for line in input_stream:
130 | 
131 |     # remove leading and trailing whitespace
132 |     line = line.strip()
133 | 
134 |     # parse the input we got from mapper.py
135 |     (key, datatype_mode) = line.split('\t', 1)
136 | 
137 |     # this IF-switch only works because Hadoop sorts map output
138 |     # by key (here: key) before it is passed to the reducer
139 |     if current_key == key:
140 |       current_datatype_mode = max_datatype_mode(current_datatype_mode, datatype_mode)
141 |     else:
142 |       if current_key:
143 |         process_new_field(current_key, current_datatype_mode)
144 |       current_datatype_mode = datatype_mode
145 |       current_key = key
146 | 
147 |   # do not forget to output the last key if needed!
148 |   if current_key == key:
149 |     process_new_field(current_key, current_datatype_mode)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |   main(sys.argv[1:])
154 | 


--------------------------------------------------------------------------------
/json/transform-data-mapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Copyright 2015, OneFold
  5 | # All rights reserved.
  6 | # http://www.onefold.io
  7 | #
  8 | # Author: Jorge Chang
  9 | #
 10 | # See license in LICENSE file.
 11 | #
 12 | # Transform Data Mapper - takes data from stdin, cleans the data based on schema
 13 | # generated previously, and split array fields into different files.
 14 | #
 15 | 
 16 | import re
 17 | import sys
 18 | import json
 19 | import os
 20 | import socket
 21 | import subprocess
 22 | import codecs
 23 | import hashlib
 24 | import pprint
 25 | from pymongo import MongoClient
 26 | 
 27 | # create utf reader and writer for stdin and stdout
 28 | output_stream = codecs.getwriter("utf-8")(sys.stdout)
 29 | input_stream = codecs.getreader("utf-8")(sys.stdin, errors="ignore")
 30 | error_stream = codecs.getwriter("utf-8")(sys.stderr)
 31 | 
 32 | process_array = "child_table"
 33 | shard_key = None
 34 | 
 35 | BATCH_SIZE = 10000
 36 | BATCH_NUM_LINES = 50000
 37 | 
 38 | # e.g. schema['event'] = 'string-nullable'
 39 | mongo_schema_collection = None
 40 | schema = {}
 41 | shard_values = []
 42 | 
 43 | # params
 44 | tmp_path = None
 45 | 
 46 | # create file descriptors
 47 | file_descriptors = {}
 48 | 
 49 | 
 50 | def clean_data(line, line_num, parent = None, parent_hash_code = None, is_array = False):
 51 |   new_data = {}
 52 |   new_data_fragments = {}
 53 | 
 54 |   # read each line into a line_hash
 55 |   try:
 56 |     data = json.loads(line, encoding="utf-8")
 57 |   except ValueError:
 58 |     print >> error_stream, "Line %i: JSON Parse Error. Data: %s" % (line_num, line)
 59 |     return None
 60 | 
 61 |   # create hash code
 62 |   hash_code = hashlib.sha1(json.dumps(data, sort_keys=True)).hexdigest()
 63 |   new_data['hash_code'] = hash_code
 64 | 
 65 |   if parent_hash_code != None:
 66 |     new_data['parent_hash_code'] = parent_hash_code
 67 | 
 68 |   # determine shard key (only for root level).
 69 |   if parent == None:
 70 |     if shard_key is not None:
 71 |       shard_value = get_shard_value(data, shard_key)
 72 | 
 73 |       if shard_value is None:
 74 |         print >> error_stream, "Line %i: Invalid shard value. Data: %s" % (line_num, line)
 75 |         return
 76 | 
 77 |       new_data_fragments["root/%s" % shard_value] = new_data
 78 |       shard_values.append(shard_value)
 79 |     else:
 80 |       new_data_fragments['root'] = new_data
 81 | 
 82 |   else:
 83 |     new_data_fragments['root'] = new_data
 84 | 
 85 |   if data:
 86 | 
 87 |     for (key, value) in data.iteritems():
 88 | 
 89 |       k = re.sub("[^0-9a-zA-Z_]", '_', key).lower()
 90 | 
 91 |       # BigQuery disallows field to start with number
 92 |       if ord(k[0]) >= 48 and ord(k[0]) <= 59:
 93 |         k = "_f" + k
 94 | 
 95 |       # Hive disallows field to start with "_"
 96 |       if k[0] == '_':
 97 |         k = k.lstrip("_")
 98 | 
 99 |       if parent == None:
100 |         full_key = k
101 |         dict_key = full_key
102 |       else:
103 |         if is_array:
104 |           full_key = parent + "." + k
105 |           dict_key = key
106 |         else:
107 |           full_key = parent + "_" + k
108 |           dict_key = full_key
109 | 
110 |       # check to see if dict is empty - BigQuery doesn't support RECORD data type with no fields
111 |       if isinstance(value, dict) and len(value) == 0:
112 |         continue
113 | 
114 |       # check to see if list is empty - BigQuery doesn't support REPEATED data type with no data
115 |       if isinstance(value, list) and len(value) == 0:
116 |         continue
117 | 
118 |       # print error if data type is not found for this key!
119 |       if full_key not in schema:
120 |         print >> error_stream, "Line %i: Couldn't find data type for key %s. Skipping this value. Data: %s" % (
121 |           line_num, full_key, line)
122 |         continue
123 | 
124 |       data_type = schema[full_key]["data_type"]
125 |       mode = schema[full_key]["mode"]
126 | 
127 |       data_type_forced = False
128 |       if 'forced' in schema[full_key]:
129 |         data_type_forced = schema[full_key]['forced']
130 | 
131 |       if data_type == 'record':
132 | 
133 |           if mode == 'repeated':
134 |             if not isinstance(value, list):
135 |               print >> error_stream, "Line %i: Expect repeated record but found %s. Data: %s" % (line_num, value, line)
136 |               return None
137 |             else:
138 | 
139 |               if process_array == "child_table":
140 |                 if full_key not in new_data_fragments:
141 |                   new_data_fragments[full_key] = []
142 | 
143 |                 for v in value:
144 |                   t = clean_data(json.dumps(v, ensure_ascii=False), line_num, full_key, hash_code, True)
145 | 
146 |                   for fragment, fragment_content in t.iteritems():
147 |                     if fragment == 'root':
148 |                       new_data_fragments[full_key].append(fragment_content)
149 |                     else:
150 |                       fragment_key = re.sub("[^0-9a-zA-Z_]", '_', fragment).lower()
151 |                       new_data_fragments[fragment_key] = fragment_content
152 | 
153 |               else:
154 |                 new_data[dict_key] = json.dumps(value)
155 | 
156 |           else:
157 |             if not isinstance(value, dict):
158 |               print >> error_stream, "Line %i: Expect record but found %s. Data: %s" % (line_num, value, line)
159 |               return None
160 |             else:
161 |               t = clean_data(json.dumps(value, ensure_ascii=False), line_num, full_key)
162 | 
163 |               for fragment, fragment_content in t.iteritems():
164 |                 if fragment == 'root':
165 |                   fragment_content.pop("hash_code", None)
166 |                   new_data.update(fragment_content)
167 | 
168 |                 if isinstance(fragment_content, list):
169 |                   new_data_fragments[fragment] = fragment_content
170 | 
171 |       else:
172 | 
173 |         if value:
174 | 
175 |           # check if data type mismatch
176 |           if data_type == 'string':
177 | 
178 |             if mode == 'repeated':
179 |               if not isinstance(value, list):
180 |                 print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % (
181 |                   line_num, value, line)
182 |                 return None
183 |               else:
184 | 
185 |                 if process_array == "child_table":
186 |                   if full_key not in new_data_fragments:
187 |                     new_data_fragments[full_key] = []
188 | 
189 |                   for v in value:
190 |                     cleaned_v = unicode(v)
191 |                     t = {"value": cleaned_v, "parent_hash_code": hash_code}
192 |                     new_data_fragments[full_key].append(t)
193 |                 else:
194 |                   new_data[dict_key] = json.dumps(value)
195 | 
196 |             else:
197 |               new_data[dict_key] = unicode(value)
198 | 
199 |           elif data_type == 'float':
200 | 
201 |             if mode == 'repeated':
202 |               if not isinstance(value, list):
203 |                 print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % (
204 |                   line_num, value, line)
205 |                 return None
206 |               else:
207 | 
208 |                 if process_array == "child_table":
209 |                   if full_key not in new_data_fragments:
210 |                     new_data_fragments[full_key] = []
211 | 
212 |                   for v in value:
213 | 
214 |                     cleaned_v = None
215 | 
216 |                     try:
217 |                       cleaned_v = float(v)
218 |                     except ValueError:
219 |                       if not data_type_forced:
220 |                         print >> error_stream, "Line %i: Couldn't convert %s to float. Data: %s" % (
221 |                           line_num, str(value), line)
222 |                         return None
223 | 
224 |                     t = {"value": cleaned_v, "parent_hash_code": hash_code}
225 |                     new_data_fragments[full_key].append(t)
226 |                 else:
227 |                   new_data[dict_key] = json.dumps(value)
228 | 
229 |             else:
230 |               try:
231 |                 new_data[dict_key] = float(value)
232 |               except ValueError:
233 |                 if data_type_forced:
234 |                   new_data[dict_key] = None
235 |                 else:
236 |                   print >> error_stream, "Line %i: Couldn't convert %s to float. Data: %s" % (
237 |                     line_num, str(value), line)
238 |                   return None
239 | 
240 |           elif data_type == 'integer':
241 | 
242 |             if mode == 'repeated':
243 |               if not isinstance(value, list):
244 |                 print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % (
245 |                   line_num, value, line)
246 |                 return None
247 |               else:
248 | 
249 |                 if process_array == "child_table":
250 |                   if full_key not in new_data_fragments:
251 |                     new_data_fragments[full_key] = []
252 | 
253 |                   for v in value:
254 | 
255 |                     cleaned_v = None
256 | 
257 |                     try:
258 |                       cleaned_v = int(v)
259 |                     except ValueError:
260 |                       if not data_type_forced:
261 |                         print >> error_stream, "Line %i: Couldn't convert %s to int. Data: %s" % (
262 |                           line_num, str(value), line)
263 |                         return None
264 | 
265 |                     t = {"value": cleaned_v, "parent_hash_code": hash_code}
266 |                     new_data_fragments[full_key].append(t)
267 |                 else:
268 |                   new_data[dict_key] = json.dumps(value)
269 | 
270 |             else:
271 |               try:
272 |                 new_data[dict_key] = int(value)
273 |               except ValueError:
274 |                 if data_type_forced:
275 |                   new_data[dict_key] = None
276 |                 else:
277 |                   print >> error_stream, "Line %i: Couldn't convert %s to int. Data: %s" % (line_num, str(value), line)
278 |                   return None
279 | 
280 |           elif data_type == 'boolean':
281 | 
282 |             if mode == 'repeated':
283 |               if not isinstance(value, list):
284 |                 print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % (
285 |                   line_num, value, line)
286 |                 return None
287 |               else:
288 | 
289 |                 if process_array == "child_table":
290 |                   if full_key not in new_data_fragments:
291 |                     new_data_fragments[full_key] = []
292 | 
293 |                   for v in value:
294 |                     t = {"value": str(v).lower() == 'true', "parent_hash_code": hash_code}
295 |                     new_data_fragments[full_key].append(t)
296 |                 else:
297 |                   new_data[dict_key] = json.dumps(value)
298 | 
299 |             else:
300 |               new_data[dict_key] = (str(value).lower() == 'true')
301 | 
302 |           else:
303 | 
304 |             if mode == 'repeated':
305 |               if not isinstance(value, list):
306 |                 print >> error_stream, "Line %i: Expect repeated string but found %s. Data: %s" % (
307 |                   line_num, value, line)
308 |                 return None
309 |               else:
310 | 
311 |                 if process_array == "child_table":
312 |                   if full_key not in new_data_fragments:
313 |                     new_data_fragments[full_key] = []
314 | 
315 |                   for v in value:
316 |                     cleaned_v = unicode(v)
317 |                     t = {"value": cleaned_v, "parent_hash_code": hash_code}
318 |                     new_data_fragments[full_key].append(t)
319 |                 else:
320 |                   new_data[dict_key] = json.dumps(value)
321 | 
322 |             else:
323 |               new_data[dict_key] = unicode(value)
324 | 
325 |         else:
326 |           new_data[dict_key] = None
327 | 
328 |   return new_data_fragments
329 | 
330 | 
331 | def get_shard_value(data, shard_key):
332 |   # split shard key by "."
333 |   tmp = data
334 |   shard_key_parts = shard_key.split(".")
335 |   for shard_key_part in shard_key_parts:
336 |     if shard_key_part in tmp:
337 |       tmp = tmp[shard_key_part]
338 |     else:
339 |       return None
340 | 
341 |   if isinstance(tmp, dict):
342 |     return None
343 |   else:
344 | 
345 |     shard_value = str(tmp)
346 | 
347 |     if len(shard_value) > 32 or len(shard_value) <= 0:
348 |       return None
349 | 
350 |     shard_value = re.sub("[^0-9a-zA-Z_]", '_', shard_value).lower()
351 |     return shard_value
352 | 
353 | 
354 | # creating folder and opening file (for local mode)
355 | def create_file_descriptor(fragment_value, shard_value = None):
356 | 
357 |   path = fragment_value
358 |   if shard_value != None:
359 |     path = fragment_value + "/" + shard_value
360 | 
361 |   # creating folder and opening file (for local mode)
362 |   execute('mkdir -p %s/%s' % (tmp_path, path), ignore_error=True)
363 |   file_name = '%s/%s/part-00000' % (tmp_path, path)
364 |   print >> error_stream, "Opening file descriptor %s" % file_name
365 |   file = open(file_name, 'w')
366 |   file_descriptors[path] = {"file": file, "file_name": file_name}
367 |   print >> error_stream, "Opened file descriptor %s" % file_name
368 | 
369 | 
370 | def process_line(line, line_num):
371 |   # clean data
372 |   data_fragments = clean_data(line, line_num, None)
373 | 
374 |   # skip if data is not clean..
375 |   if data_fragments is None or len(data_fragments) == 0:
376 |     return
377 | 
378 |   # handle other fragments
379 |   for fragment_value, fragment_content in data_fragments.iteritems():
380 | 
381 |     # open local file descriptor for this fragment (for local mode only)
382 |     if tmp_path != None:
383 |       if fragment_value not in file_descriptors:
384 |         create_file_descriptor(fragment_value)
385 |       file = file_descriptors[fragment_value]["file"]
386 | 
387 |     if isinstance(fragment_content, list):
388 |       for element in fragment_content:
389 |         if tmp_path != None:
390 |           # write data to local file
391 |           file.write(json.dumps(element))
392 |           file.write('\n')
393 |         else:
394 |           print >> output_stream, "%s\t%s" % (fragment_value, json.dumps(element))
395 |     else:
396 |       if tmp_path != None:
397 |         # write data to local file
398 |         file.write(json.dumps(fragment_content))
399 |         file.write('\n')
400 |       else:
401 |         print >> output_stream, "%s\t%s" % (fragment_value, json.dumps(fragment_content))
402 | 
403 | 
404 | def execute(command, ignore_error=False):
405 |   print >> error_stream, 'Executing command: %s' % command
406 |   if subprocess.call(command, shell=True):
407 |     # Non-zero return code indicates an error.
408 |     if not ignore_error:
409 |       raise Exception("Error executing command: %s" % command)
410 | 
411 | 
412 | def main(argv):
413 | 
414 |   # parse parameters
415 |   global tmp_path, mongo_schema_collection
416 | 
417 |   args = argv[0].split(",")
418 |   schema_arg = args[0]
419 |   if len(args) > 1:
420 |     tmp_path = args[1]
421 | 
422 |   schema_args = schema_arg.split("/")
423 |   schema_collection_name = schema_args[-1]
424 |   schema_db_name = schema_args[-2]
425 |   mongo_uri = '/'.join(schema_args[0:-2])
426 | 
427 |   client = MongoClient(mongo_uri)
428 |   db = client[schema_db_name]
429 | 
430 |   mongo_schema_collection = db[schema_collection_name]
431 | 
432 |   # delete temp folder if already exist (only for local mode)
433 |   if tmp_path != None:
434 |     execute('rm -rf %s' % tmp_path, ignore_error=True)
435 | 
436 |   # read schema from MongoDB
437 |   global schema, process_array, shard_key
438 | 
439 |   # read schema from mongodb server
440 |   schema_fields = mongo_schema_collection.find({"type": "field"})
441 |   schema = dict((schema_field['key'], schema_field) for schema_field in schema_fields)
442 | 
443 |   # read process_array from redis
444 |   # if redis_server.hget('%s/policy' % app_id, "process_array") != None:
445 |   #   process_array = redis_server.hget('%s/policy' % app_id, "process_array")
446 |   #
447 |   # # read shard_key from redis
448 |   # if redis_server.hget('%s/policy' % app_id, "shard_key") != None:
449 |   #   shard_key = redis_server.hget('%s/policy' % app_id, "shard_key")
450 | 
451 |   # process input
452 |   line_num = 1
453 |   for line in input_stream:
454 |     process_line(line, line_num)
455 |     line_num += 1
456 | 
457 |     # print something to stderr and stdout every 1000 lines
458 |     if line_num % 1000 == 0:
459 |       print >> error_stream, "Processed %i lines." % line_num
460 | 
461 |   print >> error_stream, "Finished writing to local files."
462 | 
463 |   # close out the local files
464 |   for fragment_value, file_descriptor in file_descriptors.iteritems():
465 |     print >> error_stream, "Closing file descriptor %s" % fragment_value
466 | 
467 |     # close file
468 |     file_descriptor["file"].close()
469 | 
470 |     # write fragment values to mongodb
471 |     print >> error_stream, "Adding fragment value %s to mongodb." % (fragment_value)
472 |     mongo_schema_collection.update_one({"type": "fragments"}, {"$addToSet": {"fragments": fragment_value}}, upsert = True);
473 | 
474 |   for shard_value in shard_values:
475 |     # write shard values to mongodb
476 |     if shard_key is not None:
477 |       print >> error_stream, "Adding shard value %s to mongodb." % (shard_value)
478 |       mongo_schema_collection.update_one({"type": "shards"}, {"$addToSet": {"shards": shard_value}}, upsert = True);
479 | 
480 | 
481 | if __name__ == "__main__":
482 |   main(sys.argv[1:])
483 | 


--------------------------------------------------------------------------------
/onefold.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Author: Jorge Chang 
  5 | #
  6 | # See license in LICENSE file.
  7 | #
  8 | # This is the main program used to ETL mongodb collections into Hive tables.
  9 | #
 10 | 
 11 | from pymongo import MongoClient
 12 | import argparse
 13 | import os
 14 | import glob
 15 | from bson.json_util import dumps
 16 | import codecs
 17 | import pprint
 18 | import json
 19 | from onefold_util import execute
 20 | from dw_util import Hive, GBigQuery
 21 | from cs_util import HDFSStorage, GCloudStorage
 22 | 
 23 | 
 24 | NUM_RECORDS_PER_PART = 100000
 25 | TMP_PATH = '/tmp/onefold_mongo'
 26 | CLOUD_STORAGE_PATH = 'onefold_mongo'
 27 | HADOOP_MAPREDUCE_STREAMING_LIB = "/usr/hdp/current/hadoop-mapreduce-client/hadoop-streaming.jar"
 28 | ONEFOLD_MAPREDUCE_JAR = os.getcwd() + "/java/MapReduce/target/MapReduce-0.0.1-SNAPSHOT.jar"
 29 | ONEFOLD_HIVESERDES_JAR = os.getcwd() + "/java/HiveSerdes/target/hive-serdes-1.0-SNAPSHOT.jar"
 30 | 
 31 | # default mapreduce params
 32 | mapreduce_params = {}
 33 | mapreduce_params["mapred.reduce.max.attempts"] = "0"
 34 | mapreduce_params["mapred.map.max.attempts"] = "0"
 35 | mapreduce_params["mapred.task.timeout"] = "12000000"
 36 | MAPREDUCE_PARAMS_STR = ' '.join(["-D %s=%s"%(k,v) for k,v in mapreduce_params.iteritems()])
 37 | 
 38 | 
 39 | # helper function to split "[datatype]-[mode]" into datatype and mode
 40 | def parse_datatype_mode (datatype_mode):
 41 |   a = datatype_mode.split("-")
 42 |   if len(a) >= 2:
 43 |     return (a[0], a[1])
 44 |   else:
 45 |     raise ValueError('Invalid datatype / mode tuple %s' % datatype_mode)
 46 | 
 47 | # helper function to check if "address.zip_code" is in data by spliting the jsonpath by "."
 48 | def jsonpath_get(mydict, path):
 49 |   elem = mydict
 50 |   try:
 51 |     for x in path.split("."):
 52 |       elem = elem.get(x)
 53 |   except:
 54 |     pass
 55 | 
 56 |   return elem
 57 | 
 58 | 
 59 | class Loader:
 60 | 
 61 |   # control params
 62 |   infra_type = None
 63 |   mongo_uri = None
 64 |   db_name = None
 65 |   collection_name = None
 66 |   collection_sort_by_field = None
 67 |   extract_query = None
 68 |   tmp_path = None
 69 |   schema_db_name = None
 70 |   schema_collection_name = None
 71 |   use_mr = False
 72 | 
 73 |   hiveserveer_host = None
 74 |   hiveserver_port = None
 75 | 
 76 |   gcloud_project_id = None
 77 |   gcloud_storage_bucket_id = None
 78 | 
 79 |   write_disposition = None
 80 |   process_array = "child_table"
 81 |   dw_database_name = None
 82 |   dw_table_name = None
 83 | 
 84 |   policies = None
 85 | 
 86 |   # mongo client and schema collection
 87 |   mongo_client = None
 88 |   mongo_schema_collection = None
 89 | 
 90 |   # runtime variables
 91 |   extract_file_names = []
 92 |   reject_file_names = []
 93 |   sort_by_field_min = None
 94 |   sort_by_field_max = None
 95 |   dw_table_names = []
 96 |   dw = None
 97 |   cs = None
 98 |   num_records_extracted = 0
 99 |   num_records_rejected = 0
100 | 
101 |   # policy related variables
102 |   required_fields = {}
103 | 
104 | 
105 |   def initialize(self):
106 | 
107 |     # open mongo client
108 |     self.mongo_client = MongoClient(self.mongo_uri)
109 | 
110 |     # open schema collection
111 |     mongo_schema_db = self.mongo_client[self.schema_db_name]
112 |     self.mongo_schema_collection = mongo_schema_db[self.schema_collection_name]
113 | 
114 |     # if overwrite, delete schema collection
115 |     if self.write_disposition == 'overwrite':
116 |       self.mongo_schema_collection.remove({})
117 | 
118 |     # create data warehouse object
119 |     if self.infra_type == 'hadoop':
120 |       self.dw = Hive(self.hiveserveer_host, self.hiveserver_port, ONEFOLD_HIVESERDES_JAR)
121 |       self.cs = HDFSStorage()
122 |     elif self.infra_type == 'gcloud':
123 |       self.dw = GBigQuery(self.gcloud_project_id, self.gcloud_storage_bucket_id)
124 |       self.cs = GCloudStorage(self.gcloud_project_id, self.gcloud_storage_bucket_id)
125 | 
126 |     # turn policies into better data structure for use later (required_fields)
127 |     if self.policies != None:
128 |       for policy in self.policies:
129 |         if 'key' in policy:
130 |           if 'required' in policy:
131 |             if policy['key'] not in self.required_fields == None:
132 |               self.required_fields[policy['key']] = {}
133 |             self.required_fields[policy['key']] = policy
134 | 
135 |           if 'data_type' in policy:
136 |             datatype_overwrite = policy['data_type']
137 | 
138 |             if 'mode' in policy:
139 |               mode_overwrite = policy['mode']
140 |             else:
141 |               mode_overwrite = 'nullable'
142 | 
143 |             self.mongo_schema_collection.update_one(
144 |               {"key": policy['key'].replace(".", "_"), "type": "field"},
145 |               {"$set": {"data_type": datatype_overwrite,
146 |                         "mode": mode_overwrite,
147 |                         "forced": True}},
148 |               upsert = True)
149 | 
150 | 
151 |   def extract_data(self):
152 | 
153 |     # create tmp_path folder if necessary
154 |     if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'data')):
155 |       os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'data'))
156 | 
157 |     if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'rejected')):
158 |       os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'rejected'))
159 | 
160 |     # delete old tmp files if exists
161 |     for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'data', '*')):
162 |       print "Deleting old file %s" % (old_file)
163 |       os.remove(old_file)
164 | 
165 |     for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'rejected', '*')):
166 |       print "Deleting old file %s" % (old_file)
167 |       os.remove(old_file)
168 | 
169 |     # some state variables
170 |     part_num = 0
171 |     extract_file = None
172 | 
173 |     reject_part_num = 0
174 |     reject_file = None
175 | 
176 |     # start mongo client
177 |     db = self.mongo_client[self.db_name]
178 |     collection = db[self.collection_name]
179 | 
180 |     # turn query string into json
181 |     if self.extract_query is not None:
182 |       if 'ObjectId' in self.extract_query:
183 |         # kinda hacky.. and dangerous! This is to evaluate an expression
184 |         # like {"_id": {$gt:ObjectId("55401a60151a4b1a4f000001")}}
185 |         from bson.objectid import ObjectId
186 |         extract_query_json = eval(self.extract_query)
187 |       else:
188 |         extract_query_json = json.loads(self.extract_query)
189 |     else:
190 |       extract_query_json = None
191 | 
192 |     # query collection, sort by collection_sort_by_field
193 |     for data in collection.find(extract_query_json).sort(self.collection_sort_by_field, 1):
194 | 
195 |       # track min and max id for auditing..
196 |       if self.sort_by_field_min == None:
197 |         self.sort_by_field_min = data[self.collection_sort_by_field]
198 |       self.sort_by_field_max = data[self.collection_sort_by_field]
199 | 
200 |       # open a new file if necessary
201 |       if self.num_records_extracted % NUM_RECORDS_PER_PART == 0:
202 | 
203 |         if extract_file != None:
204 |           extract_file.close()
205 | 
206 |         part_num += 1
207 |         extract_file_name = os.path.join(self.tmp_path, self.collection_name, 'data', str(part_num))
208 |         extract_file = open(extract_file_name, "w")
209 |         extract_file_codec = codecs.getwriter("utf-8")(extract_file)
210 |         self.extract_file_names.append(extract_file_name)
211 |         print "Creating file %s" % extract_file_name
212 | 
213 |       # validate policies
214 |       rejected = False
215 |       for required_field_name, policy in self.required_fields.iteritems():
216 |         if policy['required'] and jsonpath_get(data, required_field_name) is None:
217 | 
218 |           # --------------------------------------------------------
219 |           # document found that doesn't contain required fields.
220 |           # --------------------------------------------------------
221 | 
222 |           # open a new file if necessary
223 |           if self.num_records_rejected % NUM_RECORDS_PER_PART == 0:
224 | 
225 |             if reject_file != None:
226 |               reject_file.close()
227 | 
228 |             reject_part_num += 1
229 |             reject_file_name = os.path.join(self.tmp_path, self.collection_name, 'rejected', str(reject_part_num))
230 |             reject_file = open(reject_file_name, "w")
231 |             reject_file_codec = codecs.getwriter("utf-8")(reject_file)
232 |             self.reject_file_names.append(reject_file_name)
233 |             print "Creating reject file %s" % reject_file_name
234 | 
235 |           self.num_records_rejected += 1
236 |           reject_file_codec.write("Rejected. Missing %s. Data: %s" % (required_field_name, dumps(data)))
237 |           reject_file_codec.write('\n')
238 | 
239 |           rejected = True
240 |           break
241 | 
242 |       if not rejected:
243 |         self.num_records_extracted += 1
244 |         extract_file_codec.write(dumps(data))
245 |         extract_file_codec.write('\n')
246 | 
247 |     if extract_file != None:
248 |       extract_file.close()
249 | 
250 |     if reject_file != None:
251 |       reject_file.close()
252 | 
253 |   def simple_schema_gen(self):
254 |     command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \
255 |               % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name)
256 |     execute(command)
257 | 
258 | 
259 |   def mr_schema_gen(self):
260 | 
261 |     hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
262 |     hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (CLOUD_STORAGE_PATH, self.collection_name)
263 | 
264 |     # delete folders
265 |     self.cs.rmdir(hdfs_data_folder)
266 |     self.cs.rmdir(hdfs_mr_output_folder)
267 |     
268 | 
269 |     # copy extracted files to hdfs data folder
270 |     self.cs.mkdir(hdfs_data_folder)
271 | 
272 |     for extract_file_name in self.extract_file_names:
273 |       self.cs.copy_from_local(extract_file_name, hdfs_data_folder)
274 | 
275 |     hadoop_command = """hadoop jar %s \
276 |                               -D mapred.job.name="onefold-mongo-generate-schema" \
277 |                               %s \
278 |                               -input %s -output %s \
279 |                               -mapper 'json/generate-schema-mapper.py' \
280 |                               -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
281 |                               -file json/generate-schema-mapper.py \
282 |                               -file json/generate-schema-reducer.py
283 |     """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder,
284 |            hdfs_mr_output_folder, self.mongo_uri,
285 |            self.schema_db_name, self.schema_collection_name)
286 |     execute(hadoop_command)
287 | 
288 | 
289 |   def simple_data_transform(self):
290 | 
291 |     hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)
292 |     transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name)
293 | 
294 |     command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
295 |               % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
296 |                  self.schema_collection_name, transform_data_tmp_path)
297 |     execute(command)
298 | 
299 |     # delete folders
300 |     self.cs.rmdir (hdfs_mr_output_folder)
301 | 
302 |     # manually copy files into hdfs
303 |     fragment_values = self.get_fragments()
304 |     for fragment_value in fragment_values:
305 |       self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value))
306 |       self.cs.copy_from_local("%s/%s/part-00000" % (transform_data_tmp_path, fragment_value),
307 |                               "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
308 |       
309 | 
310 |   def mr_data_transform(self):
311 | 
312 |     hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
313 |     hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)
314 | 
315 |     # delete folders
316 |     self.cs.rmdir(hdfs_mr_output_folder)
317 | 
318 |     hadoop_command = """hadoop jar %s \
319 |                               -libjars %s \
320 |                               -D mapred.job.name="onefold-mongo-transform-data" \
321 |                               -D mapred.reduce.tasks=0 \
322 |                               %s \
323 |                               -input %s -output %s \
324 |                               -mapper 'json/transform-data-mapper.py %s/%s/%s' \
325 |                               -file json/transform-data-mapper.py \
326 |                               -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
327 |     """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
328 |            self.schema_db_name, self.schema_collection_name)
329 |     execute(hadoop_command)
330 | 
331 | 
332 |   # retrieve schema tree from schema collection
333 |   def retrieve_schema_fields(self):
334 | 
335 |     # read schema from mongodb schema collection
336 |     schema_fields = []
337 | 
338 |     mongo_schema_fields = self.mongo_schema_collection.find({"type": "field"})
339 |     for mongo_schema_field in mongo_schema_fields:
340 |       schema_fields.append(mongo_schema_field)
341 | 
342 |     # add hash code to field
343 |     field = {}
344 |     field['key'] = "hash_code"
345 |     field['mode'] = "nullable"
346 |     field['data_type'] = "string"
347 |     schema_fields.append(field)
348 | 
349 |     return schema_fields
350 | 
351 | 
352 |   def get_fragments(self):
353 |     fragment_record = self.mongo_schema_collection.find_one({"type": "fragments"})
354 |     if fragment_record != None:
355 |       return fragment_record['fragments']
356 |     else:
357 |       return []
358 | 
359 | 
360 |   def load_table_hive (self, shard_value = None, table_name = None, different_table_per_shard = False, data_import_id = None):
361 | 
362 |     # if shard_value is None:
363 |     #   gcs_uri = "%s/data/*" % (self.mr4_output_folder_uri)
364 |     # else:
365 |     #   gcs_uri = "%s/data/%s/*" % (self.mr4_output_folder_uri, shard_value)
366 | 
367 |     if different_table_per_shard:
368 |       full_table_name = "%s_%s" % (table_name, shard_value)
369 |     else:
370 |       full_table_name = "%s" % (table_name)
371 | 
372 |     cloud_storage_path = "%s/%s/data_transform/output/%s/" % (CLOUD_STORAGE_PATH, self.collection_name, shard_value)
373 |     self.dw.load_table(self.dw_database_name, full_table_name, cloud_storage_path)
374 | 
375 |     # extract bq_job_id and save to db
376 |     return "%s/%s" % (data_import_id, shard_value)
377 | 
378 | 
379 |   def load_dw (self):
380 | 
381 |     # retrieve schema fields from mongodb schema collection
382 |     schema_fields = self.retrieve_schema_fields()
383 | 
384 |     # create tables
385 |     if self.write_disposition == 'overwrite':
386 |       if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
387 |         self.dw.delete_table(self.dw_database_name, self.dw_table_name)
388 |       self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array)
389 |     else:
390 |       # if append, update table.
391 |       if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
392 |         self.dw_table_names = self.dw.update_table(self.dw_database_name, self.dw_table_name, schema_fields)
393 |       else:
394 |         self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array)
395 | 
396 |     # load data
397 |     fragment_values = self.get_fragments()
398 | 
399 |     if fragment_values == None or len(fragment_values) == 0:
400 |       table_name = self.dw_table_name
401 |       self.load_table_hive(shard_value = None, table_name = table_name, different_table_per_shard=False, data_import_id=None)
402 | 
403 |     else:
404 |       for fragment_value in fragment_values:
405 |         print "Loading fragment: " + fragment_value
406 |         if fragment_value == 'root':
407 |           table_name = self.dw_table_name
408 |         else:
409 |           table_name = self.dw_table_name + "_" + fragment_value
410 | 
411 |         self.load_table_hive(shard_value = fragment_value, table_name = table_name, different_table_per_shard=False, data_import_id=None)
412 | 
413 | 
414 |   def run(self):
415 |     # init (start mongo client)
416 |     self.initialize()
417 | 
418 |     # extract data from Mongo
419 |     self.extract_data()
420 | 
421 |     if self.num_records_extracted > 0:
422 |       # generate schema and transform data
423 |       if self.use_mr:
424 |         self.mr_schema_gen()
425 |         self.mr_data_transform()
426 |       else:
427 |         self.simple_schema_gen()
428 |         self.simple_data_transform()
429 | 
430 |       # Create data warehouse tables and load data into them
431 |       self.load_dw()
432 | 
433 |     print '-------------------'
434 |     print '    RUN SUMMARY'
435 |     print '-------------------'
436 |     print 'Num records extracted %s' % self.num_records_extracted
437 |     print 'Num records rejected %s' % self.num_records_rejected
438 |     print 'Extracted data with %s from %s to %s' % (self.collection_sort_by_field, self.sort_by_field_min, self.sort_by_field_max)
439 |     print 'Extracted files are located at: %s' % (' '.join(self.extract_file_names))
440 |     print 'Destination Tables: %s' % (' '.join(self.dw_table_names))
441 |     print 'Schema is stored in Mongo %s.%s' % (self.schema_db_name, self.schema_collection_name)
442 | 
443 | def usage():
444 |   # ./onefold.py --mongo mongodb://173.255.115.8:27017 --source_db test --source_collection uber_events --schema_db test --schema_collection uber_events_schema --hiveserver_host 130.211.146.208 --hiveserver_port 10000
445 |   # ./onefold.py --mongo mongodb://173.255.115.8:27017 --source_db test --source_collection uber_events --schema_db test --schema_collection uber_events_schema --hiveserver_host 130.211.146.208 --hiveserver_port 10000 --use_mr
446 |   pass
447 | 
448 | def main():
449 | 
450 |   # parse command line
451 |   parser = argparse.ArgumentParser(description='Generate schema for MongoDB collections.')
452 |   parser.add_argument('--mongo', metavar='mongo', type=str, required=True, help='MongoDB connectivity')
453 |   parser.add_argument('--source_db', metavar='source_db', type=str, required=True, help='Source MongoDB database name')
454 |   parser.add_argument('--source_collection', metavar='source_collection', type=str, required=True,
455 |                       help='Source MongoDB collection name')
456 |   parser.add_argument('--source_sort_by_field', metavar='source_sort_by_field', type=str, default='_id',
457 |                       help='Source MongoDB collection name')
458 |   parser.add_argument('--query', metavar='query', type=str, help='Mongo Query for filtering')
459 |   parser.add_argument('--tmp_path', metavar='tmp_path', type=str, help='Path to store tmp file from extraction.',
460 |                       default=TMP_PATH)
461 |   parser.add_argument('--schema_db', metavar='schema_db', type=str,
462 |                       help='MongoDB database name to store schema. If not provided, default to source db.')
463 |   parser.add_argument('--schema_collection', metavar='schema_collection', type=str,
464 |                       help='MongoDB collection name to store schema. If not provided, default to [source_collection]_schema')
465 |   parser.add_argument('--write_disposition', metavar='write_disposition', type=str,
466 |                       help='overwrite or append. Default is overwrite', default='overwrite', choices=['overwrite', 'append'])
467 |   parser.add_argument('--dest_db_name', metavar='dest_db_name', type=str,
468 |                       help='Hive database name. If not provided, default to \'default\' hive database.')
469 |   parser.add_argument('--dest_table_name', metavar='dest_table_name', type=str,
470 |                       help='Hive table name. If not provided, default to source collection name.')
471 |   parser.add_argument('--use_mr', action='store_true')
472 |   parser.add_argument('--policy_file', metavar='policy_file', type=str,
473 |                       help='Data Policy file name.')
474 |   parser.add_argument('--infra_type', metavar='infra_type', type=str, default='hadoop',
475 |                       help='Infrastructure type. One of hadoop or gcloud')
476 | 
477 |   # hive related parameters
478 |   parser.add_argument('--hiveserver_host', metavar='hiveserver_host', type=str, required=False, help='Hiveserver host')
479 |   parser.add_argument('--hiveserver_port', metavar='hiveserver_port', type=str, required=False, help='Hiveserver port')
480 | 
481 |   # gcloud related parameters
482 |   parser.add_argument('--gcloud_project_id', metavar='gcloud_project_id', type=str, required=False, help='GCloud project id')
483 |   parser.add_argument('--gcloud_storage_bucket_id', metavar='gcloud_storage_bucket_id', type=str, required=False, help='GCloud storage bucket id')
484 | 
485 |   args = parser.parse_args()
486 | 
487 |   # global mongo_uri, db_name, collection_name, extract_query, tmp_path, schema_db_name, schema_collection_name, use_mr
488 |   loader = Loader()
489 |   loader.infra_type = args.infra_type
490 |   loader.mongo_uri = args.mongo
491 |   loader.db_name = args.source_db
492 |   loader.collection_name = args.source_collection
493 |   loader.collection_sort_by_field = args.source_sort_by_field
494 |   loader.extract_query = args.query
495 |   loader.tmp_path = args.tmp_path
496 | 
497 |   if args.schema_db != None:
498 |     loader.schema_db_name = args.schema_db
499 |   else:
500 |     loader.schema_db_name = args.source_db
501 | 
502 |   if args.schema_collection != None:
503 |     loader.schema_collection_name = args.schema_collection
504 |   else:
505 |     loader.schema_collection_name = "%s_schema" % args.source_collection
506 | 
507 |   if args.infra_type == 'hadoop':
508 |     if args.hiveserver_host is None:
509 |       raise ValueError("hiveserver_host must be specified for 'hadoop' infrastructure type.")
510 |     if args.hiveserver_port is None:
511 |       raise ValueError("hiveserver_port must be specified for 'hadoop' infrastructure type.")
512 | 
513 |     loader.hiveserver_host = args.hiveserver_host
514 |     loader.hiveserver_port = args.hiveserver_port
515 |   else:
516 |     if args.gcloud_project_id is None:
517 |       raise ValueError("gcloud_project_id must be specified for 'gcloud' infrastructure type.")
518 |     if args.gcloud_storage_bucket_id is None:
519 |       raise ValueError("gcloud_storage_bucket_id must be specified for 'gcloud' infrastructure type.")
520 | 
521 |     loader.gcloud_project_id = args.gcloud_project_id
522 |     loader.gcloud_storage_bucket_id = args.gcloud_storage_bucket_id
523 | 
524 |   loader.write_disposition = args.write_disposition
525 | 
526 |   if args.dest_table_name != None:
527 |     loader.dw_table_name = args.dest_table_name
528 |   else:
529 |     loader.dw_table_name = args.source_collection
530 | 
531 |   if args.dest_db_name != None:
532 |     loader.dw_database_name = args.dest_db_name
533 | 
534 |   if args.use_mr:
535 |     loader.use_mr = args.use_mr
536 | 
537 |   if args.policy_file != None:
538 |     # open policy file
539 |     policy_file = open(args.policy_file, "r")
540 |     loader.policies = json.loads(policy_file.read())
541 | 
542 |   loader.run()
543 | 
544 | 
545 | if __name__ == '__main__':
546 |   main()
547 | 


--------------------------------------------------------------------------------
/onefold_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #
 4 | # Copyright 2015, OneFold
 5 | # All rights reserved.
 6 | # http://www.onefold.io
 7 | #
 8 | # Author: Jorge Chang
 9 | #
10 | # See license in LICENSE file.
11 | #
12 | # OneFold utility functions - mainly for executing shell commands.
13 | #
14 | 
15 | import subprocess
16 | import os
17 | import random
18 | import time
19 | 
20 | # execute shell command
21 | def execute(command, ignore_error=False, retry=False, subpress_output=False):
22 | 
23 |   if retry:
24 |     num_retries = 5
25 |   else:
26 |     num_retries = 1
27 | 
28 |   l = range(0,num_retries)
29 |   for n in l:
30 |     try:
31 |       print 'Executing command: %s' % command
32 | 
33 |       if subpress_output:
34 |         devnull = open(os.devnull, 'w')
35 |         rc = subprocess.call(command, shell=True, stdout=devnull, stderr=devnull)
36 |       else:
37 |         rc = subprocess.call(command, shell=True)
38 | 
39 |       if rc:
40 |         # Non-zero return code indicates an error.
41 |         if not ignore_error:
42 |           raise Exception("Error executing command: %s" % command)
43 | 
44 |       # if command ran successfully, return!
45 |       return
46 |     except:
47 |       if retry:
48 |         # Apply exponential backoff.
49 |         print 'Retry-able. Sleeping...'
50 |         time.sleep((2 ** n) + random.randint(0, 1000) / 1000)
51 |       else:
52 |         raise
53 | 
54 |   # only reach this point if we've re-tried and still failed.
55 |   if retry:
56 |     print "Retries exceeded (%s times). Throwing exception.." % num_retries
57 |     raise Exception ("Retries exceeded (%s times) when executing this command." % num_retries)
58 | 
59 | 
60 | def execute_and_read_with_retry(command):
61 |   for n in range(0,5):
62 |     (return_code, stdout_lines, stderr_lines) = execute_and_read(command)
63 |     if return_code == 0:
64 |       break
65 |     else:
66 |       print "Error executing command: %s with return code %s" % (command, return_code)
67 |       print 'Retry-able. Sleeping...'
68 |       time.sleep((2 ** n) + random.randint(0, 1000) / 1000)
69 | 
70 |   return (return_code, stdout_lines, stderr_lines)
71 | 
72 | 
73 | # execute shell command and return stdout as list of strings
74 | def execute_and_read(command):
75 |   # run command and read stdout
76 |   print 'Executing command: %s' % command
77 |   p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
78 |   p.wait()
79 | 
80 |   return_code = p.returncode
81 | 
82 |   stdout_lines = p.stdout.readlines()
83 |   # print stdout
84 |   for line in stdout_lines:
85 |     print line.strip()
86 | 
87 |   stderr_lines = p.stderr.readlines()
88 |   # print stderr
89 |   for line in stderr_lines:
90 |     print line.strip()
91 | 
92 |   return (return_code, stdout_lines, stderr_lines)
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------