├── LICENSE.txt ├── README.md ├── cloudburst ├── .DS_Store ├── LICENSE.txt └── src │ └── cloudBurst │ ├── AlignInfo.java │ ├── AlignmentRecord.java │ ├── AlignmentStats.java │ ├── CloudBurst.java │ ├── ConvertFastaForCloud.java │ ├── CountKmers.java │ ├── DNAString.java │ ├── DisplaySequenceFile.java │ ├── FastaRecord.java │ ├── FilterAlignments.java │ ├── LandauVishkin.java │ ├── MerRecord.java │ ├── MerReduce.java │ ├── PrintAlignments.java │ ├── SharedSeedRecord.java │ ├── SubstringTester.java │ └── Timer.java ├── cloudfront ├── .DS_Store └── code │ ├── .DS_Store │ ├── CHANGES.txt │ ├── LICENSE.txt │ ├── NOTICE.txt │ ├── README.TXT │ ├── build.xml │ ├── lib │ ├── cascading-1.0.9 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── cascading-1.0.9.jar │ │ ├── cascading-core-1.0.9.jar │ │ ├── cascading-test-1.0.9.jar │ │ └── cascading-xml-1.0.9.jar │ ├── commons-cli-1.2 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── commons-cli-1.2-javadoc.jar │ │ ├── commons-cli-1.2-sources.jar │ │ └── commons-cli-1.2.jar │ ├── hadoop-0.18.3 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── hadoop-0.18.3-ant.jar │ │ ├── hadoop-0.18.3-core.jar │ │ ├── hadoop-0.18.3-examples.jar │ │ ├── hadoop-0.18.3-test.jar │ │ └── hadoop-0.18.3-tools.jar │ ├── janino-2.5.15 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ └── janino-2.5.15.jar │ └── jgrapht-jdk1.6 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ └── jgrapht-jdk1.6.jar │ └── src │ └── logprocessor │ ├── Main.java │ ├── enums │ ├── Columns.java │ └── ReportNames.java │ ├── reporters │ ├── FieldAggregatedReporterAssembly.java │ └── TimeBucketedReporterAssembly.java │ └── s3copy │ ├── CopyFromS3.java │ ├── DatePathFilter.java │ ├── HDFSWriterReducer.java │ └── S3CopyMapper.java ├── emrfs-plugins └── EMRFSRSAEncryptionMaterialsProvider │ ├── README.md │ ├── pom.xml │ └── src │ └── main │ └── java │ └── com │ └── amazon │ └── ws │ └── emr │ └── hadoop │ └── fs │ └── cse │ └── RSAEncryptionMaterialsProvider.java ├── freebase ├── .DS_Store └── code │ ├── aws_sdb.rb │ ├── base64.rb │ ├── freebase_jobflow.json │ ├── mapper.py │ ├── name_mapper.rb │ ├── name_reducer.rb │ ├── top_sdb_mapper.rb │ └── top_sdb_reducer.rb ├── hive-ads ├── .DS_Store ├── README.md └── libs │ ├── .DS_Store │ ├── join-clicks-to-impressions.q │ ├── model-build.q │ ├── response-time-stats.q │ ├── split_user_agent.py │ ├── twitter-impressions.q │ ├── upload-to-simple-db │ └── wait-for.sh ├── node ├── README.txt ├── sample-mapper.js └── sample-reducer.js ├── pig-apache ├── .DS_Store ├── do-reports.pig └── do-reports2.pig ├── powershell ├── Create-InteractiveHiveJob.ps1 └── RunHiveDemoJob.ps1 ├── similarity ├── .DS_Store ├── LICENSE.txt ├── README.md ├── convert_netflix.py ├── lastfm_jobflow.json ├── netflix_jobflow.json ├── similarity.py └── user_count_mapper.py ├── spark ├── .DS_Store ├── Shark sample query.txt └── Spark sample query.txt └── wordcount ├── .DS_Store └── wordSplitter.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | 4 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 5 | 6 | 1. Definitions. 7 | 8 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 9 | 10 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 11 | 12 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 13 | 14 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 15 | 16 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 17 | 18 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 19 | 20 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 21 | 22 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 23 | 24 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 25 | 26 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 27 | 28 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 29 | 30 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 31 | 32 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 33 | 34 | 1. You must give any other recipients of the Work or Derivative Works a copy of this License; and 35 | 2. You must cause any modified files to carry prominent notices stating that You changed the files; and 36 | 3. You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 37 | 4. If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 38 | 39 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 40 | 41 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 42 | 43 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 44 | 45 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 46 | 47 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 48 | 49 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 50 | 51 | END OF TERMS AND CONDITIONS 52 | 53 | Note: Other license terms may apply to certain, identified software files contained within or distributed with the accompanying software if such terms are included in the directory containing the accompanying software. Such other license terms will then apply in lieu of the terms of the software license above. 54 | 55 | JSON processing code subject to the JSON License from JSON.org: 56 | 57 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 58 | 59 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 60 | 61 | The Software shall be used for Good, not Evil. 62 | 63 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The code samples in this repository are meant to illustrate how to use popular applications on Amazon EMR. 2 | They are not meant to be run in production and all users should carefully inspect code samples before running them. 3 | 4 | Use at your own risk. 5 | -------------------------------------------------------------------------------- /cloudburst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-archives/emr-sample-apps/49fe298a3dd7a48dec56771d613a778ee89dfbf8/cloudburst/.DS_Store -------------------------------------------------------------------------------- /cloudburst/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2011-2013 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"). You 4 | may not use this file except in compliance with the License. A copy of 5 | the License is located at 6 | 7 | http://aws.amazon.com/apache2.0/ 8 | 9 | or in the "license" file accompanying this file. This file is 10 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | ANY KIND, either express or implied. See the License for the specific 12 | language governing permissions and limitations under the License. 13 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/AlignInfo.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | public class AlignInfo { 4 | public int alignlen; 5 | public int differences; 6 | public int [] dist; 7 | public int [] what; 8 | public int distlen; 9 | 10 | private static final StringBuilder builder = new StringBuilder(); 11 | 12 | 13 | //------------------------- Constructor -------------------------- 14 | public AlignInfo(int len, int k, int [] pdist, int [] pwhat, int dlen) 15 | { 16 | setVals(len, k, pdist, pwhat, dlen); 17 | } 18 | 19 | 20 | //------------------------- setVals -------------------------- 21 | public void setVals(int len, int k, int [] pdist, int [] pwhat, int dlen) 22 | { 23 | alignlen = len; 24 | differences = k; 25 | dist = pdist; 26 | what = pwhat; 27 | distlen = dlen; 28 | } 29 | 30 | 31 | //------------------------- isBazeaYatesSeed -------------------------- 32 | // Since an alignment may be recompute k+1 times for each of the k+1 seeds, 33 | // see if the current alignment is the leftmost alignment by checking for 34 | // differences in the proceeding chunks of the query 35 | 36 | public boolean isBazeaYatesSeed(int qlen, int kmerlen) 37 | { 38 | int numBuckets = qlen / kmerlen; 39 | 40 | int lastbucket = -1; 41 | int distdelta = 0; 42 | int pos = 0; 43 | 44 | for (int i = 0; i < distlen; i++) 45 | { 46 | pos += dist[i] + distdelta; 47 | 48 | distdelta = 0; 49 | if (what [i] == 2) 50 | { 51 | // end of string 52 | continue; 53 | } 54 | else if (what[i] == -1) 55 | { 56 | // gap character occurs between pos and pos+1 57 | if (pos % kmerlen == 0) 58 | { 59 | // occurs right between buckets, skip 60 | continue; 61 | } 62 | } 63 | 64 | int bucket = pos / kmerlen; 65 | if (bucket - lastbucket > 1) { return false; } 66 | lastbucket = bucket; 67 | } 68 | 69 | return (lastbucket == numBuckets-1); 70 | } 71 | 72 | 73 | //------------------------- isBazeaYatesSeed -------------------------- 74 | // Run isBazeaYates seed, but output some debugging info 75 | 76 | public boolean isBazeaYatesSeedDebug(int qlen, int kmerlen) 77 | { 78 | int numBuckets = qlen / kmerlen; 79 | 80 | System.out.println("KMER_LEN: " + kmerlen + " numbuckets: " + numBuckets); 81 | 82 | int lastbucket = -1; 83 | int distdelta = 0; 84 | int pos = 0; 85 | for (int i = 0; i < distlen; i++) 86 | { 87 | pos += dist[i] + distdelta; 88 | 89 | distdelta = 0; 90 | if (what [i] == 2) 91 | { 92 | // end of string 93 | continue; 94 | } 95 | else if (what[i] == 0) 96 | { 97 | 98 | } 99 | else if (what[i] == 1) 100 | { 101 | 102 | } 103 | else if (what[i] == -1) 104 | { 105 | // gap character occurs between pos and pos+1 106 | if (pos % kmerlen == 0) 107 | { 108 | // occurs right between buckets, skip 109 | System.out.println(i + ": pos: " + pos 110 | + " dist: " + dist[i] 111 | + " what: " + what[i] 112 | + " between bucket: " + pos/kmerlen); 113 | continue; 114 | } 115 | } 116 | 117 | 118 | int bucket = pos / kmerlen; 119 | 120 | System.out.println(i + ": pos: " + pos 121 | + " dist: " + dist[i] 122 | + " what: " + what[i] 123 | + " bucket: " + bucket); 124 | if (bucket - lastbucket > 1) { return false; } 125 | lastbucket = bucket; 126 | } 127 | 128 | return (lastbucket == numBuckets-1); 129 | } 130 | 131 | 132 | 133 | //------------------------- toString -------------------------- 134 | public String toString() 135 | { 136 | builder.setLength(0); 137 | 138 | builder.append(alignlen); builder.append(';'); 139 | builder.append(differences); builder.append(';'); 140 | 141 | for (int i = 0; i < distlen; i++) 142 | { 143 | builder.append(dist[i]); builder.append(';'); 144 | } 145 | 146 | for (int i = 0; i < distlen; i++) 147 | { 148 | builder.append(what[i]); builder.append(';'); 149 | } 150 | 151 | return builder.toString(); 152 | } 153 | 154 | 155 | //------------------------- printAlignment -------------------------- 156 | // print out the aligned strings, with gaps as necessary 157 | 158 | public void printAlignment(byte[] t, byte[] p) 159 | { 160 | if (dist == null) 161 | { 162 | System.out.print("t: " ); 163 | for (int i = 0; i < t.length; i++) 164 | { 165 | System.out.print((char) t[i]); 166 | } 167 | } 168 | else 169 | { 170 | System.out.print("a: "); 171 | int pos = 0; 172 | int nextstride = 0; 173 | for (int i = 0; i < distlen; i++) 174 | { 175 | if (what[i] == 2) { break; } 176 | int stride = dist[i] + nextstride; 177 | for (int j = 0; j < stride; j++) { System.out.print(" "); } 178 | System.out.print("*"); 179 | 180 | nextstride = 0; 181 | if ((what[i] == 1) || (what[i] == 0)) { nextstride = -1; } 182 | } 183 | 184 | System.out.println(); 185 | 186 | 187 | System.out.print("t: "); 188 | pos = 0; 189 | nextstride = 0; 190 | for (int i = 0; i < distlen; i++) 191 | { 192 | int stride = dist[i] + nextstride; 193 | 194 | if (what[i] == 1) { nextstride = -1; } 195 | else { nextstride = 0; } 196 | 197 | for (int j = 0; j < stride; j++, pos++) 198 | { 199 | System.out.print((char) t[pos]); 200 | } 201 | 202 | if (what[i] == 1) 203 | { 204 | System.out.print('-'); 205 | } 206 | else if (what[i] == -1) 207 | { 208 | System.out.print((char) t[pos]); 209 | pos++; 210 | } 211 | } 212 | } 213 | 214 | System.out.println(); 215 | 216 | System.out.print("p: "); 217 | 218 | if (dist == null) 219 | { 220 | for (int i = 0; i < p.length; i++) 221 | { 222 | System.out.print((char) p[i]); 223 | } 224 | } 225 | else 226 | { 227 | int pos = 0; 228 | for (int i = 0; i < distlen; i++) 229 | { 230 | for (int j = 0; j < dist[i]; j++, pos++) 231 | { 232 | System.out.print((char) p[pos]); 233 | } 234 | 235 | if (what[i] == -1) { System.out.print('-'); } 236 | } 237 | } 238 | 239 | System.out.println(); 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/AlignmentRecord.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import org.apache.hadoop.io.BytesWritable; 4 | import org.apache.hadoop.io.Text; 5 | import java.io.IOException; 6 | 7 | public class AlignmentRecord 8 | { 9 | public int m_refID; 10 | public int m_refStart = 0; 11 | public int m_refEnd = 0; 12 | public int m_differences = 0; 13 | public boolean m_isRC = false; 14 | 15 | private static final StringBuilder builder = new StringBuilder(); 16 | private static final BytesWritable bytes = new BytesWritable(); 17 | private static final byte [] sbuffer = new byte[17]; 18 | 19 | AlignmentRecord() 20 | { 21 | 22 | } 23 | 24 | AlignmentRecord(int refid, int refstart, int refend, int differences, boolean rc) 25 | { 26 | m_refID = refid; 27 | m_refStart = refstart; 28 | m_refEnd = refend; 29 | m_differences = differences; 30 | m_isRC = rc; 31 | } 32 | 33 | AlignmentRecord(AlignmentRecord other) 34 | { 35 | set(other); 36 | } 37 | 38 | AlignmentRecord(Text t) throws IOException 39 | { 40 | fromText(t); 41 | } 42 | 43 | AlignmentRecord(BytesWritable b) throws IOException 44 | { 45 | fromBytes(b); 46 | } 47 | 48 | public void set(AlignmentRecord other) 49 | { 50 | m_refID = other.m_refID; 51 | m_refStart = other.m_refStart; 52 | m_refEnd = other.m_refEnd; 53 | m_differences = other.m_differences; 54 | m_isRC = other.m_isRC; 55 | } 56 | 57 | public Text toText() 58 | { 59 | return new Text(toString()); 60 | } 61 | 62 | public String toString() 63 | { 64 | builder.setLength(0); 65 | 66 | builder.append(m_refID); builder.append('\t'); 67 | builder.append(m_isRC ? 1 : 0); builder.append('\t'); 68 | builder.append(m_refStart); builder.append('\t'); 69 | builder.append(m_refEnd); builder.append('\t'); 70 | builder.append(m_differences); 71 | 72 | return builder.toString(); 73 | } 74 | 75 | public String toAlignment(int readid) 76 | { 77 | builder.setLength(0); 78 | 79 | builder.append(m_refID); builder.append('\t'); 80 | builder.append(m_refStart); builder.append('\t'); 81 | builder.append(m_refEnd); builder.append('\t'); 82 | builder.append(readid); builder.append('\t'); 83 | builder.append(m_differences); builder.append('\t'); 84 | builder.append(m_isRC ? "-" : "+"); 85 | 86 | return builder.toString(); 87 | } 88 | 89 | public AlignmentRecord fromText(Text t) 90 | { 91 | String [] vals = t.toString().split("\t", 5); 92 | 93 | m_refID = Integer.parseInt(vals[0]); 94 | m_isRC = Integer.parseInt(vals[1]) == 1; 95 | m_refStart = Integer.parseInt(vals[2]); 96 | m_refEnd = Integer.parseInt(vals[3]); 97 | m_differences = Integer.parseInt(vals[4]); 98 | 99 | return this; 100 | } 101 | 102 | public BytesWritable toBytes() //throws IOException 103 | { 104 | sbuffer[0] = (byte) (m_isRC ? 1 : 0); 105 | 106 | sbuffer[1] = (byte) ((m_refID & 0xFF000000) >> 24); 107 | sbuffer[2] = (byte) ((m_refID & 0x00FF0000) >> 16); 108 | sbuffer[3] = (byte) ((m_refID & 0x0000FF00) >> 8); 109 | sbuffer[4] = (byte) ((m_refID & 0x000000FF)); 110 | 111 | sbuffer[5] = (byte) ((m_refStart & 0xFF000000) >> 24); 112 | sbuffer[6] = (byte) ((m_refStart & 0x00FF0000) >> 16); 113 | sbuffer[7] = (byte) ((m_refStart & 0x0000FF00) >> 8); 114 | sbuffer[8] = (byte) ((m_refStart & 0x000000FF)); 115 | 116 | sbuffer[9] = (byte) ((m_refEnd & 0xFF000000) >> 24); 117 | sbuffer[10] = (byte) ((m_refEnd & 0x00FF0000) >> 16); 118 | sbuffer[11] = (byte) ((m_refEnd & 0x0000FF00) >> 8); 119 | sbuffer[12] = (byte) ((m_refEnd & 0x000000FF)); 120 | 121 | sbuffer[13] = (byte) ((m_differences & 0xFF000000) >> 24); 122 | sbuffer[14] = (byte) ((m_differences & 0x00FF0000) >> 16); 123 | sbuffer[15] = (byte) ((m_differences & 0x0000FF00) >> 8); 124 | sbuffer[16] = (byte) ((m_differences & 0x000000FF)); 125 | 126 | bytes.set(sbuffer, 0, 17); 127 | return bytes; 128 | } 129 | 130 | 131 | public void fromBytes(BytesWritable t) 132 | { 133 | byte [] raw = t.get(); 134 | 135 | m_isRC = raw[0] == 1; 136 | 137 | m_refID = (raw[1] & 0xFF) << 24 138 | | (raw[2] & 0xFF) << 16 139 | | (raw[3] & 0xFF) << 8 140 | | (raw[4] & 0xFF); 141 | 142 | m_refStart = (raw[5] & 0xFF) << 24 143 | | (raw[6] & 0xFF) << 16 144 | | (raw[7] & 0xFF) << 8 145 | | (raw[8] & 0xFF); 146 | 147 | m_refEnd = (raw[9] & 0xFF) << 24 148 | | (raw[10] & 0xFF) << 16 149 | | (raw[11] & 0xFF) << 8 150 | | (raw[12] & 0xFF); 151 | 152 | m_differences = (raw[13] & 0xFF) << 24 153 | | (raw[14] & 0xFF) << 16 154 | | (raw[15] & 0xFF) << 8 155 | | (raw[16] & 0xFF); 156 | } 157 | } -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/AlignmentStats.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.SequenceFile; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapred.JobConf; 11 | import org.apache.hadoop.fs.FileStatus; 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.fs.PathFilter; 15 | 16 | 17 | public class AlignmentStats { 18 | 19 | private static int numalignments = 0; 20 | private static int rcalignments = 0; 21 | private static int numfiles = 0; 22 | private static Set reads = new HashSet(); 23 | private static AlignmentRecord ar = new AlignmentRecord(); 24 | private static JobConf conf = null; 25 | 26 | public static void fileStats(Path thePath) throws Exception 27 | { 28 | SequenceFile.Reader theReader = new SequenceFile.Reader(FileSystem.get(conf), thePath, conf); 29 | IntWritable key = new IntWritable(); 30 | Text value = new Text(); 31 | numfiles++; 32 | 33 | while(theReader.next(key,value)) 34 | { 35 | numalignments++; 36 | 37 | int thisread = key.get(); 38 | 39 | if (ar.fromText(value).m_isRC) 40 | { 41 | rcalignments++; 42 | } 43 | 44 | reads.add(thisread); 45 | } 46 | } 47 | 48 | public static void stats(Path thePath) throws Exception 49 | { 50 | conf = new JobConf(AlignmentStats.class); 51 | 52 | FileSystem fs = FileSystem.get(conf); 53 | 54 | if (!fs.exists(thePath)) 55 | { 56 | throw new IOException(thePath + " not found"); 57 | } 58 | 59 | FileStatus status = fs.getFileStatus(thePath); 60 | 61 | if (status.isDir()) 62 | { 63 | FileStatus [] files = fs.listStatus(thePath); 64 | for(FileStatus file : files) 65 | { 66 | String str = file.getPath().getName(); 67 | 68 | if (str.startsWith(".")) 69 | { 70 | // skip 71 | } 72 | else if (!file.isDir()) 73 | { 74 | fileStats(file.getPath()); 75 | } 76 | } 77 | } 78 | else 79 | { 80 | fileStats(thePath); 81 | } 82 | 83 | 84 | int numreads = reads.size(); 85 | 86 | System.out.println(numfiles + " files processed"); 87 | System.out.println(numalignments + " Total Alignments"); 88 | System.out.println(rcalignments + " RC Alignments"); 89 | System.out.println(numreads + " Reads Aligned"); 90 | } 91 | 92 | public static void main(String[] args) throws Exception 93 | { 94 | String path = "/user/guest/br-results/"; //args[0]; 95 | stats(new Path(path)); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/CloudBurst.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | 4 | import java.io.IOException; 5 | 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.BytesWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.mapred.FileInputFormat; 11 | import org.apache.hadoop.mapred.FileOutputFormat; 12 | import org.apache.hadoop.mapred.JobClient; 13 | import org.apache.hadoop.mapred.JobConf; 14 | import org.apache.hadoop.mapred.RunningJob; 15 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 16 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 17 | 18 | import cloudBurst.MerReduce.MapClass; 19 | import cloudBurst.MerReduce.ReduceClass; 20 | 21 | import cloudBurst.FilterAlignments.FilterCombinerClass; 22 | import cloudBurst.FilterAlignments.FilterMapClass; 23 | import cloudBurst.FilterAlignments.FilterReduceClass; 24 | 25 | public class CloudBurst { 26 | 27 | // Make sure this number is longer than the longest read 28 | public static final int CHUNK_OVERLAP = 1024; 29 | 30 | 31 | //------------------------- alignall -------------------------- 32 | // Setup and run the hadoop job for running the alignment 33 | 34 | public static RunningJob alignall(String refpath, 35 | String qrypath, 36 | String outpath, 37 | int MIN_READ_LEN, 38 | int MAX_READ_LEN, 39 | int K, 40 | int ALLOW_DIFFERENCES, 41 | boolean FILTER_ALIGNMENTS, 42 | int NUM_MAP_TASKS, 43 | int NUM_REDUCE_TASKS, 44 | int BLOCK_SIZE, 45 | int REDUNDANCY) throws IOException, Exception 46 | { 47 | int SEED_LEN = MIN_READ_LEN / (K+1); 48 | int FLANK_LEN = MAX_READ_LEN-SEED_LEN+K; 49 | 50 | System.out.println("refath: " + refpath); 51 | System.out.println("qrypath: " + qrypath); 52 | System.out.println("outpath: " + outpath); 53 | System.out.println("MIN_READ_LEN: " + MIN_READ_LEN); 54 | System.out.println("MAX_READ_LEN: " + MAX_READ_LEN); 55 | System.out.println("K: " + K); 56 | System.out.println("SEED_LEN: " + SEED_LEN); 57 | System.out.println("FLANK_LEN: " + FLANK_LEN); 58 | System.out.println("ALLOW_DIFFERENCES: " + ALLOW_DIFFERENCES); 59 | System.out.println("FILTER_ALIGNMENTS: " + FILTER_ALIGNMENTS); 60 | System.out.println("NUM_MAP_TASKS: " + NUM_MAP_TASKS); 61 | System.out.println("NUM_REDUCE_TASKS: " + NUM_REDUCE_TASKS); 62 | System.out.println("BLOCK_SIZE: " + BLOCK_SIZE); 63 | System.out.println("REDUNDANCY: " + REDUNDANCY); 64 | 65 | JobConf conf = new JobConf(MerReduce.class); 66 | conf.setJobName("CloudBurst"); 67 | conf.setNumMapTasks(NUM_MAP_TASKS); 68 | conf.setNumReduceTasks(NUM_REDUCE_TASKS); 69 | 70 | FileInputFormat.addInputPath(conf, new Path(refpath)); 71 | FileInputFormat.addInputPath(conf, new Path(qrypath)); 72 | 73 | conf.set("refpath", refpath); 74 | conf.set("qrypath", qrypath); 75 | conf.set("MIN_READ_LEN", Integer.toString(MIN_READ_LEN)); 76 | conf.set("MAX_READ_LEN", Integer.toString(MAX_READ_LEN)); 77 | conf.set("K", Integer.toString(K)); 78 | conf.set("SEED_LEN", Integer.toString(SEED_LEN)); 79 | conf.set("FLANK_LEN", Integer.toString(FLANK_LEN)); 80 | conf.set("ALLOW_DIFFERENCES", Integer.toString(ALLOW_DIFFERENCES)); 81 | conf.set("BLOCK_SIZE", Integer.toString(BLOCK_SIZE)); 82 | conf.set("REDUNDANCY", Integer.toString(REDUNDANCY)); 83 | conf.set("FILTER_ALIGNMENTS", (FILTER_ALIGNMENTS ? "1" : "0")); 84 | 85 | conf.setMapperClass(MapClass.class); 86 | 87 | conf.setInputFormat(SequenceFileInputFormat.class); 88 | conf.setMapOutputKeyClass(BytesWritable.class); 89 | conf.setMapOutputValueClass(BytesWritable.class); 90 | 91 | conf.setReducerClass(ReduceClass.class); 92 | conf.setOutputKeyClass(IntWritable.class); 93 | conf.setOutputValueClass(BytesWritable.class); 94 | conf.setOutputFormat(SequenceFileOutputFormat.class); 95 | 96 | Path oPath = new Path(outpath); 97 | FileOutputFormat.setOutputPath(conf, oPath); 98 | System.err.println(" Removing old results"); 99 | FileSystem.get(conf).delete(oPath); 100 | 101 | RunningJob rj = JobClient.runJob(conf); 102 | System.err.println("CloudBurst Finished"); 103 | return rj; 104 | } 105 | 106 | 107 | //------------------------- filter -------------------------- 108 | // Setup and run the hadoop job for filtering the alignments to just report unambiguous bests 109 | 110 | public static void filter(String alignpath, 111 | String outpath, 112 | int nummappers, 113 | int numreducers) throws IOException, Exception 114 | { 115 | System.out.println("NUM_FMAP_TASKS: " + nummappers); 116 | System.out.println("NUM_FREDUCE_TASKS: " + numreducers); 117 | 118 | JobConf conf = new JobConf(FilterAlignments.class); 119 | conf.setJobName("FilterAlignments"); 120 | conf.setNumMapTasks(nummappers); 121 | conf.setNumReduceTasks(numreducers); 122 | 123 | FileInputFormat.addInputPath(conf, new Path(alignpath)); 124 | 125 | conf.setMapperClass(FilterMapClass.class); 126 | 127 | conf.setInputFormat(SequenceFileInputFormat.class); 128 | conf.setMapOutputKeyClass(IntWritable.class); 129 | conf.setMapOutputValueClass(BytesWritable.class); 130 | 131 | conf.setCombinerClass(FilterCombinerClass.class); 132 | 133 | conf.setReducerClass(FilterReduceClass.class); 134 | conf.setOutputKeyClass(IntWritable.class); 135 | conf.setOutputValueClass(BytesWritable.class); 136 | conf.setOutputFormat(SequenceFileOutputFormat.class); 137 | 138 | Path oPath = new Path(outpath); 139 | FileOutputFormat.setOutputPath(conf, oPath); 140 | System.err.println(" Removing old results"); 141 | FileSystem.get(conf).delete(oPath); 142 | 143 | JobClient.runJob(conf); 144 | 145 | System.err.println("FilterAlignments Finished"); 146 | } 147 | 148 | 149 | //------------------------- main -------------------------- 150 | // Parse the command line options, run alignment and filtering 151 | 152 | public static void main(String[] args) throws Exception 153 | { 154 | String refpath = null; 155 | String qrypath = null; 156 | String outpath = null; 157 | 158 | int K = 0; 159 | int readlen = 0; 160 | int allowdifferences = 0; 161 | 162 | int nummappers = 1; 163 | int numreducers = 1; 164 | int numfmappers = 1; 165 | int numfreducers = 1; 166 | int blocksize = 128; 167 | int redundancy = 1; 168 | 169 | boolean filteralignments = false; 170 | 171 | int local = 0; // set to zero to use command line arguments 172 | 173 | if (local == 1) 174 | { 175 | refpath = "/user/guest/cloudburst/s_suis.br"; 176 | qrypath = "/user/guest/cloudburst/100k.br"; 177 | outpath = "/user/guest/br-results"; 178 | readlen = 36; 179 | 180 | K = 3; 181 | allowdifferences = 0; 182 | filteralignments = true; 183 | redundancy = 2; 184 | } 185 | else if (args.length < 13) 186 | { 187 | System.err.println("Usage: CloudBurst refpath qrypath outpath readlen k allowdifferences filteralignments #mappers #reduces #fmappers #freducers blocksize redundancy"); 188 | return; 189 | } 190 | else 191 | { 192 | refpath = args[0]; 193 | qrypath = args[1]; 194 | outpath = args[2]; 195 | readlen = Integer.parseInt(args[3]); 196 | K = Integer.parseInt(args[4]); 197 | allowdifferences = Integer.parseInt(args[5]); 198 | filteralignments = Integer.parseInt(args[6]) == 1; 199 | nummappers = Integer.parseInt(args[7]); 200 | numreducers = Integer.parseInt(args[8]); 201 | numfmappers = Integer.parseInt(args[9]); 202 | numfreducers = Integer.parseInt(args[10]); 203 | blocksize = Integer.parseInt(args[11]); 204 | redundancy = Integer.parseInt(args[12]); 205 | } 206 | 207 | if (redundancy < 1) { System.err.println("minimum redundancy is 1"); return; } 208 | 209 | if (readlen > CHUNK_OVERLAP) 210 | { 211 | System.err.println("Increase CHUNK_OVERLAP for " + readlen + " length reads, and reconvert fasta file"); 212 | return; 213 | } 214 | 215 | // start the timer 216 | Timer all = new Timer(); 217 | 218 | String alignpath = outpath; 219 | if (filteralignments) { alignpath += "-alignments"; } 220 | 221 | 222 | // run the alignments 223 | Timer talign = new Timer(); 224 | alignall(refpath, qrypath, alignpath, readlen, readlen, K, allowdifferences, filteralignments, 225 | nummappers, numreducers, blocksize, redundancy); 226 | System.err.println("Alignment time: " + talign.get()); 227 | 228 | 229 | // filter to report best alignments 230 | if (filteralignments) 231 | { 232 | Timer tfilter = new Timer(); 233 | filter(alignpath, outpath, numfmappers, numfreducers); 234 | 235 | System.err.println("Filtering time: " + tfilter.get()); 236 | } 237 | 238 | System.err.println("Total Running time: " + all.get()); 239 | }; 240 | } 241 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/ConvertFastaForCloud.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.BytesWritable; 14 | import org.apache.hadoop.io.IntWritable; 15 | import org.apache.hadoop.io.SequenceFile; 16 | import org.apache.hadoop.io.SequenceFile.Writer; 17 | import org.apache.hadoop.mapred.JobConf; 18 | 19 | 20 | public class ConvertFastaForCloud { 21 | 22 | private static final FastaRecord record = new FastaRecord(); 23 | 24 | public static int min_seq_len = Integer.MAX_VALUE; 25 | public static int max_seq_len = 0; 26 | 27 | public static int min(int a, int b) 28 | { 29 | if (a < b) return a; 30 | return b; 31 | } 32 | 33 | public static int max(int a, int b) 34 | { 35 | if (a > b) return a; 36 | return b; 37 | } 38 | 39 | private static IntWritable iw = new IntWritable(); 40 | 41 | public static void saveSequence(int id, StringBuilder sequence, Writer writer) throws IOException 42 | { 43 | int fulllength = sequence.length(); 44 | int maxchunk = 65535; 45 | 46 | if (fulllength < min_seq_len) { min_seq_len = fulllength; } 47 | if (fulllength > max_seq_len) { max_seq_len = fulllength; } 48 | 49 | if (fulllength > 100) 50 | { 51 | System.out.println("In " + id + "... " + fulllength + "bp"); 52 | } 53 | 54 | int offset = 0; 55 | int numchunks = 0; 56 | 57 | while(offset < fulllength) 58 | { 59 | numchunks++; 60 | int end = min(offset + maxchunk, fulllength); 61 | 62 | boolean lastChunk = (end == fulllength); 63 | 64 | record.m_sequence = DNAString.stringToBytes(sequence.substring(offset, end)); 65 | record.m_offset = offset; 66 | record.m_lastChunk = lastChunk; 67 | 68 | iw.set(id); 69 | writer.append(iw, record.toBytes()); 70 | 71 | if (end == fulllength) 72 | { 73 | offset = fulllength; 74 | } 75 | else 76 | { 77 | offset = end - cloudBurst.CloudBurst.CHUNK_OVERLAP; 78 | } 79 | } 80 | 81 | if (numchunks > 1) 82 | { 83 | System.out.println(" " + numchunks + " chunks"); 84 | } 85 | } 86 | 87 | public static void convertFile(String infile, SequenceFile.Writer writer) throws IOException 88 | { 89 | String header = ""; 90 | StringBuilder sequence = null; 91 | 92 | int count = 0; 93 | 94 | try 95 | { 96 | BufferedReader data = new BufferedReader(new InputStreamReader(new FileInputStream(infile))); 97 | 98 | String mapfile = infile; 99 | mapfile += ".map"; 100 | FileWriter fstream = new FileWriter(mapfile); 101 | BufferedWriter out = new BufferedWriter(fstream); 102 | 103 | String line; 104 | while ((line = data.readLine()) != null) 105 | { 106 | line.trim(); 107 | 108 | if (line.charAt(0) == '>') 109 | { 110 | if (count > 0) 111 | { 112 | saveSequence(count, sequence, writer); 113 | } 114 | 115 | sequence = new StringBuilder(); 116 | header = line.substring(1); // skip the > 117 | count++; 118 | 119 | out.write(count + " " + header + "\n"); 120 | } 121 | else 122 | { 123 | sequence.append(line.toUpperCase()); 124 | } 125 | } 126 | 127 | saveSequence(count, sequence, writer); 128 | 129 | out.close(); 130 | } 131 | catch (FileNotFoundException e) 132 | { 133 | System.err.println("Can't open " + infile); 134 | e.printStackTrace(); 135 | System.exit(1); 136 | } 137 | 138 | System.err.println("Processed " + count + " sequences"); 139 | } 140 | 141 | 142 | /** 143 | * @param args 144 | * @throws IOException 145 | */ 146 | public static void main(String[] args) throws IOException { 147 | if (args.length != 2) { 148 | System.err.println("Usage: ConvertFastaForCloud file.fa outfile.br"); 149 | System.exit(-1); 150 | } 151 | 152 | String infile = args[0]; 153 | String outfile = args[1]; 154 | 155 | System.err.println("Converting " + infile + " into " + outfile); 156 | 157 | JobConf config = new JobConf(); 158 | 159 | SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(config), config, 160 | new Path(outfile), IntWritable.class, BytesWritable.class); 161 | 162 | convertFile(infile, writer); 163 | 164 | writer.close(); 165 | 166 | System.err.println("min_seq_len: " + min_seq_len); 167 | System.err.println("max_seq_len: " + max_seq_len); 168 | System.err.println("Using DNAString version: " + DNAString.VERSION); 169 | } 170 | }; 171 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/CountKmers.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.BytesWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.Counters; 12 | import org.apache.hadoop.mapred.JobClient; 13 | import org.apache.hadoop.mapred.JobConf; 14 | import org.apache.hadoop.mapred.MapReduceBase; 15 | import org.apache.hadoop.mapred.Mapper; 16 | import org.apache.hadoop.mapred.OutputCollector; 17 | import org.apache.hadoop.mapred.Reducer; 18 | import org.apache.hadoop.mapred.Reporter; 19 | import org.apache.hadoop.mapred.RunningJob; 20 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 21 | import org.apache.hadoop.mapred.TextOutputFormat; 22 | 23 | 24 | public class CountKmers { 25 | 26 | public static class MerMapClass extends MapReduceBase implements 27 | Mapper 28 | { 29 | private FastaRecord record = new FastaRecord(); 30 | private BytesWritable mer = new BytesWritable(); 31 | private IntWritable pos = new IntWritable(1); 32 | private byte [] dnabuffer = null; 33 | private int KMER_LEN; 34 | 35 | public void configure(JobConf conf) 36 | { 37 | KMER_LEN = Integer.parseInt(conf.get("KMER_LEN")); 38 | dnabuffer = new byte[DNAString.arrToDNALen(KMER_LEN)]; 39 | } 40 | 41 | public void map(IntWritable id, BytesWritable rawRecord, 42 | OutputCollector output, Reporter reporter) throws IOException 43 | { 44 | record.fromBytes(rawRecord); 45 | 46 | byte [] seq = record.m_sequence; 47 | int realoffsetstart = record.m_offset; 48 | int seqlen = seq.length; 49 | 50 | int startoffset = 0; 51 | 52 | // If I'm not the first chunk, shift over so there is room for the left flank 53 | if (realoffsetstart != 0) 54 | { 55 | int shift = CloudBurst.CHUNK_OVERLAP + 1 - KMER_LEN; 56 | startoffset = shift; 57 | realoffsetstart += shift; 58 | } 59 | 60 | // stop so the last mer will just fit 61 | int end = seqlen - KMER_LEN + 1; 62 | 63 | for (int start = startoffset, realoffset = realoffsetstart; start < end; start++, realoffset++) 64 | { 65 | if (DNAString.arrHasN(seq, start, KMER_LEN)) { continue; } 66 | DNAString.arrToDNAStr(seq, start, KMER_LEN, dnabuffer, 0); 67 | mer.set(dnabuffer, 0, dnabuffer.length); 68 | pos.set(realoffset); 69 | output.collect(mer, pos); 70 | } 71 | } 72 | } 73 | 74 | 75 | public static class MerReduceClass extends MapReduceBase implements 76 | Reducer 77 | { 78 | private static Text mertext = new Text(); 79 | private static Text locations = new Text(); 80 | private static StringBuilder builder = new StringBuilder(); 81 | private boolean SHOW_POS; 82 | 83 | public void configure(JobConf conf) 84 | { 85 | SHOW_POS = (Integer.parseInt(conf.get("SHOW_POS")) == 0) ? false : true; 86 | } 87 | 88 | 89 | public synchronized void reduce(BytesWritable mer, Iterator values, 90 | OutputCollector output, Reporter reporter) 91 | throws IOException 92 | { 93 | int cnt = 0; 94 | builder.setLength(0); 95 | 96 | while (values.hasNext()) 97 | { 98 | cnt++; 99 | if (SHOW_POS) 100 | { 101 | builder.append('\t'); 102 | builder.append(values.next().get()); 103 | } 104 | } 105 | 106 | String val = DNAString.bytesToString(DNAString.bytesWritableDNAToArr(mer)); 107 | mertext.set(val); 108 | 109 | if (SHOW_POS) 110 | { 111 | builder.insert(0, cnt); 112 | String locs = builder.toString(); 113 | locations.set(locs); 114 | } 115 | else 116 | { 117 | locations.set(Integer.toString(cnt)); 118 | } 119 | 120 | output.collect(mertext, locations); 121 | } 122 | } 123 | 124 | 125 | /** 126 | * @param args 127 | * @throws IOException 128 | */ 129 | public static void main(String[] args) throws IOException 130 | { 131 | String inpath = null; 132 | String outpath = null; 133 | int kmerlen = 0; 134 | int numMappers = 1; 135 | int numReducers = 1; 136 | int showpos = 0; 137 | 138 | int data = 1; 139 | 140 | if (data == 0) 141 | { 142 | if (args.length != 6) 143 | { 144 | System.err.println("Usage: CountKmers filename outpath kmerlen showpos numMappers numReducers"); 145 | return; 146 | } 147 | 148 | inpath = args[0]; 149 | outpath = args[1]; 150 | kmerlen = Integer.parseInt(args[2]); 151 | showpos = Integer.parseInt(args[3]); 152 | numMappers = Integer.parseInt(args[4]); 153 | numReducers = Integer.parseInt(args[5]); 154 | } 155 | else if (data == 1) 156 | { 157 | inpath = "/user/guest/cloudburst/s_suis.br"; 158 | outpath = "/user/mschatz/kmers"; 159 | kmerlen = 12; 160 | showpos = 0; 161 | numMappers = 1; 162 | numReducers = 1; 163 | } 164 | 165 | System.out.println("inpath: " + inpath); 166 | System.out.println("outpath: " + outpath); 167 | System.out.println("kmerlen: " + kmerlen); 168 | System.out.println("showpos: " + showpos); 169 | System.out.println("nummappers: " + numMappers); 170 | System.out.println("numreducers: " + numReducers); 171 | 172 | JobConf conf = new JobConf(MerReduce.class); 173 | conf.setNumMapTasks(numMappers); 174 | conf.setNumReduceTasks(numReducers); 175 | 176 | conf.addInputPath(new Path(inpath));; 177 | conf.set("KMER_LEN", Integer.toString(kmerlen)); 178 | conf.set("SHOW_POS", Integer.toString(showpos)); 179 | 180 | conf.setInputFormat(SequenceFileInputFormat.class); 181 | 182 | conf.setMapOutputKeyClass(BytesWritable.class); 183 | conf.setMapOutputValueClass(IntWritable.class); 184 | //conf.setCompressMapOutput(true); 185 | 186 | conf.setOutputKeyClass(Text.class); 187 | conf.setOutputValueClass(Text.class); 188 | conf.setOutputFormat(TextOutputFormat.class); 189 | 190 | conf.setMapperClass(MerMapClass.class); 191 | conf.setReducerClass(MerReduceClass.class); 192 | 193 | Path oPath = new Path(outpath); 194 | conf.setOutputPath(oPath); 195 | System.err.println(" Removing old results"); 196 | FileSystem.get(conf).delete(oPath); 197 | 198 | conf.setJobName("CountMers"); 199 | 200 | Timer t = new Timer(); 201 | RunningJob rj = JobClient.runJob(conf); 202 | System.err.println("CountMers Finished"); 203 | 204 | System.err.println("Total Running time was " + t.get()); 205 | 206 | Counters counters = rj.getCounters( ); 207 | Counters.Group task = counters.getGroup("org.apache.hadoop.mapred.Task$Counter"); 208 | long numDistinctMers = task.getCounter("REDUCE_INPUT_GROUPS"); 209 | System.err.println("Num Distinct Mers: " + numDistinctMers); 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/DisplaySequenceFile.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | //Written by Alexander Mont 4 | 5 | //Reads a SequenceFile and outputs the key-value pairs. Intended 6 | //primarily for testing and debugging purposes. 7 | 8 | import java.io.FileWriter; 9 | 10 | import org.apache.hadoop.io.BytesWritable; 11 | import org.apache.hadoop.io.SequenceFile; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.io.Writable; 14 | import org.apache.hadoop.fs.FileSystem; 15 | import org.apache.hadoop.fs.Path; 16 | 17 | public class DisplaySequenceFile { 18 | public static void main(String[] args) throws Exception 19 | { 20 | String filename = null; 21 | 22 | int data = 1; 23 | 24 | if (data == 1) 25 | { 26 | filename = "/user/guest/cloudburst/s_suis.br"; 27 | } 28 | else 29 | { 30 | if (args.length != 1) { 31 | System.err.println("Usage: DisplaySequenceFile seqfile"); 32 | System.exit(-1); 33 | } 34 | 35 | filename = args[0]; 36 | } 37 | 38 | System.err.println("Printing " + filename); 39 | 40 | 41 | Path thePath = new Path(filename); 42 | JobConf conf = new JobConf(DisplaySequenceFile.class); 43 | 44 | SequenceFile.Reader theReader = new SequenceFile.Reader(FileSystem.get(conf), thePath, conf); 45 | 46 | int numrecords = 0; 47 | 48 | if (theReader.getValueClass() == BytesWritable.class) 49 | { 50 | Writable key = (Writable)(theReader.getKeyClass().newInstance()); 51 | BytesWritable value = new BytesWritable(); 52 | 53 | FastaRecord record = new FastaRecord(); 54 | FileWriter fw = new FileWriter("/Users/mschatz/ref.br.txt"); 55 | 56 | while(theReader.next(key,value)) 57 | { 58 | record.fromBytes(value); 59 | fw.write(record.toString()); 60 | fw.write("\n"); 61 | 62 | numrecords++; 63 | } 64 | 65 | 66 | fw.close(); 67 | } 68 | else 69 | { 70 | Writable key = (Writable)(theReader.getKeyClass().newInstance()); 71 | Writable value = (Writable)(theReader.getValueClass().newInstance()); 72 | 73 | 74 | while(theReader.next(key,value)) 75 | { 76 | System.out.println(key.toString() + " -> " + value.toString()); 77 | numrecords++; 78 | } 79 | } 80 | 81 | System.out.println("Saw " + numrecords); 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/FastaRecord.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import org.apache.hadoop.io.BytesWritable; 4 | import java.io.IOException; 5 | 6 | public class FastaRecord 7 | { 8 | public byte[] m_sequence = null; 9 | public boolean m_lastChunk = false; 10 | public int m_offset = 0; 11 | 12 | private static final StringBuilder builder = new StringBuilder(); 13 | 14 | FastaRecord() 15 | { 16 | 17 | } 18 | 19 | FastaRecord(BytesWritable t) throws IOException 20 | { 21 | fromBytes(t); 22 | } 23 | 24 | public String toString() 25 | { 26 | builder.setLength(0); 27 | 28 | builder.append(m_lastChunk?1:0); builder.append('\t'); 29 | builder.append(m_offset); builder.append('\t'); 30 | builder.append(DNAString.bytesToString(m_sequence)); 31 | 32 | return builder.toString(); 33 | } 34 | 35 | public BytesWritable toBytes() 36 | { 37 | byte [] dna = DNAString.arrToDNA(m_sequence); 38 | 39 | int len = 1 + // lastChunk 40 | 4 + // offset 41 | dna.length; 42 | 43 | byte [] buf = new byte[len]; 44 | 45 | buf[0] = (byte) (m_lastChunk ? 1 : 0); 46 | 47 | buf[1] = (byte) ((m_offset & 0xFF000000) >> 24); 48 | buf[2] = (byte) ((m_offset & 0x00FF0000) >> 16); 49 | buf[3] = (byte) ((m_offset & 0x0000FF00) >> 8); 50 | buf[4] = (byte) ((m_offset & 0x000000FF)); 51 | 52 | System.arraycopy(dna, 0, buf, 5, dna.length); 53 | 54 | return new BytesWritable(buf); 55 | } 56 | 57 | 58 | public void fromBytes(BytesWritable t) 59 | { 60 | byte [] raw = t.get(); 61 | int rawlen = t.getSize(); 62 | 63 | m_lastChunk = raw[0] == 1; 64 | 65 | m_offset = (raw[1] & 0xFF) << 24 66 | | (raw[2] & 0xFF) << 16 67 | | (raw[3] & 0xFF) << 8 68 | | (raw[4] & 0xFF); 69 | 70 | int sl = rawlen - 5; 71 | m_sequence = DNAString.dnaToArr(raw, 5, sl); 72 | } 73 | 74 | public static void main(String[] args) throws IOException 75 | { 76 | Timer t = new Timer(); 77 | int num = 100000; 78 | 79 | for (int i = 0; i < num; i++) 80 | { 81 | FastaRecord record = new FastaRecord(); 82 | 83 | record.m_lastChunk = false; 84 | record.m_offset = 123456; 85 | record.m_sequence = DNAString.stringToBytes("ACGTACGTA"); 86 | 87 | BytesWritable bw = record.toBytes(); 88 | 89 | FastaRecord record2 = new FastaRecord(bw); 90 | 91 | if (record.m_lastChunk != record2.m_lastChunk || 92 | record.m_offset != record2.m_offset || 93 | DNAString.bytesToString(record.m_sequence).compareTo(DNAString.bytesToString(record2.m_sequence)) != 0) 94 | { 95 | throw new IOException("Mismatch\norg: " + record.toString() + "\nnew: " + record2.toString()); 96 | } 97 | } 98 | 99 | System.out.println(num + " took:" + t.get()); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/FilterAlignments.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.io.BytesWritable; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.mapred.MapReduceBase; 9 | import org.apache.hadoop.mapred.Mapper; 10 | import org.apache.hadoop.mapred.OutputCollector; 11 | import org.apache.hadoop.mapred.Reducer; 12 | import org.apache.hadoop.mapred.Reporter; 13 | 14 | public class FilterAlignments 15 | { 16 | // Identity mapper 17 | public static class FilterMapClass extends MapReduceBase implements 18 | Mapper 19 | { 20 | public void map(IntWritable readid, BytesWritable rawAlignment, 21 | OutputCollector output, Reporter reporter) throws IOException 22 | { 23 | output.collect(readid, rawAlignment); 24 | } 25 | } 26 | 27 | 28 | // The combiner scans the partial list of alignments for each read, and only outputs the top 2 alignments 29 | // Should have good speedup, since an arbitrarily long list is reduced to just 2 items 30 | // Can't just record the top 1, because then it might be lost that number 1 is a tie 31 | // Must output at least the top 1, or a second best alignment might be recorded instead 32 | public static class FilterCombinerClass extends MapReduceBase implements 33 | Reducer 34 | { 35 | private static AlignmentRecord bestAlignment = new AlignmentRecord(); 36 | private static AlignmentRecord curAlignment = new AlignmentRecord(); 37 | private static AlignmentRecord secondBest = new AlignmentRecord(); 38 | 39 | public synchronized void reduce(IntWritable readid, Iterator values, 40 | OutputCollector output, Reporter reporter) throws IOException 41 | { 42 | boolean recordSecond = false; 43 | bestAlignment.fromBytes(values.next()); 44 | 45 | while (values.hasNext()) 46 | { 47 | curAlignment.fromBytes(values.next()); 48 | 49 | if (curAlignment.m_differences < bestAlignment.m_differences) 50 | { 51 | bestAlignment.set(curAlignment); 52 | recordSecond = false; 53 | } 54 | else if (curAlignment.m_differences == bestAlignment.m_differences) 55 | { 56 | recordSecond = true; 57 | secondBest.set(curAlignment); 58 | } 59 | else 60 | { 61 | // curAlignment is worse than best alignment, nothing to do 62 | } 63 | } 64 | 65 | output.collect(readid, bestAlignment.toBytes()); 66 | 67 | if (recordSecond) 68 | { 69 | output.collect(readid, secondBest.toBytes()); 70 | } 71 | } 72 | } 73 | 74 | 75 | // if there is a unique best alignment, record that alignment 76 | public static class FilterReduceClass extends MapReduceBase implements 77 | Reducer 78 | { 79 | private static AlignmentRecord bestAlignment = new AlignmentRecord(); 80 | private static AlignmentRecord curAlignment = new AlignmentRecord(); 81 | 82 | public synchronized void reduce(IntWritable readid, Iterator values, 83 | OutputCollector output, Reporter reporter) throws IOException 84 | { 85 | boolean recordBest = true; 86 | bestAlignment.fromBytes(values.next()); 87 | 88 | while (values.hasNext()) 89 | { 90 | curAlignment.fromBytes(values.next()); 91 | 92 | if (curAlignment.m_differences < bestAlignment.m_differences) 93 | { 94 | bestAlignment.set(curAlignment); 95 | recordBest = true; 96 | } 97 | else if (curAlignment.m_differences == bestAlignment.m_differences) 98 | { 99 | recordBest = false; 100 | } 101 | else 102 | { 103 | // curAlignment is worse than best alignment, nothing to do 104 | 105 | } 106 | } 107 | 108 | if (recordBest) 109 | { 110 | output.collect(readid, bestAlignment.toBytes()); 111 | } 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/MerRecord.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import org.apache.hadoop.io.BytesWritable; 4 | import java.io.IOException; 5 | 6 | public class MerRecord 7 | { 8 | public boolean isReference = false; 9 | public boolean isRC = false; 10 | public int offset = 0; 11 | public int id; 12 | 13 | public byte[] leftFlank; // only set in the reduce phase 14 | public byte[] rightFlank; // only set in the reduce phase 15 | 16 | private static StringBuilder builder = new StringBuilder(); 17 | private static BytesWritable bytes = new BytesWritable(); 18 | 19 | private static byte[] sbuffer = new byte [1024]; 20 | 21 | //------------------------- Constructor -------------------------- 22 | MerRecord() 23 | { 24 | 25 | } 26 | 27 | //------------------------- Constructor -------------------------- 28 | MerRecord(BytesWritable t) throws IOException 29 | { 30 | fromBytes(t); 31 | } 32 | 33 | //------------------------- toBytes -------------------------- 34 | // Pack the MerRecord information into a BytesWritable 35 | // extract the flanking sequence on-the-fly do avoid copying as much as possible 36 | 37 | public BytesWritable toBytes(byte [] seq, int leftstart, int leftlen, int rightstart, int rightlen) //throws IOException 38 | { 39 | int len = 1 + // isReference, isRC 40 | 4 + // offset 41 | 4 + // id 42 | 1; // hardstop between left and right flank 43 | 44 | if (leftlen > 0) 45 | { 46 | len += DNAString.arrToDNALen(leftlen); 47 | } 48 | 49 | if (rightlen > 0) 50 | { 51 | len += DNAString.arrToDNALen(rightlen); 52 | } 53 | 54 | if (len > sbuffer.length) 55 | { 56 | sbuffer = new byte[len*2]; 57 | } 58 | 59 | sbuffer[0] = (byte) ((isReference ? 0x01 : 0x00) | (isRC ? 0x10 : 0x00)); 60 | 61 | sbuffer[1] = (byte) ((offset & 0xFF000000) >> 24); 62 | sbuffer[2] = (byte) ((offset & 0x00FF0000) >> 16); 63 | sbuffer[3] = (byte) ((offset & 0x0000FF00) >> 8); 64 | sbuffer[4] = (byte) ((offset & 0x000000FF)); 65 | 66 | sbuffer[5] = (byte) ((id & 0xFF000000) >> 24); 67 | sbuffer[6] = (byte) ((id & 0x00FF0000) >> 16); 68 | sbuffer[7] = (byte) ((id & 0x0000FF00) >> 8); 69 | sbuffer[8] = (byte) ((id & 0x000000FF)); 70 | 71 | int pos = 9; 72 | 73 | if (leftlen > 0) 74 | { 75 | pos += DNAString.arrToDNAStrRev(seq, leftstart, leftlen, sbuffer, pos); 76 | } 77 | 78 | sbuffer[pos] = DNAString.hardstop; pos++; 79 | 80 | if (rightlen > 0) 81 | { 82 | pos += DNAString.arrToDNAStr(seq, rightstart, rightlen, sbuffer, pos); 83 | } 84 | 85 | /* 86 | if (pos != len) 87 | { 88 | throw new IOException("pos(" + pos + ") != len(" + len + ")"); 89 | } 90 | */ 91 | 92 | bytes.set(sbuffer, 0, len); 93 | return bytes; 94 | } 95 | 96 | 97 | //------------------------- fromBytes -------------------------- 98 | // Unpack the raw bytes and set the MerRecord fields 99 | 100 | public void fromBytes(BytesWritable t) 101 | { 102 | byte [] raw = t.get(); 103 | int rawlen = t.getSize(); 104 | 105 | //sbuffer[0] = (byte) ((isReference ? 0x01 : 0x00) | (isRC ? 0x10 : 0x00)); 106 | 107 | isReference = (raw[0] & 0x01) == 0x01; 108 | isRC = (raw[0] & 0x10) == 0x10; 109 | 110 | offset = (raw[1] & 0xFF) << 24 111 | | (raw[2] & 0xFF) << 16 112 | | (raw[3] & 0xFF) << 8 113 | | (raw[4] & 0xFF); 114 | 115 | id = (raw[5] & 0xFF) << 24 116 | | (raw[6] & 0xFF) << 16 117 | | (raw[7] & 0xFF) << 8 118 | | (raw[8] & 0xFF); 119 | 120 | int fieldstart = 9; 121 | 122 | for (int i = fieldstart; i < rawlen; i++) 123 | { 124 | if (raw[i] == DNAString.hardstop) 125 | { 126 | //leftFlank = DNAString.dnaToArr(raw, fieldstart, i-fieldstart); 127 | leftFlank = new byte[i-fieldstart]; 128 | System.arraycopy(raw, fieldstart, leftFlank, 0, i-fieldstart); 129 | 130 | fieldstart = i+1; // skip the hardstop 131 | break; 132 | } 133 | } 134 | 135 | rightFlank = new byte[rawlen - fieldstart]; 136 | System.arraycopy(raw, fieldstart, rightFlank, 0, rawlen-fieldstart); 137 | //rightFlank = DNAString.dnaToArr(raw, fieldstart, rawlen-fieldstart); 138 | } 139 | 140 | 141 | //------------------------- toString -------------------------- 142 | // Serialize the fields to a string for debugging 143 | 144 | public String toString() 145 | { 146 | builder.setLength(0); 147 | 148 | builder.append(isReference?'1':'0'); builder.append(';'); 149 | builder.append(isRC?'1':'0'); builder.append(';'); 150 | builder.append(offset); builder.append(';'); 151 | builder.append(id); builder.append(';'); 152 | builder.append(DNAString.bytesToString(DNAString.dnaToArr(leftFlank))); builder.append(';'); 153 | builder.append(DNAString.bytesToString(DNAString.dnaToArr(rightFlank))); 154 | 155 | return builder.toString(); 156 | } 157 | 158 | 159 | //------------------------- main -------------------------- 160 | // Make sure the serialization is correct and fast 161 | 162 | public static void main(String[] args) throws IOException 163 | { 164 | byte[] seq = DNAString.stringToBytes("ACGTACGTACGTACGTACGT"); 165 | 166 | MerRecord mr = new MerRecord(); 167 | mr.id = 12345; 168 | mr.isRC = true; 169 | mr.isReference = false; 170 | 171 | mr.offset = 1234567; 172 | 173 | Timer t = new Timer(); 174 | int num = 10000000; 175 | for (int i = 0; i < num; i++) 176 | { 177 | //System.out.println("Org: " + mr.toString()); 178 | 179 | BytesWritable bw = mr.toBytes(seq, 0, 5, 19, 0); 180 | 181 | MerRecord mr2 = new MerRecord(bw); 182 | 183 | //System.out.println("New: " + mr2.toString()); 184 | 185 | //if (mr2.id != mr.id || 186 | // mr2.isRC != mr.isRC || 187 | // mr2.isReference != mr.isReference || 188 | // DNAString.bytesToString(mr2.leftFlank).compareTo(DNAString.bytesToString(mr.leftFlank)) != 0 || 189 | // DNAString.bytesToString(mr2.rightFlank).compareTo(DNAString.bytesToString(mr.rightFlank)) != 0) 190 | //{ 191 | // 192 | //} 193 | 194 | if (mr2.id != mr.id || 195 | mr2.isRC != mr.isRC || 196 | mr2.isReference != mr.isReference) 197 | { 198 | throw new IOException("Mismatch!"); 199 | } 200 | } 201 | 202 | System.out.println(num + " took:" + t.get()); 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/PrintAlignments.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.fs.FileStatus; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.SequenceFile; 10 | import org.apache.hadoop.mapred.JobConf; 11 | 12 | public class PrintAlignments { 13 | 14 | private static JobConf conf = null; 15 | private static AlignmentRecord ar = new AlignmentRecord(); 16 | 17 | public static void printFile(Path thePath) throws IOException 18 | { 19 | SequenceFile.Reader theReader = new SequenceFile.Reader(FileSystem.get(conf), thePath, conf); 20 | 21 | IntWritable key = new IntWritable(); 22 | BytesWritable value = new BytesWritable(); 23 | 24 | while(theReader.next(key,value)) 25 | { 26 | ar.fromBytes(value); 27 | System.out.println(ar.toAlignment(key.get())); 28 | } 29 | } 30 | 31 | /** 32 | * @param args 33 | * @throws IOException 34 | */ 35 | public static void main(String[] args) throws IOException 36 | { 37 | String filename = null; 38 | //filename = "/user/guest/br-results/"; 39 | 40 | if (filename == null) 41 | { 42 | if (args.length != 1) 43 | { 44 | System.err.println("Usage: PrintAlignments seqfile"); 45 | System.exit(-1); 46 | } 47 | 48 | filename = args[0]; 49 | } 50 | 51 | 52 | System.err.println("Printing " + filename); 53 | 54 | Path thePath = new Path(filename); 55 | conf = new JobConf(AlignmentStats.class); 56 | 57 | FileSystem fs = FileSystem.get(conf); 58 | 59 | if (!fs.exists(thePath)) 60 | { 61 | throw new IOException(thePath + " not found"); 62 | } 63 | 64 | FileStatus status = fs.getFileStatus(thePath); 65 | 66 | if (status.isDir()) 67 | { 68 | FileStatus [] files = fs.listStatus(thePath); 69 | for(FileStatus file : files) 70 | { 71 | String str = file.getPath().getName(); 72 | 73 | if (str.startsWith(".")) 74 | { 75 | // skip 76 | } 77 | else if (!file.isDir()) 78 | { 79 | printFile(file.getPath()); 80 | } 81 | } 82 | } 83 | else 84 | { 85 | printFile(thePath); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/SharedSeedRecord.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | public class SharedSeedRecord 4 | { 5 | public int offset = 0; 6 | public int seedLength = 0; 7 | public boolean isRC = false; 8 | public byte[] leftFlank; 9 | public byte[] rightFlank; 10 | 11 | public int refID; 12 | public int refOffset = 0; 13 | public byte[] refLeftFlank; 14 | public byte[] refRightFlank; 15 | 16 | SharedSeedRecord() 17 | { 18 | 19 | } 20 | 21 | private static final StringBuilder builder = new StringBuilder(); 22 | 23 | public String toString() 24 | { 25 | builder.setLength(0); 26 | 27 | builder.append(isRC ? 1 : 0); builder.append(';'); 28 | builder.append(offset); builder.append(';'); 29 | builder.append(seedLength); builder.append(';'); 30 | builder.append(DNAString.bytesToString(leftFlank)); builder.append(';'); 31 | builder.append(DNAString.bytesToString(rightFlank)); builder.append(';'); 32 | builder.append(refID); builder.append(';'); 33 | builder.append(refOffset); builder.append(';'); 34 | builder.append(DNAString.bytesToString(refLeftFlank)); builder.append(';'); 35 | builder.append(DNAString.bytesToString(refRightFlank)); 36 | 37 | return builder.toString(); 38 | } 39 | } -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/SubstringTester.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | import org.apache.hadoop.io.Text; 4 | 5 | public class SubstringTester { 6 | 7 | public static int KMER_LEN = 32; 8 | 9 | public static void version1(StringBuilder sb, Text mer) 10 | { 11 | Timer mertime = new Timer(); 12 | int merlen = KMER_LEN; 13 | int end = sb.length() - merlen; 14 | for (int start = 0; start < end; start++) 15 | { 16 | mer.set(sb.substring(start, start+merlen)); 17 | } 18 | System.out.println("version1 took: " + mertime.get()); 19 | } 20 | 21 | public static void version2(StringBuilder sb, Text mer) 22 | { 23 | Timer mertime = new Timer(); 24 | int merlen = KMER_LEN; 25 | int end = sb.length() - merlen; 26 | 27 | char [] merchar = new char[merlen]; 28 | byte [] merbytes = new byte[merlen]; 29 | 30 | System.out.println("version 2"); 31 | 32 | for (int start = 0; start < end; start++) 33 | { 34 | sb.getChars(start, start+merlen, merchar, 0); 35 | for (int i = 0; i < merlen; i++) 36 | { 37 | merbytes[i] = (byte)merchar[i]; 38 | } 39 | 40 | mer.set(merbytes); 41 | 42 | if (start < 10) 43 | { 44 | System.out.println(" mer[" + start + "]:" + mer.toString()); 45 | 46 | } 47 | } 48 | System.out.println("total: " + mertime.get()); 49 | } 50 | 51 | public static void version3(StringBuilder sb, Text mer) 52 | { 53 | System.out.println("version 3"); 54 | 55 | Timer mertime = new Timer(); 56 | int alllen = sb.length(); 57 | int merlen = KMER_LEN; 58 | int end = alllen - merlen; 59 | 60 | byte [] merbytes = new byte[merlen]; 61 | byte [] allbytes = new byte[alllen]; 62 | 63 | for (int i = 0; i < alllen; i++) 64 | { 65 | allbytes[i] = (byte) sb.charAt(i); 66 | } 67 | 68 | 69 | for (int start = 0; start < end; start++) 70 | { 71 | for (int i = 0; i < merlen; i++) 72 | { 73 | merbytes[i] = allbytes[start+i]; 74 | } 75 | 76 | mer.set(merbytes); 77 | 78 | if (start < 10) 79 | { 80 | System.out.println(" mer[" + start + "]:" + mer.toString()); 81 | } 82 | } 83 | System.out.println("total: " + mertime.get()); 84 | } 85 | 86 | 87 | 88 | public static void main(String[] args) 89 | { 90 | String str = "acbdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"; 91 | 92 | Timer buildtime = new Timer(); 93 | StringBuilder sb = new StringBuilder(); 94 | int numcopies = 100000; 95 | for (int i = 0; i < numcopies; i++) 96 | { 97 | sb.append(str); 98 | } 99 | 100 | System.out.println("Constructed " + numcopies + " copies in " + buildtime.get()); 101 | 102 | Text mer = new Text(); 103 | 104 | //version1(sb, mer); 105 | version2(sb, mer); 106 | version3(sb, mer); 107 | 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /cloudburst/src/cloudBurst/Timer.java: -------------------------------------------------------------------------------- 1 | package cloudBurst; 2 | 3 | public class Timer { 4 | 5 | public long starttime; 6 | public long endtime; 7 | 8 | Timer() 9 | { 10 | starttime = System.currentTimeMillis(); 11 | } 12 | 13 | double get() 14 | { 15 | endtime = System.currentTimeMillis(); 16 | return (endtime - starttime) / 1000.0; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /cloudfront/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-archives/emr-sample-apps/49fe298a3dd7a48dec56771d613a778ee89dfbf8/cloudfront/.DS_Store -------------------------------------------------------------------------------- /cloudfront/code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-archives/emr-sample-apps/49fe298a3dd7a48dec56771d613a778ee89dfbf8/cloudfront/code/.DS_Store -------------------------------------------------------------------------------- /cloudfront/code/CHANGES.txt: -------------------------------------------------------------------------------- 1 | CloudFront LogAnalyzer (1.0.0) -------------------------------------------------------------------------------- /cloudfront/code/NOTICE.txt: -------------------------------------------------------------------------------- 1 | CloudFront LogAnalyzer 2 | Copyright 2009-2009 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the “License”). You may not use this file except in compliance with the License. A copy of the License is 5 | located at 6 | 7 | http://aws.Amazon/apache2.0/ 8 | 9 | or in the “license” file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 10 | express or implied. See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /cloudfront/code/README.TXT: -------------------------------------------------------------------------------- 1 | == CloudFront LogAnalyzer 2 | 3 | CloudFront LogAnalyzer is an analysis package for Amazon CloudFront Access 4 | Logs. The application is built to run on Amazon Elastic MapReduce. It uses 5 | Cascading(http://www.cascading.org) to generate the reports. The application 6 | reads in the location of your CloudFront logs and the date range for 7 | consideration. It runs map reduce jobs to process these logs to produce the 8 | following usage reports 9 | 10 | - Overall Volume Report 11 | - Client IP Report 12 | - Object Popularity Report 13 | - Edge Location Report 14 | 15 | 16 | == Getting Setup 17 | 18 | === Building CloudFront LogAnalyzer 19 | 20 | Building the tool is pretty straight forward. All the dependent jars 21 | needed are present in the libs directory. After unpacking the tgz, run 22 | ant jar to build create a jar file 23 | 24 | ant jar 25 | 26 | The jar file logprocessor.jar gets generated in the build 27 | directory. 28 | 29 | 30 | === Upload jar to your S3 bucket 31 | 32 | Upload the generated jar to an S3 bucket you own and note down the jar 33 | location (e.g s3n:////logprocessor.jar). 34 | You will be needing this location to run this using Amazon Elastic 35 | MapReduce 36 | 37 | 38 | === Running the application using the Amazon Elastic MapReduce Web Console 39 | 40 | Sign up for Amazon Elastic MapReduce (http://aws.amazon.com/elasticmapreduce/) 41 | if you have not already done so. Login to the webconsole and follow the steps to 42 | create a new JobFlow. Choose the CloudFront LogAnalyzer sample application in the 43 | JobFlow Wizard. This will pre fill the jar location and parameters with default 44 | values that uses the jar and date in the public sample bucket. You can either go 45 | with the defaults or modify these values to use the jar file that you have 46 | uploaded and/or provide your cloudfront log files. 47 | 48 | === Running the application from the Ruby Client 49 | 50 | Download the Amazon Elastic MapReduce Ruby Client from Resources -> Sample Code 51 | and Librarues in http://aws.amazon.com/elasticmapreduce. Follow the instructions 52 | provided to get it setup. The following command runs the CloudFront LogAnalyzer 53 | application wuth default parameters 54 | 55 | ./elastic-mapreduce --create --jar s3n://elasticmapreduce/samples/cloudfront/logprocessor.jar 56 | --args "-input,s3n://elasticmapreduce/samples/cloudfront/input, 57 | -output,s3n:///" 58 | 59 | === Jar Arguments 60 | 61 | The application takes the following arguments. Use these arguments either from the console or from the 62 | 63 | -input 64 | the s3 location of the cloudfront log files e.g s3n://mycloudfrontlogbucket/cloudfrontlogs 65 | 66 | -output 67 | s3 location to which the reports get written to (e.g s3n://myoutputbucket/reports Note: It 68 | is important that the subdirectory doesnt already exist) 69 | 70 | 71 | -start 72 | (e.g 2009-02-01-01 => 1st February 2009, 01:00)(Defaults to "any") 73 | -end 74 | (e.g 2009-03-31-23 => 31st March 2009, 23:00)(Defaults to "any") 75 | -timeBucket