├── .gitattributes ├── .gitignore ├── LICENSE ├── Performance Evaluation.png ├── README.md ├── Similarity Evaluation.png ├── hadoop-pot-assembly ├── pom.xml └── src │ └── main │ └── assembly │ └── assembly.xml ├── hadoop-pot-core ├── pom.xml └── src │ └── main │ └── java │ └── org │ └── pooledtimeseries │ ├── Deduplicate.java │ ├── FormatOutput.java │ ├── GradientTimeSeries.java │ ├── MeanChiSquareDistanceCalculation.java │ ├── OpticalTimeSeries.java │ ├── SimilarityCalculation.java │ ├── cartesian │ ├── CartesianInputFormat.java │ └── CartesianRecordReader.java │ ├── healthcheck │ └── CheckCartesianProductSeqFile.java │ ├── seqfile │ ├── FullFileInputFormat.java │ ├── FullFileRecordReader.java │ ├── PoTVideoPathFilter.java │ └── TextVectorsToSequenceFile.java │ └── util │ ├── ClassScope.java │ ├── HadoopFileUtil.java │ ├── PoTConstants.java │ ├── PoTSerialiser.java │ ├── PoTUtil.java │ └── ReadSeqFileUtil.java ├── hadoop-pot-video ├── pom.xml └── src │ └── main │ └── java │ └── org │ └── pooledtimeseries │ ├── FeatureVector.java │ ├── PoT.java │ ├── PoTException.java │ └── healthcheck │ └── CheckOpenCV.java ├── pom.xml ├── src └── main │ ├── bin │ ├── pooled-time-series │ └── pooled-time-series-hadoop │ └── resources │ └── tika-config.xml └── visualization ├── circlepacking.html ├── cluster-d3.html ├── css ├── dashboard.css └── style.css ├── dashboard.html ├── data ├── formatted_similarity_calc.csv ├── similarity_cluster.json ├── similarity_cluster.png └── similarity_heatmap.png ├── favicon.ico ├── index.html ├── js ├── dashboard.js └── matrix.js └── py ├── evaluate_hmdb.py ├── similarity_cluster.py ├── similarity_heatmap.py └── video_duration.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # Windows image file caches 4 | Thumbs.db 5 | ehthumbs.db 6 | 7 | # Folder config file 8 | Desktop.ini 9 | 10 | # Recycle Bin used on file shares 11 | $RECYCLE.BIN/ 12 | 13 | # Windows Installer files 14 | *.cab 15 | *.msi 16 | *.msm 17 | *.msp 18 | 19 | # Windows shortcuts 20 | *.lnk 21 | 22 | # ========================= 23 | # Operating System Files 24 | # ========================= 25 | 26 | # OSX 27 | # ========================= 28 | 29 | .DS_Store 30 | .AppleDouble 31 | .LSOverride 32 | 33 | # Thumbnails 34 | ._* 35 | 36 | # Files that might appear on external disk 37 | .Spotlight-V100 38 | .Trashes 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | /.classpath 47 | /.project 48 | /.settings/ 49 | /data/ 50 | /similarity.txt 51 | 52 | *.mp4 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {Michael S. Ryoo} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /Performance Evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/hadoop-pot/c3e7d1dda74ce56c25f574795c09feb9a6429c62/Performance Evaluation.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Hadoop implementation of the Pooled Time Series (PoT) algorithm 2 | =============================================================== 3 | PoT java implementation using Apache Hadoop. 4 | 5 | # Dependencies 6 | * Maven (Version shouldn't matter much. Tested with 2.x and 3.x.) 7 | * OpenCV 2.4.x (Tested with 2.4.9 and 2.4.11) 8 | 9 | # Pre-requisites 10 | If you get any errors running brew install opencv related to numpy, please run: 11 | 12 | 1. `pip install numpy` 13 | 14 | Now move on to OpenCV (More detailed instructions in [wiki/Installing-opencv](https://github.com/USCDataScience/hadoop-pot/wiki/Installing-opencv)) 15 | 1. `brew install opencv --with-java` 16 | 17 | The above should leave you with a: 18 | 19 | /usr/local/Cellar/opencv//share/OpenCV/java 20 | 21 | Directory which contains the associated dylib OpenCV dynamic library along with the OpenCV jar file. 22 | 23 | # Getting started 24 | 1. `cd hadoop-pot-assembly` 25 | 2. `mvn install assembly:assembly` 26 | 3. Set OPENCV_JAVA_HOME, e.g., to `export OPENCV_JAVA_HOME=/usr/local/Cellar/opencv/2.4.9/share/OpenCV/java` 27 | 4. Set POOLED_TIME_SERIES_HOME, e.g., to `export POOLED_TIME_SERIES_HOME=$HOME/hadoop-pot/src/main` 28 | 5. Run `pooled-time-series`, e.g., by creating an alias, `alias pooled-time-series="$POOLED_TIME_SERIES_HOME/bin/pooled-time-series"` 29 | 30 | The above should produce: 31 | 32 | ``` 33 | usage: pooled_time_series 34 | -d,--dir A directory with image files in it 35 | -f,--file Path to a single file 36 | -h,--help Print this message. 37 | -j,--json Set similarity output format to JSON. 38 | Defaults to .txt 39 | -o,--outputfile File containing similarity results. 40 | Defaults to ./similarity.txt 41 | -p,--pathfile A file containing full absolute paths to 42 | videos. Previous default was 43 | memex-index_temp.txt 44 | ``` 45 | 46 | So, to call the code e.g., on a directory of files called `data`, you would run (e.g., with OpenCV 2.4.9): 47 | 48 | ``` 49 | pooled-times-series -d data 50 | ``` 51 | 52 | Alternatively you can create (independently of this tool) a file with absolute file paths to video files, 1 per line, and then pass it with the `-p` file to the above program. 53 | 54 | ## Running Hadoop Jobs 55 | ### Config and Getting Started 56 | Add the following to your .bashrc 57 | ``` 58 | export HADOOP_OPTS="-Djava.library.path= -Dmapred.map.child.java.opts=-Djava.library.path=" 59 | alias pooled-time-series-hadoop="$POOLED_TIME_SERIES_HOME/bin/pooled-time-series-hadoop" 60 | ``` 61 | 62 | Build and clean up the jar for running 63 | ``` 64 | # Compile everything 65 | mvn install assembly:assembly 66 | 67 | # Drop the LICENSE file from our jar that will give us headaches otherwise 68 | zip -d target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar META-INF/LICENSE 69 | 70 | ``` 71 | 72 | # Documentation moving to the wiki 73 | 74 | We are moving our documentation to the wiki. Please bear with us and report issues as you find them. 75 | 76 | * [Getting up and running with Hadoop - Individual MR commands](https://github.com/USCDataScience/hadoop-pot/wiki/Individual-MR-job-commands) 77 | 78 | # Research Background and Detail 79 | This is a source code used in the following conference paper [1]. 80 | It includes the pooled time series (PoT) representation framework as well as basic per-frame descriptor extractions including histogram of optical flows (HOF) and histogram of oriented gradients (HOG). 81 | For more detailed information on the approach, please check the paper. 82 | 83 | If you take advantage of this code for any academic purpose, please do cite: 84 | 85 | ``` 86 | [1] Mattmann, Chris A., and Madhav Sharan. "Scalable Hadoop-Based Pooled Time Series of Big Video Data from the Deep Web." Proceedings of the 2017 ACM on International Conference on Multimedia Retrieval. ACM, 2017. 87 | [2] M. S. Ryoo, B. Rothrock, and L. Matthies, "Pooled Motion Features for First-Person Videos", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2015. 88 | 89 | https://arxiv.org/abs/1610.06669 90 | http://arxiv.org/pdf/1412.6505v2.pdf 91 | 92 | @inproceedings{mattmann2017scalable, 93 | title={Scalable Hadoop-Based Pooled Time Series of Big Video Data from the Deep Web}, 94 | author={Mattmann, Chris A and Sharan, Madhav}, 95 | booktitle={Proceedings of the 2017 ACM on International Conference on Multimedia Retrieval}, 96 | pages={117--120}, 97 | year={2017}, 98 | organization={ACM} 99 | } 100 | 101 | @inproceedings{ryoo2015pot, 102 | title={Pooled Motion Features for First-Person Videos}, 103 | author={M. S. Ryoo and B. Rothrock and L. Matthies}, 104 | booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 105 | year={2015}, 106 | month={June}, 107 | address={Boston, MA}, 108 | } 109 | ``` 110 | 111 | # Evaluation 112 | ![](https://raw.githubusercontent.com/USCDataScience/hadoop-pot/master/Similarity%20Evaluation.png) 113 | 114 | HMDB Dataset - http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/ 115 | 116 | -------------------------------------------------------------------------------- /Similarity Evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/hadoop-pot/c3e7d1dda74ce56c25f574795c09feb9a6429c62/Similarity Evaluation.png -------------------------------------------------------------------------------- /hadoop-pot-assembly/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | gov.nasa.jpl.memex 6 | hadoop-pot 7 | 1.0-SNAPSHOT 8 | 9 | pooled-time-series 10 | hadoop-pot-assembly 11 | 12 | 13 | 14 | org.apache.maven.plugins 15 | maven-compiler-plugin 16 | 3.3 17 | 18 | 1.7 19 | 1.7 20 | 21 | 22 | 23 | 26 | maven-assembly-plugin 27 | 2.5.4 28 | 29 | 30 | jar-with-dependencies 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | ${project.groupId} 43 | hadoop-pot-core 44 | ${project.version} 45 | 46 | 47 | -------------------------------------------------------------------------------- /hadoop-pot-assembly/src/main/assembly/assembly.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | jar-with-dependencies 6 | 7 | jar 8 | 9 | false 10 | 11 | 12 | / 13 | true 14 | true 15 | runtime 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /hadoop-pot-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | gov.nasa.jpl.memex 5 | hadoop-pot 6 | 1.0-SNAPSHOT 7 | 8 | hadoop-pot-core 9 | 10 | 11 | ${project.groupId} 12 | hadoop-pot-video 13 | ${project.version} 14 | 15 | 16 | org.apache.hadoop 17 | hadoop-common 18 | 2.7.2 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-mapreduce-client-core 23 | 2.7.2 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-mapreduce-client-jobclient 28 | 2.7.2 29 | 30 | 31 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/Deduplicate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.pooledtimeseries; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.File; 21 | import java.io.FileReader; 22 | import java.io.FileWriter; 23 | import java.io.IOException; 24 | import java.io.PrintWriter; 25 | import java.util.Arrays; 26 | import java.util.HashSet; 27 | import java.util.Set; 28 | 29 | public class Deduplicate { 30 | private static final double DEFAULT_THRESHOLD = 0.99d; 31 | 32 | public static void main(String[] args) throws IOException { 33 | if (args.length < 3) { 34 | System.err.println("Improper usage. Execute with below 2 arguments- "); 35 | System.err.println("args[0] - path to CSV file to write the deduped similarity calc csv "); 36 | System.err.println("args[1] - path to List of video names "); 37 | System.err.println("args[2] - Video pairs with similarity score- 'vid1,vid2\t0.5' "); 38 | System.err.println("args[3] - similarity threshold. Default 0.99 "); 39 | throw new RuntimeException("Insufficient Input"); 40 | } 41 | File outFile = new File(args[0]);// CSV file to write the output deduped_similarity_calc.csv 42 | File outFileNames = new File(args[1]);// List of video names 43 | 44 | if (outFile.exists() || outFileNames.exists()) { 45 | throw new RuntimeException(String.format("Some output file already esists - %s , %s ", outFile.getAbsolutePath() 46 | , outFileNames.getAbsolutePath()) ); 47 | } 48 | 49 | File simFile = new File(args[2]);// Video pairs with similarity score 50 | double threshold = args.length == 4 ? Double.parseDouble(args[3]) : DEFAULT_THRESHOLD; 51 | // All videos to discard 52 | Set videosToDelete = new HashSet<>(); 53 | // One video from each similar set will be kept 54 | Set videosToKeep = new HashSet<>(); 55 | 56 | BufferedReader br = new BufferedReader(new FileReader(simFile)); 57 | String simLine = null; 58 | while ((simLine = br.readLine()) != null) { 59 | storeVideosToDelete(videosToDelete, videosToKeep, simLine, threshold); 60 | 61 | } 62 | br.close(); 63 | 64 | // Write output in outFile 65 | PrintWriter similarity = new PrintWriter(new FileWriter(outFile, true)); 66 | //reset videosToKeep for outputting video names 67 | videosToKeep = new HashSet<>(); 68 | 69 | br = new BufferedReader(new FileReader(simFile)); 70 | simLine = null; 71 | while ((simLine = br.readLine()) != null) { 72 | String[] pairAndScore = simLine.split("\t"); 73 | String[] pair = pairAndScore[0].split(","); 74 | boolean vid1InDelete = videosToDelete.contains(pair[0]); 75 | boolean vid2InDelete = videosToDelete.contains(pair[1]); 76 | if (vid1InDelete || vid2InDelete) { 77 | continue; 78 | }else{ 79 | videosToKeep.addAll(Arrays.asList(pair)); 80 | similarity.println(simLine); 81 | } 82 | 83 | } 84 | br.close(); 85 | similarity.close(); 86 | 87 | PrintWriter listOfFile = new PrintWriter(new FileWriter(outFileNames, true)); 88 | for (String videos : videosToKeep){ 89 | listOfFile.println(videos); 90 | } 91 | listOfFile.close(); 92 | 93 | System.out.println("Stored results in: " + outFile.getAbsolutePath()); 94 | 95 | } 96 | 97 | private static void storeVideosToDelete(Set videosToDelete, Set videosToKeep, String simLine, double threshold) { 98 | String[] pairAndScore = simLine.split("\t"); 99 | double score = Double.parseDouble(pairAndScore[1]); 100 | if (score >= threshold) { 101 | String[] pair = pairAndScore[0].split(","); 102 | String vid1 = pair[0]; 103 | String vid2 = pair[1]; 104 | if(vid1.equals(vid2)){ 105 | return; 106 | } 107 | boolean vid1InKeep = videosToKeep.contains(pair[0]); 108 | boolean vid2InKeep = videosToKeep.contains(pair[1]); 109 | boolean vid1InDelete = videosToDelete.contains(pair[0]); 110 | boolean vid2InDelete = videosToDelete.contains(pair[1]); 111 | 112 | if (vid1InDelete || vid2InDelete) { 113 | return; 114 | } 115 | // None of the video is kept 116 | if (!vid1InKeep && !vid2InKeep) { 117 | videosToKeep.add(vid1); 118 | videosToDelete.add(vid2); 119 | } else if (vid1InKeep && vid2InKeep) {// Both of the video are kept 120 | videosToDelete.add(vid1);// delete any one of them 121 | videosToKeep.remove(vid1); 122 | } else if (vid1InKeep) { // Only vid1 is in keep 123 | videosToDelete.add(vid2); 124 | } else if (vid2InKeep) { // Only vid2 is in keep 125 | videosToDelete.add(vid1); 126 | } 127 | 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/FormatOutput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.BufferedReader; 21 | import java.io.File; 22 | import java.io.FileReader; 23 | import java.io.FileWriter; 24 | import java.io.IOException; 25 | import java.io.PrintWriter; 26 | import java.nio.file.Files; 27 | import java.nio.file.Path; 28 | import java.nio.file.Paths; 29 | import java.text.DecimalFormat; 30 | import java.util.List; 31 | 32 | import com.google.common.base.Charsets; 33 | 34 | public class FormatOutput { 35 | 36 | /** 37 | * Sample output- 38 | * ,1.mp4, 2.mp4, 3.mp4,
39 | * 1.mp4, 1.0, 0.677986882429, 0.514423983869,
40 | * 2.mp4, , 1.0, 0.12525353988,
41 | * 3.mp4, , , 1.0,
42 | * 43 | * @param args
44 | * args[0] - path to CSV file to write the formatted similarity calc csv
45 | * args[1] - Video pairs with similarity score
46 | * args[2] - List of all videos
47 | * 48 | * @throws IOException 49 | */ 50 | public static void main(String[] args) throws IOException { 51 | if (args.length < 3) { 52 | System.err.println("Improper usage. Execute with below 3 arguments- "); 53 | System.err.println("args[0] - path to CSV file to write the formatted similarity calc csv "); 54 | System.err.println("args[1] - Video pairs with similarity score- 'vid1,vid2\t0.5' "); 55 | System.err.println("args[2] - List of all videos "); 56 | throw new RuntimeException("Insufficient Input"); 57 | } 58 | File outFile = new File(args[0]);//CSV file to write the output formatted_similarity_calc.csv 59 | if (outFile.exists()) { 60 | throw new RuntimeException("Output file already exists-" + outFile.getAbsolutePath()); 61 | } 62 | 63 | File simFile = new File(args[1]);//Video pairs with similarity score 64 | 65 | Path inputList = Paths.get(args[2]);// List of all videos 66 | 67 | List videoList = Files.readAllLines(inputList, Charsets.UTF_8); 68 | //adding a blank at first position to match output 69 | videoList.add(0,""); 70 | //Result is a 2D square matrix of size video count + 1 71 | //additional 1 is for storing video file name 72 | String[][] resultMatrix = new String [videoList.size()][videoList.size()]; 73 | System.out.println("Initialised input files and resultMatrix"); 74 | 75 | //init first row with just video name 76 | resultMatrix[0]=videoList.toArray(new String[videoList.size()]); 77 | //init first col with just video name 78 | for (int i=1;i videoList) { 115 | 116 | DecimalFormat df = new DecimalFormat("0.00"); 117 | 118 | String score = ""; 119 | 120 | int indexOfvid1 = 0; 121 | int indexOfvid2 = 0; 122 | 123 | { 124 | // scoped under a brace to limit scope of temp variables 125 | String[] pairAndScore = simLine.split("\t"); 126 | 127 | score = df.format(Double.parseDouble(pairAndScore[1]) ); 128 | String[] pair = pairAndScore[0].split(","); 129 | indexOfvid1 = videoList.indexOf(pair[0]); 130 | indexOfvid2 = videoList.indexOf(pair[1]); 131 | } 132 | 133 | //if this video is not present in input list of video skip it from matrix 134 | //This is used when we create output for a subset of videos 135 | if(indexOfvid2 == -1 || indexOfvid1 == -1) 136 | return; 137 | 138 | //Fill only upper matrix 139 | if (indexOfvid1 < indexOfvid2) { 140 | resultMatrix[indexOfvid1][indexOfvid2]=score; 141 | } else { 142 | //equal score will be one anyway 143 | resultMatrix[indexOfvid2][indexOfvid1]=score; 144 | } 145 | 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/GradientTimeSeries.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | import java.io.StringWriter; 23 | import java.util.ArrayList; 24 | import java.util.logging.Level; 25 | import java.util.logging.Logger; 26 | 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.io.LongWritable; 30 | import org.apache.hadoop.io.Text; 31 | import org.apache.hadoop.mapred.FileInputFormat; 32 | import org.apache.hadoop.mapred.FileOutputFormat; 33 | import org.apache.hadoop.mapred.JobClient; 34 | import org.apache.hadoop.mapred.JobConf; 35 | import org.apache.hadoop.mapred.MapReduceBase; 36 | import org.apache.hadoop.mapred.Mapper; 37 | import org.apache.hadoop.mapred.OutputCollector; 38 | import org.apache.hadoop.mapred.Reporter; 39 | import org.apache.hadoop.mapred.TextInputFormat; 40 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; 41 | import org.opencv.core.Core; 42 | import org.pooledtimeseries.util.HadoopFileUtil; 43 | import org.pooledtimeseries.util.PoTUtil; 44 | 45 | public class GradientTimeSeries { 46 | private static final Logger LOG = Logger.getLogger(GradientTimeSeries.class.getName()); 47 | 48 | public static class Map extends MapReduceBase implements Mapper { 49 | @Override 50 | public void configure(JobConf job) { 51 | super.configure(job); 52 | PoTUtil.loadOpenCV(); 53 | } 54 | 55 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 56 | 57 | try { 58 | File tempFile = new HadoopFileUtil().copyToTempDir(value.toString()); 59 | double[][] series1 = PoT.getGradientTimeSeries(tempFile.toPath(), 5, 5, 8); 60 | tempFile.delete(); 61 | 62 | String ofVector = saveVectors(series1); 63 | output.collect(value, new Text(ofVector)); 64 | } catch (Exception e) { 65 | LOG.log(Level.SEVERE, "Exception while calling PoT.getGradientTimeSeries", e); 66 | } 67 | } 68 | 69 | private static String saveVectors(double[][] vectors) { 70 | int d = vectors[0].length; 71 | 72 | ArrayList temp_hists = new ArrayList(); 73 | 74 | for (int i = 0; i < vectors.length; i++) { 75 | double[][][] temp_hist = new double[1][1][d]; 76 | temp_hist[0][0] = vectors[i]; 77 | 78 | temp_hists.add(temp_hist); 79 | } 80 | 81 | return getSaveHistogramsOutput(temp_hists); 82 | } 83 | 84 | private static String getSaveHistogramsOutput(ArrayList hists) { 85 | int w_d = hists.get(0).length; 86 | int h_d = hists.get(0)[0].length; 87 | int o_d = hists.get(0)[0][0].length; 88 | 89 | int i, j, k, l; 90 | 91 | StringWriter writer = new StringWriter(); 92 | String head = String.format("%d %d", hists.size(), w_d * h_d * o_d); 93 | writer.write(head); 94 | writer.write("\n"); 95 | 96 | for (l = 0; l < (int) hists.size(); l++) { 97 | double[][][] hist = hists.get(l); 98 | 99 | for (i = 0; i < hist.length; i++) { 100 | for (j = 0; j < hist[0].length; j++) { 101 | for (k = 0; k < hist[0][0].length; k++) { // optical_bins+1 102 | writer.write(String.format("%f ", hist[i][j][k])); 103 | } 104 | } 105 | } 106 | 107 | writer.write("\n"); 108 | } 109 | 110 | return writer.toString(); 111 | } 112 | } 113 | 114 | public static class MultiFileOutput extends MultipleTextOutputFormat { 115 | protected String generateFileNameForKeyValue(Text key, Text value, String name) { 116 | String[] splitPath = key.toString().split("/"); 117 | String fileName = splitPath[splitPath.length - 1]; 118 | String fName =fileName + ".hog.txt"; 119 | File file = new File(fName); 120 | if(file.exists()) 121 | file.delete(); 122 | return fName; 123 | } 124 | 125 | protected Text generateActualKey(Text key, Text value) { 126 | return null; 127 | } 128 | } 129 | 130 | public static void main(String[] args) throws Exception { 131 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 132 | 133 | Configuration baseConf = new Configuration(); 134 | baseConf.set("mapred.reduce.tasks", "0"); 135 | JobConf conf = new JobConf(baseConf, GradientTimeSeries.class); 136 | 137 | conf.setJobName("gradient_time_series"); 138 | 139 | conf.setOutputKeyClass(Text.class); 140 | conf.setOutputValueClass(Text.class); 141 | 142 | conf.setMapperClass(Map.class); 143 | 144 | conf.setInputFormat(TextInputFormat.class); 145 | conf.setOutputFormat(MultiFileOutput.class); 146 | 147 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 148 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 149 | 150 | JobClient.runJob(conf); 151 | } 152 | } 153 | 154 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/MeanChiSquareDistanceCalculation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | import java.util.List; 23 | import java.util.logging.Logger; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.io.DoubleWritable; 28 | import org.apache.hadoop.io.IntWritable; 29 | import org.apache.hadoop.io.NullWritable; 30 | import org.apache.hadoop.io.BytesWritable; 31 | import org.apache.hadoop.io.Text; 32 | import org.apache.hadoop.mapred.FileOutputFormat; 33 | import org.apache.hadoop.mapred.JobClient; 34 | import org.apache.hadoop.mapred.JobConf; 35 | import org.apache.hadoop.mapred.MapReduceBase; 36 | import org.apache.hadoop.mapred.Mapper; 37 | import org.apache.hadoop.mapred.OutputCollector; 38 | import org.apache.hadoop.mapred.Reducer; 39 | import org.apache.hadoop.mapred.Reporter; 40 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 41 | import org.apache.hadoop.mapred.TextOutputFormat; 42 | import org.pooledtimeseries.cartesian.CartesianInputFormat; 43 | import org.pooledtimeseries.util.PoTSerialiser; 44 | import org.pooledtimeseries.util.ReadSeqFileUtil; 45 | 46 | public class MeanChiSquareDistanceCalculation { 47 | private static final Logger LOG = Logger.getLogger(MeanChiSquareDistanceCalculation.class.getName()); 48 | static int videos=0; 49 | public static class Map extends MapReduceBase implements Mapper { 50 | 51 | @Override 52 | public void map(Text key, BytesWritable value, OutputCollector output, Reporter reporter) throws IOException { 53 | videos++; 54 | System.out.println(videos); 55 | LOG.info("Processing pair - " + key); 56 | long startTime = System.currentTimeMillis(); 57 | 58 | String[] videoFiles = ReadSeqFileUtil.getFileNames(key); 59 | 60 | // If we're looking at a pair of videos where the videos are the same 61 | // we don't include them in the meanChiSquareDistance calculation. 62 | if (videoFiles[0].equals(videoFiles[1])) 63 | return; 64 | 65 | List fvList = (List) PoTSerialiser.getObject(value.getBytes()) ; 66 | 67 | LOG.info("Loaded Time Series for pair in - " + (System.currentTimeMillis() - startTime)); 68 | 69 | for (int i = 0; i < fvList.get(0).numDim(); i++) { 70 | 71 | output.collect(new IntWritable(i), new DoubleWritable( 72 | PoT.chiSquareDistance( 73 | fvList.get(0).feature.get(i), 74 | fvList.get(1).feature.get(i) 75 | ) 76 | )); 77 | } 78 | 79 | LOG.info("Completed processing pair - " + key); 80 | LOG.info("Time taken to complete job - " + (System.currentTimeMillis() - startTime)); 81 | } 82 | } 83 | 84 | public static class Reduce extends MapReduceBase implements Reducer{ 85 | 86 | public void reduce(IntWritable key, Iterator values, 87 | OutputCollector output, Reporter reporter) throws IOException { 88 | double sum = 0; 89 | int count = 0; 90 | 91 | while (values.hasNext()){ 92 | sum += values.next().get(); 93 | count++; 94 | } 95 | 96 | output.collect(null, new DoubleWritable(sum / (double) count)); 97 | } 98 | 99 | } 100 | 101 | public static void main(String[] args) throws Exception { 102 | 103 | Configuration baseConf = new Configuration(); 104 | baseConf.set("mapreduce.job.maps", "96"); 105 | baseConf.set("mapred.tasktracker.map.tasks.maximum", "96"); 106 | 107 | JobConf conf = new JobConf(baseConf, MeanChiSquareDistanceCalculation.class); 108 | System.out.println("Before Map:"+ conf.getNumMapTasks()); 109 | conf.setNumMapTasks(96); 110 | System.out.println("After Map:"+ conf.getNumMapTasks()); 111 | 112 | 113 | conf.setJobName("mean_chi_square_calculation"); 114 | 115 | System.out.println("Track:" + baseConf.get("mapred.job.tracker")); 116 | System.out.println("Job Name- "+conf.getJobName()); 117 | System.out.println(baseConf.get("mapreduce.job.maps")); 118 | 119 | conf.setMapOutputKeyClass(IntWritable.class); 120 | conf.setMapOutputValueClass(DoubleWritable.class); 121 | conf.setOutputKeyClass(IntWritable.class); 122 | conf.setOutputValueClass(DoubleWritable.class); 123 | 124 | conf.setOutputFormat(TextOutputFormat.class); 125 | 126 | conf.setInputFormat(CartesianInputFormat.class); 127 | CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, 128 | args[0]); 129 | CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, 130 | args[0]); 131 | 132 | 133 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 134 | 135 | conf.setMapperClass(Map.class); 136 | conf.setReducerClass(Reduce.class); 137 | 138 | JobClient.runJob(conf); 139 | } 140 | } 141 | 142 | 143 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/OpticalTimeSeries.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | 21 | 22 | import java.io.File; 23 | import java.io.IOException; 24 | import java.io.StringWriter; 25 | import java.util.ArrayList; 26 | import java.util.logging.Level; 27 | import java.util.logging.Logger; 28 | 29 | import org.apache.hadoop.conf.Configuration; 30 | import org.apache.hadoop.fs.Path; 31 | import org.apache.hadoop.io.LongWritable; 32 | import org.apache.hadoop.io.Text; 33 | import org.apache.hadoop.mapred.FileInputFormat; 34 | import org.apache.hadoop.mapred.FileOutputFormat; 35 | import org.apache.hadoop.mapred.JobClient; 36 | import org.apache.hadoop.mapred.JobConf; 37 | import org.apache.hadoop.mapred.MapReduceBase; 38 | import org.apache.hadoop.mapred.Mapper; 39 | import org.apache.hadoop.mapred.OutputCollector; 40 | import org.apache.hadoop.mapred.Reporter; 41 | import org.apache.hadoop.mapred.TextInputFormat; 42 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; 43 | import org.opencv.core.Core; 44 | import org.pooledtimeseries.util.HadoopFileUtil; 45 | import org.pooledtimeseries.util.PoTUtil; 46 | 47 | public class OpticalTimeSeries { 48 | private static final Logger LOG = Logger.getLogger(OpticalTimeSeries.class.getName()); 49 | 50 | public static class Map extends MapReduceBase implements Mapper { 51 | @Override 52 | public void configure(JobConf job) { 53 | super.configure(job); 54 | PoTUtil.loadOpenCV(); 55 | } 56 | 57 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 58 | 59 | try { 60 | File tempFile = new HadoopFileUtil().copyToTempDir(value.toString()); 61 | double[][] series1 = PoT.getOpticalTimeSeries(tempFile.toPath(), 5, 5, 8); 62 | tempFile.delete(); 63 | 64 | String ofVector = saveVectors(series1); 65 | output.collect(value, new Text(ofVector)); 66 | } catch (Exception e) { 67 | e.printStackTrace(); 68 | LOG.log(Level.SEVERE, "Exception while calling PoT.getOpticalTimeSeries", e); 69 | } 70 | } 71 | 72 | private static String saveVectors(double[][] vectors) { 73 | int d = vectors[0].length; 74 | 75 | ArrayList temp_hists = new ArrayList(); 76 | 77 | for (int i = 0; i < vectors.length; i++) { 78 | double[][][] temp_hist = new double[1][1][d]; 79 | temp_hist[0][0] = vectors[i]; 80 | 81 | temp_hists.add(temp_hist); 82 | } 83 | 84 | return getSaveHistogramsOutput(temp_hists); 85 | } 86 | 87 | private static String getSaveHistogramsOutput(ArrayList hists) { 88 | int w_d = hists.get(0).length; 89 | int h_d = hists.get(0)[0].length; 90 | int o_d = hists.get(0)[0][0].length; 91 | 92 | int i, j, k, l; 93 | 94 | StringWriter writer = new StringWriter(); 95 | String head = String.format("%d %d", hists.size(), w_d * h_d * o_d); 96 | writer.write(head); 97 | writer.write("\n"); 98 | 99 | for (l = 0; l < (int) hists.size(); l++) { 100 | double[][][] hist = hists.get(l); 101 | 102 | for (i = 0; i < hist.length; i++) { 103 | for (j = 0; j < hist[0].length; j++) { 104 | for (k = 0; k < hist[0][0].length; k++) { // optical_bins+1 105 | writer.write(String.format("%f ", hist[i][j][k])); 106 | } 107 | } 108 | } 109 | 110 | writer.write("\n"); 111 | } 112 | 113 | return writer.toString(); 114 | } 115 | } 116 | 117 | public static class MultiFileOutput extends MultipleTextOutputFormat { 118 | protected String generateFileNameForKeyValue(Text key, Text value, String name) { 119 | String[] splitPath = key.toString().split("/"); 120 | String fileName = splitPath[splitPath.length - 1]; 121 | String fName =fileName + ".of.txt"; 122 | File file = new File(fName); 123 | if(file.exists()) 124 | file.delete(); 125 | return fName; 126 | } 127 | 128 | protected Text generateActualKey(Text key, Text value) { 129 | return null; 130 | } 131 | } 132 | 133 | public static void main(String[] args) throws Exception { 134 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 135 | LOG.info("Loaded- " + Core.NATIVE_LIBRARY_NAME); 136 | 137 | Configuration baseConf = new Configuration(); 138 | baseConf.set("mapred.reduce.tasks", "0"); 139 | JobConf conf = new JobConf(baseConf, OpticalTimeSeries.class); 140 | 141 | conf.setJobName("optical_time_series"); 142 | 143 | conf.setOutputKeyClass(Text.class); 144 | conf.setOutputValueClass(Text.class); 145 | 146 | conf.setMapperClass(Map.class); 147 | 148 | conf.setInputFormat(TextInputFormat.class); 149 | conf.setOutputFormat(MultiFileOutput.class); 150 | 151 | FileInputFormat.setInputPaths(conf, new Path(args[0])); 152 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 153 | 154 | JobClient.runJob(conf); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/SimilarityCalculation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.Scanner; 26 | import java.util.logging.Logger; 27 | 28 | import org.apache.commons.lang.ArrayUtils; 29 | import org.apache.hadoop.fs.Path; 30 | import org.apache.hadoop.io.BytesWritable; 31 | import org.apache.hadoop.io.Text; 32 | import org.apache.hadoop.mapred.FileOutputFormat; 33 | import org.apache.hadoop.mapred.JobClient; 34 | import org.apache.hadoop.mapred.JobConf; 35 | import org.apache.hadoop.mapred.MapReduceBase; 36 | import org.apache.hadoop.mapred.Mapper; 37 | import org.apache.hadoop.mapred.OutputCollector; 38 | import org.apache.hadoop.mapred.Reporter; 39 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 40 | import org.apache.hadoop.mapred.TextOutputFormat; 41 | import org.pooledtimeseries.cartesian.CartesianInputFormat; 42 | import org.pooledtimeseries.util.HadoopFileUtil; 43 | import org.pooledtimeseries.util.PoTSerialiser; 44 | import org.pooledtimeseries.util.ReadSeqFileUtil; 45 | 46 | public class SimilarityCalculation { 47 | 48 | private static final Logger LOG = Logger.getLogger(SimilarityCalculation.class.getName()); 49 | 50 | static int videos = 0; 51 | 52 | public static class Map extends MapReduceBase implements Mapper { 53 | 54 | double[] meanDists = null; 55 | 56 | @Override 57 | public void configure(JobConf conf) { 58 | super.configure(conf); 59 | String meanDistsPath = conf.get("meanDistsFilePath"); 60 | List meanDistsList = new ArrayList(); 61 | InputStream in = null; 62 | try { 63 | in = HadoopFileUtil.getInputStreamFromHDFS(meanDistsPath); 64 | Scanner scin = new Scanner(in) ; 65 | while (scin.hasNextDouble()) { 66 | meanDistsList.add(scin.nextDouble()); 67 | } 68 | scin.close(); 69 | } catch (IOException e) { 70 | e.printStackTrace(); 71 | } finally { 72 | if(in !=null){ 73 | try { 74 | in.close(); 75 | } catch (IOException e) {} 76 | } 77 | } 78 | 79 | this.meanDists = ArrayUtils.toPrimitive(meanDistsList.toArray(new Double[0])); 80 | LOG.info("Loaded meanDist of length - " + meanDists.length); 81 | } 82 | 83 | @Override 84 | public void map(Text key, BytesWritable value, OutputCollector output, Reporter reporter) 85 | throws IOException { 86 | videos++; 87 | LOG.info("Processing pair - " + key); 88 | long startTime = System.currentTimeMillis(); 89 | 90 | String[] videoPaths = ReadSeqFileUtil.getFileNames(key); 91 | 92 | List fvList = (List) PoTSerialiser.getObject(value.getBytes()) ; 93 | LOG.info("Loaded Time Series for pair in - " + (System.currentTimeMillis() - startTime)); 94 | 95 | double similarity = PoT.kernelDistance(fvList.get(0), fvList.get(1), meanDists); 96 | 97 | File p1 = new File(videoPaths[0]); 98 | File p2 = new File(videoPaths[1]); 99 | output.collect(new Text(p1.getName() + ',' + p2.getName()), new Text(String.valueOf(similarity))); 100 | 101 | LOG.info("Completed processing pair - " + key); 102 | LOG.info("Time taken to complete job - " + (System.currentTimeMillis() - startTime)); 103 | } 104 | } 105 | 106 | public static void main(String[] args) throws Exception { 107 | 108 | JobConf conf = new JobConf(); 109 | System.out.println("Before Map:" + conf.getNumMapTasks()); 110 | conf.setNumMapTasks(196); 111 | System.out.println("After Map:" + conf.getNumMapTasks()); 112 | conf.setJobName("similarity_calc"); 113 | 114 | conf.set("meanDistsFilePath", args[2]); 115 | 116 | System.out.println("Job Name: " + conf.getJobName()); 117 | conf.setJarByClass(SimilarityCalculation.class); 118 | 119 | conf.setOutputKeyClass(Text.class); 120 | conf.setOutputValueClass(Text.class); 121 | 122 | conf.setInputFormat(CartesianInputFormat.class); 123 | CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, 124 | args[0]); 125 | CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, 126 | args[0]); 127 | 128 | conf.setOutputFormat(TextOutputFormat.class); 129 | 130 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 131 | 132 | conf.setMapperClass(Map.class); 133 | 134 | JobClient.runJob(conf); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/cartesian/CartesianInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.cartesian; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.commons.logging.Log; 23 | import org.apache.commons.logging.LogFactory; 24 | import org.apache.hadoop.mapred.FileInputFormat; 25 | import org.apache.hadoop.mapred.InputSplit; 26 | import org.apache.hadoop.mapred.JobConf; 27 | import org.apache.hadoop.mapred.RecordReader; 28 | import org.apache.hadoop.mapred.Reporter; 29 | import org.apache.hadoop.mapred.join.CompositeInputSplit; 30 | import org.apache.hadoop.util.ReflectionUtils; 31 | 32 | public class CartesianInputFormat extends FileInputFormat { 33 | 34 | public static final Log LOG = LogFactory.getLog(CartesianInputFormat.class); 35 | 36 | public static final String LEFT_INPUT_FORMAT = "cart.left.inputformat"; 37 | public static final String LEFT_INPUT_PATH = "cart.left.path"; 38 | public static final String RIGHT_INPUT_FORMAT = "cart.right.inputformat"; 39 | public static final String RIGHT_INPUT_PATH = "cart.right.path"; 40 | 41 | public static void setLeftInputInfo(JobConf conf, Class inputFormat, String inputPath) { 42 | conf.set(LEFT_INPUT_FORMAT, inputFormat.getCanonicalName()); 43 | conf.set(LEFT_INPUT_PATH, inputPath); 44 | } 45 | 46 | public static void setRightInputInfo(JobConf job, Class inputFormat, String inputPath) { 47 | job.set(RIGHT_INPUT_FORMAT, inputFormat.getCanonicalName()); 48 | job.set(RIGHT_INPUT_PATH, inputPath); 49 | } 50 | 51 | @Override 52 | public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { 53 | 54 | try { 55 | // Get the input splits from both the left and right data sets 56 | InputSplit[] leftSplits = getInputSplits(conf, conf.get(LEFT_INPUT_FORMAT), conf.get(LEFT_INPUT_PATH), 57 | numSplits); 58 | InputSplit[] rightSplits = getInputSplits(conf, conf.get(RIGHT_INPUT_FORMAT), conf.get(RIGHT_INPUT_PATH), 59 | numSplits); 60 | 61 | // Create our CartesianInputSplits, size equal to left.length * 62 | // right.length 63 | CompositeInputSplit[] returnSplits = new CompositeInputSplit[((leftSplits.length * (rightSplits.length - 1)) 64 | / 2) + leftSplits.length]; 65 | 66 | int i = 0; 67 | // For each of the left input splits 68 | for (int leftLoop = 0; leftLoop < leftSplits.length; leftLoop++) { 69 | InputSplit left = leftSplits[leftLoop]; 70 | // For each of the right input splits 71 | 72 | for (int rightLoop = leftLoop; rightLoop < rightSplits.length; rightLoop++) { 73 | InputSplit right = rightSplits[rightLoop]; 74 | // Create a new composite input split composing of the two 75 | 76 | returnSplits[i] = new CompositeInputSplit(2); 77 | returnSplits[i].add(left); 78 | returnSplits[i].add(right); 79 | ++i; 80 | } 81 | } 82 | 83 | // Return the composite splits 84 | LOG.info("Total splits to process: " + returnSplits.length); 85 | return returnSplits; 86 | } catch (ClassNotFoundException e) { 87 | e.printStackTrace(); 88 | throw new IOException(e); 89 | } 90 | } 91 | 92 | @Override 93 | public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { 94 | // create a new instance of the Cartesian record reader 95 | return new CartesianRecordReader((CompositeInputSplit) split, conf, reporter); 96 | } 97 | 98 | private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits) 99 | throws ClassNotFoundException, IOException { 100 | // Create a new instance of the input format 101 | FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass), 102 | conf); 103 | 104 | // Set the input path for the left data set 105 | inputFormat.setInputPaths(conf, inputPath); 106 | 107 | // Get the left input splits 108 | return inputFormat.getSplits(conf, numSplits); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/cartesian/CartesianRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.cartesian; 19 | 20 | import java.io.IOException; 21 | import java.util.List; 22 | 23 | import org.apache.hadoop.io.BytesWritable; 24 | import org.apache.hadoop.io.Text; 25 | import org.apache.hadoop.mapred.FileInputFormat; 26 | import org.apache.hadoop.mapred.InputSplit; 27 | import org.apache.hadoop.mapred.JobConf; 28 | import org.apache.hadoop.mapred.RecordReader; 29 | import org.apache.hadoop.mapred.Reporter; 30 | import org.apache.hadoop.mapred.join.CompositeInputSplit; 31 | import org.apache.hadoop.util.ReflectionUtils; 32 | import org.pooledtimeseries.FeatureVector; 33 | import org.pooledtimeseries.util.PoTSerialiser; 34 | 35 | public class CartesianRecordReader 36 | implements RecordReader { 37 | 38 | // Record readers to get key value pairs 39 | private RecordReader leftRR = null, rightRR = null; 40 | 41 | // Store configuration to re-create the right record reader 42 | private FileInputFormat rightFIF; 43 | private JobConf rightConf; 44 | private InputSplit rightIS; 45 | private Reporter rightReporter; 46 | // if left and right are same splits this flag is set 47 | // It's used to avoid repeated pairs 48 | // for l=1,2 r =1,2 pair=11,12,22 49 | private boolean pairWithItself; 50 | 51 | // Helper variables 52 | private K1 lkey; 53 | private V1 lvalue; 54 | private K2 rkey; 55 | private V2 rvalue; 56 | private boolean goToNextLeft = true, alldone = false; 57 | private int rightShiftCount = 1; 58 | 59 | /** 60 | * Creates a new instance of the CartesianRecordReader 61 | * 62 | * @param split 63 | * @param conf 64 | * @param reporter 65 | * @throws IOException 66 | */ 67 | public CartesianRecordReader(CompositeInputSplit split, JobConf conf, Reporter reporter) throws IOException { 68 | this.rightConf = conf; 69 | this.rightIS = split.get(1); 70 | this.rightReporter = reporter; 71 | 72 | try { 73 | // Create left record reader 74 | FileInputFormat leftFIF = (FileInputFormat) ReflectionUtils 75 | .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf); 76 | 77 | leftRR = leftFIF.getRecordReader(split.get(0), conf, reporter); 78 | 79 | // Create right record reader 80 | rightFIF = (FileInputFormat) ReflectionUtils 81 | .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf); 82 | 83 | rightRR = rightFIF.getRecordReader(rightIS, rightConf, rightReporter); 84 | } catch (ClassNotFoundException e) { 85 | 86 | e.printStackTrace(); 87 | throw new IOException(e); 88 | } 89 | 90 | // Create key value pairs for parsing 91 | lkey = (K1) this.leftRR.createKey(); 92 | lvalue = (V1) this.leftRR.createValue(); 93 | 94 | rkey = (K2) this.rightRR.createKey(); 95 | rvalue = (V2) this.rightRR.createValue(); 96 | } 97 | 98 | @Override 99 | public Text createKey() { 100 | return new Text(); 101 | } 102 | 103 | @Override 104 | public BytesWritable createValue() { 105 | return new BytesWritable(); 106 | } 107 | 108 | @Override 109 | public long getPos() throws IOException { 110 | return leftRR.getPos(); 111 | } 112 | 113 | @Override 114 | public boolean next(Text key, BytesWritable value) throws IOException { 115 | 116 | do { 117 | // If we are to go to the next left key/value pair 118 | if (goToNextLeft) { 119 | // Read the next key value pair, false means no more pairs 120 | if (!leftRR.next(lkey, lvalue)) { 121 | // If no more, then this task is nearly finished 122 | alldone = true; 123 | break; 124 | } else { 125 | // If we aren't done, set the value to the key and set 126 | // our flags 127 | goToNextLeft = alldone = false; 128 | 129 | // Reset the right record reader 130 | this.rightRR = this.rightFIF.getRecordReader(this.rightIS, this.rightConf, this.rightReporter); 131 | } 132 | 133 | if (this.pairWithItself) { 134 | // shifting right data set to avoid repeated pairs 135 | // we consider a,b == b,a 136 | for (int i = 0; i < rightShiftCount; i++) { 137 | rightRR.next(rkey, rvalue); 138 | } 139 | rightShiftCount++; 140 | } 141 | } 142 | 143 | // Read the next key value pair from the right data set 144 | if (rightRR.next(rkey, rvalue)) { 145 | // If success, set key and value for left and right splits 146 | key.set(lkey.toString() + "~" + rkey.toString()); 147 | // Merge FeatureVector of both videos 148 | // Order is important and should be same as order of key 149 | List featureList = (List)PoTSerialiser.getObject(lvalue.getBytes()); 150 | featureList.addAll((List) PoTSerialiser.getObject(rvalue.getBytes())); 151 | byte[] featureListBytes = PoTSerialiser.getBytes(featureList); 152 | value.set(featureListBytes, 0, featureListBytes.length); 153 | 154 | // This assumes that key will always be unique among all splits 155 | if (lkey.toString().equals(rkey.toString())) { 156 | this.pairWithItself = true; 157 | } 158 | } else { 159 | // Otherwise, this right data set is complete 160 | // and we should go to the next left pair 161 | goToNextLeft = true; 162 | } 163 | 164 | // This loop will continue if we finished reading key/value 165 | // pairs from the right data set 166 | } while (goToNextLeft); 167 | 168 | if (alldone) { 169 | // reset shift counter 170 | rightShiftCount = 1; 171 | this.pairWithItself = false; 172 | } 173 | // Return true if a key/value pair was read, false otherwise 174 | return !alldone; 175 | } 176 | 177 | public void close() throws IOException { 178 | leftRR.close(); 179 | rightRR.close(); 180 | } 181 | 182 | public float getProgress() throws IOException { 183 | return leftRR.getProgress(); 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/healthcheck/CheckCartesianProductSeqFile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.healthcheck; 19 | 20 | import java.io.IOException; 21 | import java.util.Iterator; 22 | import java.util.List; 23 | 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.io.BytesWritable; 26 | import org.apache.hadoop.io.IntWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapred.JobClient; 29 | import org.apache.hadoop.mapred.JobConf; 30 | import org.apache.hadoop.mapred.MapReduceBase; 31 | import org.apache.hadoop.mapred.Mapper; 32 | import org.apache.hadoop.mapred.OutputCollector; 33 | import org.apache.hadoop.mapred.Reducer; 34 | import org.apache.hadoop.mapred.Reporter; 35 | import org.apache.hadoop.mapred.RunningJob; 36 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 37 | import org.apache.hadoop.mapred.TextOutputFormat; 38 | import org.apache.hadoop.util.GenericOptionsParser; 39 | import org.pooledtimeseries.FeatureVector; 40 | import org.pooledtimeseries.cartesian.CartesianInputFormat; 41 | import org.pooledtimeseries.seqfile.TextVectorsToSequenceFile; 42 | import org.pooledtimeseries.util.PoTSerialiser; 43 | import org.pooledtimeseries.util.ReadSeqFileUtil; 44 | 45 | /** 46 | * Program for verifying Sequence File generated by {@link TextVectorsToSequenceFile}
47 | * If SeqFile is correct logs for this job will have printed correct keys and Size
48 | * Output of this job will have 2 records-
49 | * - Number of pairs with similar key 50 | * - Number of pairs with different keys 51 | */ 52 | public class CheckCartesianProductSeqFile { 53 | 54 | public static class CartesianMapper extends MapReduceBase implements Mapper { 55 | 56 | private Text simkey = new Text("simkey"); 57 | private Text diskey = new Text("diskey"); 58 | private static final IntWritable one = new IntWritable(1); 59 | 60 | public void map(Text key, BytesWritable value, OutputCollector output, Reporter reporter) 61 | throws IOException { 62 | // System.out.println(value); 63 | System.out.println(key); 64 | System.out.println(""); 65 | 66 | System.out.println("Size- "+ ((List) PoTSerialiser.getObject(value.getBytes()) ).size() ); 67 | 68 | System.out.println(); 69 | // If the two values are equal add one to output 70 | String[] files = ReadSeqFileUtil.getFileNames(key); 71 | if (files[0].equals(files[1])){ 72 | output.collect(simkey, one); 73 | }else{ 74 | output.collect(diskey, one); 75 | } 76 | 77 | 78 | } 79 | } 80 | 81 | public static class CartesianReducer extends MapReduceBase implements Reducer { 82 | private Text outputVal = new Text(); 83 | 84 | public void reduce(Text key, Iterator values, OutputCollector output, 85 | Reporter reporter) throws IOException { 86 | int sum = 0; 87 | while (values.hasNext()) { 88 | sum += values.next().get(); 89 | } 90 | outputVal.set("" + sum); 91 | output.collect(key, outputVal); 92 | } 93 | 94 | } 95 | 96 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 97 | 98 | long start = System.currentTimeMillis(); 99 | JobConf conf = new JobConf("Cartesian Product"); 100 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 101 | if (otherArgs.length != 2) { 102 | System.err.println("Usage: CheckCartesianProductSeqFile "); 103 | System.exit(1); 104 | } 105 | 106 | // Configure the join type 107 | conf.setJarByClass(CheckCartesianProductSeqFile.class); 108 | 109 | conf.setMapperClass(CartesianMapper.class); 110 | conf.setReducerClass(CartesianReducer.class); 111 | 112 | conf.setInputFormat(CartesianInputFormat.class); 113 | CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]); 114 | CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]); 115 | 116 | TextOutputFormat.setOutputPath(conf, new Path(otherArgs[1])); 117 | 118 | conf.setOutputKeyClass(Text.class); 119 | conf.setOutputValueClass(IntWritable.class); 120 | 121 | RunningJob job = JobClient.runJob(conf); 122 | while (!job.isComplete()) { 123 | Thread.sleep(1000); 124 | } 125 | 126 | long finish = System.currentTimeMillis(); 127 | 128 | System.out.println("Time in ms: " + (finish - start)); 129 | 130 | System.exit(job.isSuccessful() ? 0 : 2); 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/seqfile/FullFileInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.seqfile; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.fs.Path; 23 | import org.apache.hadoop.io.BytesWritable; 24 | import org.apache.hadoop.io.Text; 25 | import org.apache.hadoop.mapreduce.InputSplit; 26 | import org.apache.hadoop.mapreduce.JobContext; 27 | import org.apache.hadoop.mapreduce.RecordReader; 28 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 29 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 30 | 31 | public class FullFileInputFormat extends 32 | FileInputFormat { 33 | @Override 34 | protected boolean isSplitable(JobContext context, Path file) { 35 | return false; 36 | } 37 | 38 | @Override 39 | public RecordReader createRecordReader( 40 | InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { 41 | FullFileRecordReader reader = new FullFileRecordReader(); 42 | reader.initialize(split, context); 43 | return reader; 44 | } 45 | } -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/seqfile/FullFileRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.seqfile; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.io.BytesWritable; 24 | import org.apache.hadoop.io.Text; 25 | import org.apache.hadoop.mapreduce.InputSplit; 26 | import org.apache.hadoop.mapreduce.RecordReader; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 29 | import org.pooledtimeseries.util.PoTConstants; 30 | import org.pooledtimeseries.util.PoTSerialiser; 31 | import org.pooledtimeseries.util.ReadSeqFileUtil; 32 | 33 | public class FullFileRecordReader extends RecordReader { 34 | public static final byte[] VECTOR_SEPERATOR = PoTConstants.VECTOR_SEPERATOR.getBytes(); 35 | 36 | private FileSplit fileSplit; 37 | private Configuration conf; 38 | private BytesWritable value = new BytesWritable(); 39 | private Text key = new Text(); 40 | 41 | private boolean processed = false; 42 | 43 | @Override 44 | public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { 45 | this.fileSplit = (FileSplit) split; 46 | this.conf = context.getConfiguration(); 47 | } 48 | 49 | @Override 50 | public boolean nextKeyValue() throws IOException, InterruptedException { 51 | if (!processed) { 52 | 53 | String files[] = new String[2]; 54 | files[0] = fileSplit.getPath().toString() + ".of.txt"; 55 | files[1] = fileSplit.getPath().toString() + ".hog.txt"; 56 | 57 | byte[] listFeatures = PoTSerialiser.getBytes(ReadSeqFileUtil.computeFeatureFromSeries(files) ); 58 | 59 | value.set(listFeatures, 0, listFeatures.length ); 60 | key.set(fileSplit.getPath().toString()); 61 | processed = true; 62 | return true; 63 | } 64 | return false; 65 | } 66 | 67 | 68 | @Override 69 | public Text getCurrentKey() throws IOException, InterruptedException { 70 | return key; 71 | } 72 | 73 | @Override 74 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 75 | return value; 76 | } 77 | 78 | @Override 79 | public float getProgress() throws IOException { 80 | return processed ? 1.0f : 0.0f; 81 | } 82 | 83 | @Override 84 | public void close() throws IOException { 85 | // do nothing 86 | } 87 | } -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/seqfile/PoTVideoPathFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.seqfile; 19 | 20 | import java.io.IOException; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.conf.Configured; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.fs.PathFilter; 27 | 28 | public class PoTVideoPathFilter extends Configured implements PathFilter{ 29 | Configuration conf; 30 | FileSystem fs; 31 | 32 | @Override 33 | public boolean accept(Path path) { 34 | try { 35 | if (fs.isDirectory(path)) { 36 | return true; 37 | } else { 38 | //only accept files with mp4 39 | if (path.getName().endsWith(".mp4")) { 40 | return true; 41 | } 42 | } 43 | } catch (IOException e) { 44 | e.printStackTrace(); 45 | } 46 | return false; 47 | } 48 | 49 | @Override 50 | public void setConf(Configuration conf) { 51 | this.conf = conf; 52 | if (conf != null) { 53 | try { 54 | fs = FileSystem.get(conf); 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | } 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/seqfile/TextVectorsToSequenceFile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.seqfile; 19 | 20 | import java.io.IOException; 21 | import java.util.logging.Logger; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.apache.hadoop.conf.Configured; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.io.BytesWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapreduce.InputSplit; 29 | import org.apache.hadoop.mapreduce.Job; 30 | import org.apache.hadoop.mapreduce.Mapper; 31 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 32 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 33 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 34 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 35 | 36 | public class TextVectorsToSequenceFile extends Configured { 37 | static class SequenceFileMapper extends 38 | Mapper { 39 | private static final Logger LOG = Logger.getLogger(TextVectorsToSequenceFile.class.getName()); 40 | 41 | private Text filename; 42 | 43 | @Override 44 | protected void setup(Context context) throws IOException, 45 | InterruptedException { 46 | InputSplit split = context.getInputSplit(); 47 | Path path = ((FileSplit) split).getPath(); 48 | filename = new Text(path.toString()); 49 | } 50 | 51 | @Override 52 | protected void map(Text key, BytesWritable value, 53 | Context context) throws IOException, InterruptedException { 54 | LOG.info("Processing filename- " + filename); 55 | context.write(filename, value); 56 | } 57 | } 58 | 59 | 60 | public static void main(String[] args) throws Exception { 61 | Configuration conf = new Configuration(); 62 | Job job = Job.getInstance(conf); 63 | job.setJarByClass(TextVectorsToSequenceFile.class); 64 | job.setJobName("smallfilestoseqfile"); 65 | job.setInputFormatClass(FullFileInputFormat.class); 66 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 67 | 68 | job.setNumReduceTasks(1); 69 | FullFileInputFormat.setInputPaths(job, new Path(args[0])); 70 | FileInputFormat.setInputPathFilter(job, PoTVideoPathFilter.class); 71 | 72 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 73 | 74 | job.setOutputKeyClass(Text.class); 75 | job.setOutputValueClass(BytesWritable.class); 76 | job.setMapperClass(SequenceFileMapper.class); 77 | job.waitForCompletion(true); 78 | 79 | } 80 | 81 | 82 | } -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/ClassScope.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | import java.util.Vector; 21 | import java.util.logging.Logger; 22 | 23 | public class ClassScope { 24 | private static java.lang.reflect.Field LIBRARIES; 25 | private static final Logger LOG = Logger.getLogger(ClassScope.class.getName()); 26 | 27 | static { 28 | try { 29 | LIBRARIES = ClassLoader.class.getDeclaredField("loadedLibraryNames"); 30 | } catch (Exception e) { 31 | LIBRARIES = null; 32 | e.printStackTrace(); 33 | } 34 | LIBRARIES.setAccessible(true); 35 | } 36 | 37 | private static Vector getLoadedLibraries(final ClassLoader loader) throws Exception { 38 | final Vector libraries = (Vector) LIBRARIES.get(loader); 39 | return libraries; 40 | } 41 | 42 | public static boolean isLibraryLoaded(String library) { 43 | try { 44 | final Vector libraries = ClassScope.getLoadedLibraries(ClassLoader.getSystemClassLoader()); 45 | LOG.info("Libraries found - " + libraries); 46 | return libraries.contains(library); 47 | } catch (Exception e) { 48 | e.printStackTrace(); 49 | return false; 50 | } 51 | 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/HadoopFileUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | import java.io.File; 21 | import java.io.FileOutputStream; 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | import java.io.OutputStream; 25 | import java.net.URI; 26 | import java.util.Arrays; 27 | import java.util.logging.Level; 28 | import java.util.logging.Logger; 29 | 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.fs.FSDataInputStream; 32 | import org.apache.hadoop.fs.FileSystem; 33 | import org.apache.hadoop.fs.Path; 34 | import org.apache.hadoop.io.IOUtils; 35 | 36 | import com.google.common.io.Files; 37 | 38 | public class HadoopFileUtil { 39 | private static final Logger LOG = Logger.getLogger(HadoopFileUtil.class.getName()); 40 | 41 | /** 42 | * Copies file to a temporary directory and return File object to temporary file 43 | */ 44 | public File copyToTempDir(String value) throws IOException { 45 | Path videoPath = new Path(value.toString()); 46 | videoPath.getFileSystem(new Configuration()); 47 | 48 | LOG.info("Reading file from - " + videoPath); 49 | 50 | File tempDir = Files.createTempDir(); 51 | 52 | // Get the filesystem - HDFS 53 | FileSystem fs = FileSystem.get(URI.create(value.toString()), new Configuration()); 54 | 55 | // Open the path mentioned in HDFS 56 | FSDataInputStream in = null; 57 | OutputStream out = null; 58 | LOG.info("Copying file to a TempDir - " + tempDir.getPath()); 59 | try { 60 | in = fs.open(videoPath); 61 | LOG.info("Available byte - " + in.available()); 62 | out = new FileOutputStream(tempDir.getAbsolutePath() + "/" + videoPath.getName()); 63 | IOUtils.copyBytes(in, out, new Configuration()); 64 | 65 | } catch (Exception e) { 66 | LOG.log(Level.SEVERE, "Error while copying to TempDir", e); 67 | return null; 68 | } finally { 69 | try { 70 | in.close(); 71 | out.close(); 72 | } catch (Exception e) {} 73 | } 74 | LOG.info("Available files - " + Arrays.asList(tempDir.listFiles()) ); 75 | 76 | return new File(tempDir.getAbsolutePath() + "/" + videoPath.getName()); 77 | } 78 | 79 | public static InputStream getInputStreamFromHDFS(String pathToHDFS) throws IOException{ 80 | Path videoPath = new Path(pathToHDFS.toString()); 81 | return videoPath.getFileSystem(new Configuration()).open(videoPath); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/PoTConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | public class PoTConstants { 21 | public static final String VECTOR_SEPERATOR = "|"; 22 | public static final String FILE_SEPERATOR = "~"; 23 | public static final String VECTOR_SEPERATOR_REGEX = "\\|"; 24 | 25 | } 26 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/PoTSerialiser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | import java.io.ByteArrayInputStream; 21 | import java.io.ByteArrayOutputStream; 22 | import java.io.ObjectInput; 23 | import java.io.ObjectInputStream; 24 | import java.io.ObjectOutput; 25 | import java.io.ObjectOutputStream; 26 | import java.util.logging.Level; 27 | import java.util.logging.Logger; 28 | 29 | public class PoTSerialiser { 30 | 31 | private static final Logger LOG = Logger.getLogger(PoTSerialiser.class.getName()); 32 | 33 | public static byte[] getBytes(Object value) { 34 | long start = System.currentTimeMillis(); 35 | ByteArrayOutputStream bos = new ByteArrayOutputStream(); 36 | ObjectOutput out = null; 37 | byte[] byteArr = null; 38 | try { 39 | out = new ObjectOutputStream(bos); 40 | out.writeObject(value); 41 | byteArr = bos.toByteArray(); 42 | LOG.fine("Time taken serializing - " + (System.currentTimeMillis() - start)); 43 | } catch (Exception e) { 44 | LOG.log(Level.SEVERE, "Unable to serialize", e); 45 | 46 | } finally { 47 | try { 48 | if (out != null) { 49 | out.close(); 50 | } 51 | } catch (Exception ex) { 52 | // ignore close exception 53 | } 54 | try { 55 | bos.close(); 56 | } catch (Exception ex) { 57 | // ignore close exception 58 | } 59 | } 60 | 61 | return byteArr; 62 | } 63 | 64 | public static Object getObject(byte[] byteArr) { 65 | 66 | if(byteArr == null || byteArr.length == 0){ 67 | return null; 68 | } 69 | long start = System.currentTimeMillis(); 70 | ByteArrayInputStream bis = new ByteArrayInputStream(byteArr); 71 | ObjectInput in = null; 72 | try { 73 | in = new ObjectInputStream(bis); 74 | LOG.fine("Time taken deserializing - " + (System.currentTimeMillis() - start)); 75 | return in.readObject(); 76 | } catch (Exception e) { 77 | LOG.log(Level.SEVERE, "Unable to deserialize", e); 78 | return null; 79 | } finally { 80 | try { 81 | bis.close(); 82 | } catch (Exception ex) { 83 | // ignore close exception 84 | } 85 | try { 86 | if (in != null) { 87 | in.close(); 88 | } 89 | } catch (Exception ex) { 90 | // ignore close exception 91 | } 92 | } 93 | 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/PoTUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | import java.util.logging.Logger; 21 | 22 | import org.opencv.core.Core; 23 | import org.pooledtimeseries.SimilarityCalculation; 24 | 25 | public class PoTUtil { 26 | private static final String DEFAULT_LIB_PATH = "/mnt/apps/opencv-2.4.11/release/lib/libopencv_java2411.so"; 27 | private static final Logger LOG = Logger.getLogger(SimilarityCalculation.class.getName()); 28 | 29 | public static void loadOpenCV(String libraryPath){ 30 | 31 | if (!ClassScope.isLibraryLoaded(Core.NATIVE_LIBRARY_NAME)) { 32 | LOG.info("Trying to load - " + Core.NATIVE_LIBRARY_NAME); 33 | try{ 34 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 35 | }catch (java.lang.UnsatisfiedLinkError e){ 36 | System.load(libraryPath); 37 | } 38 | } 39 | } 40 | 41 | public static void loadOpenCV(){ 42 | loadOpenCV(DEFAULT_LIB_PATH); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /hadoop-pot-core/src/main/java/org/pooledtimeseries/util/ReadSeqFileUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.util; 19 | 20 | import java.io.IOException; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | import java.util.logging.Level; 24 | import java.util.logging.Logger; 25 | 26 | import org.apache.hadoop.io.Text; 27 | import org.pooledtimeseries.FeatureVector; 28 | import org.pooledtimeseries.MeanChiSquareDistanceCalculation; 29 | import org.pooledtimeseries.PoT; 30 | 31 | public class ReadSeqFileUtil { 32 | private static final Logger LOG = Logger.getLogger(MeanChiSquareDistanceCalculation.class.getName()); 33 | 34 | /** 35 | * Takes HDFS path to time series and convert them to {@link FeatureVector} 36 | * @param files - path to of.txt and hog.txt 37 | * @return List of {@link FeatureVector} 38 | */ 39 | public static List computeFeatureFromSeries(String[] files) { 40 | 41 | ArrayList tws = PoT.getTemporalWindows(4); 42 | ArrayList fvList = new ArrayList(); 43 | 44 | ArrayList multiSeries = new ArrayList(); 45 | 46 | long startIoTime = System.currentTimeMillis(); 47 | 48 | try { 49 | multiSeries.add(PoT.loadTimeSeries(HadoopFileUtil.getInputStreamFromHDFS(files[0])) ); 50 | multiSeries.add(PoT.loadTimeSeries(HadoopFileUtil.getInputStreamFromHDFS(files[1])) ); 51 | } catch (IOException e) { 52 | LOG.log(Level.SEVERE,"Unable to read series from filesysytem ",e); 53 | throw new RuntimeException("Unable to read series from filesysytem",e); 54 | } 55 | 56 | LOG.info("Read both series in - " + (System.currentTimeMillis() - startIoTime)); 57 | 58 | FeatureVector fv = new FeatureVector(); 59 | for (int i = 0; i < multiSeries.size(); i++) { 60 | fv.feature.add(PoT.computeFeaturesFromSeries(multiSeries.get(i), tws, 1)); 61 | fv.feature.add(PoT.computeFeaturesFromSeries(multiSeries.get(i), tws, 2)); 62 | fv.feature.add(PoT.computeFeaturesFromSeries(multiSeries.get(i), tws, 5)); 63 | } 64 | fvList.add(fv); 65 | 66 | return fvList; 67 | 68 | } 69 | 70 | public static String[] getFileNames(Text key) { 71 | 72 | return key.toString().split(PoTConstants.FILE_SEPERATOR); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /hadoop-pot-video/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | gov.nasa.jpl.memex 5 | hadoop-pot 6 | 1.0-SNAPSHOT 7 | 8 | hadoop-pot-video 9 | 10 | 11 | org.openpnp 12 | opencv 13 | 2.4.11-2 14 | 15 | 16 | junit 17 | junit 18 | 19 | 20 | commons-io 21 | commons-io 22 | 23 | 24 | commons-cli 25 | commons-cli 26 | 27 | 28 | com.googlecode.json-simple 29 | json-simple 30 | 31 | 32 | -------------------------------------------------------------------------------- /hadoop-pot-video/src/main/java/org/pooledtimeseries/FeatureVector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.Serializable; 21 | import java.util.ArrayList; 22 | 23 | public class FeatureVector implements Serializable{ 24 | 25 | private static final long serialVersionUID = 1L; 26 | 27 | public ArrayList> feature; 28 | 29 | public FeatureVector() { 30 | feature = new ArrayList>(); 31 | } 32 | 33 | public FeatureVector(ArrayList> f) { 34 | feature = f; 35 | } 36 | 37 | public int numDim() { 38 | return feature.size(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /hadoop-pot-video/src/main/java/org/pooledtimeseries/PoT.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries; 19 | 20 | import java.io.BufferedReader; 21 | import java.io.BufferedWriter; 22 | import java.io.File; 23 | import java.io.FileOutputStream; 24 | import java.io.FileWriter; 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | import java.io.InputStreamReader; 28 | import java.io.OutputStreamWriter; 29 | import java.nio.file.Files; 30 | import java.nio.file.Path; 31 | import java.nio.file.Paths; 32 | import java.util.ArrayList; 33 | import java.util.List; 34 | import java.util.Scanner; 35 | import java.util.logging.Level; 36 | import java.util.logging.Logger; 37 | 38 | import org.apache.commons.cli.CommandLine; 39 | import org.apache.commons.cli.CommandLineParser; 40 | import org.apache.commons.cli.GnuParser; 41 | import org.apache.commons.cli.HelpFormatter; 42 | import org.apache.commons.cli.Option; 43 | import org.apache.commons.cli.OptionBuilder; 44 | import org.apache.commons.cli.Options; 45 | import org.apache.commons.cli.ParseException; 46 | import org.apache.commons.io.FileUtils; 47 | import org.apache.commons.io.filefilter.TrueFileFilter; 48 | import org.json.simple.JSONObject; 49 | import org.opencv.core.Core; 50 | import org.opencv.core.Mat; 51 | import org.opencv.core.MatOfPoint2f; 52 | import org.opencv.core.Point; 53 | import org.opencv.core.Size; 54 | import org.opencv.highgui.VideoCapture; 55 | import org.opencv.imgproc.Imgproc; 56 | import org.opencv.video.Video; 57 | 58 | /** 59 | * 60 | * Pooled Time Series Similarity Metric. 61 | * 62 | */ 63 | public class PoT { 64 | 65 | public static int frame_width = 320; 66 | public static int frame_height = 240; 67 | 68 | private static String outputFile = "similarity.txt"; 69 | 70 | private static enum OUTPUT_FORMATS {TXT, JSON} 71 | private static OUTPUT_FORMATS outputFormat = OUTPUT_FORMATS.TXT; 72 | 73 | private static final Logger LOG = Logger.getLogger(PoT.class.getName()); 74 | 75 | public static void main(String[] args) { 76 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 77 | Option fileOpt = OptionBuilder.withArgName("file").hasArg() 78 | .withLongOpt("file") 79 | .withDescription("Path to a single file").create('f'); 80 | 81 | Option dirOpt = OptionBuilder.withArgName("directory").hasArg() 82 | .withLongOpt("dir") 83 | .withDescription("A directory with image files in it").create('d'); 84 | 85 | Option helpOpt = OptionBuilder.withLongOpt("help") 86 | .withDescription("Print this message.").create('h'); 87 | 88 | Option pathFileOpt = OptionBuilder 89 | .withArgName("path file") 90 | .hasArg() 91 | .withLongOpt("pathfile") 92 | .withDescription( 93 | "A file containing full absolute paths to videos. Previous default was memex-index_temp.txt") 94 | .create('p'); 95 | 96 | Option outputFileOpt = OptionBuilder 97 | .withArgName("output file") 98 | .withLongOpt("outputfile") 99 | .hasArg() 100 | .withDescription("File containing similarity results. Defaults to ./similarity.txt") 101 | .create('o'); 102 | 103 | Option jsonOutputFlag = OptionBuilder 104 | .withArgName("json output") 105 | .withLongOpt("json") 106 | .withDescription("Set similarity output format to JSON. Defaults to .txt") 107 | .create('j'); 108 | 109 | Option similarityFromFeatureVectorsOpt = OptionBuilder 110 | .withArgName("similarity from FeatureVectors directory") 111 | .withLongOpt("similarityFromFeatureVectorsDirectory") 112 | .hasArg() 113 | .withDescription("calculate similarity matrix from given directory of feature vectors") 114 | .create('s'); 115 | 116 | Options options = new Options(); 117 | options.addOption(dirOpt); 118 | options.addOption(pathFileOpt); 119 | options.addOption(fileOpt); 120 | options.addOption(helpOpt); 121 | options.addOption(outputFileOpt); 122 | options.addOption(jsonOutputFlag); 123 | options.addOption(similarityFromFeatureVectorsOpt); 124 | 125 | // create the parser 126 | CommandLineParser parser = new GnuParser(); 127 | 128 | try { 129 | // parse the command line arguments 130 | CommandLine line = parser.parse(options, args); 131 | String directoryPath = null; 132 | String pathFile = null; 133 | String singleFilePath = null; 134 | String similarityFromFeatureVectorsDirectory = null; 135 | ArrayList videoFiles = null; 136 | 137 | if (line.hasOption("dir")) { 138 | directoryPath = line.getOptionValue("dir"); 139 | } 140 | 141 | if (line.hasOption("pathfile")) { 142 | pathFile = line.getOptionValue("pathfile"); 143 | } 144 | 145 | if (line.hasOption("file")) { 146 | singleFilePath = line.getOptionValue("file"); 147 | } 148 | 149 | if (line.hasOption("outputfile")) { 150 | outputFile = line.getOptionValue("outputfile"); 151 | } 152 | 153 | if (line.hasOption("json")) { 154 | outputFormat = OUTPUT_FORMATS.JSON; 155 | } 156 | 157 | if (line.hasOption("similarityFromFeatureVectorsDirectory")) { 158 | similarityFromFeatureVectorsDirectory = line.getOptionValue("similarityFromFeatureVectorsDirectory"); 159 | } 160 | 161 | if (line.hasOption("help") 162 | || (line.getOptions() == null || (line.getOptions() != null && line 163 | .getOptions().length == 0)) 164 | || (directoryPath != null && pathFile != null 165 | && !directoryPath.equals("") && !pathFile.equals(""))) { 166 | HelpFormatter formatter = new HelpFormatter(); 167 | formatter.printHelp("pooled_time_series", options); 168 | System.exit(1); 169 | } 170 | 171 | if (directoryPath != null) { 172 | File dir = new File(directoryPath); 173 | List files = (List) FileUtils.listFiles(dir, 174 | TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); 175 | videoFiles = new ArrayList(files.size()); 176 | 177 | for (File file : files) { 178 | String filePath = file.toString(); 179 | 180 | // When given a directory to load videos from we need to ensure that we 181 | // don't try to load the of.txt and hog.txt intermediate result files 182 | // that results from previous processing runs. 183 | if (!filePath.contains(".txt")) { 184 | videoFiles.add(file.toPath()); 185 | } 186 | } 187 | 188 | LOG.info("Added " + videoFiles.size() + " video files from " 189 | + directoryPath); 190 | 191 | } 192 | 193 | if (pathFile != null) { 194 | Path list_file = Paths.get(pathFile); 195 | videoFiles = loadFiles(list_file); 196 | LOG.info("Loaded " + videoFiles.size() + " video files from " 197 | + pathFile); 198 | } 199 | 200 | if (singleFilePath != null) { 201 | Path singleFile = Paths.get(singleFilePath); 202 | LOG.info("Loaded file: " + singleFile); 203 | videoFiles = new ArrayList(1); 204 | videoFiles.add(singleFile); 205 | } 206 | 207 | if (similarityFromFeatureVectorsDirectory != null) { 208 | File dir = new File(similarityFromFeatureVectorsDirectory); 209 | List files = (List) FileUtils.listFiles(dir, 210 | TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); 211 | videoFiles = new ArrayList(files.size()); 212 | 213 | for (File file : files) { 214 | String filePath = file.toString(); 215 | 216 | // We need to load only the *.of.txt and *.hog.txt values 217 | if (filePath.endsWith(".of.txt")) { 218 | videoFiles.add(file.toPath()); 219 | } 220 | 221 | if (filePath.endsWith(".hog.txt")) { 222 | videoFiles.add(file.toPath()); 223 | } 224 | } 225 | 226 | LOG.info("Added " + videoFiles.size() + " feature vectors from " 227 | + similarityFromFeatureVectorsDirectory); 228 | evaluateSimilarity(videoFiles, 1); 229 | } 230 | else { 231 | evaluateSimilarity(videoFiles, 0); 232 | } 233 | LOG.info("done."); 234 | 235 | } catch (ParseException exp) { 236 | // oops, something went wrong 237 | System.err.println("Parsing failed. Reason: " + exp.getMessage()); 238 | } 239 | 240 | } 241 | 242 | public static void evaluateSimilarity(ArrayList files, int save_mode) { 243 | // PoT level set 244 | ArrayList tws = getTemporalWindows(4); 245 | 246 | // computing feature vectors 247 | ArrayList fv_list = new ArrayList(); 248 | 249 | for (int k = 0; k < files.size(); k++) { 250 | try { 251 | LOG.fine(files.get(k).toString()); 252 | 253 | ArrayList multi_series = new ArrayList(); 254 | Path file = files.get(k); 255 | 256 | // optical flow descriptors 257 | String series_name1 = file.toString(); 258 | if ((!series_name1.endsWith(".of.txt")) && (!series_name1.endsWith(".hog.txt"))) { 259 | series_name1 += ".of.txt"; 260 | } 261 | Path series_path1 = Paths.get(series_name1); 262 | double[][] series1; 263 | 264 | if (save_mode == 0) { 265 | series1 = getOpticalTimeSeries(file, 5, 5, 8); 266 | saveVectors(series1, series_path1); 267 | 268 | } else { 269 | series1 = loadTimeSeries(series_path1); 270 | } 271 | 272 | multi_series.add(series1); 273 | 274 | // gradients descriptors 275 | String series_name2 = file.toString(); 276 | if ((!series_name2.endsWith(".hog.txt")) && (!series_name2.endsWith(".of.txt"))) { 277 | series_name2 += ".hog.txt"; 278 | } 279 | Path series_path2 = Paths.get(series_name2); 280 | double[][] series2; 281 | 282 | if (save_mode == 0) { 283 | series2 = getGradientTimeSeries(file, 5, 5, 8); 284 | saveVectors(series2, series_path2); 285 | } else { 286 | series2 = loadTimeSeries(series_path2); 287 | } 288 | 289 | multi_series.add(series2); 290 | 291 | // computing features from series of descriptors 292 | FeatureVector fv = new FeatureVector(); 293 | 294 | for (int i = 0; i < multi_series.size(); i++) { 295 | fv.feature.add(computeFeaturesFromSeries(multi_series.get(i), tws, 1)); 296 | fv.feature.add(computeFeaturesFromSeries(multi_series.get(i), tws, 2)); 297 | fv.feature.add(computeFeaturesFromSeries(multi_series.get(i), tws, 5)); 298 | } 299 | LOG.info( (k+1)+"/"+files.size()+" files done. " + "Finished processing file: " + file.getFileName()); 300 | fv_list.add(fv); 301 | 302 | } catch (PoTException e) { 303 | LOG.severe("PoTException occurred: " + e.message + ": Skipping file " + files.get(k)); 304 | continue; 305 | } 306 | } 307 | double[][] similarities = calculateSimilarities(fv_list); 308 | writeSimilarityOutput(files, similarities); 309 | } 310 | 311 | public static double[][] calculateSimilarities(ArrayList fv_list) { 312 | // feature vector similarity measure 313 | if (fv_list.size() < 1) { 314 | LOG.info("Feature Vector list is empty. Nothing to calculate. Exiting..."); 315 | System.exit(1); 316 | } 317 | double[] mean_dists = new double[fv_list.get(0).numDim()]; 318 | for (int i = 0; i < fv_list.get(0).numDim(); i++) 319 | mean_dists[i] = meanChiSquareDistances(fv_list, i); 320 | 321 | System.out.print("mean-chi-square-distances: "); 322 | for (int i = 0; i < fv_list.get(0).numDim(); i++) 323 | System.out.format("%f ", mean_dists[i]); 324 | System.out.println(""); 325 | 326 | double[][] sims = new double[fv_list.size()][fv_list.size()]; 327 | for (int i = 0; i < fv_list.size(); i++) { 328 | for (int j = 0; j < fv_list.size(); j++) { 329 | sims[i][j] = kernelDistance(fv_list.get(i), fv_list.get(j), mean_dists); 330 | } 331 | } 332 | 333 | return sims; 334 | } 335 | 336 | private static void writeSimilarityOutput(ArrayList files, double[][] similarities) { 337 | if (outputFormat == OUTPUT_FORMATS.TXT) { 338 | writeSimilarityToTextFile(similarities); 339 | } else if (outputFormat == OUTPUT_FORMATS.JSON) { 340 | writeSimilarityToJSONFile(files, similarities); 341 | } else { 342 | LOG.severe("Invalid output format. Skipping similarity dump."); 343 | } 344 | } 345 | 346 | private static void writeSimilarityToTextFile(double[][] similarities) { 347 | try { 348 | FileOutputStream fos = new FileOutputStream(outputFile); 349 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos)); 350 | 351 | for (int i = 0; i < similarities.length; i++) { 352 | for (int j = 0; j < similarities[0].length; j++) { 353 | writer.write(String.format("%f,", similarities[i][j])); 354 | } 355 | writer.newLine(); 356 | } 357 | 358 | writer.close(); 359 | fos.close(); 360 | } catch (IOException e) { 361 | e.printStackTrace(); 362 | } 363 | } 364 | 365 | private static void writeSimilarityToJSONFile(ArrayList files, double[][] similarities) { 366 | JSONObject root_json_obj = new JSONObject(); 367 | 368 | for (int i = 0; i < similarities.length; i++) { 369 | JSONObject fileJsonObj = new JSONObject(); 370 | 371 | for (int j = 0; j < similarities[0].length; j++) { 372 | fileJsonObj.put(files.get(j).getFileName(), similarities[i][j]); 373 | } 374 | 375 | root_json_obj.put(files.get(i).getFileName(), fileJsonObj); 376 | } 377 | 378 | try { 379 | outputFile = outputFile.substring(0, outputFile.lastIndexOf('.')) + ".json"; 380 | FileWriter file = new FileWriter(outputFile); 381 | file.write(root_json_obj.toJSONString()); 382 | file.flush(); 383 | file.close(); 384 | } catch (IOException e) { 385 | e.printStackTrace(); 386 | } 387 | } 388 | 389 | public static ArrayList loadFiles(Path list_file) { 390 | ArrayList filenames = new ArrayList(); 391 | 392 | try (InputStream in = Files.newInputStream(list_file); 393 | BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { 394 | String line = null; 395 | while ((line = reader.readLine()) != null) { 396 | filenames.add(Paths.get(line)); 397 | } 398 | } catch (IOException x) { 399 | System.err.println(x); 400 | } 401 | 402 | return filenames; 403 | } 404 | 405 | public static double[][] getOpticalTimeSeries(Path filename, int w_d, 406 | int h_d, int o_d) throws PoTException { 407 | ArrayList hists = getOpticalHistograms(filename, w_d, h_d, 408 | o_d); 409 | double[][] vectors = new double[hists.size()][]; 410 | 411 | for (int i = 0; i < hists.size(); i++) { 412 | vectors[i] = histogramToVector(hists.get(i)); 413 | } 414 | 415 | return vectors; 416 | } 417 | 418 | static double[] histogramToVector(double[][][] hist) { 419 | int d1 = hist.length; 420 | int d2 = hist[0].length; 421 | int d3 = hist[0][0].length; 422 | double[] vector = new double[d1 * d2 * d3]; 423 | 424 | for (int i = 0; i < d1; i++) { 425 | for (int j = 0; j < d2; j++) { 426 | for (int k = 0; k < d3; k++) { 427 | vector[d3 * d2 * i + d3 * j + k] = hist[i][j][k]; 428 | } 429 | } 430 | } 431 | 432 | return vector; 433 | } 434 | 435 | static ArrayList getOpticalHistograms(Path filename, int w_d, 436 | int h_d, int o_d) throws PoTException{ 437 | ArrayList histograms = new ArrayList(); 438 | 439 | try{ 440 | LOG.info("opening video file " + filename.toString() ); 441 | VideoCapture capture = new VideoCapture(filename.toString()); 442 | 443 | if (!capture.isOpened()) { 444 | LOG.warning("video file " + filename.getFileName() + " could not be opened."); 445 | double[][][] hist = new double[w_d][h_d][o_d]; 446 | histograms.add(hist); 447 | } 448 | else { 449 | // variables for processing images 450 | Mat original_frame = new Mat(); 451 | 452 | Mat frame = new Mat(); 453 | Mat frame_gray = new Mat(); 454 | Mat prev_frame_gray = new Mat(); 455 | MatOfPoint2f flow = new MatOfPoint2f(); 456 | 457 | // computing a list of histogram of optical flows (i.e. a list of 5*5*8 458 | // arrays) 459 | for (int frame_index = 0;; frame_index++) { 460 | // capturing the video images 461 | capture.read(original_frame); 462 | 463 | if (original_frame.empty()) { 464 | if (frame_index == 0) { 465 | throw new PoTException("Could not read the video file"); 466 | } 467 | else 468 | break; 469 | } 470 | else { 471 | // resizing the captured frame and converting it to the gray scale 472 | // image. 473 | Imgproc.resize(original_frame, frame, new Size(frame_width, 474 | frame_height)); 475 | Imgproc.cvtColor(frame, frame_gray, Imgproc.COLOR_BGR2GRAY); 476 | 477 | double[][][] hist = new double[w_d][h_d][o_d]; 478 | histograms.add(hist); 479 | 480 | // from frame #2 481 | if (frame_index > 0) { 482 | // calculate optical flows 483 | Video.calcOpticalFlowFarneback(prev_frame_gray, frame_gray, flow, 484 | 0.5, 1, 10, 2, 7, 1.5, 0); // 0.5, 1, 15, 2, 7, 1.5, 0 485 | 486 | // update histogram of optical flows 487 | updateOpticalHistogram(histograms.get(frame_index), flow); 488 | } 489 | 490 | Mat temp_frame = prev_frame_gray; 491 | prev_frame_gray = frame_gray; 492 | frame_gray = temp_frame; 493 | } 494 | } 495 | 496 | capture.release(); 497 | } 498 | }catch(Exception e){ 499 | e.printStackTrace(); 500 | LOG.log(Level.SEVERE, "Exception in getOpticalHistograms ", e); 501 | } 502 | return histograms; 503 | } 504 | 505 | static void updateOpticalHistogram(double[][][] hist, Mat flow) { 506 | int d1 = hist.length; 507 | int d2 = hist[0].length; 508 | int d3 = hist[0][0].length; 509 | 510 | int step = 4; // 5; 511 | 512 | for (int x = 0; x < frame_width; x += step) { 513 | int x_type = (int) (x * d1 / frame_width); 514 | 515 | for (int y = 0; y < frame_height; y += step) { 516 | int y_type = (int) (y * d2 / frame_height); 517 | 518 | Point fxy = new Point(flow.get(y, x)); 519 | 520 | double size = (fxy.x + fxy.y) * (fxy.x + fxy.y); 521 | 522 | if (size < 9) { 523 | continue; // 25 524 | } else { 525 | int f_type = opticalFlowType(fxy, d3); 526 | 527 | hist[x_type][y_type][f_type]++; 528 | } 529 | } 530 | } 531 | } 532 | 533 | static int opticalFlowType(Point fxy, int dim) { 534 | double degree = Math.atan2(fxy.y, fxy.x); 535 | int type = 7; 536 | 537 | for (int i = 0; i < dim; i++) { 538 | double boundary = (i + 1) * 2 * Math.PI / dim - Math.PI; 539 | 540 | if (degree < boundary) { 541 | type = i; 542 | break; 543 | } 544 | } 545 | 546 | return type; 547 | } 548 | 549 | public static void saveVectors(double[][] vectors, Path outfile) { 550 | int d = vectors[0].length; 551 | 552 | ArrayList temp_hists = new ArrayList(); 553 | 554 | for (int i = 0; i < vectors.length; i++) { 555 | double[][][] temp_hist = new double[1][1][d]; 556 | temp_hist[0][0] = vectors[i]; 557 | 558 | temp_hists.add(temp_hist); 559 | } 560 | 561 | saveHistograms(temp_hists, outfile); 562 | } 563 | 564 | static void saveHistograms(ArrayList hists, Path outfile) { 565 | int w_d = hists.get(0).length; 566 | int h_d = hists.get(0)[0].length; 567 | int o_d = hists.get(0)[0][0].length; 568 | 569 | int i, j, k, l; 570 | 571 | try (FileOutputStream fos = new FileOutputStream(outfile.toFile()); 572 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos))) { 573 | String head = String.format("%d %d", hists.size(), w_d * h_d * o_d); 574 | writer.write(head); 575 | writer.newLine(); 576 | 577 | for (l = 0; l < (int) hists.size(); l++) { 578 | double[][][] hist = hists.get(l); 579 | 580 | for (i = 0; i < hist.length; i++) { 581 | for (j = 0; j < hist[0].length; j++) { 582 | for (k = 0; k < hist[0][0].length; k++) { // optical_bins+1 583 | writer.write(String.format("%f ", hist[i][j][k])); 584 | } 585 | } 586 | } 587 | 588 | writer.newLine(); 589 | } 590 | 591 | } catch (IOException x) { 592 | System.err.println(x); 593 | } 594 | } 595 | 596 | public static double[][] loadTimeSeries(Scanner scin) { 597 | double[][] series = new double[1][1]; 598 | int num_frames = scin.nextInt(); 599 | int dim = scin.nextInt(); 600 | 601 | series = new double[num_frames][dim]; 602 | 603 | for (int i = 0; i < num_frames; i++) { 604 | for (int j = 0; j < dim; j++) { 605 | series[i][j] = scin.nextDouble(); 606 | } 607 | } 608 | scin.close(); 609 | 610 | return series; 611 | } 612 | 613 | public static double[][] loadTimeSeries(InputStream in) { 614 | 615 | Scanner scin = new Scanner(in); 616 | try{ 617 | return loadTimeSeries(scin); 618 | }finally{ 619 | try { 620 | in.close(); 621 | } catch (IOException e) { 622 | e.printStackTrace(); 623 | } 624 | } 625 | 626 | } 627 | 628 | public static double[][] loadTimeSeries(Path filename) { 629 | try (InputStream in = Files.newInputStream(filename);) { 630 | return loadTimeSeries(in); 631 | } catch (IOException e) { 632 | e.printStackTrace(); 633 | return null; 634 | } 635 | } 636 | 637 | public static double[][] getGradientTimeSeries(Path filename, int w_d, 638 | int h_d, int o_d) throws PoTException { 639 | ArrayList hists = getGradientHistograms(filename, w_d, h_d, 640 | o_d); 641 | double[][] vectors = new double[hists.size()][]; 642 | 643 | for (int i = 0; i < hists.size(); i++) { 644 | vectors[i] = histogramToVector(hists.get(i)); 645 | } 646 | 647 | return vectors; 648 | } 649 | 650 | static ArrayList getGradientHistograms(Path filename, int w_d, 651 | int h_d, int o_d) throws PoTException{ 652 | ArrayList histograms = new ArrayList(); 653 | 654 | VideoCapture capture = new VideoCapture(filename.toString()); 655 | 656 | if (!capture.isOpened()) { 657 | LOG.warning("video file not opened."); 658 | 659 | double[][][] hist = new double[w_d][h_d][o_d]; 660 | histograms.add(hist); 661 | } 662 | else { 663 | // variables for processing images 664 | Mat original_frame = new Mat(); 665 | Mat resized = new Mat(); 666 | Mat resized_gray = new Mat(); 667 | 668 | // initializing a list of histogram of gradients (i.e. a list of s*s*9 669 | // arrays) 670 | for (int i = 0;; i++) { 671 | // capturing the video images 672 | capture.read(original_frame); 673 | if (original_frame.empty()) { 674 | 675 | if (original_frame.empty()) { 676 | if (i == 0) { 677 | throw new PoTException("Could not read the video file"); 678 | } 679 | else 680 | break; 681 | } 682 | 683 | } 684 | 685 | double[][][] hist = new double[w_d][h_d][o_d]; 686 | 687 | Imgproc.resize(original_frame, resized, new Size(frame_width, 688 | frame_height)); 689 | Imgproc.cvtColor(resized, resized_gray, Imgproc.COLOR_BGR2GRAY); 690 | 691 | ArrayList gradients = computeGradients(resized_gray, o_d); 692 | updateGradientHistogram(hist, gradients); 693 | 694 | histograms.add(hist); 695 | } 696 | 697 | capture.release(); 698 | } 699 | 700 | return histograms; 701 | } 702 | 703 | static ArrayList computeGradients(Mat frame, int dim) { 704 | byte frame_array[] = new byte[(int) frame.total()]; 705 | frame.get(0, 0, frame_array); 706 | 707 | ArrayList gradients = new ArrayList(); 708 | 709 | for (int k = 0; k < dim; k++) { 710 | double angle = Math.PI * (double) k / (double) dim; 711 | 712 | double dx = Math.cos(angle) * 0.9999999; 713 | double dy = Math.sin(angle) * 0.9999999; 714 | 715 | double[][] grad = new double[frame.width()][frame.height()]; 716 | 717 | for (int i = 0; i < frame.cols(); i++) { 718 | for (int j = 0; j < frame.rows(); j++) { 719 | if (i <= 1 || j <= 1 || i >= frame.cols() - 2 720 | || j >= frame.rows() - 2) { 721 | grad[i][j] = 0; 722 | } else { 723 | double f1 = interpolatePixel(frame_array, frame.cols(), (double) i 724 | + dx, (double) j + dy); 725 | double f2 = interpolatePixel(frame_array, frame.cols(), (double) i 726 | - dx, (double) j - dy); 727 | 728 | double diff = f1 - f2; 729 | if (diff < 0) 730 | diff = diff * -1; 731 | if (diff >= 256) 732 | diff = 255; 733 | 734 | grad[i][j] = diff; 735 | } 736 | } 737 | } 738 | 739 | gradients.add(grad); 740 | } 741 | 742 | return gradients; 743 | } 744 | 745 | static double interpolatePixel(byte[] image, int w, double x, double y) { 746 | double x1 = (double) ((int) x); 747 | double x2 = (double) ((int) x + 1); 748 | double y1 = (double) ((int) y); 749 | double y2 = (double) ((int) y + 1); 750 | 751 | double f11 = (double) (image[(int) y * w + (int) x] & 0xFF); 752 | double f21 = (double) (image[(int) y * w + (int) x + 1] & 0xFF); 753 | double f12 = (double) (image[(int) (y + 1) * w + (int) x] & 0xFF); 754 | double f22 = (double) (image[(int) (y + 1) * w + (int) x + 1] & 0xFF); 755 | 756 | double f = f11 * (x2 - x) * (y2 - y) + f21 * (x - x1) * (y2 - y) + f12 757 | * (x2 - x) * (y - y1) + f22 * (x - x1) * (y - y1); 758 | 759 | return f; 760 | } 761 | 762 | static void updateGradientHistogram(double[][][] hist, 763 | ArrayList gradients) { 764 | int d1 = hist.length; 765 | int d2 = hist[0].length; 766 | int d3 = hist[0][0].length; 767 | 768 | int width = gradients.get(0).length; 769 | int height = gradients.get(0)[0].length; 770 | 771 | for (int i = 0; i < width; i++) { 772 | int s1_index = i * d1 / width; 773 | 774 | for (int j = 0; j < height; j++) { 775 | int s2_index = j * d2 / height; 776 | 777 | for (int k = 0; k < d3; k++) { 778 | double val = gradients.get(k)[i][j] / 100; 779 | hist[s1_index][s2_index][k] += val; 780 | } 781 | } 782 | } 783 | } 784 | 785 | public static ArrayList getTemporalWindows(int level) { 786 | ArrayList fws = new ArrayList(); 787 | 788 | for (int l = 0; l < level; l++) { 789 | int cascade_steps = (int) Math.pow((double) 2, (double) l);// 2; 790 | double step_size = (double) 1 / (double) cascade_steps; 791 | 792 | for (int k = 0; k < cascade_steps; k++) { 793 | double start = step_size * (double) k + 0.000001; 794 | double end = step_size * (double) (k + 1) + 0.000001; 795 | 796 | double[] wind = new double[2]; 797 | wind[0] = start; 798 | wind[1] = end; 799 | 800 | fws.add(wind); 801 | } 802 | } 803 | 804 | return fws; 805 | } 806 | 807 | public static ArrayList computeFeaturesFromSeries(double[][] series, 808 | ArrayList time_windows_list, int feature_mode) { 809 | int start = 0; 810 | int end = series.length - 1; 811 | 812 | ArrayList feature = new ArrayList(); 813 | 814 | for (int j = 0; j < time_windows_list.size(); j++) { 815 | int duration = end - start; 816 | 817 | for (int i = 0; i < series[0].length; i++) { 818 | if (duration < 0) { 819 | if (feature_mode == 2 || feature_mode == 4) { 820 | feature.add(0.0); 821 | feature.add(0.0); 822 | } else 823 | feature.add(0.0); 824 | 825 | continue; 826 | } 827 | 828 | int window_start = start 829 | + (int) (duration * time_windows_list.get(j)[0] + 0.5); 830 | int window_end = start 831 | + (int) (duration * time_windows_list.get(j)[1] + 0.5); 832 | 833 | if (feature_mode == 1) { // Sum pooling 834 | double sum = 0; 835 | 836 | for (int t = window_start; t <= window_end; t++) { 837 | if (t < 0) 838 | continue; 839 | 840 | sum += series[t][i]; 841 | } 842 | 843 | feature.add(sum); 844 | } else if (feature_mode == 2) { // Gradient pooling1 845 | double positive_gradients = 0; 846 | double negative_gradients = 0; 847 | 848 | for (int t = window_start; t <= window_end; t++) { 849 | int look = 2; 850 | 851 | if (t - look < 0) 852 | continue; 853 | else { 854 | double dif = series[t][i] - series[t - look][i]; 855 | 856 | if (dif > 0.01) { // 0.01 for optical 857 | positive_gradients++; 858 | } else if (dif < -0.01) { // if (dif<-10) 859 | negative_gradients++; 860 | } 861 | } 862 | } 863 | 864 | feature.add(positive_gradients); 865 | feature.add(negative_gradients); 866 | } else if (feature_mode == 4) { // Gradient pooling2 867 | double positive_gradients = 0; 868 | double negative_gradients = 0; 869 | 870 | for (int t = window_start; t <= window_end; t++) { 871 | int look = 2; 872 | 873 | if (t - look < 0) 874 | continue; 875 | else { 876 | double dif = series[t][i] - series[t - look][i]; 877 | 878 | if (dif > 0) { 879 | positive_gradients += dif; 880 | } else { 881 | negative_gradients += -dif; 882 | } 883 | } 884 | } 885 | 886 | feature.add(positive_gradients); 887 | feature.add(negative_gradients); 888 | } else if (feature_mode == 5) { // Max pooling 889 | double max = -1000000; 890 | 891 | for (int t = window_start; t <= window_end; t++) { 892 | if (t < 0) 893 | continue; 894 | 895 | if (series[t][i] > max) 896 | max = series[t][i]; 897 | } 898 | 899 | feature.add(max); 900 | } 901 | } 902 | } 903 | 904 | return feature; 905 | } 906 | 907 | public static void normalizeFeatureL1(ArrayList sample) { 908 | int sum = 0; 909 | 910 | for (int i = 0; i < sample.size(); i++) { 911 | double val = sample.get(i); 912 | if (val < 0) 913 | val = -1 * val; 914 | 915 | sum += val; 916 | } 917 | 918 | for (int i = 0; i < sample.size(); i++) { 919 | double v; 920 | if (sum == 0) 921 | v = 0; 922 | else 923 | v = sample.get(i) / sum;// *100; 924 | 925 | sample.set(i, v); 926 | } 927 | } 928 | 929 | static double chiSquareDistance(ArrayList feature1, 930 | ArrayList feature2) { 931 | if (feature1.size() != feature2.size()) 932 | LOG.warning("feature vector dimension mismatch."); 933 | 934 | double score = 0; 935 | 936 | for (int i = 0; i < feature1.size(); i++) { 937 | double h1 = feature1.get(i); 938 | double h2 = feature2.get(i); 939 | 940 | if (h1 < 0 || h2 < 0) { 941 | LOG.warning("A negative feature value. The chi square kernel " 942 | + "does not work with negative values. Please try shifting " 943 | + "the vector to make all its elements positive."); 944 | } 945 | 946 | if (h1 == h2) 947 | continue; 948 | else 949 | score += (h1 - h2) * (h1 - h2) / (h1 + h2); 950 | } 951 | 952 | return 0.5 * score; 953 | } 954 | 955 | static double meanChiSquareDistances(ArrayList samples, int d) { 956 | double mean_dist = 0; 957 | 958 | double sum = 0; 959 | int count = 0; 960 | 961 | for (int i = 0; i < samples.size(); i++) { 962 | for (int j = i + 1; j < samples.size(); j++) { 963 | count++; 964 | 965 | sum += chiSquareDistance(samples.get(i).feature.get(d), 966 | samples.get(j).feature.get(d)); 967 | } 968 | } 969 | 970 | mean_dist = sum / (double) count; 971 | 972 | return mean_dist; 973 | } 974 | 975 | static double kernelDistance(FeatureVector sample1, FeatureVector sample2, 976 | double[] mean_dists) { 977 | double distance = 0; 978 | 979 | for (int d = 0; d < sample1.numDim(); d++) { 980 | double weight = 1; 981 | 982 | double val = chiSquareDistance(sample1.feature.get(d), 983 | sample2.feature.get(d)) 984 | / mean_dists[d] * weight; 985 | if (mean_dists[d] == 0) 986 | val = chiSquareDistance(sample1.feature.get(d), sample2.feature.get(d)) / 1000000.0; 987 | 988 | distance = distance + val; 989 | } 990 | 991 | double final_score = Math.exp(-1 * distance / 10); // 10000 10 992 | 993 | return final_score; 994 | } 995 | } 996 | -------------------------------------------------------------------------------- /hadoop-pot-video/src/main/java/org/pooledtimeseries/PoTException.java: -------------------------------------------------------------------------------- 1 | package org.pooledtimeseries; 2 | 3 | /** 4 | * Created by Aditya on 10/29/15. 5 | */ 6 | public class PoTException extends Exception { 7 | //Parameterless Constructor 8 | private PoTException() {} 9 | 10 | //Constructor that accepts a message 11 | public PoTException(String message) 12 | { 13 | super(message); 14 | this.message = message; 15 | } 16 | 17 | public String message; 18 | } 19 | -------------------------------------------------------------------------------- /hadoop-pot-video/src/main/java/org/pooledtimeseries/healthcheck/CheckOpenCV.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.pooledtimeseries.healthcheck; 19 | 20 | import org.opencv.core.Core; 21 | import org.opencv.core.CvType; 22 | import org.opencv.core.Mat; 23 | import org.opencv.highgui.VideoCapture; 24 | 25 | public class CheckOpenCV { 26 | 27 | public static void main(String[] args) { 28 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 29 | Mat mat = Mat.eye(3, 3, CvType.CV_8UC1); 30 | System.out.println("mat = " + mat.dump()); 31 | 32 | String filename = args[0]; 33 | 34 | System.out.println("opening video file " + filename); 35 | VideoCapture capture = new VideoCapture(filename.toString()); 36 | 37 | if (!capture.isOpened()) { 38 | System.out.println("video file " + filename + " could not be opened."); 39 | 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | gov.nasa.jpl.memex 5 | hadoop-pot 6 | pom 7 | 1.0-SNAPSHOT 8 | hadoop-pot 9 | http://maven.apache.org 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 3.3 16 | 17 | 1.7 18 | 1.7 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | junit 27 | junit 28 | 3.8.1 29 | test 30 | 31 | 32 | commons-io 33 | commons-io 34 | 2.4 35 | 36 | 37 | commons-cli 38 | commons-cli 39 | 1.2 40 | 41 | 42 | com.googlecode.json-simple 43 | json-simple 44 | 1.1.1 45 | 46 | 47 | 48 | 49 | hadoop-pot-video 50 | hadoop-pot-core 51 | hadoop-pot-assembly 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/main/bin/pooled-time-series: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | export ORIG_DIR=`pwd` 19 | export DIR=`dirname $0` 20 | cd $DIR 21 | export DIR_PATH=`pwd` 22 | cd $ORIG_DIR 23 | 24 | java -Djava.library.path=$OPENCV_JAVA_HOME -cp $DIR_PATH/../../../hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.PoT "$@" 25 | -------------------------------------------------------------------------------- /src/main/bin/pooled-time-series-hadoop: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | MAIN_PTS_DIR=$POOLED_TIME_SERIES_HOME/../.. 19 | HDFS_PTS_DIR=/user/pts/output 20 | 21 | TIME_SERIES_INPUT=$HDFS_PTS_DIR/OpticalAndGradientTimeSeriesInput 22 | SIMILARITY_INPUT=$HDFS_PTS_DIR/SequenceVideoVectors 23 | 24 | PTS_TIME_SERIES_INPUT=$POOLED_TIME_SERIES_HOME/OpticalAndGradientTimeSeriesInput 25 | PTS_SIMILARITY_INPUT=$POOLED_TIME_SERIES_HOME/MeanChiSquareAndSimilarityInput 26 | PTS_MEAN_CHI_OUTPUT=$POOLED_TIME_SERIES_HOME/MeanChiSquareOutput 27 | 28 | echo "*****************" 29 | echo "initialized, removing previous input directories" 30 | hadoop fs -rm -r $TIME_SERIES_INPUT 31 | hadoop fs -rm -r $SIMILARITY_INPUT 32 | echo "*****************" 33 | 34 | hadoop fs -mkdir $HDFS_PTS_DIR 35 | hadoop fs -mkdir $TIME_SERIES_INPUT 36 | mkdir $PTS_TIME_SERIES_INPUT 37 | mkdir $PTS_SIMILARITY_INPUT 38 | mkdir $PTS_MEAN_CHI_OUTPUT 39 | 40 | echo "*****************" 41 | echo "directory created" 42 | echo "*****************" 43 | 44 | # list full file names omitting information like 45 | hadoop fs -ls $2 | sed '1d;s/ */ /g' | cut -d\ -f8 | grep '\.mp4$' > $PTS_TIME_SERIES_INPUT/original_videos.txt 46 | echo "*****************" 47 | echo "Checking- " $PTS_TIME_SERIES_INPUT/original_videos.txt 48 | ls -lrt $PTS_TIME_SERIES_INPUT/original_videos.txt 49 | echo "*****************" 50 | 51 | 52 | mkdir $PTS_TIME_SERIES_INPUT/split 53 | split -l 1000 $PTS_TIME_SERIES_INPUT/original_videos.txt $PTS_TIME_SERIES_INPUT/split/original_videos.txt_ 54 | 55 | # Create temp space for storing batch results of OTS and GTS 56 | hadoop fs -mkdir $HDFS_PTS_DIR/OTSOutput_batch 57 | hadoop fs -mkdir $HDFS_PTS_DIR/GTSOutput_batch 58 | 59 | #Loop over all the files in $PTS_TIME_SERIES_INPUT/split/ 60 | # process 1000 videos at one time 61 | FILES=$PTS_TIME_SERIES_INPUT/split/* 62 | for f in $FILES 63 | do 64 | # Remove old original_videos.txt 65 | hadoop fs -rm $TIME_SERIES_INPUT/original_videos.txt 66 | # copy new batch if video input to hdfs 67 | ls $f 68 | hadoop fs -put $f $TIME_SERIES_INPUT/original_videos.txt 69 | # checking copied input 70 | hadoop fs -ls $TIME_SERIES_INPUT 71 | hadoop fs -cat $TIME_SERIES_INPUT/original_videos.txt | wc -l 72 | hadoop fs -cat $TIME_SERIES_INPUT/original_videos.txt | head 73 | 74 | # Optical and Gradient Time Series Calcs 75 | hadoop jar hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.OpticalTimeSeries $TIME_SERIES_INPUT $HDFS_PTS_DIR/OTSOutput 76 | 77 | echo "*****************" 78 | echo "Completed OpticalTimeSeries. Output in - " $HDFS_PTS_DIR/OTSOutput 79 | hadoop fs -ls $HDFS_PTS_DIR/OTSOutput 80 | echo "*****************" 81 | hadoop jar hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.GradientTimeSeries $TIME_SERIES_INPUT $HDFS_PTS_DIR/GTSOutput 82 | 83 | echo "*****************" 84 | echo "Completed GradientTimeSeries. Output in - " $HDFS_PTS_DIR/GTSOutput 85 | hadoop fs -ls $HDFS_PTS_DIR/GTSOutput 86 | echo "*****************" 87 | 88 | hadoop fs -cp $HDFS_PTS_DIR/OTSOutput/*.of.txt $HDFS_PTS_DIR/OTSOutput_batch 89 | hadoop fs -cp $HDFS_PTS_DIR/GTSOutput/*.hog.txt $HDFS_PTS_DIR/GTSOutput_batch 90 | 91 | hadoop fs -rm -r $HDFS_PTS_DIR/OTSOutput 92 | hadoop fs -rm -r $HDFS_PTS_DIR/GTSOutput 93 | 94 | done 95 | 96 | hadoop fs -rm $2/*.of.txt 97 | hadoop fs -rm $2/*.hog.txt 98 | 99 | hadoop fs -cp $HDFS_PTS_DIR/OTSOutput_batch/*.of.txt $2 100 | hadoop fs -cp $HDFS_PTS_DIR/GTSOutput_batch/*.hog.txt $2 101 | 102 | echo "*****************" 103 | echo "Copied OTSOutput, GTSOutput to : " $2 104 | hadoop fs -ls $2 | head 105 | echo "*****************" 106 | 107 | hadoop fs -rm -r $HDFS_PTS_DIR/OTSOutput_batch 108 | hadoop fs -rm -r $HDFS_PTS_DIR/GTSOutput_batch 109 | 110 | echo "*****************" 111 | echo "Removed batch outputs after copying to : " $2 112 | echo "*****************" 113 | 114 | 115 | 116 | # Converting of.txt and hog.txt into a single sequence file 117 | hadoop jar hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.seqfile.TextVectorsToSequenceFile $2 $SIMILARITY_INPUT 118 | echo "*****************" 119 | echo "Completed sequence file generation" 120 | hadoop fs -ls $HDFS_PTS_DIR/SequenceVideoVectors 121 | echo "*****************" 122 | 123 | 124 | # MeanChiSquareDistance Vector Calc 125 | hadoop jar hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.MeanChiSquareDistanceCalculation $SIMILARITY_INPUT $HDFS_PTS_DIR/MeanChiSquaredCalcOutput 126 | hadoop fs -getmerge $HDFS_PTS_DIR/MeanChiSquaredCalcOutput $PTS_MEAN_CHI_OUTPUT/mean_dists.txt 127 | hadoop fs -put $PTS_MEAN_CHI_OUTPUT/mean_dists.txt $HDFS_PTS_DIR/ 128 | 129 | echo "*****************" 130 | echo "Expecting output in " $HDFS_PTS_DIR/MeanChiSquaredCalcOutput 131 | hadoop fs -ls $HDFS_PTS_DIR/MeanChiSquaredCalcOutput 132 | echo "Copied merged MeanChiSquaredCalcOutput output to " $PTS_MEAN_CHI_OUTPUT/mean_dists.txt 133 | ls -lrt $PTS_MEAN_CHI_OUTPUT/mean_dists.txt 134 | echo "Copied merged to hdfs " $HDFS_PTS_DIR/ 135 | hadoop fs -ls $HDFS_PTS_DIR/mean_dists.txt 136 | echo "*****************" 137 | 138 | # Similarity Calc 139 | hadoop fs -rm -r $HDFS_PTS_DIR/SimilarityCalc 140 | hadoop jar hadoop-pot-assembly/target/pooled-time-series-1.0-SNAPSHOT-jar-with-dependencies.jar org.pooledtimeseries.SimilarityCalculation $SIMILARITY_INPUT $HDFS_PTS_DIR/SimilarityCalc/ $HDFS_PTS_DIR/mean_dists.txt 141 | hadoop fs -getmerge $HDFS_PTS_DIR/SimilarityCalc $PTS_SIMILARITY_INPUT/similarity_calc.txt 142 | 143 | echo "Output in - " $PTS_SIMILARITY_INPUT/similarity_calc.txt 144 | 145 | 146 | -------------------------------------------------------------------------------- /src/main/resources/tika-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | video/mp4 5 | application/mp4 6 | video/quicktime 7 | 8 | 9 | 10 | video/mp4 11 | application/mp4 12 | video/quicktime 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /visualization/circlepacking.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 | 24 | 61 | -------------------------------------------------------------------------------- /visualization/cluster-d3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Flare Dendrogram 4 | 64 | 65 | 66 | 67 | 68 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /visualization/css/dashboard.css: -------------------------------------------------------------------------------- 1 | table, th, td { 2 | border: 1px solid black; 3 | padding:3px !important; 4 | } 5 | table { 6 | border-collapse: collapse; 7 | margin:10px; 8 | } 9 | html, body{ 10 | height: 100%; 11 | } 12 | .upper-div{ 13 | overflow-y:scroll; 14 | overflow-x:scroll; 15 | overflow: -moz-scrollbars-vertical; 16 | } 17 | .lower-div{ 18 | 19 | } 20 | ::-webkit-scrollbar { 21 | -webkit-appearance: none; 22 | width: 7px; 23 | } 24 | ::-webkit-scrollbar-thumb { 25 | border-radius: 4px; 26 | background-color: rgba(0, 0, 0, .5); 27 | -webkit-box-shadow: 0 0 1px rgba(255, 255, 255, .5); 28 | } 29 | .break-words { 30 | word-break:break-all 31 | } 32 | 33 | .well{ 34 | padding: 5px; 35 | margin-bottom: 2px; 36 | } -------------------------------------------------------------------------------- /visualization/css/style.css: -------------------------------------------------------------------------------- 1 | body{ 2 | font-size: 14px; 3 | background-color: whitesmoke; 4 | } 5 | h4{ 6 | padding-left: 100px; 7 | } 8 | -------------------------------------------------------------------------------- /visualization/dashboard.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 | Min -   14 | Max -   15 | Click on any cell to view videos!   16 | 18 | 19 | 20 | 23 | 24 |
{{ y | range:filters.min : filters.max }}
25 |      26 |
27 |
28 |
29 | Score - {{score}} 30 |
31 |
32 | Comments on this pair - 33 |
34 |
35 | 36 |
37 |
38 |
39 |
40 |
41 | 42 |
43 |
{{ feedback_response }}
44 |
45 |
46 |
{{videoId1}} - {{ video1 }} 47 |
51 |
{{videoId2}} - {{ video2 }} 52 |
56 | 57 | 58 |
59 |
60 | 61 | 65 | 66 | 67 | 68 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /visualization/data/formatted_similarity_calc.csv: -------------------------------------------------------------------------------- 1 | ,1.mp4,2.mp4,3.mp4,4.mp4,5.mp4,6.mp4,7.mp4, 2 | 1.mp4,1.0,0.677986882429,0.514423983869,0.303814588993,0.05396806823,0.691531696688,0.220586417979, 3 | 2.mp4,,1.0,0.12525353988,0.106469279664,0.497897205246,0.98017123923,0.450498041365, 4 | 3.mp4,,,1.0,0.364850515997,0.440085467283,0.166097710583,0.730772512002, 5 | 4.mp4,,,,1.0,0.0413008435665,0.250465685569,0.782336621617, 6 | 5.mp4,,,,,1.0,0.623467310099,0.182500077256, 7 | 6.mp4,,,,,,1.0,0.922439590582, 8 | 7.mp4,,,,,,,1.0, -------------------------------------------------------------------------------- /visualization/data/similarity_cluster.json: -------------------------------------------------------------------------------- 1 | {"children": [{"color": "#9e0142", "children": [{"name": "01091b9db35e6b57e9b2f9e41e67afd9_1-48988ee434340f9c61ba35ff26bf3b.mp4"}], "name": "cluster0"}, {"color": "#d8434e", "children": [{"name": "01167b79bb5926d6d1d9966ee78fb154_1-34e63d37ed1d2c55940d73dee88470.mp4"}], "name": "cluster1"}, {"color": "#f67a49", "children": [{"name": "011e50d1a95d4cbdc2e12e01060f5ffd_1-f89de38d12f1bcd0098a76140d117e.mp4"}, {"name": "01282698e48306b9f04da78d5388eaad_1-a86d3077108f51dc09c2db79ca8ca9.mp4"}, {"name": "012f564a40cdcc70a113f5143e9c1d28_1-6980c9cef71b9037693dca74d1e7dc.mp4"}, {"name": "013c4d37b8c7ff356df135d0f726263d_1-a9ed23f91c92cb25ac42629ffe3c83.mp4"}, {"name": "014dba5d8b4d950d3e8ca9f392365ea3_1-5bf7500286057453a143ac20e91095.mp4"}, {"name": "015d287a8a951567dc10218bb7141e07_1-2c1bcfd0c0704290df2792c8a0781f.mp4"}, {"name": "0169b72d1dec21850bb9a8df92507c12_1-61239f4f7f48122521996e0369e3b6.mp4"}, {"name": "016cebe78f399424bd923cc6968be490_1-d55689535b1a0f2c847a1332001e17.mp4"}, {"name": "01b720fcb93da59c32a0254f21ade0ae_1-9c9a93a5aed41546d4852ddcafd0b3.mp4"}, {"name": "01b98267a85406778d081213649c09e5_1-273d732e768dab37679b866a50fe58.mp4"}], "name": "cluster2"}, {"color": "#fdbf6f", "children": [{"name": "0143219c1a764fff93b9b487b8bc0132_1-d4d0011a28b479c5921cd29c09d346.mp4"}, {"name": "01b281631cd2b6aa9295cfaae6ee7f4c_1-41ebe9635716b926b329f530391713.mp4"}], "name": "cluster3"}, {"color": "#feeda1", "children": [{"name": "0167b5419cdabe121cb4a65e4a6bc9fa_1-c3193d9003c17d35c9823b71802039.mp4"}, {"name": "018a13837514bf46295802244586525e_1-c42c68c7e75c1161b3d5e77816e14c.mp4"}], "name": "cluster4"}, {"color": "#f1f9a9", "children": [{"name": "01934d5c7d3e7120f33ac34ac24c4d8b_1-eed0305ebc491848b4369e88b4721d.mp4"}], "name": "cluster5"}, {"color": "#bfe5a0", "children": [{"name": "01ab1348d3186a1336dcd1fdf98af3df_1-15f748b50d25bb646e423f790820b3.mp4"}], "name": "cluster6"}, {"color": "#74c7a5", "children": [{"name": "01cc7e5af3c6ed3e81dcf99ff2ec4a56_1-09c23856b5ff2dfd72cb13c502080d.mp4"}], "name": "cluster7"}, {"color": "#378ebb", "children": [{"name": "01d57beec61cdf4b20f37f1af1fbfe23_1-d93bfdce1c419f4204836789bb565f.mp4"}], "name": "cluster8"}, {"color": "#5e4fa2", "children": [{"name": "01e050e25f96845b62e5b7e08e8431cb_1-8d0e0383f1ec94beb40042bb3feb1a.mp4"}], "name": "cluster9"}], "name": "clusters"} -------------------------------------------------------------------------------- /visualization/data/similarity_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/hadoop-pot/c3e7d1dda74ce56c25f574795c09feb9a6429c62/visualization/data/similarity_cluster.png -------------------------------------------------------------------------------- /visualization/data/similarity_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/hadoop-pot/c3e7d1dda74ce56c25f574795c09feb9a6429c62/visualization/data/similarity_heatmap.png -------------------------------------------------------------------------------- /visualization/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/hadoop-pot/c3e7d1dda74ce56c25f574795c09feb9a6429c62/visualization/favicon.ico -------------------------------------------------------------------------------- /visualization/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Similarity for Video files 4 | 5 | 6 | 7 | 8 | 9 | 10 |

Range for similarity:

11 |
12 | 0 < - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - > 1
13 |
14 |
15 |
16 |
17 | 18 |
19 |
27 | 28 | -------------------------------------------------------------------------------- /visualization/js/dashboard.js: -------------------------------------------------------------------------------- 1 | angular.module('myApp', []) //main controller 2 | .controller('myCtrl',['$scope','$http', 3 | function ($scope, $http) { 4 | $scope.legends ={} 5 | var VIDEO_PATH = "data/ht_video_pot_test_set/"; 6 | //FILL link here 7 | var GOOGLE_FORMS_URL=""; 8 | $scope.video1 = ""; 9 | $scope.video2 = ""; 10 | $scope.score = 0.0; 11 | $scope.videoId1 = 0; 12 | $scope.videoId2 = 0; 13 | //Add paths to different data set here 14 | $scope.dataSet = ['data/formatted_similarity_calc.csv','data/formatted_similarity_calc_6.csv']; 15 | 16 | $scope.readCSV = function() { 17 | // http get request to read CSV file content 18 | if($scope.selectedDataSet){ 19 | $http.get($scope.selectedDataSet).success($scope.processData); 20 | }else{ 21 | $http.get($scope.dataSet[0]).success($scope.processData); 22 | } 23 | 24 | }; 25 | 26 | $scope.processData = function(allText) { 27 | // split content based on new line 28 | var allTextLines = allText.split(/\r\n|\n/); 29 | var headers = allTextLines[0].split(','); 30 | var lines = []; 31 | 32 | for ( var i = 0; i < allTextLines.length; i++) { 33 | // split content based on comma 34 | var data = allTextLines[i].split(','); 35 | if (data.length == headers.length) { 36 | var tarr = []; 37 | for ( var j = 0; j < headers.length; j++) { 38 | if(i==0){ 39 | tarr.push(j); 40 | $scope.legends[j]=data[j] 41 | continue; 42 | } 43 | if(j==0){ 44 | tarr.push(i); 45 | continue; 46 | } 47 | 48 | // tarr.push((data[j]*100).toFixed(0) + "%"); 49 | tarr.push(data[j]); 50 | } 51 | lines.push(tarr); 52 | } 53 | } 54 | 55 | $scope.data = lines; 56 | }; 57 | 58 | $scope.showVideos = function(vid1, vid2, score){ 59 | $scope.videoId1=vid1; 60 | $scope.videoId2=vid2; 61 | $scope.video1 = VIDEO_PATH + $scope.legends[vid1] 62 | $scope.video2 = VIDEO_PATH + $scope.legends[vid2] 63 | $scope.score = score; 64 | 65 | $scope.playVideo(document.getElementById("video1")); 66 | $scope.playVideo(document.getElementById("video2")); 67 | 68 | } 69 | 70 | $scope.playVideo = function(video) { 71 | video.addEventListener('loadeddata', function() { 72 | video.play() 73 | }, false); 74 | 75 | } 76 | 77 | $scope.recordFeedback = function(){ 78 | $scope.feedback_response="Posting.."; 79 | 80 | $http({ 81 | url:GOOGLE_FORMS_URL, 82 | headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, 83 | params: { 84 | "entry.1986871126" : $scope.video1.substr(VIDEO_PATH.length), 85 | "entry.489660422" : $scope.video2.substr(VIDEO_PATH.length), 86 | "entry.1932134194" : $scope.score, 87 | "entry.19555886": $scope.comments 88 | }, 89 | method:"POST", 90 | }).error(function(data, status) { 91 | //Error is expected as it's cross domain 92 | //But if status is 0 then form was posted fine 93 | if(status == 0){ 94 | $scope.feedback_response="Posted. Thanks!" 95 | }else{ 96 | $scope.feedback_response="Error! Contact support" 97 | } 98 | 99 | }); 100 | } 101 | $scope.readCSV(); 102 | }])//Filter for percentage in css 103 | .filter('percentage', ['$filter', function ($filter) { 104 | return function (input, decimals) { 105 | //works only for fractions 106 | if(input>1){ 107 | return 0; 108 | } 109 | return $filter('number')(input * 100, decimals) + '%'; 110 | }; 111 | }])//Filter for range 112 | .filter('range', function() { 113 | return function(input, min, max) { 114 | //works only for fractions 115 | if (input > 1) { 116 | return input; 117 | } 118 | if (input >= min && input <= max) { 119 | return input 120 | } 121 | return ""; 122 | }; 123 | }); 124 | 125 | -------------------------------------------------------------------------------- /visualization/js/matrix.js: -------------------------------------------------------------------------------- 1 | var header; 2 | var head=0; 3 | window.onload = function () { 4 | 5 | d3.csv("data/formatted_similarity_calc.csv", function (error,data) { 6 | console.log("in"); 7 | if (error) 8 | throw error; 9 | var label_col_full = Object.keys(data[0]); 10 | header = d3.keys(data[0]); 11 | var label_row = []; 12 | var rows = []; 13 | var row = []; 14 | var temp; 15 | for (var i = 0; i < data.length; i++) { 16 | temp=data[i][label_col_full[0]]; 17 | label_row.push(temp); 18 | row = []; 19 | 20 | for (var j = 1; j < label_col_full.length; j++) { 21 | 22 | temp=parseFloat(data[i][label_col_full[j]]); 23 | row.push(temp); 24 | 25 | } 26 | rows.push(row); 27 | 28 | } 29 | 30 | d3.select("svg").remove(); 31 | d3.select("rowLabelg").remove(); 32 | main(rows, label_col_full.slice(1), label_row); 33 | 34 | }); 35 | }; 36 | 37 | var mapsize = 2000; 38 | var pixelsize = 20; 39 | var cellsize = pixelsize-1; 40 | 41 | d3.select('.tooltip').style('padding',' 10px') 42 | .style('background',' white') 43 | .style('border-radius',' 10px') 44 | .style('box-shadow',' 4px 4px 10px rgba(0, 0, 0, 0.4)'); 45 | 46 | var main = function (corr, label_col, label_row) { 47 | 48 | var transition_time = 1500; 49 | var body = d3.select('body'); 50 | body.select('g.legend').style('position','absolute') 51 | .style('height','25px') 52 | .style('width','400px').style('margin','auto').style('margin-left','100px') 53 | .style('background','linear-gradient(to right,#c8f2b9,#db3db6)'); 54 | var tooltip = body.select('div.tooltip'); 55 | var svg = body.select('#chart').append('svg') 56 | .attr('width', mapsize*3-500) 57 | .attr('height', mapsize-1400).style('margin','auto').style('margin-top','-50px').style('margin-left','150px');;; 58 | 59 | 60 | var row = corr; 61 | var col = d3.transpose(corr); 62 | var total_len ; 63 | 64 | var indexify = function (mat) { 65 | var res = []; 66 | total_len = mat.length; 67 | console.log(total_len); 68 | for (var i = 0; i < mat.length; i++) { 69 | for (var j = 0; j < mat[0].length; j++) { 70 | if(isNaN(mat[i][j])) 71 | temp = 0; 72 | else 73 | temp=mat[i][j]; 74 | res.push({ 75 | i: i, 76 | j: j, 77 | val: temp 78 | 79 | }); 80 | 81 | } 82 | 83 | } 84 | return res; 85 | }; 86 | 87 | var corr_data = indexify(corr); 88 | var order_col = d3.range(label_col.length + 1); 89 | var order_row = d3.range(label_row.length + 1); 90 | 91 | var color = d3.scale.linear() 92 | .domain([ 0, 1]) 93 | .range(['#c8f2b9', '#db3db6']); 94 | 95 | var scale = d3.scale.linear() 96 | .domain([0, d3.min([50, d3.max([label_col.length, label_row.length, 4])])]) 97 | .range([0, parseFloat(1) * 250]); 98 | 99 | 100 | 101 | var label_space = 50; 102 | 103 | var matrix = svg.append('g') 104 | .attr('class', 'matrix') 105 | .attr('height',mapsize-1400) 106 | .attr('width',mapsize*3-500) 107 | .attr('transform', 'translate(' + (label_space + 10) + ',' + (label_space + 10) + ')') 108 | .selectAll('rect.pixel').data(corr_data) 109 | .enter().append('rect') 110 | .attr('class', 'pixel') 111 | .attr('width', cellsize) 112 | .attr('height', cellsize) 113 | .attr('position','absolute') 114 | .attr('y',function(d){return d.i*pixelsize+ label_space-5}) 115 | .attr('x',function(d){return d.j*pixelsize + label_space}) 116 | .style('fill', function (d) { 117 | return color(d.val); 118 | }) 119 | .on('mouseover', function (d) { 120 | tooltip.style("opacity", 0.8) 121 | .style('position', 'absolute') 122 | .style("left", (d3.event.pageX + 35) + 'px') 123 | .style("top", (d3.event.pageY + 30) + 'px') 124 | .html('File: '+ header[d.i+1] +"
" + "File: " +header[d.j+1] + "
" + "Value: " + d.val.toFixed(3)); 125 | 126 | 127 | d3.select(this).style("opacity", 0.5); 128 | }) 129 | .on('mouseout', function (d) { 130 | tooltip.style("opacity", 1e-6); 131 | d3.select(this).style("opacity", 1); 132 | }); 133 | 134 | 135 | rowLabel = [] 136 | colLabel = [] 137 | 138 | for(var head=1; head=3: 42 | limit_smallest_cluster = int(sys.argv[2]) 43 | else: 44 | limit_smallest_cluster = None 45 | 46 | print path_to_sim_mat 47 | print num_videos 48 | print "Filter -", limit_smallest_cluster 49 | # load data from formatted_similarity_calc.csv 50 | # skip header 51 | # skip first column so usecols=range(1 , num_videos), 52 | # paint only upper half filling_values=0) 53 | data = np.genfromtxt(path_to_sim_mat, 54 | delimiter=",", skip_header=1, usecols=range(1 , num_videos+1), 55 | filling_values=0) 56 | 57 | ## add matrix with it's transpose to fill lower half 58 | data = np.triu(data).T + np.triu(data) 59 | ## Diagonal is also added to itself hence resetting it to 1 60 | np.fill_diagonal(data, 1) 61 | # We have similarity matrix, to make it to distance matrix we 62 | # subtract similarity score from 1 63 | data = 1 - data 64 | print "Data loaded" 65 | 66 | db = DBSCAN(eps=0.2).fit(data) 67 | with open(path_to_sim_mat) as f: 68 | videos = f.readline().strip().split(",")[1:] 69 | 70 | ## each index stores it's cluster label 71 | clusters = db.labels_ 72 | 73 | ## map of cluster labe to set of videos contained by it 74 | video_clusters = {} 75 | for cluster, video in zip(clusters, videos): 76 | if cluster not in video_clusters: 77 | video_clusters[cluster] = [] 78 | 79 | video_clusters[cluster].append(video) 80 | 81 | 82 | print "clusters calculated" 83 | 84 | ############################################################################## 85 | # Plot result 86 | 87 | core_samples_mask = np.zeros_like(clusters, dtype=bool) 88 | core_samples_mask[db.core_sample_indices_] = True 89 | 90 | # Black removed and is used for noise instead. 91 | unique_labels = set(clusters) 92 | colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) 93 | 94 | labelCtr = Counter(clusters) 95 | 96 | # The slices for pie chart will be ordered and plotted counter-clockwise. 97 | fracs = [] 98 | 99 | figure1 = plt.figure() 100 | ax1 = figure1.add_axes([0.07,0.25,0.90,0.70]) 101 | 102 | # initialize d3-hierarchy json 103 | clusterJson = {"children":[],"name": "clusters"} 104 | # initialize FILTERED d3-hierarchy json 105 | clusterJsonFiltered = {"children":[],"name": "clusters"} 106 | 107 | # Single loop for forming piechart, , cluster image 108 | for k, col in zip(unique_labels, colors): 109 | ## setting frac for piechart 110 | fracs.append(labelCtr[k]) 111 | 112 | ## d3 json 113 | clusterJsonChild = {"children":[],"name": "cluster"+str(k),"color":lib_colors.rgb2hex(col)} 114 | clusterJsonChildFiltered = {"children":[],"name": "cluster"+str(k),"color":lib_colors.rgb2hex(col)} 115 | 116 | for video in video_clusters[k]: 117 | clusterJsonChild["children"].append({"name": video}) 118 | 119 | # check if filter is enabled and clusters qualifies the filter 120 | if(limit_smallest_cluster and len(video_clusters[k]) >= limit_smallest_cluster): 121 | clusterJsonChildFiltered["children"].append({"name": video}) 122 | 123 | 124 | clusterJson["children"].append(clusterJsonChild) 125 | 126 | # check if filter is enabled and there are nodes after filtering 127 | if(limit_smallest_cluster and len(clusterJsonChildFiltered["children"])>0): 128 | clusterJsonFiltered["children"].append(clusterJsonChildFiltered) 129 | 130 | 131 | ## cluster image 132 | if k == -1: 133 | # Black used for noise. 134 | col = 'k' 135 | 136 | class_member_mask = (clusters == k) 137 | 138 | xy = data[class_member_mask & core_samples_mask] 139 | ax1.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, 140 | markeredgecolor='k', markersize=14) 141 | 142 | xy = data[class_member_mask & ~core_samples_mask] 143 | ax1.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, 144 | markeredgecolor='k', markersize=6) 145 | 146 | plt.title('Estimated number of clusters: %d' % len(unique_labels) ) 147 | 148 | 149 | # make a square figure and axes 150 | ax2 = figure1.add_axes([0.4,0.0,0.20,0.20]) 151 | 152 | ax2.pie(fracs, startangle=90, colors=colors) 153 | 154 | plt.savefig('../data/similarity_cluster.png') 155 | 156 | 157 | with open('../data/similarity_cluster.json', 'w') as fp: 158 | json.dump(clusterJson, fp) 159 | 160 | if(limit_smallest_cluster): 161 | with open('../data/similarity_cluster_filtered_'+str(limit_smallest_cluster)+'.json', 'w') as fp: 162 | json.dump(clusterJsonFiltered, fp) 163 | 164 | -------------------------------------------------------------------------------- /visualization/py/similarity_heatmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import matplotlib 19 | matplotlib.use('Agg') 20 | 21 | import numpy as np 22 | import pylab as pl 23 | import sys 24 | 25 | pl.ioff() 26 | 27 | if len(sys.argv) != 3: 28 | print "Usage - " 29 | print "python similarity_heatmap.py /path/to/formatted/similarity number_of_videos" 30 | print "number_of_videos can be less or equal to number of videos in similarity matrix" 31 | sys.exit() 32 | 33 | 34 | path_to_sim_mat = sys.argv[1] 35 | num_videos = int(sys.argv[2]) 36 | print path_to_sim_mat 37 | print num_videos 38 | 39 | # load data from formatted_similarity_calc.csv 40 | # skip header 41 | # skip first column so usecols=range(1 , num_videos), 42 | # paint only upper half filling_values=0) 43 | data = np.genfromtxt(path_to_sim_mat, 44 | delimiter=",", skip_header=1, usecols=range(1 , num_videos+1), 45 | filling_values=0) 46 | 47 | print "Data loaded" 48 | 49 | #use single color blue 50 | pl.imshow(data, cmap=pl.cm.Blues, interpolation="nearest") 51 | 52 | #show color scale 53 | pl.colorbar() 54 | 55 | # tpggle to pl.show() for just viewing image 56 | pl.savefig('../data/similarity_heatmap.png') 57 | print "saved in ../data/similarity_heatmap.png" 58 | -------------------------------------------------------------------------------- /visualization/py/video_duration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | ''' 19 | Takes a directory path and outputs stats of video duration 20 | - Total length of all videos in seconds 21 | - Mean length of video length 22 | - Standard deviation of length 23 | 24 | Install pymediainfo - https://github.com/sbraz/pymediainfo 25 | ''' 26 | 27 | from pymediainfo import MediaInfo 28 | 29 | import os 30 | import sys 31 | import numpy as np 32 | 33 | if len(sys.argv) < 2: 34 | print "Usage -" 35 | print "\t python video_duration.py " 36 | sys.exit() 37 | 38 | file_path = sys.argv[1] 39 | 40 | print "Finding length of all files in ", file_path 41 | 42 | durations = [] 43 | for f in os.listdir(file_path): 44 | if not f[-3:] == "mp4": 45 | continue 46 | media_info = MediaInfo.parse(file_path+"/"+f) 47 | #duration in millionseconds 48 | 49 | # Only if MediaInfo was able to open video file 50 | if len(media_info.tracks) > 0 and media_info.tracks[0].duration: 51 | duration_in_ms = media_info.tracks[0].duration 52 | 53 | durations.append(1.0*duration_in_ms/1000) 54 | else: 55 | print "Can't open ", file_path+"/"+f 56 | 57 | print "" 58 | print "**********************************" 59 | print "Total Duration - ", sum(durations) 60 | print "Average Duration - ", np.average(durations) 61 | print "Standard deviation of whole set - ", np.std(durations) 62 | print "**********************************" 63 | 64 | --------------------------------------------------------------------------------