├── LICENSE ├── README.md ├── config.properties ├── eval.properties ├── log4j.properties ├── pom.xml ├── samples ├── building_concepts.txt └── samples.zip └── src ├── main └── java │ └── gr │ └── iti │ └── mklab │ ├── data │ ├── GeoCell.java │ └── ImageMetadata.java │ ├── methods │ ├── LanguageModel.java │ ├── MultipleGrid.java │ ├── SimilaritySearch.java │ └── TermCellProbs.java │ ├── metrics │ ├── Entropy.java │ └── Locality.java │ ├── mmcomms16 │ ├── AmbiguityBasedSampling.java │ ├── BuildingSampling.java │ ├── GeographicalUniformSampling.java │ ├── GeographicallyFocusedSampling.java │ ├── Sampling.java │ ├── TextBasedSampling.java │ ├── TextDiversitySampling.java │ ├── UserUniformSampling.java │ └── VisualSampling.java │ ├── tools │ ├── CenterOfGravity.java │ ├── DataManager.java │ ├── InterfaceTermCellProb.java │ └── SimilarityCalculator.java │ └── util │ ├── EasyBufferedReader.java │ ├── EasyBufferedWriter.java │ ├── Progress.java │ ├── TextUtil.java │ └── Utils.java └── test └── java └── gr └── iti └── mklab └── main ├── Evaluation.java └── MultimediaGeotagging.java /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multimedia Geotagging 2 | ====== 3 | 4 | This repository contains the implementation of algorithms that estimate the geographic location of multimedia items based on their textual content. The approach is described in here and here. It was submitted in MediaEval Placing Task 2016. 5 | 6 | 7 | 8 |

Main Method

9 | 10 | The approach is a refined language model, including feature selection and weighting schemes and heuristic techniques that improves the accuracy in finer granularities. It is a text-based method, in which a complex geographical-tag model is built from the tags, titles and the locations of a massive amount of geotagged images that are included in a training set, in order to estimate the location of each query image included in a test set. 11 | 12 | The main approach comprises two major processing steps, an offline and an online. 13 | 14 |

Offline Processing Step

15 | 16 | * Pre-processing 17 | * apply URL decoding, lowercase transformation, tokenization 18 | * remove accents, punctuations and symbols (e.g. “.%!&”) 19 | * discard terms consisting of numerics or less than three characters 20 | 21 | * Language Model 22 | * divide earth surface in rectangular cells with a side length of 0.01° 23 | * calculate term-cell probabilities based on the users that used the term inside the cell 24 | 25 | * Feature selection 26 | * calculate locality score of every term in the dataset 27 | * locality is based on the term frequency and the neighbor users that have used it in the cell distribution 28 | * the final set of selected terms is formed from the terms with locality score greater than zero 29 | 30 | * Feature weighting using spatial entropy 31 | * calculate spatial entropy values of every term applying the Shannon entropy formula in the term-cell probabilities 32 | * spatial entropy weights derives from a Gaussian weight function over the spatial entropy of terms 33 | * locality weights derives from the relative position in the rank of terms based on their locality score 34 | * combine locality and spatial entropy weight to generate the final weights 35 | 36 |

Online Processing Step

37 | 38 | * Language Model based estimation (prior-estimation) 39 | * the probability of each cell is calculated 40 | * Most Likely Cell (MLC) considered the cell with the highest probability and used to produce the estimation 41 | 42 | * Multiple Resolution Grids 43 | * build different language models for multiple resolution grids (side length 0.01° and 0.001°) 44 | * estimate the MLC combining the result of the individual language models 45 | 46 | * Similarity Search 47 | * determine the most similar training images within the MLC 48 | * their center-of-gravity is the final location estimation 49 | 50 | 51 |

Instructions

52 | 53 | In order to make possible to run the project you have to set all necessary argument in configurations, following the instruction for every argument. The default values may be used. 54 | 55 | 56 | _Input File_
57 | The imput files must be in the same format as YFCC100M dataset. 58 | 59 | 60 | _Output Files_
61 | At the end of the training process, the algorithm creates a folder named `TermCellProbs` and inside the folder another folder named `scale_(s)`, named appropriately based on the scale `s` of the language model's cells. The format of this file is the following. 62 | 63 | term cell1-lon_cell1-lat>cell1-prob>cell1-users cell2-lon_cell2-lat>cell2-prob>cell2-users... 64 | 65 | `term`: the actual name of the term
66 | `cellx`: the x most probable cell.
67 | `cellx-lon_cellx-lat`: the longitude and latitude of center of the `cellx`, which is used as cell ID
68 | `cellx-prob`: the probability of the `cellx` for the specific tag
69 | `cellx-users`: the number of users that used the specific term in the `cellx` 70 | 71 | The output of the feature weighting scheme is a folder with name `Weights` containing two files one for locality weight and one for spatial entropy weights, namely `locality_weights` and `spatial_entropy_weights`, respectively. Each row contains a term and its corresponding weight, separated with a tab. 72 | 73 | The files that are described above are given as input in the Language Model estimation process. During this process, a folder named `resultsLM` and inside that folder two files named `resultsLM_scale(s)`are created, where are included the MLCs of the query images. Every row contains the imageID and the MLC (tab-separated) of the image that corresponds in the respective line in the test set. Also, a file named `resultsLM_scale(s)_conf_evid` is created in the same folder, containing the confidence and evidences that lead to estimated MLC, for every query image. 74 | 75 | Having estimated the MLCs for both granularity grids, the files are fed to the Multiple Resolution Grids technique, which produce a file named `resultsLM_mg(cs)-(fs)`, where `(cs)` and `(fs)` stands for coarser and finer granularity grid, respectively. Every row of this file contains the image id, the MLC of the coarser language model and the result of the Multiple Resolution Grids technique, separated with a `>`. 76 | 77 | In conclusion, the file that is created by the Multiple Resolution Grids technique is used for the final processes of the algorithm, Similarity Search. During this process, a folder named `resultSS` is created, containing the similarity values and the location of the images that containing in the MLG of every image in the test set. The final results are saved in the file specified in the arguments, and the records in each row are the ID of the query image, the real longitude and latitude, the estimated longitude and latitude, and they are tab-separated. 78 | 79 |

Evaluation Framework

80 | 81 | This pacage contains the implemetations of the sampling strategies described in the MMCommons 2016 paper. In order to run the evaluation framework you have to set all necessary argument in configuration file, following the instruction for every argument. To run the code, the Evaluation class have to be executed. 82 | 83 | Additionally, in this folder, the zip file that contains the generated collections from the different sampling strategies and the file of the building concepts can be found. Keep in mind that the geographical uniform sampling, the user uniform sampling and text diversity sampling generates different files in every code execution because they involve random selections and permutations. 84 | 85 |

Demo Version

86 | 87 | There have been developed a demo version and a storm module of the approach. 88 | 89 |

Contact for further details about the project

90 | 91 | Giorgos Kordopatis-Zilos (georgekordopatis@iti.gr)
92 | Symeon Papadopoulos (papadop@iti.gr) 93 | -------------------------------------------------------------------------------- /config.properties: -------------------------------------------------------------------------------- 1 | #Project directory 2 | dir=/home/georgekordopatis/Documents/multimedia-geotagging/images/ 3 | 4 | #Processes of the program 5 | #Values: 6 | #create = create the needed sets (training and test) 7 | #train = create Cell-Tag probability file with the entropy value for each tag 8 | #FS = Feature Selection 9 | #LM = Language Model 10 | #IG = Internal Grid 11 | #SS = Similarity Search 12 | #all = all the processes 13 | process=train 14 | 15 | #Folder that contains the training files and Test set file 16 | trainFolder=/yfcc100m/ 17 | testFile=/testset/2016/mediaeval2016_placing_test 18 | 19 | #Scale of Grid 20 | #side cell = 10^(-scale) (i.e. scale 2 = 0.01) 21 | coarserScale=2 22 | finerScale=3 23 | 24 | #Total number of the similar images (k) and the result files of the LM process for multiple grids (input) 25 | #required for IGSS process 26 | k=5 27 | 28 | #Name of the final Result File (output) 29 | resultFile=results_G2-3_k -------------------------------------------------------------------------------- /eval.properties: -------------------------------------------------------------------------------- 1 | #Paths to the input Files 2 | testFile=mediaeval2015_placing_test 3 | placeFile=mediaeval2015_placing_test_places 4 | conceptFile=mediaeval2015_placing_test_autotags 5 | resultFile=results 6 | 7 | #Sampling Strategy 8 | #GUS <-- Geographical Uniform Sampling 9 | #UUS <-- User Uniform Sampling 10 | #TBS <-- Text-based Sampling 11 | #TDS <-- Text Diversity Sampling 12 | #GFS <-- Geographically Focused Sampling 13 | #ABS <-- Ambiguity-based Sampling 14 | #VS <-- Visual Sampling 15 | #BS <-- Building Sampling 16 | #(Empty) <-- No sampling 17 | sampling=GUS 18 | 19 | #Minimum and Maximum precision range 20 | #precisionrange = 10^(scale) (i.e. scale -1 --> range 0.1km) 21 | minRangeScale=-2 22 | maxRangeScale=3 23 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | # Set up logging to include a file record of the output 2 | # Note: the file is always created, even if there is 3 | # no actual output. 4 | log4j.rootLogger=info, stdout, R 5 | 6 | # Log format to standard out 7 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 8 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.stdout.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n 10 | 11 | # File based log output 12 | log4j.appender.R=org.apache.log4j.RollingFileAppender 13 | log4j.appender.R.File=testout.log 14 | log4j.appender.R.MaxFileSize=100000KB 15 | log4j.appender.R.encoding=UTF-8 16 | # Keep one backup file 17 | log4j.appender.R.MaxBackupIndex=1 18 | log4j.appender.R.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.R.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 4.0.0 5 | gr.iti.mklab 6 | multimedia-geotagging 7 | 0.1-SNAPSHOT 8 | jar 9 | 10 | multimedia-geotagging 11 | https://github.com/socialsensor/multimedia-geotagging 12 | Contains the implementation of algorithms that estimate the geographic location of media content based on their content and metadata. 13 | 14 | 15 | 16 | gkordo 17 | Giorgos Kordopatis-Zilos 18 | georgekordopatis@iti.gr 19 | 20 | 21 | 22 | 23 | 24 | The Apache Software License, Version 2.0 25 | http://www.apache.org/licenses/LICENSE-2.0.txt 26 | repo 27 | 28 | 29 | 30 | 31 | scm:git:git@github.com:socialsensor/multimedia-geotagging.git 32 | scm:git:git@github.com:socialsensor/multimedia-geotagging.git 33 | git@github.com:socialsensor/multimedia-geotagging.git 34 | 35 | 36 | 37 | UTF-8 38 | 39 | 40 | 41 | 42 | 43 | junit 44 | junit 45 | 3.8.1 46 | test 47 | 48 | 49 | 50 | log4j 51 | log4j 52 | 1.2.16 53 | 54 | 55 | 56 | org.apache.hadoop 57 | hadoop-core 58 | 1.2.1 59 | 60 | 61 | 62 | org.apache.commons 63 | commons-math3 64 | 3.4.1 65 | 66 | 67 | 68 | info.debatty 69 | java-lsh 70 | 0.10 71 | 72 | 73 | 74 | net.sf.geographiclib 75 | GeographicLib-Java 76 | 1.42 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | org.apache.maven.plugins 85 | maven-compiler-plugin 86 | 2.5.1 87 | 88 | 1.6 89 | 1.6 90 | 91 | 92 | 93 | 94 | org.apache.maven.plugins 95 | maven-source-plugin 96 | 2.2.1 97 | 98 | 99 | attach-sources 100 | 101 | jar 102 | 103 | 104 | 105 | 106 | 107 | org.apache.maven.plugins 108 | maven-javadoc-plugin 109 | 2.9.1 110 | 111 | 112 | attach-javadocs 113 | 114 | jar 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /samples/building_concepts.txt: -------------------------------------------------------------------------------- 1 | flying buttress 2 | brussels carpet 3 | capitol 4 | rose window 5 | abbey 6 | coliseum 7 | nave 8 | cathedral 9 | pantheon 10 | chateau 11 | belfry 12 | gothic 13 | temple 14 | aisle 15 | pointed arch 16 | rotunda 17 | organ loft 18 | onion dome 19 | palace 20 | bastion 21 | campanile 22 | cloister 23 | dome 24 | clock tower 25 | roman arch 26 | round arch 27 | amphitheater 28 | church 29 | facade 30 | frieze 31 | ceiling 32 | ballpark 33 | gargoyle 34 | colonnade 35 | manor 36 | altar 37 | battlement 38 | corbel 39 | castle 40 | brownstone 41 | mansion 42 | fortification 43 | pediment 44 | row house 45 | pedestal 46 | acropolis 47 | apartment 48 | building complex 49 | skyscraper 50 | stronghold 51 | monument 52 | fortress 53 | great hall 54 | tower 55 | drawbridge 56 | arch 57 | portico 58 | stadium 59 | field house 60 | condominium 61 | fort 62 | steeple 63 | steel arch bridge 64 | memorial 65 | column 66 | gable 67 | stained 68 | dome building 69 | watchtower 70 | marina 71 | city 72 | support column 73 | concrete 74 | cantilever bridge 75 | building 76 | roof 77 | door knocker 78 | building structure 79 | department store 80 | cityscape 81 | bazaar 82 | casino 83 | baluster 84 | auditorium 85 | hall 86 | truss 87 | brickwork 88 | assembly hall 89 | harbor 90 | radome 91 | architecture 92 | warehouse 93 | chandelier 94 | house 95 | window box 96 | ruins 97 | greenhouse 98 | stairwell 99 | window 100 | lighthouse 101 | mezzanine 102 | country house 103 | library 104 | stairs 105 | bookshop 106 | waterfront 107 | cemetery 108 | villa 109 | rafter 110 | stoop 111 | resort 112 | brick 113 | bannister 114 | mantel 115 | wall 116 | loft 117 | shelter 118 | cafeteria 119 | farmhouse 120 | cabin 121 | -------------------------------------------------------------------------------- /samples/samples.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/socialsensor/multimedia-geotagging/08a434ca3f6f11a15824e391b50a53f011d24159/samples/samples.zip -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/data/GeoCell.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.data; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | 7 | import gr.iti.mklab.util.Utils; 8 | 9 | /** 10 | * Class that implements the earth cells. 11 | * @author gkordo 12 | * 13 | */ 14 | public class GeoCell { 15 | 16 | private Double totalProb; 17 | private String id; 18 | private Float confidence; 19 | private Map evidence; 20 | 21 | /** 22 | * Constructor of the class where the id is specified and the 23 | * evidence and the summation of the probabilities are initialized. 24 | * @param id : cell ID 25 | */ 26 | public GeoCell(String id){ 27 | this.id = id; 28 | this.evidence = new HashMap(); 29 | this.totalProb = 0.0; 30 | } 31 | 32 | /** 33 | * 34 | * @return the cell ID 35 | */ 36 | public String getID(){ 37 | return id; 38 | } 39 | 40 | /** 41 | * Set the value of the confidence of choosing that cell. 42 | * @param confidence : value of confidence 43 | */ 44 | public void setConfidence(Float confidence){ 45 | this.confidence = confidence; 46 | } 47 | 48 | /** 49 | * 50 | * @return the confidence of the cell 51 | */ 52 | public Float getConfidence(){ 53 | return confidence; 54 | } 55 | 56 | /** 57 | * 58 | * @return the summation of all probabilities 59 | */ 60 | public Double getTotalProb() { 61 | return totalProb; 62 | } 63 | 64 | /** 65 | * Add the given probability to the summation and store the word. 66 | * @param prob : probability of the word 67 | * @param word : actual word 68 | */ 69 | public void addProb(double prob, String word) { 70 | totalProb += prob; 71 | this.evidence.put(word, (float) prob); 72 | } 73 | 74 | /** 75 | * 76 | * @return the sorted map of the word and their probabilities 77 | */ 78 | public Map getEvidence(){ 79 | Map unsortMap = new HashMap(); 80 | for(Entry word:evidence.entrySet()){ 81 | if(word.getValue()/totalProb>0.0001){ 82 | unsortMap.put(word.getKey(), (float) (word.getValue()/totalProb)); 83 | } 84 | } 85 | return Utils.sortByValues(unsortMap); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/data/ImageMetadata.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.data; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * The class that contains the metadata of an image. 7 | * @author gkordo 8 | * 9 | */ 10 | public class ImageMetadata{ 11 | 12 | private String imageID; 13 | private String predictedCell,coarserCell; 14 | private String userID; 15 | private Set tags; 16 | 17 | /** 18 | * Constructor using the metadata provided by the dataset file 19 | * @param id : image ID 20 | * @param userID : user ID 21 | * @param tags : image tags 22 | */ 23 | public ImageMetadata (String id, String userID, Set tags) { 24 | this.imageID = id; 25 | this.userID = userID; 26 | this.tags = tags; 27 | } 28 | 29 | public String getId () { 30 | return imageID; 31 | } 32 | 33 | public String getUserId () { 34 | return userID; 35 | } 36 | 37 | public Set getTags () { 38 | return tags; 39 | } 40 | 41 | public void setPredictedCell (String cell){ 42 | this.predictedCell = cell; 43 | } 44 | 45 | public void setCoarserCell (String cell){ 46 | this.coarserCell = cell; 47 | } 48 | 49 | public String getCell () { 50 | return predictedCell; 51 | } 52 | 53 | public String getCoarserCell () { 54 | return coarserCell; 55 | } 56 | } -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/methods/LanguageModel.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.methods; 2 | 3 | import gr.iti.mklab.data.GeoCell; 4 | import gr.iti.mklab.tools.DataManager; 5 | import gr.iti.mklab.util.EasyBufferedReader; 6 | import gr.iti.mklab.util.Utils; 7 | import gr.iti.mklab.util.Progress; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.Set; 12 | import java.util.Map.Entry; 13 | 14 | import org.apache.log4j.Logger; 15 | 16 | /** 17 | * This class is the core of the algorithm. It is the implementation of the language model. 18 | * The Most Likely Cell of the given image is calculated. 19 | * @author gkordo 20 | * 21 | */ 22 | public class LanguageModel { 23 | 24 | protected Map selectedTermWeights; 25 | 26 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.LanguageModel"); 27 | 28 | // The function that compose the other functions to calculate and 29 | // return the Most Likely Cell (MLC) for a query item. 30 | public GeoCell calculateLanguageModel(Set sentenceWords, 31 | Map> termCellProbsMap, boolean confidenceFlag) { 32 | 33 | Map cellMap = calculateCellsProbForImageTags(sentenceWords, 34 | termCellProbsMap); 35 | 36 | GeoCell mlc = findMLC(cellMap, confidenceFlag); 37 | 38 | return mlc; 39 | } 40 | 41 | // find the Most Likely Cell. 42 | private GeoCell findMLC( 43 | Map cellMap, boolean confidenceFlag) { 44 | 45 | cellMap = Utils.sortByMLCValues(cellMap); 46 | 47 | GeoCell mlc = null; 48 | 49 | if (!cellMap.isEmpty()){ 50 | String mlcId = cellMap.keySet().toArray()[0].toString(); 51 | 52 | mlc = cellMap.get(mlcId); 53 | 54 | if(confidenceFlag) 55 | mlc.setConfidence((float) calculateConfidence(cellMap, mlcId, 0.3)); 56 | } 57 | 58 | return mlc; 59 | } 60 | 61 | // calculate confidence for the estimated location 62 | private static double calculateConfidence(Map cellMap, 63 | String mlc, double l) { 64 | 65 | Double sum = 0.0, total = 0.0; 66 | 67 | for(Entry entry:cellMap.entrySet()){ 68 | double[] mCell = {Double.parseDouble(mlc.split("_")[0]), 69 | Double.parseDouble(mlc.split("_")[1])}; 70 | double[] cell = {Double.parseDouble(entry.getKey().split("_")[0]), 71 | Double.parseDouble(mlc.split("_")[1])}; 72 | if((cell[0] >= (mCell[0]-l)) && (cell[0] <= (mCell[0]+l)) 73 | && (cell[1] >= (mCell[1]-l)) && (cell[1] <= (mCell[1]+l))){ 74 | sum += entry.getValue().getTotalProb(); 75 | } 76 | total += entry.getValue().getTotalProb(); 77 | } 78 | return sum/total; 79 | } 80 | 81 | /** 82 | * This is the function that calculate the cell probabilities. 83 | * @param sentenceWords : list of words contained in tweet text 84 | * @return a map of cell 85 | */ 86 | private Map calculateCellsProbForImageTags (Set terms, 87 | Map> termCellProbsMap) { 88 | 89 | Map cellMap = new HashMap(); 90 | 91 | for(String term:terms){ 92 | if(termCellProbsMap.containsKey(term)){ 93 | double locality= selectedTermWeights.get(term)[1]; 94 | double entropy= selectedTermWeights.get(term)[0]; 95 | 96 | for(Entry entry: termCellProbsMap.get(term).entrySet()){ 97 | String cell = entry.getKey(); 98 | if(cellMap.containsKey(cell)){ 99 | cellMap.get(cell).addProb(entry.getValue() 100 | *(0.8*locality+0.2*entropy), term); 101 | }else{ 102 | GeoCell tmp = new GeoCell(cell); 103 | tmp.addProb(entry.getValue() 104 | *(0.8*locality+0.2*entropy), term); 105 | cellMap.put(cell,tmp); 106 | } 107 | } 108 | } 109 | } 110 | return cellMap; 111 | } 112 | 113 | /** 114 | * Initialize Language Model 115 | * @param testFile : file that contains test image metadata 116 | * @param probFile : file that contains the term-cell probabilities 117 | * @param weightFolder : the folder that contains the term weights 118 | * @return the term-cell probability map 119 | */ 120 | public Map> loadTermCellProbsAndWeights( 121 | String testFile, String probFile, String weightFolder){ 122 | 123 | // Feature Selection 124 | loadTermWeights(weightFolder); 125 | 126 | logger.info("loading cells' probabilities for all tags from " + probFile); 127 | 128 | long startTime = System.currentTimeMillis(); 129 | Progress prog = new Progress(startTime,10,1,"loading",logger); 130 | 131 | Map> tagCellProbsMap = 132 | new HashMap>(); 133 | Set termsInTestSet = DataManager.getSetOfTerms(testFile); 134 | 135 | EasyBufferedReader reader = new EasyBufferedReader(probFile); 136 | String line; 137 | // load tag-cell probabilities from the given file 138 | while ((line = reader.readLine())!=null){ 139 | prog.showMessege(System.currentTimeMillis()); 140 | String term = line.split("\t")[0]; 141 | 142 | if(line.split("\t").length>1 && termsInTestSet.contains(term) 143 | && selectedTermWeights.containsKey(term)){ 144 | Map tmpCellMap = new HashMap(); 145 | for(String cell:line.split("\t")[2].split(" ")){ 146 | tmpCellMap.put(cell.split(">")[0], 147 | Double.parseDouble(cell.split(">")[1])); 148 | } 149 | tagCellProbsMap.put(term, tmpCellMap); 150 | } 151 | } 152 | logger.info(tagCellProbsMap.size() + " tags loaded in " + 153 | (System.currentTimeMillis()-startTime)/1000.0 + "s"); 154 | reader.close(); 155 | 156 | return tagCellProbsMap; 157 | } 158 | 159 | private void loadTermWeights(String folder){ 160 | 161 | // load locality weight of the terms 162 | EasyBufferedReader reader = new 163 | EasyBufferedReader(folder + "/locality_weights"); 164 | String line; 165 | while ((line = reader.readLine())!=null){ 166 | Double[] temp = {0.0, Double.parseDouble(line.split("\t")[1])}; 167 | selectedTermWeights.put(line.split("\t")[0], temp); 168 | } 169 | reader.close(); 170 | 171 | // load spatial entropy weight of the terms 172 | reader = new EasyBufferedReader( 173 | folder + "/spatial_entropy_weights"); 174 | while ((line = reader.readLine())!=null){ 175 | if(selectedTermWeights.containsKey(line.split("\t")[0])) 176 | selectedTermWeights.get(line.split("\t")[0])[0] = 177 | Double.parseDouble(line.split("\t")[1]); 178 | } 179 | reader.close(); 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/methods/MultipleGrid.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.methods; 2 | 3 | import gr.iti.mklab.util.EasyBufferedReader; 4 | import gr.iti.mklab.util.EasyBufferedWriter; 5 | 6 | import org.apache.log4j.Logger; 7 | 8 | /** 9 | * The implementation of the Internal Grid technique 10 | * @author gkordo 11 | * 12 | */ 13 | public class MultipleGrid { 14 | 15 | static Logger logger = Logger.getLogger("gr.iti.mklab.method.InternalGrid"); 16 | 17 | /** 18 | * Method that perform the Multiple Grid technique and generates 19 | * the arguments for the similarity search Class contractor 20 | * @param dir : directory of the project 21 | * @param resultFile : name of the output file 22 | * @param resultCorserGrid : file with the estimated cells of the coarser grid 23 | * @param resultFinerGrid : file with the estimated cells of the finer grid 24 | */ 25 | public static void determinCellIDsForSS(String dir, String resultFile, 26 | String resultCorserGrid, String resultFinerGrid){ 27 | 28 | logger.info("Process: Multiple Grid Technique\t|\t" 29 | + "Status: INITIALIZE"); 30 | // Initialize parameters 31 | EasyBufferedReader resultLMGCReader = new EasyBufferedReader(dir + resultCorserGrid); 32 | EasyBufferedReader resultLMGFReader = new EasyBufferedReader(dir + resultFinerGrid); 33 | EasyBufferedWriter writer = new EasyBufferedWriter(dir + resultFile); 34 | 35 | String corseMLC; 36 | String fineMLC; 37 | 38 | logger.info("Process: Multiple Grid Technique\t|\t" 39 | + "Status: STARTED"); 40 | 41 | while ((corseMLC=resultLMGCReader.readLine())!=null 42 | && (fineMLC=resultLMGFReader.readLine())!=null){ 43 | 44 | if(!corseMLC.split("\t")[1].equals("N/A")){ 45 | String mlc = deterimBoarders(corseMLC.split("\t")[1], fineMLC.split("\t")[1]); 46 | if(!mlc.isEmpty()){ 47 | writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1] 48 | + ":" + mlc); // selected cell ID and the sell of the coarser granularity 49 | }else{ 50 | writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1] 51 | + ":" + corseMLC.split("\t")[1]); 52 | } 53 | writer.newLine(); 54 | } else{ 55 | writer.write(corseMLC.split("\t")[0] + "\tN/A"); 56 | } 57 | } 58 | 59 | logger.info("Process: Multiple Grid Technique\t|\t" 60 | + "Status: COMPLETED"); 61 | 62 | writer.close(); 63 | resultLMGCReader.close(); 64 | resultLMGFReader.close(); 65 | } 66 | 67 | /** 68 | * Method that determines the borders of the cell that similarity search will take place 69 | * @param corseMLC : estimated cell of the coarser grid 70 | * @param fineMLC : estimated cell of the finer grid 71 | */ 72 | private static String deterimBoarders(String corseMLC, String fineMLC){ 73 | 74 | String mlc = corseMLC; 75 | 76 | if (!corseMLC.equals("N/A")){ 77 | Double[] corseLatLon = {Double.parseDouble(corseMLC.split("_")[0]), 78 | Double.parseDouble(corseMLC.split("_")[1])}; 79 | 80 | if(!fineMLC.equals("N/A")){ 81 | Double[] fineLatLon = {Double.parseDouble(fineMLC.split("_")[0]), 82 | Double.parseDouble(fineMLC.split("_")[1])}; 83 | 84 | // check whether the estimated cell of the finer grid laying 85 | // inside the borders of the estimated cell of the coarser grid 86 | if(fineLatLon[0]>=(corseLatLon[0]-0.005) 87 | && fineLatLon[0]<=(corseLatLon[0]+0.005) 88 | && fineLatLon[1]>=(corseLatLon[1]-0.005) 89 | && fineLatLon[1]<=(corseLatLon[1]+0.005)){ 90 | mlc = fineMLC; 91 | } 92 | } 93 | } 94 | 95 | return mlc; 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/methods/SimilaritySearch.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.methods; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.log4j.Logger; 10 | 11 | import gr.iti.mklab.tools.CenterOfGravity; 12 | import gr.iti.mklab.util.EasyBufferedWriter; 13 | import gr.iti.mklab.util.Progress; 14 | import gr.iti.mklab.util.EasyBufferedReader; 15 | 16 | /** 17 | * Class that estimates the final location for every query image 18 | * @author gkordo 19 | * 20 | */ 21 | public class SimilaritySearch extends CenterOfGravity{ 22 | 23 | private Map estimatedCellMap = new HashMap(); 24 | private Map similarities = new HashMap(); 25 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch"); 26 | 27 | /** 28 | * Contractor of the class. 29 | * @param multipleGridFile : file that contains the results of the multiple grid technique 30 | * @param similarityFile : file that contains the similar images of every query images 31 | * @param testFile : file that contains the test image's metadata 32 | * @param outputFile : name of the output file 33 | * @param k : number of similar images based on the center-of-gravity is calculated 34 | * @param a : variable required for center-of-gravity calculation 35 | */ 36 | public SimilaritySearch(String testFile,String multipleGridFile, 37 | String similarityFile, String outputFile, int k, int a) { 38 | super(a); 39 | 40 | logger.info("Process: Location Estimation\t|\t" 41 | + "Status: INITIALIZE"); 42 | loadEstimatedCells(multipleGridFile); 43 | logger.info("Process: Location Estimation\t|\t" 44 | + "Status: STARTED"); 45 | estimateLocation(similarityFile,k); 46 | writeResultsInFile(testFile, outputFile); 47 | logger.info("Process: Location Estimation\t|\t" 48 | + "Status: COMPLETED"); 49 | } 50 | 51 | /** 52 | * Function that loads the estimated cells from the Multiple Grid Technique. 53 | * @param multipleGridFile : ile that contains the results of the multiple grid technique 54 | */ 55 | private void loadEstimatedCells(String multipleGridFile) { 56 | 57 | EasyBufferedReader reader = new EasyBufferedReader(multipleGridFile); 58 | 59 | String line; 60 | while ((line = reader.readLine())!=null){ 61 | if((!line.split("\t")[1].equals("N/A"))){ 62 | estimatedCellMap.put(line.split("\t")[0], line.split("\t")[1]); 63 | } 64 | } 65 | 66 | reader.close(); 67 | } 68 | 69 | /** 70 | * Final location estimation of the images contained in the test set 71 | * @param similarityFile : file that contains the similar images of every query images 72 | * @param cellFile : file that contains the results of the multiple grid technique 73 | * @param k : number of similar images based on the center-of-gravity is calculated 74 | */ 75 | private void estimateLocation(String similarityFile, int k) { 76 | 77 | EasyBufferedReader reader = new EasyBufferedReader(similarityFile); 78 | 79 | Progress prog = new Progress(System.currentTimeMillis(), 1000000, 100, 1, "calculate", logger); 80 | int count=0; 81 | String line; 82 | 83 | // Calculate the final results 84 | while ((line = reader.readLine())!=null){ 85 | prog.showProgress(count, System.currentTimeMillis()); 86 | if(estimatedCellMap.containsKey(line.split("\t")[0])){ 87 | similarities.put(line.split("\t")[0], 88 | findSimilarImages(line, estimatedCellMap.get(line.split("\t")[0]), k)); 89 | } 90 | count++; 91 | } 92 | reader.close(); 93 | } 94 | 95 | /** 96 | * Location estimation for a query image. 97 | * @param line : line that contain the similarity of the train images 98 | * @param cells : estimated cells from the multiple grid technique 99 | * @param k : number of similar images based on the center-of-gravity is calculated 100 | * @return estimated location 101 | */ 102 | private static String findSimilarImages(String line, String cells, int k){ 103 | 104 | List images = new ArrayList(); 105 | Collections.addAll(images, line.split("\t")[1].split(" ")); 106 | 107 | Map similarity = new HashMap(k); 108 | Map similarityCoarser = new HashMap(k); 109 | 110 | boolean flag = false; 111 | Double[] result = new Double[2]; 112 | 113 | // final estimation 114 | for(String image:images){ 115 | if(similarity.size()")[0].equals(cells.split(">")[1])){ 117 | if(deterimCell(image.split(">")[0],cells)){ 118 | similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1])); 119 | }else if(similarityCoarser.size()")[0], Double.parseDouble(image.split(">")[1])); 121 | } 122 | }else { 123 | similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1])); 124 | } 125 | }else{ 126 | flag = true; 127 | result = computeCoordination(similarity); 128 | break; 129 | } 130 | } 131 | 132 | if(similarity.size()>0 && !flag){ 133 | flag = true; 134 | result = computeCoordination(similarity); 135 | }else if(similarityCoarser.size()>0 && !flag){ 136 | flag = true; 137 | result = computeCoordination(similarityCoarser); 138 | } 139 | 140 | // final return 141 | if(flag){ 142 | return result[1] + "\t" + result[0]; 143 | }else{ 144 | return cells.split(">")[0].replace("_", "\t"); 145 | } 146 | } 147 | 148 | /** 149 | * Function that determines if the given point lays inside a define cell. 150 | * @param point : latitude-longitude pair 151 | * @param cell : grid's cell 152 | * @return a boolean that contain the information 153 | */ 154 | private static boolean deterimCell(String point, String cell){ 155 | 156 | boolean cellID = false; 157 | 158 | Double[] pointLoc = {Double.parseDouble(point.split("_")[0]), Double.parseDouble(point.split("_")[1])}; 159 | Double[] cellLoc = {Double.parseDouble(cell.split("_")[0]), Double.parseDouble(cell.split("_")[1])}; 160 | 161 | if((pointLoc[0]>=(cellLoc[0]-0.0005)) && (pointLoc[0]<=(cellLoc[0]+0.0005)) 162 | &&(pointLoc[1]>=(cellLoc[1]-0.0005)) && (pointLoc[1]<=(cellLoc[1]+0.0005))){ 163 | cellID = true; 164 | } 165 | 166 | return cellID; 167 | } 168 | 169 | /** 170 | * Function that write the result in a file 171 | * @param testFile : file that contains the test image's metadata 172 | * @param outputFile : name of the output file 173 | */ 174 | private void writeResultsInFile(String testFile, String outputFile) { 175 | 176 | EasyBufferedReader reader = new EasyBufferedReader(testFile); 177 | EasyBufferedWriter writer = new EasyBufferedWriter(outputFile); 178 | 179 | String line; 180 | // for every query image 181 | while ((line = reader.readLine())!=null){ 182 | 183 | writer.write(line.split("\t")[0]); 184 | 185 | if(similarities.containsKey(line.split("\t")[0])){ // the location have been estimated 186 | writer.write(line.split("\t")[1] + "\t" + 187 | line.split("\t")[12] + "\t" + line.split("\t")[13] + "\t" + 188 | similarities.get(line.split("\t")[0])); 189 | writer.newLine(); 190 | }else{ // no estimation 191 | writer.write(line.split("\t")[1] + "\t" + 192 | line.split("\t")[12] + "\t" + line.split("\t")[13] 193 | + "\t-73.98282136256299\t40.75282028252674"); 194 | writer.newLine(); 195 | } 196 | } 197 | reader.close(); 198 | writer.close(); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/methods/TermCellProbs.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.methods; 2 | 3 | import gr.iti.mklab.tools.InterfaceTermCellProb; 4 | import gr.iti.mklab.util.Utils; 5 | import gr.iti.mklab.util.TextUtil; 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.math.BigDecimal; 10 | import java.util.*; 11 | import java.util.Map.Entry; 12 | 13 | import org.apache.commons.io.FileUtils; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.*; 16 | import org.apache.hadoop.mapred.*; 17 | import org.apache.log4j.Logger; 18 | 19 | /** 20 | * Class that calculate the term-cell probabilities for all term in all cells and saves the results in file. 21 | * The implementation employ hadoop map-reduce function. 22 | * @author gkordo 23 | * 24 | */ 25 | public class TermCellProbs implements InterfaceTermCellProb{ 26 | 27 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.TermCellProbCalculator"); 28 | private static Set testIDs; 29 | private static Set users; 30 | private static int scale; 31 | 32 | /** 33 | * Contractor of the class get the set of image IDs and the user IDs of the images in the test set. 34 | * @param testIDs : set of test image IDs 35 | * @param users : set of test user IDs 36 | */ 37 | public TermCellProbs(Set testIDs, Set users){ 38 | TermCellProbs.testIDs = testIDs; 39 | TermCellProbs.users = users; 40 | } 41 | 42 | /** 43 | * Map class that takes the lines of the train file as input and creates key-value pairs, 44 | * using as keys the terms contained in the images and as values strings that contain 45 | * the information regarding the cell and user ID. 46 | * @author gkordo 47 | * 48 | */ 49 | public static class MapTermCellProb extends MapReduceBase implements Mapper { 50 | 51 | /** 52 | * Required map function 53 | * @param key : key value 54 | * @param value : input string 55 | * @param output : output collector 56 | * @param reporter : reporter of the job 57 | */ 58 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 59 | 60 | String[] metadata = value.toString().split("\t"); 61 | 62 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set 63 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations 64 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information 65 | 66 | // get image cell based on its latitude-longitude pair 67 | BigDecimal cellLonCenter = new BigDecimal(Double.parseDouble( 68 | metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP); 69 | BigDecimal cellLatCenter = new BigDecimal(Double.parseDouble( 70 | metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP); 71 | 72 | String cellID = cellLonCenter+"_"+cellLatCenter; 73 | 74 | //get image user ID 75 | String userID = metadata[3]; 76 | 77 | // get image tags 78 | Set terms = new HashSet(); 79 | TextUtil.parse(metadata[10], terms); 80 | TextUtil.parse(metadata[8], terms); 81 | 82 | for(String term:terms){ 83 | if(!term.isEmpty() && term.length() > 2){ 84 | output.collect(new Text(term), new Text(cellID+">"+userID)); // key-value pair 85 | } 86 | } 87 | } 88 | } 89 | } 90 | 91 | 92 | /** 93 | * Reduce class that get the key-value pairs and calculate the term-cell probabilities of every term. 94 | * @author gkordo 95 | * 96 | */ 97 | public static class ReduceTermCellProb extends MapReduceBase implements Reducer { 98 | 99 | /** 100 | * Required reduce function 101 | * @param key : key value 102 | * @param values : set of values that share the same key 103 | * @param output : output collector 104 | * @param reporter : reporter of the job 105 | */ 106 | public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { 107 | 108 | // frequency map that contains the count of the different users for every single cell 109 | Map> termFreq = new HashMap>(); 110 | int Nt = 0; // total user count 111 | 112 | // process every value that corresponds to a specific key 113 | while (values.hasNext()) { 114 | 115 | String entry = values.next().toString(); 116 | 117 | // retrieve cell ID and user ID from the value of the pair 118 | String cellID = entry.split(">")[0]; 119 | String userID = entry.split(">")[1]; 120 | 121 | // update of the frequency map 122 | if (termFreq.containsKey(cellID)){ 123 | if(!termFreq.get(cellID).contains(userID)){ 124 | Nt++; 125 | termFreq.get(cellID).add(userID); 126 | } 127 | }else{ 128 | Nt++; 129 | termFreq.put(cellID,new HashSet()); 130 | termFreq.get(cellID).add(userID); 131 | } 132 | } 133 | 134 | // calculation of the tag-cell probabilities map for every cell 135 | Map cellsProbs = new HashMap(); 136 | for(Entry> entryCell : termFreq.entrySet()){ 137 | String cellID = entryCell.getKey(); 138 | Double cellProb = ((double)(entryCell.getValue().size()))/Nt; 139 | cellsProbs.put(cellID,cellProb); 140 | } 141 | 142 | // sorting of the tag-cell probabilities map 143 | Map cellsProbsSorted = Utils.sortByValues(cellsProbs); 144 | 145 | // convert tag-cell probabilities map in string in order to be saved in the output file 146 | String out = convertMapToString(cellsProbsSorted,termFreq); 147 | 148 | // send output to collector 149 | output.collect(key, new Text(out)); 150 | } 151 | 152 | /** 153 | * Function that convert tag-cell probabilities map in output string. 154 | * @param cellsProbs : tag-cell probabilities map 155 | * @param termFreq : frequency map 156 | * @return a string contains cell IDs accompanied with tag-cell probabilities 157 | */ 158 | public static String convertMapToString(Map cellsProbs, 159 | Map> termFreq){ 160 | String out = ""; 161 | for(Entry entryCell: cellsProbs.entrySet()){ 162 | if(cellsProbs.get(entryCell.getKey()) >= 0.00001){ 163 | String tempCellIDProb = entryCell.getKey() 164 | + ">" + cellsProbs.get(entryCell.getKey()) 165 | + ">" + termFreq.get(entryCell.getKey()).size(); 166 | 167 | out += (tempCellIDProb + " "); 168 | } 169 | } 170 | return out.trim(); 171 | } 172 | } 173 | 174 | /** 175 | * Core function for the job of tag-cell probabilities calculation. 176 | * @param dir : directory of the project 177 | * @param trainFolder : the file of the train set 178 | * @param outFolder : the folder where the tag-set probabilities file will be stored 179 | * @param scale : the scale of the grid that is used 180 | */ 181 | public void calculatorTermCellProb(String dir, String trainFolder, 182 | String outFolder, int scale) throws IOException{ 183 | 184 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t" 185 | + "Status: INITIALIZE"); 186 | 187 | TermCellProbs.scale = scale; 188 | 189 | // initialize Job 190 | JobConf conf = new JobConf(TermCellProbs.class); 191 | conf.setJobName("termcellprobmapred"); 192 | 193 | conf.setOutputKeyClass(Text.class); 194 | conf.setOutputValueClass(Text.class); 195 | 196 | conf.setMapperClass(MapTermCellProb.class); 197 | conf.setReducerClass(ReduceTermCellProb.class); 198 | 199 | conf.setInputFormat(TextInputFormat.class); 200 | conf.setOutputFormat(TextOutputFormat.class); 201 | 202 | // clean the output file directory 203 | File folder = new File(dir + outFolder); 204 | if (folder.exists()) { 205 | FileUtils.cleanDirectory(folder); 206 | FileUtils.forceDelete(folder); 207 | } 208 | 209 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder)); 210 | FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder)); 211 | 212 | // start Job 213 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t" 214 | + "Status: STARTED"); 215 | long startTime = System.currentTimeMillis(); 216 | JobClient.runJob(conf); 217 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t" 218 | + "Status: COMPLETED\t|\tTotal time: " + 219 | (System.currentTimeMillis()-startTime)/60000.0+"m"); 220 | 221 | new File(dir + outFolder + "/part-00000").renameTo( 222 | new File(dir + outFolder + "/term_cell_probs")); // rename the output file 223 | } 224 | } -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/metrics/Entropy.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.metrics; 2 | 3 | import gr.iti.mklab.util.EasyBufferedReader; 4 | import gr.iti.mklab.util.EasyBufferedWriter; 5 | import gr.iti.mklab.util.Utils; 6 | 7 | import java.io.File; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import org.apache.commons.math3.distribution.NormalDistribution; 13 | import org.apache.commons.math3.stat.descriptive.moment.Mean; 14 | import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; 15 | import org.apache.log4j.Logger; 16 | 17 | /** 18 | * Entropy class update the file that contains the tag-cell probabilities with the spatial entropy of every individual tag. 19 | * Calculate the spatial tag entropy for all of the tags. Entropy is used for feature weighting. 20 | * @author gkordo 21 | * 22 | */ 23 | public class Entropy { 24 | 25 | static Logger logger = Logger.getLogger("gr.iti.mklab.method.Entropy"); 26 | 27 | /** 28 | * Calculate the Spatial Entropy weights of the LM terms 29 | * @param dir : project directory 30 | * @param fileTermCell : Term-Cell probability file 31 | */ 32 | public static void calculateEntropyWeights(String dir, String fileTermCell){ 33 | 34 | logger.info("Process: Spatial Entropy weights calculation\t|\t" 35 | + "Status: INITIALIZE"); 36 | 37 | new File(dir + "Weights").mkdir(); 38 | 39 | // Term Spatial Entropy calculation 40 | EasyBufferedReader reader = new EasyBufferedReader(dir + fileTermCell); 41 | Map termSpatialEntropy = new HashMap(); 42 | long sTime = System.currentTimeMillis(); 43 | String line; 44 | while ((line=reader.readLine())!=null){ 45 | String term = line.split("\t")[0]; 46 | String[] cells = line.split("\t")[1].split(" "); 47 | if(cells.length > 1 48 | && term.length() > 3){ 49 | termSpatialEntropy.put(term, 50 | computeEntropyNaive(cells)); 51 | } 52 | } 53 | reader.close(); 54 | 55 | logger.info("Process: Spatial Entropy weights calculation\t|\t" 56 | + "Status: STARTED"); 57 | 58 | // Spatial Entropy weights calculation of terms 59 | Map weights = calculateSpatialEntropyWeights(termSpatialEntropy); 60 | 61 | // store weights 62 | EasyBufferedWriter writer = new EasyBufferedWriter( 63 | dir + "Weights/spatial_entropy_weights"); 64 | for(Entry term:weights.entrySet()){ 65 | writer.write(term.getKey() + "\t" + term.getValue()); 66 | writer.newLine(); 67 | } 68 | 69 | logger.info("Process: Spatial Entropy weights calculation\t|\t" 70 | + "Status: COMPLETED\t|\tTotal time: " + 71 | (System.currentTimeMillis()-sTime)/1000.0 + "s"); 72 | writer.close(); 73 | } 74 | 75 | /** 76 | * Shannon entropy formula 77 | * @param probabilities : probability distribution 78 | * @return 79 | */ 80 | private static double computeEntropyNaive(String[] probabilities) { 81 | double entropy = 0.0; 82 | for (int i=0;i< probabilities.length;i++) { 83 | double p = Double.parseDouble(probabilities[i].split(">")[1]); 84 | if(p != 0.0){ 85 | entropy -= p * Math.log(p); 86 | } 87 | } 88 | return entropy; 89 | } 90 | 91 | /** 92 | * Calculate the max probability value applying the Gaussian functionon the 93 | * probability distribution 94 | * @param entropies : spatial entropy values of the terms 95 | * @return max weight 96 | */ 97 | private static Map calculateSpatialEntropyWeights( 98 | Map entropies){ 99 | 100 | double[] termSpatialEntropyValues = entropies 101 | .values().stream().mapToDouble(d -> d).toArray(); 102 | 103 | NormalDistribution gd = new NormalDistribution( // Gaussian function for re-weighting 104 | new Mean().evaluate(termSpatialEntropyValues), 105 | new StandardDeviation().evaluate(termSpatialEntropyValues)); 106 | 107 | Double gdMax = 0.0; 108 | Map weights = new HashMap(); 109 | for(Entry p:entropies.entrySet()){ 110 | double weight = gd.density(p.getValue()); 111 | weights.put(p.getKey(), weight); 112 | if(gdMax < weight){ 113 | gdMax = weight; 114 | } 115 | } 116 | 117 | for(Entry term:weights.entrySet()){ 118 | term.setValue(term.getValue()/gdMax); 119 | } 120 | 121 | return Utils.sortByValues(weights); 122 | } 123 | } -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/metrics/Locality.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.metrics; 2 | 3 | import gr.iti.mklab.tools.DataManager; 4 | import gr.iti.mklab.util.EasyBufferedReader; 5 | import gr.iti.mklab.util.EasyBufferedWriter; 6 | import gr.iti.mklab.util.TextUtil; 7 | import gr.iti.mklab.util.Utils; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.math.BigDecimal; 12 | import java.util.HashMap; 13 | import java.util.HashSet; 14 | import java.util.Iterator; 15 | import java.util.Map; 16 | import java.util.Map.Entry; 17 | import java.util.Set; 18 | 19 | import org.apache.commons.io.FileUtils; 20 | import org.apache.hadoop.fs.Path; 21 | import org.apache.hadoop.io.LongWritable; 22 | import org.apache.hadoop.io.Text; 23 | import org.apache.hadoop.mapred.FileInputFormat; 24 | import org.apache.hadoop.mapred.FileOutputFormat; 25 | import org.apache.hadoop.mapred.JobClient; 26 | import org.apache.hadoop.mapred.JobConf; 27 | import org.apache.hadoop.mapred.MapReduceBase; 28 | import org.apache.hadoop.mapred.Mapper; 29 | import org.apache.hadoop.mapred.OutputCollector; 30 | import org.apache.hadoop.mapred.Reducer; 31 | import org.apache.hadoop.mapred.Reporter; 32 | import org.apache.hadoop.mapred.TextInputFormat; 33 | import org.apache.hadoop.mapred.TextOutputFormat; 34 | import org.apache.log4j.Logger; 35 | 36 | /** 37 | * Class that calculate the locality of the terms and saves the results in file. 38 | * The implementation employ hadoop map-reduce function. 39 | * @author gkordo 40 | * 41 | */ 42 | public class Locality { 43 | 44 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.Locality"); 45 | private static Set testIDs; 46 | private static Set users; 47 | private static int scale; 48 | 49 | public Locality(String testFile, int scale){ 50 | testIDs = DataManager.getSetOfImageIDs(testFile); 51 | users = DataManager.getSetOfUserID(testFile); 52 | Locality.scale = scale; 53 | } 54 | 55 | /** 56 | * Map class that takes the lines of the train file as input and creates key-value pairs, 57 | * using as keys the tags contained in the images and as values strings that contain 58 | * the information regarding the cell and user ID. 59 | * @author gkordo 60 | * 61 | */ 62 | public static class MapLocality extends MapReduceBase implements Mapper { 63 | 64 | /** 65 | * Required map function 66 | * @param key : key value 67 | * @param value : input string 68 | * @param output : output collector 69 | * @param reporter : reporter of the job 70 | */ 71 | public void map(LongWritable key, Text value, 72 | OutputCollector output, Reporter reporter) throws IOException { 73 | 74 | String[] metadata = value.toString().split("\t"); 75 | 76 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set 77 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations 78 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information 79 | 80 | BigDecimal tmpLonCenter = new BigDecimal( 81 | Double.parseDouble(metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP); 82 | BigDecimal tmpLatCenter = new BigDecimal( 83 | Double.parseDouble(metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP); 84 | 85 | //get image user ID 86 | String userID = metadata[3]; 87 | 88 | // get image tags 89 | Set terms = new HashSet(); 90 | TextUtil.parse(metadata[10], terms); 91 | TextUtil.parse(metadata[8], terms); 92 | 93 | // send key-value pairs 94 | for(String term:terms) { 95 | if(!term.isEmpty() && term.length() > 2){ 96 | for(int j=-2;j<2;j++){ 97 | for(int k=-2;k<2;k++){ 98 | output.collect(new Text(term), new Text(userID + ">" + 99 | (tmpLonCenter.doubleValue()+((j)*0.01)) + "_" + 100 | (tmpLatCenter.doubleValue()+((k)*0.01)))); 101 | } 102 | } 103 | } 104 | } 105 | } 106 | } 107 | } 108 | 109 | /** 110 | * Reduce class that get the key-value pairs and calculate the locality of every term. 111 | * @author gkordo 112 | * 113 | */ 114 | public static class ReduceLocality extends MapReduceBase implements Reducer { 115 | 116 | /** 117 | * Required reduce function 118 | * @param key : key value 119 | * @param values : set of values that share the same key 120 | * @param output : output collector 121 | * @param reporter : reporter of the job 122 | */ 123 | public void reduce(Text key, Iterator values, 124 | OutputCollector output, Reporter reporter) throws IOException { 125 | 126 | // map of cells that contains the count of the different users for every single cell 127 | Map> cells = new HashMap>(); 128 | int Nt = 0; // total user count 129 | 130 | while (values.hasNext()) { 131 | 132 | String value = values.next().toString(); 133 | 134 | // retrieve cell ID and user ID from the value of the pair 135 | String user = value.split(">")[0]; 136 | String cell = value.split(">")[1]; 137 | 138 | // update of the frequency map 139 | if(cells.containsKey(cell)){ 140 | if(!cells.get(cell).contains(user)){ 141 | cells.get(cell).add(user); 142 | Nt++; 143 | } 144 | }else{ 145 | cells.put(cell,new HashSet()); 146 | cells.get(cell).add(user); 147 | Nt++; 148 | } 149 | } 150 | 151 | // locality calculation 152 | double locality = 0.0; 153 | for(Entry> entry : cells.entrySet()){ 154 | int v=entry.getValue().size(); 155 | locality+=v*(v-1)/Nt; 156 | 157 | } 158 | 159 | // send output to collector 160 | if(locality > 0.0){ 161 | output.collect(key, new Text(locality + "")); 162 | } 163 | } 164 | } 165 | 166 | /** 167 | * Core function for the job of tag-cell probabilities calculation. 168 | * @param dir : project directory 169 | * @param trainFolder : the file of the train set 170 | * @throws IOException : file not found 171 | */ 172 | public void calculateLocality(String dir, String trainFolder) throws IOException{ 173 | 174 | logger.info("Process: Locality weight calculation\t|\t" 175 | + "Status: INITIALIZE"); 176 | JobConf conf = new JobConf(Locality.class); 177 | conf.setJobName("Locality"); 178 | 179 | conf.setOutputKeyClass(Text.class); 180 | conf.setOutputValueClass(Text.class); 181 | 182 | conf.setMapperClass(MapLocality.class); 183 | conf.setReducerClass(ReduceLocality.class); 184 | 185 | conf.setInputFormat(TextInputFormat.class); 186 | conf.setOutputFormat(TextOutputFormat.class); 187 | 188 | // clean the output file directory 189 | File folder = new File(dir + "temp/locality"); 190 | if (folder.exists()) { 191 | FileUtils.cleanDirectory(folder); 192 | FileUtils.forceDelete(folder); 193 | } 194 | 195 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder)); 196 | FileOutputFormat.setOutputPath(conf, new Path(dir + "temp/locality")); 197 | 198 | logger.info("Process: Locality weight calculation\t|\t" 199 | + "Status: STARTED"); 200 | long startTime = System.currentTimeMillis(); 201 | JobClient.runJob(conf); 202 | 203 | sortAndStore(dir + "temp/locality/part-00000", 204 | dir + "Weights/locality_weights"); 205 | 206 | logger.info("Process: Locality weight calculation\t|\t" 207 | + "Status: COMPLETED\t|\tTotal time: " + 208 | (System.currentTimeMillis()-startTime)/60000.0+"m"); 209 | } 210 | 211 | /** 212 | * Sort terms based on their locality values and calculate weights. 213 | * The locality term weight are stored in the given file. 214 | * @param inFile : file of the locality values of the terms 215 | * @param outFile : output file 216 | */ 217 | private void sortAndStore(String inFile, String outFile){ 218 | 219 | // load locality values 220 | EasyBufferedReader reader = new EasyBufferedReader(inFile); 221 | Map termLocalityValues = new HashMap(); 222 | String line; 223 | while ((line = reader.readLine())!=null){ 224 | String term = line.split("\t")[0]; 225 | double locality = Double.parseDouble(line.split("\t")[1]); 226 | termLocalityValues.put(term, locality); 227 | } 228 | reader.close(); 229 | 230 | // sort and store weights 231 | termLocalityValues = Utils.sortByValues(termLocalityValues); 232 | EasyBufferedWriter writer = new EasyBufferedWriter(outFile); 233 | int i = 0, totalTerms = termLocalityValues.size(); 234 | for(Entry entry : termLocalityValues.entrySet()){ 235 | writer.write(entry.getKey()+"\t"+(double)(totalTerms-i)/totalTerms); 236 | writer.newLine(); 237 | i++; 238 | } 239 | writer.close(); 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/AmbiguityBasedSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.HashSet; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | import java.util.Set; 10 | 11 | import org.apache.log4j.Logger; 12 | 13 | import gr.iti.mklab.util.EasyBufferedReader; 14 | import gr.iti.mklab.util.EasyBufferedWriter; 15 | import gr.iti.mklab.util.Utils; 16 | 17 | @SuppressWarnings("unchecked") 18 | public class AmbiguityBasedSampling extends Sampling{ 19 | 20 | private static Logger logger = Logger.getLogger( 21 | "gr.iti.mklab.eval.AmbiguityBasedSampling"); 22 | 23 | public static Object sample(String testFile) throws Exception{ 24 | 25 | logger.info("Sampling: Ambiguity-based Strategy"); 26 | 27 | AmbiguityBasedSampling sampling = 28 | new AmbiguityBasedSampling(); 29 | 30 | return sampling.writeInFile(sampling.loadData(testFile)); 31 | } 32 | 33 | protected Object loadData(String testFile) { 34 | 35 | Map ambiguous = 36 | computeCityEntropies(loadOccurrences(testFile)); 37 | logger.info(ambiguous.size() + " Towns loaded"); 38 | 39 | Map images = new 40 | HashMap(); 41 | double median = Utils.medianItemDouble(ambiguous); 42 | 43 | EasyBufferedReader reader = 44 | new EasyBufferedReader(testFile); 45 | String line; 46 | while((line = reader.readLine())!=null){ 47 | String imageID = line.split("\t")[0]; 48 | for(String place:line .split("\t")[1].split(",")){ 49 | if(place.split(":").length>2 50 | && place.split(":")[2].contains("Town")){ 51 | if(ambiguous.containsKey(place.split(":")[1]) && 52 | ambiguous.get(place.split(":")[1])>median){ 53 | images.put(imageID, true); 54 | }else{ 55 | images.put(imageID, false); 56 | } 57 | } 58 | } 59 | } 60 | reader.close(); 61 | return images; 62 | } 63 | 64 | protected Object writeInFile(Object data) { 65 | 66 | Map images = 67 | (Map) data; 68 | 69 | Map> respond = new 70 | HashMap>(); 71 | 72 | respond.put(true, new HashSet()); 73 | respond.put(false, new HashSet()); 74 | 75 | EasyBufferedWriter writerA = new EasyBufferedWriter( 76 | "samples/ambiguous_sampling.txt"); 77 | EasyBufferedWriter writerN = new EasyBufferedWriter( 78 | "samples/non_ambiguous_sampling.txt"); 79 | for(Entry image:images.entrySet()){ 80 | respond.get(image.getValue()).add(image.getKey()); 81 | if(image.getValue()){ 82 | writerA.write(image.getKey()); 83 | writerA.newLine(); 84 | }else{ 85 | writerN.write(image.getKey()); 86 | writerN.newLine(); 87 | } 88 | } 89 | writerA.close(); 90 | writerN.close(); 91 | 92 | return respond; 93 | } 94 | 95 | private static double computeEntropyNaive( 96 | final List probabilities, int total) { 97 | double entropy = 0.0; 98 | for (Double p:probabilities) { 99 | p /= total; 100 | if(p!=0.0){ 101 | entropy -= p * Math.log(p); 102 | } 103 | } 104 | return entropy; 105 | } 106 | 107 | private static Map computeCityEntropies( 108 | Map> townNames) { 109 | Map ambiguous = new HashMap(); 110 | 111 | for(Entry> town:townNames.entrySet()){ 112 | List p = new ArrayList(); 113 | int total = 0; 114 | for(Entry code:town.getValue().entrySet()){ 115 | p.add((double) code.getValue()); 116 | total += code.getValue(); 117 | } 118 | double entropy = computeEntropyNaive(p, total); 119 | if(entropy > 0.0) 120 | ambiguous.put(town.getKey(), entropy); 121 | } 122 | 123 | return ambiguous; 124 | } 125 | 126 | private static Map> 127 | loadOccurrences(String testFile) { 128 | 129 | Map> townNames = 130 | new HashMap>(); 131 | 132 | EasyBufferedReader reader = 133 | new EasyBufferedReader(testFile); 134 | String line; 135 | while((line = reader.readLine())!=null){ 136 | for(String place:line .split("\t")[1].split(",")){ 137 | if(place.split(":").length>2 && place.split(":")[2].contains("Town")){ 138 | String townCode = place.split(":")[0]; 139 | String townName = place.split(":")[1]; 140 | 141 | if(townNames.containsKey(townName)){ 142 | if(townNames.get(townName).containsKey(townCode)){ 143 | townNames.get(townName).put(townCode, 144 | townNames.get(townName).get(townCode) + 1); 145 | }else{ 146 | townNames.get(townName).put(townCode, 1); 147 | } 148 | }else{ 149 | townNames.put(townName, new HashMap()); 150 | townNames.get(townName).put(townCode, 1); 151 | } 152 | } 153 | } 154 | } 155 | reader.close(); 156 | 157 | return townNames; 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/BuildingSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.log4j.Logger; 7 | 8 | import gr.iti.mklab.util.EasyBufferedReader; 9 | import gr.iti.mklab.util.EasyBufferedWriter; 10 | 11 | @SuppressWarnings("unchecked") 12 | public class BuildingSampling extends Sampling{ 13 | 14 | private static Logger logger = Logger.getLogger( 15 | "gr.iti.mklab.eval.BuildingSampling"); 16 | 17 | public static Object sample(String testFile) throws Exception{ 18 | 19 | logger.info("Sampling: Building Strategy"); 20 | 21 | BuildingSampling sampling = 22 | new BuildingSampling(); 23 | 24 | return sampling.writeInFile(sampling.loadData(testFile)); 25 | } 26 | 27 | protected Object loadData(String testFile) { 28 | 29 | Set buildingConcepts = new HashSet(); 30 | 31 | EasyBufferedReader reader = 32 | new EasyBufferedReader("samples/building_concepts.txt"); 33 | String line; 34 | while((line = reader.readLine())!=null){ 35 | buildingConcepts.add(line); 36 | } 37 | reader.close(); 38 | 39 | Set buildingImages = new HashSet(); 40 | reader = new EasyBufferedReader(testFile); 41 | while((line = reader.readLine())!=null){ 42 | String imageID = line.split("\t")[0]; 43 | for(String concept:line .split("\t")[1].split(",")){ 44 | if(buildingConcepts.contains(concept.split(":")[0])){ 45 | buildingImages.add(imageID); 46 | } 47 | } 48 | } 49 | reader.close(); 50 | logger.info(buildingImages.size() + " Building Images loaded"); 51 | 52 | return buildingImages; 53 | } 54 | 55 | protected Object writeInFile(Object data) { 56 | 57 | Set buildingImages = (Set) data; 58 | 59 | EasyBufferedWriter writer = new EasyBufferedWriter( 60 | "samples/building_sampling.txt"); 61 | for(String image:buildingImages){ 62 | writer.write(image + "\t"); 63 | writer.newLine(); 64 | } 65 | writer.close(); 66 | 67 | return buildingImages; 68 | } 69 | } -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/GeographicalUniformSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.math.BigDecimal; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.HashMap; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | import java.util.Set; 12 | 13 | import org.apache.log4j.Logger; 14 | 15 | import gr.iti.mklab.util.EasyBufferedReader; 16 | import gr.iti.mklab.util.EasyBufferedWriter; 17 | import gr.iti.mklab.util.Utils; 18 | 19 | @SuppressWarnings("unchecked") 20 | public class GeographicalUniformSampling extends Sampling { 21 | 22 | private static Logger logger = Logger.getLogger( 23 | "gr.iti.mklab.eval.GeographicalUniformSampling"); 24 | 25 | public static Object sample(String testFile) throws Exception{ 26 | 27 | logger.info("Sampling: Geographical Uniform Strategy"); 28 | 29 | GeographicalUniformSampling sampling = 30 | new GeographicalUniformSampling(); 31 | 32 | return sampling.writeInFile(sampling.loadData(testFile)); 33 | } 34 | 35 | protected Object loadData(String testFile) { 36 | 37 | Map> cells = 38 | new HashMap>(); 39 | 40 | EasyBufferedReader reader = 41 | new EasyBufferedReader(testFile); 42 | String line; 43 | while((line = reader.readLine())!=null){ 44 | 45 | BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble( 46 | line.split("\t")[12])).setScale(1, BigDecimal.ROUND_HALF_UP); 47 | BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble( 48 | line.split("\t")[13])).setScale(1, BigDecimal.ROUND_HALF_UP); 49 | 50 | String cell = tmpLatCenter + " " + tmpLonCenter; 51 | if(cells.containsKey(cell)){ 52 | cells.get(cell).add(line.split("\t")[1]); 53 | }else{ 54 | cells.put(cell, new HashSet()); 55 | cells.get(cell).add(line.split("\t")[1]); 56 | } 57 | } 58 | reader.close(); 59 | logger.info(cells.size() + " Cells loaded"); 60 | 61 | return cells; 62 | } 63 | 64 | protected Object writeInFile(Object data) { 65 | 66 | Map> cells = (Map>) data; 67 | 68 | EasyBufferedWriter writer = new EasyBufferedWriter( 69 | "samples/geographical_uniform_sampling.txt"); 70 | 71 | int median = Utils.medianSet(cells); 72 | 73 | Set respond = new HashSet(); 74 | 75 | for(Entry> cell:cells.entrySet()){ 76 | List images = 77 | new ArrayList(cell.getValue()); 78 | Collections.shuffle(images); 79 | 80 | for(int i=0;i>> places = 33 | new HashMap>>(); 34 | 35 | places.put("continents", new HashMap>()); 36 | places.put("countries", new HashMap>()); 37 | 38 | EasyBufferedReader reader = 39 | new EasyBufferedReader(testFile); 40 | String line; 41 | while((line = reader.readLine())!=null){ 42 | String imageID = line.split("\t")[0]; 43 | for(String place:line .split("\t")[1].split(",")){ 44 | if(place.split(":").length>2 && place.contains("Timezone")){ 45 | String continent = place.split(":")[1].split("%")[0]; 46 | 47 | switch(continent) { 48 | case "Pacific" : 49 | continent = "America"; 50 | break; 51 | case "Atlantic" : 52 | continent = "America"; 53 | break; 54 | case "Indian" : 55 | continent = "Asia"; 56 | break; 57 | } 58 | if(places.get("continents").containsKey(continent)){ 59 | places.get("continents").get(continent).add(imageID); 60 | }else{ 61 | places.get("continents").put(continent, new HashSet()); 62 | places.get("continents").get(continent).add(imageID); 63 | } 64 | } 65 | 66 | if(place.split(":").length>2 && place.contains("Country")){ 67 | String country = place.split(":")[1].split("%")[0]; 68 | if(places.get("countries").containsKey(country)){ 69 | places.get("countries").get(country).add(imageID); 70 | }else{ 71 | places.get("countries").put(country, new HashSet()); 72 | places.get("countries").get(country).add(imageID); 73 | } 74 | } 75 | } 76 | } 77 | reader.close(); 78 | logger.info(places.get("continents").size() + " Continents loaded"); 79 | logger.info(places.get("countries").size() + " Countries loaded"); 80 | 81 | return places; 82 | } 83 | 84 | protected Object writeInFile(Object data) { 85 | 86 | Map>> places = 87 | (Map>>) data; 88 | 89 | EasyBufferedWriter writer = new EasyBufferedWriter( 90 | "samples/geographically_focused_sampling_continents.txt"); 91 | for(Entry> continent:places.get("continents").entrySet()){ 92 | writer.write(continent.getKey() + "\t"); 93 | for(String images:continent.getValue()){ 94 | writer.write(images + " "); 95 | } 96 | writer.newLine(); 97 | } 98 | writer.close(); 99 | 100 | writer = new EasyBufferedWriter( 101 | "samples/geographically_focused_sampling_countries.txt"); 102 | 103 | for(Entry> country:places.get("countries").entrySet()){ 104 | writer.write(country.getKey() + "\t"); 105 | for(String images:country.getValue()){ 106 | writer.write(images + " "); 107 | } 108 | writer.newLine(); 109 | } 110 | writer.close(); 111 | 112 | return places; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/Sampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | public abstract class Sampling { 4 | 5 | protected abstract Object loadData(String testFile); 6 | 7 | protected abstract Object writeInFile(Object data); 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/TextBasedSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.Map; 6 | import java.util.Map.Entry; 7 | import java.util.Set; 8 | 9 | import org.apache.log4j.Logger; 10 | 11 | import gr.iti.mklab.util.EasyBufferedReader; 12 | import gr.iti.mklab.util.EasyBufferedWriter; 13 | import gr.iti.mklab.util.Utils; 14 | 15 | @SuppressWarnings("unchecked") 16 | public class TextBasedSampling extends Sampling { 17 | 18 | private static Logger logger = Logger.getLogger( 19 | "gr.iti.mklab.eval.TextBasedSampling"); 20 | 21 | public static Object sample(String testFile) throws Exception{ 22 | 23 | logger.info("Sampling: Text-based Strategy"); 24 | 25 | TextBasedSampling sampling = new TextBasedSampling(); 26 | 27 | return sampling.writeInFile(sampling.loadData(testFile)); 28 | } 29 | 30 | protected Object loadData(String testFile) { 31 | 32 | Map images = 33 | new HashMap(); 34 | 35 | EasyBufferedReader reader = 36 | new EasyBufferedReader(testFile); 37 | String line; 38 | while((line = reader.readLine())!=null){ 39 | int tags = (!line.split("\t")[10].isEmpty() 40 | ?line.split("\t")[10].split(",").length:0); 41 | int title = (!line.split("\t")[8].isEmpty() 42 | ?line.split("\t")[8].split("\\+").length:0); 43 | 44 | images.put(line.split("\t")[1], tags+title); 45 | } 46 | reader.close(); 47 | logger.info(images.size() + " Images loaded"); 48 | 49 | return images; 50 | } 51 | 52 | protected Object writeInFile(Object data) { 53 | 54 | Map images = 55 | (Map) data; 56 | 57 | EasyBufferedWriter writer = new EasyBufferedWriter( 58 | "samples/text_based_sampling.txt"); 59 | 60 | Set respond = new HashSet(); 61 | 62 | int median = Utils.medianItemInt(images); 63 | 64 | for(Entry image:images.entrySet()){ 65 | if(image.getValue() >= median){ 66 | respond.add(image.getKey()); 67 | writer.write(image.getKey()); 68 | writer.newLine(); 69 | } 70 | } 71 | writer.close(); 72 | 73 | return respond; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/TextDiversitySampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Map.Entry; 10 | import java.util.Set; 11 | import java.util.stream.Collectors; 12 | import java.util.stream.IntStream; 13 | 14 | import org.apache.log4j.Logger; 15 | 16 | import gr.iti.mklab.util.EasyBufferedReader; 17 | import gr.iti.mklab.util.EasyBufferedWriter; 18 | import info.debatty.java.lsh.MinHash; 19 | 20 | @SuppressWarnings("unchecked") 21 | public class TextDiversitySampling extends Sampling { 22 | 23 | private static Logger logger = Logger.getLogger( 24 | "gr.iti.mklab.eval.TextDiversitySampling"); 25 | 26 | public static Object sample(String testFile) throws Exception{ 27 | 28 | logger.info("Sampling: Text Diversity Strategy"); 29 | 30 | TextDiversitySampling sampling = new TextDiversitySampling(); 31 | 32 | return sampling.writeInFile(sampling.loadData(testFile)); 33 | } 34 | 35 | protected Object loadData(String testFile) { 36 | 37 | Map, List> buckets = 38 | new HashMap, List>(); 39 | Map tags = 40 | new HashMap(); 41 | int n = 510914; 42 | MinHash mh = new MinHash(0.1, n); 43 | 44 | EasyBufferedReader reader = 45 | new EasyBufferedReader(testFile); 46 | String line; 47 | while((line = reader.readLine())!=null){ 48 | String imageID = line.split("\t")[1]; 49 | String imageTags = line.split("\t")[10]; 50 | boolean[] vector = new boolean[n]; 51 | 52 | for(String tag:imageTags.split(",")){ 53 | if(!tags.containsKey(tag)){ 54 | tags.put(tag, tags.size()); 55 | } 56 | vector[tags.get(tag)] = true; 57 | } 58 | 59 | List hash = IntStream.of((mh.signature(vector) 60 | )).boxed().collect(Collectors.toList()); 61 | if(buckets.containsKey(hash)){ 62 | buckets.get(hash).add(imageID); 63 | }else{ 64 | buckets.put(hash, new ArrayList()); 65 | buckets.get(hash).add(imageID); 66 | } 67 | } 68 | reader.close(); 69 | logger.info(buckets.size() + " Buckets created"); 70 | 71 | return buckets; 72 | } 73 | 74 | protected Object writeInFile(Object data) { 75 | 76 | Map, List> buckets = 77 | (Map, List>) data; 78 | 79 | Set respond = new HashSet(); 80 | 81 | EasyBufferedWriter writer = new EasyBufferedWriter( 82 | "samples/text_diversity_sampling.txt"); 83 | 84 | for(Entry, List> bucket 85 | :buckets.entrySet()){ 86 | List images = bucket.getValue(); 87 | Collections.shuffle(images); 88 | 89 | respond.add(images.get(0)); 90 | writer.write(images.get(0)); 91 | writer.newLine(); 92 | } 93 | writer.close(); 94 | 95 | return respond; 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/UserUniformSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Map.Entry; 10 | import java.util.Set; 11 | 12 | import org.apache.log4j.Logger; 13 | 14 | import gr.iti.mklab.util.EasyBufferedReader; 15 | import gr.iti.mklab.util.EasyBufferedWriter; 16 | 17 | @SuppressWarnings("unchecked") 18 | public class UserUniformSampling extends Sampling { 19 | 20 | private static Logger logger = Logger.getLogger( 21 | "gr.iti.mklab.eval.UserUniformSampling"); 22 | 23 | public static Object sample(String testFile) throws Exception{ 24 | 25 | logger.info("Sampling: User Uniform Strategy"); 26 | 27 | UserUniformSampling sampling = new UserUniformSampling(); 28 | 29 | return sampling.writeInFile(sampling.loadData(testFile)); 30 | } 31 | 32 | protected Object loadData(String testFile) { 33 | 34 | Map> users = 35 | new HashMap>(); 36 | 37 | EasyBufferedReader reader = 38 | new EasyBufferedReader(testFile); 39 | String line; 40 | while((line = reader.readLine())!=null){ 41 | String user = line.split("\t")[3]; 42 | if(users.containsKey(user)){ 43 | users.get(user).add(line.split("\t")[1]); 44 | }else{ 45 | users.put(user, new HashSet()); 46 | users.get(user).add(line.split("\t")[1]); 47 | } 48 | } 49 | reader.close(); 50 | logger.info(users.size() + " Users loaded"); 51 | 52 | return users; 53 | } 54 | 55 | protected Object writeInFile(Object data) { 56 | 57 | Map> users = 58 | (Map>) data; 59 | 60 | Set respond = new HashSet(); 61 | 62 | EasyBufferedWriter writer = new EasyBufferedWriter( 63 | "samples/user_uniform_sampling.txt"); 64 | 65 | for(Entry> user:users.entrySet()){ 66 | List images = 67 | new ArrayList(user.getValue()); 68 | Collections.shuffle(images); 69 | 70 | respond.add(images.get(0)); 71 | writer.write(images.get(0)); 72 | writer.newLine(); 73 | } 74 | writer.close(); 75 | 76 | return respond; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/mmcomms16/VisualSampling.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.mmcomms16; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.Map; 6 | import java.util.Set; 7 | import java.util.Map.Entry; 8 | 9 | import org.apache.log4j.Logger; 10 | 11 | import gr.iti.mklab.util.EasyBufferedReader; 12 | import gr.iti.mklab.util.EasyBufferedWriter; 13 | 14 | @SuppressWarnings("unchecked") 15 | public class VisualSampling extends Sampling{ 16 | 17 | private static Logger logger = Logger.getLogger( 18 | "gr.iti.mklab.eval.VisualSampling"); 19 | 20 | public static Object sample(String testFile) throws Exception{ 21 | 22 | logger.info("Sampling: Visual Strategy"); 23 | 24 | VisualSampling sampling = 25 | new VisualSampling(); 26 | 27 | return sampling.writeInFile(sampling.loadData(testFile)); 28 | } 29 | 30 | protected Object loadData(String testFile) { 31 | 32 | Map> concepts = 33 | new HashMap>(); 34 | 35 | EasyBufferedReader reader = 36 | new EasyBufferedReader(testFile); 37 | String line; 38 | while((line = reader.readLine())!=null){ 39 | String imageID = line.split("\t")[0]; 40 | for(String concept:line .split("\t")[1].split(",")){ 41 | if(concepts.containsKey(concept.split(":")[0])){ 42 | concepts.get(concept.split(":")[0]).add(imageID); 43 | }else{ 44 | concepts.put(concept.split(":")[0], new HashSet()); 45 | concepts.get(concept.split(":")[0]).add(imageID); 46 | } 47 | } 48 | } 49 | reader.close(); 50 | logger.info(concepts.size() + " Concepts loaded"); 51 | 52 | return concepts; 53 | } 54 | 55 | protected Object writeInFile(Object data) { 56 | 57 | Map> concepts = 58 | (Map>) data; 59 | 60 | EasyBufferedWriter writer = new EasyBufferedWriter( 61 | "samples/visual_sampling.txt"); 62 | for(Entry> concept:concepts.entrySet()){ 63 | writer.write(concept.getKey() + "\t"); 64 | for(String images:concept.getValue()){ 65 | writer.write(images + " "); 66 | } 67 | writer.newLine(); 68 | } 69 | writer.close(); 70 | 71 | return concepts; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/tools/CenterOfGravity.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.tools; 2 | 3 | import java.util.Map; 4 | import java.util.Map.Entry; 5 | 6 | /** 7 | * Abstract class that execute the calculation of the center-of-gravity of the most similar images 8 | * @author gkordo 9 | * 10 | */ 11 | public abstract class CenterOfGravity { 12 | 13 | protected static int a; 14 | 15 | // Contractor initialize a variable 16 | public CenterOfGravity(int a){ 17 | CenterOfGravity.a = a; 18 | } 19 | 20 | /** 21 | * Calculation of the center-of-gravity of the k most similar images 22 | * @param mapSim : the map with the k most similar images and their similarity values 23 | * @return the estimated location of the query image 24 | */ 25 | protected static Double[] computeCoordination(Map mapSim){ 26 | 27 | double [] loc = new double[3]; 28 | Double[] c = new Double[2]; 29 | int k = mapSim.size(); 30 | 31 | for (Entry entry:mapSim.entrySet()){ 32 | 33 | double sim = entry.getValue(); 34 | double lat = Double.parseDouble(entry.getKey().split("_")[1]); 35 | double lon = Double.parseDouble(entry.getKey().split("_")[0]); 36 | 37 | loc[0] += Math.pow(sim,a) 38 | * Math.cos(lat * (Math.PI / 180D)) 39 | * Math.cos(lon * (Math.PI / 180D)) / k; 40 | 41 | loc[1] += Math.pow(sim,a) 42 | * Math.cos(lat * (Math.PI / 180D)) 43 | * Math.sin(lon * (Math.PI / 180D)) / k; 44 | 45 | loc[2] += Math.pow(sim,a) 46 | * Math.sin(lat * (Math.PI / 180D)) / k; 47 | 48 | c[0] = (Double) (Math.atan2(loc[2], Math.sqrt(Math.pow(loc[0],2) 49 | + Math.pow(loc[1],2))) * (180D/Math.PI)); 50 | 51 | c[1] = (Double) (Math.atan2(loc[1], loc[0]) * (180D/Math.PI)); 52 | } 53 | return c; 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/tools/DataManager.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.tools; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import org.apache.log4j.Logger; 7 | 8 | import gr.iti.mklab.util.EasyBufferedReader; 9 | import gr.iti.mklab.util.TextUtil; 10 | 11 | /** 12 | * Data manager 13 | * @author gkordo 14 | * 15 | */ 16 | public class DataManager { 17 | 18 | static Logger logger = Logger.getLogger("gr.iti.mklab.tools.DataManager"); 19 | 20 | // return a set contain the image IDs of the provided dataset 21 | public static Set getSetOfImageIDs(String file){ 22 | 23 | Set usersIncludedInFile = new HashSet(); 24 | 25 | EasyBufferedReader reader = new EasyBufferedReader(file); 26 | 27 | String input; 28 | 29 | logger.info("images contained in file " + file); 30 | while ((input= reader.readLine())!=null){ 31 | usersIncludedInFile.add(input.split("\t")[1]); 32 | } 33 | logger.info(usersIncludedInFile.size()+" total images included in file"); 34 | reader.close(); 35 | 36 | return usersIncludedInFile; 37 | } 38 | 39 | // return a set contain the individual tags of the provided dataset 40 | public static Set getSetOfTerms(String file){ 41 | 42 | EasyBufferedReader reader = new EasyBufferedReader(file); 43 | Set termsIncludedInFile = new HashSet(); 44 | 45 | String line; 46 | 47 | logger.info("deterim the diffrent tags contained in file " + file); 48 | while ((line= reader.readLine())!=null){ 49 | 50 | Set terms = new HashSet(); 51 | TextUtil.parse(line.split("\t")[10], terms); 52 | TextUtil.parse(line.split("\t")[8], terms); 53 | 54 | termsIncludedInFile.addAll(terms); 55 | 56 | } 57 | logger.info(termsIncludedInFile.size()+" total tags included in file"); 58 | reader.close(); 59 | 60 | return termsIncludedInFile; 61 | } 62 | 63 | // return a set contain the different users in the provided dataset 64 | public static Set getSetOfUserID (String file){ 65 | 66 | Set usersIncludedInFile = new HashSet(); 67 | 68 | EasyBufferedReader reader = new EasyBufferedReader(file); 69 | 70 | String input; 71 | 72 | logger.info("deterim the diffrent users contained in file " + file); 73 | while ((input= reader.readLine())!=null){ 74 | usersIncludedInFile.add(input.split("\t")[3]); 75 | } 76 | logger.info(usersIncludedInFile.size()+" total users included in file"); 77 | reader.close(); 78 | 79 | return usersIncludedInFile; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/tools/InterfaceTermCellProb.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.tools; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Interface of tag-cell probability calculator 7 | * @author gkordo 8 | * 9 | */ 10 | public interface InterfaceTermCellProb { 11 | 12 | /** 13 | * Function where the tag-cell probabilities are calculated and stored in a defined file. 14 | * @param dir : directory of the project 15 | * @param trainFile : file that contains the train set 16 | * @param outFile : output file 17 | * @param scale : grid scale 18 | * @throws IOException : file not found 19 | */ 20 | public void calculatorTermCellProb(String dir, String trainFile, 21 | String outFile, int scale) throws IOException; 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/tools/SimilarityCalculator.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.tools; 2 | 3 | import gr.iti.mklab.data.ImageMetadata; 4 | import gr.iti.mklab.util.EasyBufferedReader; 5 | import gr.iti.mklab.util.Utils; 6 | import gr.iti.mklab.util.TextUtil; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.math.BigDecimal; 11 | import java.util.*; 12 | import java.util.Map.Entry; 13 | 14 | import org.apache.commons.io.FileUtils; 15 | import org.apache.hadoop.fs.Path; 16 | import org.apache.hadoop.io.*; 17 | import org.apache.hadoop.mapred.*; 18 | import org.apache.log4j.Logger; 19 | 20 | /** 21 | * For a query image, the similarity between the images contained in the train set is calculated based on their corresponding term sets. 22 | * Class that implements similarity search based on Map-Reduce scheme. 23 | * @author gkordo 24 | * 25 | */ 26 | public class SimilarityCalculator{ 27 | 28 | private static Set testIDs; 29 | private static Setusers; 30 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch"); 31 | static java.util.Map> predictedCellsOfTestImages = new HashMap>(); 32 | 33 | /** 34 | * Contractor of the class. 35 | * @param testFile : file that contains the test image's metadata 36 | * @param resultFile : file that contains the MLC of every query image 37 | */ 38 | public SimilarityCalculator(String testFile, String resultFile){ 39 | loadTestImages(testFile,resultFile); 40 | } 41 | 42 | 43 | /** 44 | * Map class that takes the lines of the train file as input and creates key-value pairs, 45 | * using as keys the image IDs of the test set images and as values strings that contain 46 | * the location of the train images and the calculated similarity. 47 | * @author gkordo 48 | * 49 | */ 50 | public static class MapSimilaritySearch extends MapReduceBase implements Mapper { 51 | 52 | /** 53 | * Required map function 54 | * @param key : key value 55 | * @param value : input string 56 | * @param output : output collector 57 | * @param reporter : reporter of the job 58 | */ 59 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { 60 | 61 | String[] metadata = value.toString().split("\t"); 62 | 63 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set 64 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations 65 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information 66 | 67 | // get image cell based on its latitude-longitude pair 68 | BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble( 69 | metadata[12])).setScale(2, BigDecimal.ROUND_HALF_UP); 70 | BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble( 71 | metadata[13])).setScale(2, BigDecimal.ROUND_HALF_UP); 72 | 73 | Set trainImageTerms = new HashSet(); 74 | TextUtil.parse(metadata[10], trainImageTerms); 75 | TextUtil.parse(metadata[8], trainImageTerms); 76 | 77 | // there is at least estimated location laying inside the borders of cell 78 | if(predictedCellsOfTestImages.containsKey(tmpLonCenter+"_"+tmpLatCenter) 79 | && trainImageTerms.size() > 1){ 80 | 81 | // calculate similarity between the train image and all images that lay inside the boarded of the specific cell 82 | for(ImageMetadata entry : predictedCellsOfTestImages 83 | .get(tmpLonCenter+"_"+tmpLatCenter)){ 84 | 85 | // determine the common terms 86 | List common = new ArrayList(trainImageTerms); 87 | common.retainAll(entry.getTags()); 88 | 89 | // calculate similarity 90 | double sjacc = (double) common.size() / (entry.getTags().size() 91 | + trainImageTerms.size() - common.size()); 92 | if(sjacc>0.05){ 93 | output.collect(new Text(entry.getId()), new Text(String.valueOf(sjacc) + 94 | ">" + metadata[12] + "_"+metadata[13])); 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | 102 | /** 103 | * Reduce class that get the key-value pairs and sort the similarities for a test image. 104 | * @author gkordo 105 | * 106 | */ 107 | public static class ReduceSimilaritySearch extends MapReduceBase implements Reducer { 108 | 109 | /** 110 | * Required reduce function 111 | * @param key : key value 112 | * @param values : set of values that share the same key 113 | * @param output : output collector 114 | * @param reporter : reporter of the job 115 | */ 116 | public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { 117 | 118 | java.util.Map simImages = new HashMap(); 119 | 120 | // load values in a topic similarity map 121 | while (values.hasNext()) { 122 | String entry = values.next().toString(); 123 | simImages.put(entry.split(">")[1],Double.parseDouble(entry.split(">")[0])); 124 | } 125 | 126 | // sort similarity map 127 | simImages = Utils.sortByValues(simImages); 128 | 129 | // write in output file 130 | output.collect(key, new Text(convertSimMapToStr(simImages))); 131 | } 132 | 133 | /** 134 | * Function that converts similarity map to output string 135 | * @param simImages : similarity map 136 | * @return a string that contains similarity and location of the train images 137 | */ 138 | public String convertSimMapToStr(java.util.Map simImages){ 139 | String out = ""; 140 | 141 | for(Entry entry : simImages.entrySet()){ 142 | out += entry.getKey() + ">" + entry.getValue() + " "; 143 | } 144 | 145 | return out.trim(); 146 | } 147 | } 148 | 149 | /** 150 | * Core function for the job of similarity search. 151 | * @param dir : directory of the project 152 | * @param trainFolder : the file of the train set 153 | * @param outFolder : the folder where the tag-set probabilities file will be stored 154 | * @throws Exception : file not found 155 | */ 156 | public void performSimilarityCalculation(String dir, String trainFolder, String outFolder) throws Exception { 157 | 158 | logger.info("Process: Similarity Calculation\t|\t" 159 | + "Status: INITIALIZE"); 160 | JobConf conf = new JobConf(SimilarityCalculator.class); 161 | conf.setJobName("similaritysearch"); 162 | 163 | conf.setOutputKeyClass(Text.class); 164 | conf.setOutputValueClass(Text.class); 165 | 166 | conf.setMapperClass(MapSimilaritySearch.class); 167 | 168 | conf.setReducerClass(ReduceSimilaritySearch.class); 169 | 170 | conf.setInputFormat(TextInputFormat.class); 171 | conf.setOutputFormat(TextOutputFormat.class); 172 | 173 | // clean the output file directory 174 | File file = new File(dir + outFolder); 175 | if (file.exists()) { 176 | FileUtils.cleanDirectory(file); 177 | FileUtils.forceDelete(file); 178 | } 179 | 180 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder)); 181 | FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder)); 182 | 183 | logger.info("Process: Similarity Calculation\t|\t" 184 | + "Status: STARTED"); 185 | long startTime = System.currentTimeMillis(); 186 | JobClient.runJob(conf); 187 | logger.info("Process: Similarity Calculation\t|\t" 188 | + "Status: COMPLETED\t|\tTotal time: " + 189 | (System.currentTimeMillis()-startTime)/60000.0+"m"); 190 | 191 | new File(dir + outFolder + "/part-00000").renameTo( 192 | new File(dir + outFolder + "/image_similarities")); // rename the output file 193 | } 194 | 195 | /** 196 | * Load test images in a map based on their MLCs. Also update the set of test image IDs and test user IDs. 197 | * @param testFile 198 | * @param resultFile 199 | */ 200 | private void loadTestImages(String testFile, String resultFile){ 201 | 202 | EasyBufferedReader readerTest = new EasyBufferedReader(testFile); 203 | EasyBufferedReader readerResult = new EasyBufferedReader(resultFile); 204 | String lineT,lineR; 205 | 206 | while ((lineT = readerTest.readLine())!=null && (lineR = readerResult.readLine())!=null){ 207 | 208 | if(!lineR.split("\t")[1].equals("N/A")){ 209 | // create an object based on test image metadata 210 | Set terms = new HashSet(); 211 | TextUtil.parse(lineR.split("\t")[10], terms); 212 | TextUtil.parse(lineR.split("\t")[8], terms); 213 | ImageMetadata image = new ImageMetadata(lineT.split("\t")[1], lineT.split("\t")[3], terms); 214 | 215 | // update respective sets 216 | testIDs.add(lineT.split("\t")[0]); 217 | users.add(lineT.split("\t")[2]); 218 | 219 | // load image object to the corresponding cell of the map 220 | if(predictedCellsOfTestImages.containsKey(lineR.split("\t")[1].split(":")[0])){ 221 | predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image); 222 | }else{ 223 | predictedCellsOfTestImages.put(lineR.split("\t")[1].split(":")[0], 224 | new ArrayList()); 225 | predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image); 226 | } 227 | } 228 | } 229 | 230 | logger.info(users.size()+" different users appeared in " + testIDs.size() + " images"); 231 | readerTest.close(); 232 | readerResult.close(); 233 | } 234 | } -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/util/EasyBufferedReader.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.io.Reader; 9 | import java.io.UnsupportedEncodingException; 10 | 11 | import org.apache.log4j.Logger; 12 | 13 | public class EasyBufferedReader extends BufferedReader { 14 | 15 | protected Logger logger; 16 | 17 | 18 | static final Reader createReader(String textFile, Logger logger){ 19 | try { 20 | return new InputStreamReader(new FileInputStream(textFile), "UTF-8"); 21 | } catch (UnsupportedEncodingException e) { 22 | logger.error(e.getMessage()); 23 | } catch (FileNotFoundException e) { 24 | logger.error(e.getMessage()); 25 | } 26 | return null; 27 | } 28 | 29 | public EasyBufferedReader(String textFile) { 30 | super(createReader(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedReader"))); 31 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedReader"); 32 | logger.debug("opened " + textFile); 33 | } 34 | 35 | @Override 36 | public void close() { 37 | try { 38 | super.close(); 39 | } catch (IOException e) { 40 | logger.error(e.getMessage()); 41 | } 42 | } 43 | 44 | @Override 45 | public String readLine() { 46 | try { 47 | return super.readLine(); 48 | } catch (IOException e) { 49 | logger.error(e.getMessage()); 50 | } 51 | return null; 52 | } 53 | 54 | 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/util/EasyBufferedWriter.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.util; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileNotFoundException; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.OutputStreamWriter; 8 | import java.io.UnsupportedEncodingException; 9 | import java.io.Writer; 10 | 11 | import org.apache.log4j.Logger; 12 | 13 | public class EasyBufferedWriter extends BufferedWriter { 14 | 15 | protected Logger logger; 16 | 17 | 18 | static final Writer createWriter(String textFile, Logger logger, boolean end){ 19 | try { 20 | return new OutputStreamWriter(new FileOutputStream(textFile,end), "UTF-8"); 21 | } catch (UnsupportedEncodingException e) { 22 | logger.error(e.getMessage()); 23 | } catch (FileNotFoundException e) { 24 | logger.error(e.getMessage()); 25 | } 26 | return null; 27 | } 28 | 29 | public EasyBufferedWriter(String textFile) { 30 | super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),false)); 31 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"); 32 | logger.debug("opened " + textFile); 33 | } 34 | 35 | public EasyBufferedWriter(String textFile, boolean end) { 36 | super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),end)); 37 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"); 38 | logger.debug("opened " + textFile); 39 | } 40 | 41 | @Override 42 | public void close() { 43 | try { 44 | super.close(); 45 | } catch (IOException e) { 46 | logger.error(e.getMessage()); 47 | } 48 | } 49 | 50 | @Override 51 | public void write(String s) { 52 | try { 53 | super.write(s); 54 | } catch (IOException e){ 55 | logger.error(e.getMessage()); 56 | } 57 | } 58 | 59 | @Override 60 | public void newLine() { 61 | try { 62 | super.newLine(); 63 | } catch (IOException e){ 64 | logger.error(e.getMessage()); 65 | } 66 | } 67 | 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/util/Progress.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.util; 2 | 3 | import org.apache.log4j.Logger; 4 | 5 | public class Progress { 6 | 7 | private long gStartTime, lastTime; 8 | private int div, scaleTime; 9 | private String mesPerCent, mesTime, messege; 10 | private int sec; 11 | private Logger logger; 12 | 13 | public Progress(long gStartTime, int limitCountLines, int scalePerCent, int scaleTime, String messege, Logger logger){ 14 | this.gStartTime = gStartTime; 15 | 16 | this.mesPerCent = "%"; 17 | if(scalePerCent==10){this.mesPerCent = "0" + this.mesPerCent;} 18 | 19 | this.scaleTime = scaleTime; 20 | this.mesTime = "m"; 21 | if(scaleTime==1){this.mesTime = "s";} 22 | 23 | this.div = limitCountLines/scalePerCent; 24 | this.messege = messege; 25 | 26 | this.logger = logger; 27 | } 28 | 29 | public Progress(long gStartTime, int sec, int scaleTime, String messege, Logger logger){ 30 | this.sec = sec; 31 | this.gStartTime = gStartTime; 32 | 33 | this.scaleTime = scaleTime; 34 | 35 | this.mesTime = "min"; 36 | this.messege = messege; 37 | if(scaleTime==1){this.mesTime = "s";} 38 | 39 | this.logger = logger; 40 | } 41 | 42 | public void showMessege(long stopTime){ 43 | if(stopTime-lastTime>sec*1000){ 44 | logger.info(messege+" > "+ (stopTime-gStartTime)/(scaleTime*1000) + mesTime); 45 | lastTime=stopTime; 46 | } 47 | } 48 | 49 | public void showProgress(int count, long stopTime){ 50 | if(count%div==0){ 51 | logger.info(messege+" > "+count/div+ mesPerCent + " > " + (stopTime-gStartTime)/(scaleTime*1000) + mesTime); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/util/TextUtil.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.util; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | import java.net.URLDecoder; 5 | import java.text.Normalizer; 6 | import java.util.Set; 7 | import java.util.regex.Pattern; 8 | 9 | 10 | public class TextUtil { 11 | 12 | public static String deAccent(String str) { 13 | String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); 14 | Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); 15 | return pattern.matcher(nfdNormalizedString).replaceAll(""); 16 | } 17 | 18 | public static Set parse (String text, Set terms) { 19 | 20 | if ((text !=null ) || (text !="")){ 21 | try{ 22 | text = URLDecoder.decode(text, "UTF-8"); 23 | text = deAccent(text); 24 | 25 | text = text.trim(); // removes redundant white spaces 26 | text = text.replaceAll("[\\p{Punct}&&[^\\,]]", ""); 27 | text = text.replaceAll("[0-9]+", ""); 28 | 29 | text = text.toLowerCase(); 30 | text = text.replaceAll("\\s{2,}", " "); 31 | text = text.replaceAll("\\,{2,}", ","); 32 | text = text.trim(); 33 | 34 | for(String term:text.split(",")){ 35 | if(!term.replaceAll(" ", "").matches("[0-9]+")&&!term.isEmpty()){ 36 | terms.add(term.trim()); 37 | for(String interm:term.split(" ")){ 38 | if(!interm.matches("[0-9]+")){ 39 | terms.add(interm); 40 | } 41 | } 42 | } 43 | } 44 | }catch(UnsupportedEncodingException exception){ 45 | }catch(IllegalArgumentException exception){} 46 | } 47 | return terms; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/gr/iti/mklab/util/Utils.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.util; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.Map.Entry; 7 | 8 | import gr.iti.mklab.data.GeoCell; 9 | 10 | import java.util.Set; 11 | import java.util.Collections; 12 | import java.util.Comparator; 13 | import java.util.HashMap; 14 | import java.util.Iterator; 15 | import java.util.LinkedHashMap; 16 | import java.util.LinkedList; 17 | 18 | public class Utils { 19 | 20 | public static Map sortByValues(Map map){ 21 | List> entries = new LinkedList>(map.entrySet()); 22 | 23 | Collections.sort(entries, Collections.reverseOrder(new Comparator>() { 24 | 25 | public int compare(Entry o1, Entry o2) { 26 | return o1.getValue().compareTo(o2.getValue()); 27 | } 28 | })); 29 | //LinkedHashMap will keep the keys in the order they are inserted 30 | //which is currently sorted on natural ordering 31 | Map sortedMap = new LinkedHashMap(); 32 | 33 | for(Map.Entry entry: entries){ 34 | sortedMap.put(entry.getKey(), entry.getValue()); 35 | } 36 | 37 | return sortedMap; 38 | } 39 | 40 | public static Map sortByValuesTable(Map map){ 41 | List> entries = new LinkedList>(map.entrySet()); 42 | 43 | Collections.sort(entries, Collections.reverseOrder(new Comparator>() { 44 | public int compare(Entry o1, Entry o2) { 45 | return o1.getValue()[0].compareTo(o2.getValue()[0]); 46 | } 47 | })); 48 | //LinkedHashMap will keep the keys in the order they are inserted 49 | //which is currently sorted on natural ordering 50 | Map sortedMap = new LinkedHashMap(); 51 | 52 | for(Map.Entry entry: entries){ 53 | sortedMap.put(entry.getKey(), entry.getValue()); 54 | } 55 | 56 | return sortedMap; 57 | } 58 | 59 | public static Map sortByMLCValues(Map unsortMap) { 60 | 61 | // Convert Map to List 62 | List> list = 63 | new LinkedList>(unsortMap.entrySet()); 64 | 65 | // Sort list with comparator, to compare the Map values 66 | Collections.sort(list, new Comparator>() { 67 | public int compare(Map.Entry o1, 68 | Map.Entry o2) { 69 | return -(o1.getValue()).getTotalProb().compareTo(o2.getValue().getTotalProb()); 70 | } 71 | }); 72 | 73 | // Convert sorted map back to a Map 74 | Map sortedMap = new LinkedHashMap(); 75 | for (Iterator> it = list.iterator(); it.hasNext();) { 76 | Map.Entry entry = it.next(); 77 | sortedMap.put(entry.getKey(), entry.getValue()); 78 | } 79 | return sortedMap; 80 | } 81 | 82 | public static HashMap getFirstEntryOfSortedMap(Map map){ 83 | HashMap firstEntry = new HashMap(); 84 | 85 | for ( Entry entry : map.entrySet()){ 86 | firstEntry.put(entry.getKey(), entry.getValue()); 87 | break; 88 | } 89 | 90 | return firstEntry; 91 | } 92 | 93 | public static HashMap invertKeysValues(Map map){ 94 | 95 | HashMap invertedHashMap = new HashMap(); 96 | 97 | for(Entry entry : map.entrySet()){ 98 | invertedHashMap.put(entry.getValue(), entry.getKey()); 99 | } 100 | 101 | return invertedHashMap; 102 | 103 | } 104 | 105 | public static double median(List p) 106 | { 107 | Double[] b = new Double[p.size()]; 108 | int i=0; 109 | for (Double entry: p){ 110 | b[i]=entry; 111 | i++; 112 | } 113 | Arrays.sort(b); 114 | if (p.size() % 2 == 0) 115 | { 116 | return (b[(b.length / 2)-1] + b[b.length / 2]) / 2.0; 117 | } 118 | else 119 | { 120 | return b[b.length / 2]; 121 | } 122 | } 123 | 124 | public static 125 | int medianSet(Map> map) 126 | { 127 | int[] b = new int[map.size()]; 128 | int i = 0; 129 | for (Entry> entry: map.entrySet()){ 130 | b[i] = entry.getValue().size(); 131 | i++; 132 | } 133 | Arrays.sort(b); 134 | if (b.length % 2 == 0) 135 | { 136 | return (int) Math.floor((b[(b.length / 2)-1] 137 | + b[b.length / 2]) / 2.0); 138 | } 139 | else 140 | { 141 | return b[b.length / 2]; 142 | } 143 | } 144 | 145 | public static 146 | int medianItemInt(Map map) 147 | { 148 | int[] b = new int[map.size()]; 149 | 150 | int i = 0; 151 | for (Entry entry: map.entrySet()){ 152 | b[i] = (int) entry.getValue(); 153 | i++; 154 | } 155 | Arrays.sort(b); 156 | if (b.length % 2 == 0) 157 | { 158 | return (int) Math.floor((b[(b.length / 2)-1] 159 | + b[b.length / 2]) / 2.0); 160 | } 161 | else 162 | { 163 | return b[b.length / 2]; 164 | } 165 | } 166 | 167 | public static 168 | double medianItemDouble(Map map) 169 | { 170 | double[] b = new double[map.size()]; 171 | 172 | int i = 0; 173 | for (Entry entry: map.entrySet()){ 174 | b[i] = (double) entry.getValue(); 175 | i++; 176 | } 177 | Arrays.sort(b); 178 | if (b.length % 2 == 0) 179 | { 180 | return (b[(b.length / 2)-1] 181 | + b[b.length / 2] / 2.0); 182 | } 183 | else 184 | { 185 | return b[b.length / 2]; 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/test/java/gr/iti/mklab/main/Evaluation.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.main; 2 | 3 | import java.io.FileInputStream; 4 | import java.text.DecimalFormat; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Map.Entry; 10 | import java.util.Properties; 11 | import java.util.Set; 12 | 13 | import org.apache.log4j.Logger; 14 | 15 | import gr.iti.mklab.mmcomms16.AmbiguityBasedSampling; 16 | import gr.iti.mklab.mmcomms16.BuildingSampling; 17 | import gr.iti.mklab.mmcomms16.GeographicalUniformSampling; 18 | import gr.iti.mklab.mmcomms16.GeographicallyFocusedSampling; 19 | import gr.iti.mklab.mmcomms16.TextBasedSampling; 20 | import gr.iti.mklab.mmcomms16.TextDiversitySampling; 21 | import gr.iti.mklab.mmcomms16.UserUniformSampling; 22 | import gr.iti.mklab.mmcomms16.VisualSampling; 23 | import gr.iti.mklab.util.EasyBufferedReader; 24 | import gr.iti.mklab.util.Utils; 25 | import net.sf.geographiclib.Geodesic; 26 | 27 | /** 28 | * Function that calculate the result of a geolocation method based on the Karney's algorithm. 29 | * @author gkordo 30 | * 31 | */ 32 | public class Evaluation { 33 | 34 | private static Logger logger = Logger.getLogger( 35 | "gr.iti.mklab.eval.VisualSampling"); 36 | 37 | /** 38 | * Initialize the map of the results based on the given values. 39 | * Lower range = 10^minRange 40 | * Higher ranger = 10^maxRange 41 | * 42 | * @param minRangeScale 43 | * @param maxRangeScale 44 | * @return 45 | */ 46 | private static Map initializeResultMap(int minRangeScale, int maxRangeScale){ 47 | Map map = new HashMap(); 48 | for(int i=minRangeScale;i estimationResultMap, 61 | int totalItems, int minRangeScale, int maxRangeScale){ 62 | 63 | for(int i=minRangeScale;i collection = null; 100 | 101 | // Sampling Strategies 102 | switch(sampling) { 103 | 104 | case "GUS" : // Geographical Uniform Sampling 105 | 106 | collection = (Set) GeographicalUniformSampling.sample(testFile); 107 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 108 | break; 109 | 110 | case "UUS" : // User Uniform Sampling 111 | 112 | collection = (Set) UserUniformSampling.sample(testFile); 113 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 114 | break; 115 | 116 | case "TBS" : // Text-based Sampling 117 | 118 | collection = (Set) TextBasedSampling.sample(testFile); 119 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 120 | break; 121 | 122 | case "TDS" : // Text Diversity Sampling 123 | 124 | collection = (Set) TextDiversitySampling.sample(testFile); 125 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 126 | break; 127 | 128 | case "GFS" : // Geographically Focused Sampling 129 | 130 | Map>> places = (Map>>) GeographicallyFocusedSampling.sample(placeFile); 132 | logger.info("----------Continents----------"); 133 | evaluateMultiSets(resultFile, places.get("continents"), minRangeScale, maxRangeScale); 134 | logger.info("----------Countries----------"); 135 | evaluateMultiSets(resultFile, places.get("countries"), minRangeScale, maxRangeScale); 136 | break; 137 | 138 | case "ABS" : // Ambiguity-based Sampling 139 | 140 | Map> ambiuous = (Map>) 141 | AmbiguityBasedSampling.sample(placeFile); 142 | logger.info("----------Ambiguous----------"); 143 | evaluateSingleSet(resultFile, ambiuous.get(true), minRangeScale, maxRangeScale, false); 144 | logger.info("----------Non-Ambiguous----------"); 145 | evaluateSingleSet(resultFile, ambiuous.get(false), minRangeScale, maxRangeScale,false); 146 | break; 147 | 148 | case "VS" : // Visual Sampling 149 | 150 | Map> concepts = (Map>) 151 | VisualSampling.sample(conceptFile); 152 | logger.info("----------Concepts----------"); 153 | evaluateMultiSets(resultFile, concepts, minRangeScale, maxRangeScale); 154 | break; 155 | 156 | case "BS" : // Building Sampling 157 | 158 | collection = (Set) BuildingSampling.sample(conceptFile); 159 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 160 | break; 161 | 162 | default: // No Sampling 163 | logger.info("Sampling: No Strategy"); 164 | evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false); 165 | 166 | } 167 | 168 | } 169 | 170 | /** 171 | * Calculate the precition and median error on a collection of images 172 | * @param resultFile : file of the results 173 | * @param collection : collection of image IDs 174 | * @param minRangeScale : minimum precision range 175 | * @param maxRangeScale : minimum precision range 176 | * @param oneLine : print results in one line 177 | */ 178 | private static void evaluateSingleSet(String resultFile, Set collection, 179 | int minRangeScale, int maxRangeScale, boolean oneLine){ 180 | 181 | // function that calculates the distance between two lat/lon points 182 | Geodesic geo = new Geodesic(6378.1370D, 298.257223563); 183 | 184 | // Initialize result containers 185 | Map estimationResultMap = initializeResultMap(minRangeScale, maxRangeScale); 186 | List distances = new ArrayList(); 187 | 188 | // Estimated and total item counters 189 | int estimations = 0; 190 | 191 | // File reader 192 | EasyBufferedReader reader = new EasyBufferedReader(resultFile); 193 | String line; 194 | while((line = reader.readLine()) != null){ 195 | if(collection == null || collection.contains(line.split("\t")[0])){ 196 | try{ 197 | // Pairs of lat/lon points 198 | Double[] estimatedLocation = 199 | {Double.parseDouble(line.split("\t")[4]), 200 | Double.parseDouble(line.split("\t")[3])}; 201 | Double[] groundTruthLocation = 202 | {Double.parseDouble(line.split("\t")[2]), 203 | Double.parseDouble(line.split("\t")[1])}; 204 | 205 | 206 | // calculate distance 207 | double distance = geo.Inverse(groundTruthLocation[0], groundTruthLocation[1], 208 | estimatedLocation[0], estimatedLocation[1]).s12; 209 | 210 | // store results 211 | for(int i=minRangeScale;i> collections, int minRangeScale, int maxRangeScale) { 250 | for(Entry> entry:collections.entrySet()){ 251 | logger.info(entry.getKey()); 252 | evaluateSingleSet(resultFile, entry.getValue(), 0, 1, true); 253 | } 254 | } 255 | } -------------------------------------------------------------------------------- /src/test/java/gr/iti/mklab/main/MultimediaGeotagging.java: -------------------------------------------------------------------------------- 1 | package gr.iti.mklab.main; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.util.HashSet; 6 | import java.util.Map; 7 | import java.util.Properties; 8 | import java.util.Set; 9 | 10 | import org.apache.log4j.Logger; 11 | 12 | import gr.iti.mklab.methods.MultipleGrid; 13 | import gr.iti.mklab.data.GeoCell; 14 | import gr.iti.mklab.methods.LanguageModel; 15 | import gr.iti.mklab.methods.SimilaritySearch; 16 | import gr.iti.mklab.methods.TermCellProbs; 17 | import gr.iti.mklab.metrics.Entropy; 18 | import gr.iti.mklab.metrics.Locality; 19 | import gr.iti.mklab.tools.DataManager; 20 | import gr.iti.mklab.tools.SimilarityCalculator; 21 | import gr.iti.mklab.util.EasyBufferedReader; 22 | import gr.iti.mklab.util.EasyBufferedWriter; 23 | import gr.iti.mklab.util.Progress; 24 | import gr.iti.mklab.util.TextUtil; 25 | 26 | /** 27 | * The main class that combines all the other class in order to implement the method. 28 | * For memory allocation issues the main method has been separated in three steps, create, 29 | * train, FS (Feature Selection), LM (language model), IG (multiple grid technique) and 30 | * SS (similarity search). 31 | * @author gkordo 32 | * 33 | */ 34 | public class MultimediaGeotagging { 35 | 36 | static Logger logger = Logger.getLogger("gr.iti.mklab.MainPlacingTask"); 37 | 38 | public static void main(String[] args) throws Exception{ 39 | 40 | Properties properties = new Properties(); 41 | 42 | logger.info("Program Started"); 43 | 44 | properties.load(new FileInputStream("config.properties")); 45 | String dir = properties.getProperty("dir"); 46 | 47 | String trainFolder = properties.getProperty("trainFolder"); 48 | String testFile = properties.getProperty("testFile"); 49 | 50 | String process = properties.getProperty("process"); 51 | 52 | int coarserScale = Integer.parseInt(properties.getProperty("coarserScale")); 53 | int finerScale = Integer.parseInt(properties.getProperty("finerScale")); 54 | 55 | int k = Integer.parseInt(properties.getProperty("k")); 56 | String resultFile = properties.getProperty("resultFile"); 57 | 58 | 59 | // Built of the Language Model 60 | if(process.contains("train") || process.equals("all")){ 61 | Set testIDs = DataManager.getSetOfImageIDs(dir + testFile); 62 | Set usersIDs = DataManager.getSetOfUserID(dir + testFile); 63 | 64 | TermCellProbs trainLM = new TermCellProbs(testIDs, usersIDs); 65 | 66 | trainLM.calculatorTermCellProb(dir, trainFolder, 67 | "Term-Cell Probs/scale_" + coarserScale, coarserScale); 68 | 69 | trainLM.calculatorTermCellProb(dir, trainFolder, 70 | "Term-Cell Probs/scale_" + finerScale, finerScale); 71 | } 72 | 73 | // Feature Selection and Feature Weighting (Locality and Spatial Entropy Calculation) 74 | if(process.contains("FS") || process.equals("all")){ 75 | Entropy.calculateEntropyWeights(dir, "Term-Cell Probs/scale_" + coarserScale 76 | + "/term_cell_probs"); 77 | 78 | Locality loc = new Locality(dir + testFile, coarserScale); 79 | loc.calculateLocality(dir, trainFolder); 80 | } 81 | 82 | // Language Model 83 | if(process.contains("LM") || process.equals("all")){ 84 | MultimediaGeotagging.computeMLCs(dir, testFile, "resultLM_scale" + coarserScale, 85 | "Term-Cell Probs/scale_" + coarserScale + "/term_cell_probs", 86 | "Weights", true); 87 | 88 | MultimediaGeotagging.computeMLCs(dir, testFile, "resultLM_scale" + finerScale, 89 | "Term-Cell Probs/scale_" + finerScale + "/term_cell_probs", 90 | "Weights", false); 91 | } 92 | 93 | // Multiple Grid Technique 94 | if(process.contains("MG") || process.equals("all")){ 95 | MultipleGrid.determinCellIDsForSS(dir + "resultLM/", 96 | "resultLM_mg" + coarserScale + "-" + finerScale, 97 | "resultLM_scale"+coarserScale, "resultLM_scale"+finerScale); 98 | } 99 | 100 | //Similarity Search 101 | if(process.contains("SS") || process.equals("all")){ 102 | new SimilarityCalculator(dir + testFile, dir + 103 | "resultLM/resultLM_mg" + coarserScale + "-" + finerScale) 104 | .performSimilarityCalculation(dir, trainFolder, "resultSS"); 105 | 106 | new SimilaritySearch(dir + testFile, dir + 107 | "resultLM/resultLM_mg" + coarserScale + "-" + finerScale, 108 | dir + "resultSS/image_similarities", dir + resultFile, k, 1); 109 | } 110 | 111 | logger.info("Program Finished"); 112 | } 113 | 114 | /** 115 | * Function that perform language model method for a file provided and in the determined scale 116 | * @param dir : directory of the project 117 | * @param testFile : the file that contains the testset images 118 | * @param resultFile : the name of the file that the results of the language model will be saved 119 | * @param termCellProbsFile : the file that contains the term-cell probabilities 120 | * @param weightFolder : the folder that contains the files of term weights 121 | * @param thetaG : feature selection accuracy threshold 122 | * @param thetaT : feature selection frequency threshold 123 | */ 124 | public static void computeMLCs(String dir, 125 | String testFile, String resultFile, String termCellProbsFile, 126 | String weightFolder, boolean confidenceFlag){ 127 | 128 | logger.info("Process: Language Model MLC\t|\t" 129 | + "Status: INITIALIZE\t|\tFile: " + testFile); 130 | 131 | new File(dir + "resultsLM").mkdir(); 132 | EasyBufferedReader reader = new EasyBufferedReader(dir + testFile); 133 | EasyBufferedWriter writer = new EasyBufferedWriter(dir + "resultsLM/" + resultFile); 134 | EasyBufferedWriter writerCE = new EasyBufferedWriter(dir + "resultsLM/" + 135 | resultFile + "_conf_evid"); 136 | 137 | // initialization of the Language Model 138 | LanguageModel lmItem = new LanguageModel(); 139 | Map> termCellProbsMap = lmItem.loadTermCellProbsAndWeights 140 | (dir + testFile, dir + termCellProbsFile, dir + weightFolder); 141 | 142 | logger.info("Process: Language Model MLC\t|\t" 143 | + "Status: STARTED"); 144 | 145 | 146 | int count = 0, total = 1000000; 147 | long startTime = System.currentTimeMillis(); 148 | Progress prog = new Progress(startTime,total,10,60, "calculate",logger); 149 | String line; 150 | while ((line = reader.readLine())!=null && count<=total){ 151 | 152 | prog.showProgress(count, System.currentTimeMillis()); 153 | count++; 154 | 155 | String[] metadata = line.split("\t"); 156 | 157 | // Pre-procession of the tags and title 158 | Set terms = new HashSet(); 159 | TextUtil.parse(metadata[10], terms); 160 | TextUtil.parse(metadata[8], terms); 161 | 162 | GeoCell result = lmItem.calculateLanguageModel(terms, 163 | termCellProbsMap, confidenceFlag); 164 | 165 | if(result == null){ // no result from tags and title procession 166 | // give image's description in the language model (if provided) 167 | result = lmItem.calculateLanguageModel(TextUtil.parse(metadata[9], terms), 168 | termCellProbsMap, confidenceFlag); 169 | } 170 | 171 | // write the results 172 | if(result != null && !result.equals("null")){ 173 | writer.write(line.split("\t")[0] + "\t" + result.getID()); 174 | if(confidenceFlag) 175 | writerCE.write(line.split("\t")[0] + "\t" + result.getConfidence() + 176 | "\t" + result.getConfidence().toString()); 177 | }else{ 178 | writer.write(line.split("\t")[0] + "\tN/A"); 179 | if(confidenceFlag) 180 | writerCE.write(line.split("\t")[0] + "\tN/A"); 181 | } 182 | writer.newLine(); 183 | if(confidenceFlag) 184 | writerCE.newLine(); 185 | } 186 | 187 | logger.info("Process: Language Model MLC\t|\tStatus: COMPLETED\t|\tTotal Time: " + 188 | (System.currentTimeMillis()-startTime)/60000.0+"m"); 189 | reader.close(); 190 | writer.close(); 191 | writerCE.close(); 192 | 193 | } 194 | } 195 | --------------------------------------------------------------------------------