├── LICENSE
├── README.md
├── config.properties
├── eval.properties
├── log4j.properties
├── pom.xml
├── samples
    ├── building_concepts.txt
    └── samples.zip
└── src
    ├── main
        └── java
        │   └── gr
        │       └── iti
        │           └── mklab
        │               ├── data
        │                   ├── GeoCell.java
        │                   └── ImageMetadata.java
        │               ├── methods
        │                   ├── LanguageModel.java
        │                   ├── MultipleGrid.java
        │                   ├── SimilaritySearch.java
        │                   └── TermCellProbs.java
        │               ├── metrics
        │                   ├── Entropy.java
        │                   └── Locality.java
        │               ├── mmcomms16
        │                   ├── AmbiguityBasedSampling.java
        │                   ├── BuildingSampling.java
        │                   ├── GeographicalUniformSampling.java
        │                   ├── GeographicallyFocusedSampling.java
        │                   ├── Sampling.java
        │                   ├── TextBasedSampling.java
        │                   ├── TextDiversitySampling.java
        │                   ├── UserUniformSampling.java
        │                   └── VisualSampling.java
        │               ├── tools
        │                   ├── CenterOfGravity.java
        │                   ├── DataManager.java
        │                   ├── InterfaceTermCellProb.java
        │                   └── SimilarityCalculator.java
        │               └── util
        │                   ├── EasyBufferedReader.java
        │                   ├── EasyBufferedWriter.java
        │                   ├── Progress.java
        │                   ├── TextUtil.java
        │                   └── Utils.java
    └── test
        └── java
            └── gr
                └── iti
                    └── mklab
                        └── main
                            ├── Evaluation.java
                            └── MultimediaGeotagging.java


/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Multimedia Geotagging
 2 | ======
 3 | 
 4 | This repository contains the implementation of algorithms that estimate the geographic location of multimedia items based on their textual content. The approach is described in <a href="http://ceur-ws.org/Vol-1436/Paper58.pdf">here</a> and <a href="http://link.springer.com/chapter/10.1007/978-3-319-18455-5_2">here</a>. It was submitted in <a href="http://www.multimediaeval.org/mediaeval2016/placing/">MediaEval Placing Task 2016</a>.
 5 | 
 6 | 
 7 | 
 8 | <h2>Main Method</h2>
 9 | 
10 | The approach is a refined language model, including feature selection and weighting schemes and heuristic techniques that improves the accuracy in finer granularities. It is a text-based method, in which a complex geographical-tag model is built from the tags, titles and the locations of a massive amount of geotagged images that are included in a training set, in order to estimate the location of each query image included in a test set.
11 | 
12 | The main approach comprises two major processing steps, an offline and an online.
13 | 
14 | <h3>Offline Processing Step</h3>
15 | 
16 | * Pre-processing
17 | 	* apply URL decoding, lowercase transformation, tokenization
18 | 	* remove accents, punctuations and symbols (e.g. “.%!&”)
19 | 	* discard terms consisting of numerics or less than three characters
20 | 
21 | * Language Model
22 | 	* divide earth surface in rectangular cells with a side length of 0.01°
23 | 	* calculate term-cell probabilities based on the users that used the term inside the cell
24 | 
25 | * Feature selection
26 | 	* calculate locality score of every term in the dataset
27 | 	* locality is based on the term frequency and the neighbor users that have used it in the cell distribution
28 | 	* the final set of selected terms is formed from the terms with locality score greater than zero 
29 | 
30 | * Feature weighting using spatial entropy
31 | 	* calculate spatial entropy values of every term applying the Shannon entropy formula in the term-cell probabilities
32 | 	* spatial entropy weights derives from a Gaussian weight function over the spatial entropy of terms
33 | 	* locality weights derives from the relative position in the rank of terms based on their locality score
34 | 	* combine locality and spatial entropy weight to generate the final weights
35 | 	
36 | <h3>Online Processing Step</h3>
37 | 
38 | * Language Model based estimation (prior-estimation)
39 | 	* the probability of each cell is calculated
40 | 	* Most Likely Cell (MLC) considered the cell with the highest probability and used to produce the estimation
41 | 
42 | * Multiple Resolution Grids
43 | 	* build different language models for multiple resolution grids (side length 0.01° and 0.001°)
44 | 	* estimate the MLC combining the result of the individual language models
45 | 
46 | * Similarity Search
47 | 	* determine the most similar training images within the MLC
48 | 	* their center-of-gravity is the final location estimation
49 | 
50 | 
51 | <h2>Instructions</h2>
52 | 
53 | In order to make possible to run the project you have to set all necessary argument in <a href="https://github.com/socialsensor/multimedia-geotagging/blob/master/config.properties">configurations</a>, following the instruction for every argument. The default values may be used. 
54 | 
55 | 
56 | _Input File_<br>
57 | The imput files must be in the same format as <a href="https://webscope.sandbox.yahoo.com/catalog.php?datatype=i&did=67">YFCC100M dataset</a>.
58 | 
59 | 
60 | _Output Files_<br>
61 | At the end of the training process, the algorithm creates a folder named `TermCellProbs` and inside the folder another folder named `scale_(s)`, named appropriately based on the scale `s` of the language model's cells. The format of this file is the following.
62 | 
63 | 	term	cell1-lon_cell1-lat>cell1-prob>cell1-users  cell2-lon_cell2-lat>cell2-prob>cell2-users...
64 | 		
65 | `term`: the actual name of the term<br>
66 | `cellx`: the x most probable cell.<br>
67 | `cellx-lon_cellx-lat`: the longitude and latitude of center of the `cellx`, which is used as cell ID<br>
68 | `cellx-prob`: the probability of the `cellx` for the specific tag<br>
69 | `cellx-users`: the number of users that used the specific term in the `cellx`
70 | 
71 | The output of the feature weighting scheme is a folder with name `Weights` containing two files one for locality weight and one for spatial entropy weights, namely `locality_weights` and `spatial_entropy_weights`, respectively. Each row contains a term and its corresponding weight, separated with a tab.
72 | 
73 | The files that are described above are given as input in the Language Model estimation process. During this process, a folder named `resultsLM` and inside that folder two files named `resultsLM_scale(s)`are created, where are included the MLCs of the query images. Every row contains the imageID and the MLC (tab-separated) of the image that corresponds in the respective line in the test set. Also, a file named `resultsLM_scale(s)_conf_evid` is created in the same folder, containing the confidence and evidences that lead to estimated MLC, for every query image.
74 | 
75 | Having estimated the MLCs for both granularity grids, the files are fed to the Multiple Resolution Grids technique, which produce a file named `resultsLM_mg(cs)-(fs)`, where `(cs)` and `(fs)` stands for coarser and finer granularity grid, respectively. Every row of this file contains the image id, the MLC of the coarser language model and the result of the Multiple Resolution Grids technique, separated with a `>`.
76 | 
77 | In conclusion, the file that is created by the Multiple Resolution Grids technique is used for the final processes of the algorithm, Similarity Search. During this process, a folder named `resultSS` is created, containing the similarity values and the location of the images that containing in the MLG of every image in the test set. The final results are saved in the file specified in the arguments, and the records in each row are the ID of the query image, the real longitude and latitude, the estimated longitude and latitude, and they are tab-separated.
78 | 
79 | <h3>Evaluation Framework</h3>
80 | 
81 | This <a href="https://github.com/MKLab-ITI/multimedia-geotagging/tree/develop/src/main/java/gr/iti/mklab/mmcomms16">pacage</a> contains the implemetations of the sampling strategies described in the <a href="http://dl.acm.org/citation.cfm?doid=2983554.2983558">MMCommons 2016 paper</a>. In order to run the evaluation framework you have to set all necessary argument in <a href="https://github.com/MKLab-ITI/multimedia-geotagging/blob/master/eval.properties">configuration file</a>, following the instruction for every argument. To run the code, the <a href="https://github.com/MKLab-ITI/multimedia-geotagging/blob/master/src/test/java/gr/iti/mklab/main/Evaluation.java">Evaluation class</a> have to be executed.
82 | 
83 | Additionally, in this <a href="https://github.com/MKLab-ITI/multimedia-geotagging/blob/master/samples/">folder</a>, the <a href="https://github.com/MKLab-ITI/multimedia-geotagging/blob/master/samples/samples.zip">zip file</a> that contains the generated collections from the different sampling strategies and the <a href="https://github.com/MKLab-ITI/multimedia-geotagging/blob/master/samples/building_concepts.txt">file</a> of the building concepts can be found. Keep in mind that the geographical uniform sampling, the user uniform sampling and text diversity sampling generates different files in every code execution because they involve random selections and permutations.
84 | 
85 | <h3>Demo Version</h3>
86 | 
87 | There have been developed a <a href="https://github.com/socialsensor/multimedia-geotagging/tree/demo">demo version</a> and a <a href="https://github.com/socialsensor/multimedia-geotagging/tree/storm">storm module</a> of the approach.
88 | 
89 | <h3>Contact for further details about the project</h3>
90 | 
91 | Giorgos Kordopatis-Zilos (georgekordopatis@iti.gr)<br>
92 | Symeon Papadopoulos (papadop@iti.gr)
93 | 


--------------------------------------------------------------------------------
/config.properties:
--------------------------------------------------------------------------------
 1 | #Project directory
 2 | dir=/home/georgekordopatis/Documents/multimedia-geotagging/images/
 3 | 
 4 | #Processes of the program
 5 | #Values:
 6 | #create = create the needed sets (training and test) 
 7 | #train = create Cell-Tag probability file with the entropy value for each tag
 8 | #FS = Feature Selection
 9 | #LM = Language Model
10 | #IG = Internal Grid 
11 | #SS = Similarity Search
12 | #all = all the processes
13 | process=train
14 | 
15 | #Folder that contains the training files and Test set file
16 | trainFolder=/yfcc100m/
17 | testFile=/testset/2016/mediaeval2016_placing_test
18 | 
19 | #Scale of Grid
20 | #side cell = 10^(-scale) (i.e. scale 2 = 0.01)
21 | coarserScale=2
22 | finerScale=3
23 | 
24 | #Total number of the similar images (k) and the result files of the LM process for multiple grids (input)
25 | #required for IGSS process
26 | k=5
27 | 
28 | #Name of the final Result File (output)
29 | resultFile=results_G2-3_k


--------------------------------------------------------------------------------
/eval.properties:
--------------------------------------------------------------------------------
 1 | #Paths to the input Files
 2 | testFile=mediaeval2015_placing_test
 3 | placeFile=mediaeval2015_placing_test_places
 4 | conceptFile=mediaeval2015_placing_test_autotags
 5 | resultFile=results
 6 | 
 7 | #Sampling Strategy
 8 | #GUS  <--  Geographical Uniform Sampling
 9 | #UUS  <--  User Uniform Sampling
10 | #TBS  <--  Text-based Sampling
11 | #TDS  <--  Text Diversity Sampling
12 | #GFS  <--  Geographically Focused Sampling
13 | #ABS  <--  Ambiguity-based Sampling
14 | #VS  <--  Visual Sampling
15 | #BS  <--  Building Sampling
16 | #(Empty)  <--  No sampling
17 | sampling=GUS
18 | 
19 | #Minimum and Maximum precision range
20 | #precisionrange = 10^(scale) (i.e. scale -1 --> range 0.1km)
21 | minRangeScale=-2
22 | maxRangeScale=3
23 | 


--------------------------------------------------------------------------------
/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set up logging to include a file record of the output
 2 | # Note: the file is always created, even if there is 
 3 | # no actual output.
 4 | log4j.rootLogger=info, stdout, R
 5 | 
 6 | # Log format to standard out
 7 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 8 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.stdout.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n
10 | 
11 | # File based log output
12 | log4j.appender.R=org.apache.log4j.RollingFileAppender
13 | log4j.appender.R.File=testout.log
14 | log4j.appender.R.MaxFileSize=100000KB
15 | log4j.appender.R.encoding=UTF-8
16 | # Keep one backup file
17 | log4j.appender.R.MaxBackupIndex=1
18 | log4j.appender.R.layout=org.apache.log4j.PatternLayout
19 | log4j.appender.R.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 | 
  4 | 	<modelVersion>4.0.0</modelVersion>
  5 | 	<groupId>gr.iti.mklab</groupId>
  6 | 	<artifactId>multimedia-geotagging</artifactId>
  7 | 	<version>0.1-SNAPSHOT</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>multimedia-geotagging</name>
 11 | 	<url>https://github.com/socialsensor/multimedia-geotagging</url>
 12 | 	<description>Contains the implementation of algorithms that estimate the geographic location of media content based on their content and metadata.</description>
 13 | 
 14 | 	<developers>
 15 | 		<developer>
 16 | 			<id>gkordo</id>
 17 | 			<name>Giorgos Kordopatis-Zilos</name>
 18 | 			<email>georgekordopatis@iti.gr</email>
 19 | 		</developer>
 20 | 	</developers>
 21 | 
 22 | 	<licenses>
 23 | 		<license>
 24 | 			<name>The Apache Software License, Version 2.0</name>
 25 | 			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 26 | 			<distribution>repo</distribution>
 27 | 		</license>
 28 | 	</licenses>
 29 | 
 30 | 	<scm>
 31 | 		<connection>scm:git:git@github.com:socialsensor/multimedia-geotagging.git</connection>
 32 | 		<developerConnection>scm:git:git@github.com:socialsensor/multimedia-geotagging.git</developerConnection>
 33 | 		<url>git@github.com:socialsensor/multimedia-geotagging.git</url>
 34 | 	</scm>
 35 | 
 36 | 	<properties>
 37 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 38 | 	</properties>
 39 | 
 40 | 	<dependencies>
 41 | 
 42 | 		<dependency>
 43 | 			<groupId>junit</groupId>
 44 | 			<artifactId>junit</artifactId>
 45 | 			<version>3.8.1</version>
 46 | 			<scope>test</scope>
 47 | 		</dependency>
 48 | 
 49 | 		<dependency>
 50 | 			<groupId>log4j</groupId>
 51 | 			<artifactId>log4j</artifactId>
 52 | 			<version>1.2.16</version>
 53 | 		</dependency>
 54 | 
 55 | 		<dependency>
 56 | 			<groupId>org.apache.hadoop</groupId>
 57 | 			<artifactId>hadoop-core</artifactId>
 58 | 			<version>1.2.1</version>
 59 | 		</dependency>
 60 | 
 61 | 		<dependency>
 62 | 			<groupId>org.apache.commons</groupId>
 63 | 			<artifactId>commons-math3</artifactId>
 64 | 			<version>3.4.1</version>
 65 | 		</dependency>
 66 | 
 67 | 		<dependency>
 68 | 			<groupId>info.debatty</groupId>
 69 | 			<artifactId>java-lsh</artifactId>
 70 | 			<version>0.10</version>
 71 | 		</dependency>
 72 | 
 73 | 		<dependency>
 74 | 			<groupId>net.sf.geographiclib</groupId>
 75 | 			<artifactId>GeographicLib-Java</artifactId>
 76 | 			<version>1.42</version>
 77 | 		</dependency>
 78 | 
 79 | 	</dependencies>
 80 | 
 81 | 	<build>
 82 | 		<plugins>
 83 | 			<plugin>
 84 | 				<groupId>org.apache.maven.plugins</groupId>
 85 | 				<artifactId>maven-compiler-plugin</artifactId>
 86 | 				<version>2.5.1</version>
 87 | 				<configuration>
 88 | 					<source>1.6</source>
 89 | 					<target>1.6</target>
 90 | 				</configuration>
 91 | 
 92 | 			</plugin>
 93 | 			<plugin>
 94 | 				<groupId>org.apache.maven.plugins</groupId>
 95 | 				<artifactId>maven-source-plugin</artifactId>
 96 | 				<version>2.2.1</version>
 97 | 				<executions>
 98 | 					<execution>
 99 | 						<id>attach-sources</id>
100 | 						<goals>
101 | 							<goal>jar</goal>
102 | 						</goals>
103 | 					</execution>
104 | 				</executions>
105 | 			</plugin>
106 | 			<plugin>
107 | 				<groupId>org.apache.maven.plugins</groupId>
108 | 				<artifactId>maven-javadoc-plugin</artifactId>
109 | 				<version>2.9.1</version>
110 | 				<executions>
111 | 					<execution>
112 | 						<id>attach-javadocs</id>
113 | 						<goals>
114 | 							<goal>jar</goal>
115 | 						</goals>
116 | 					</execution>
117 | 				</executions>
118 | 			</plugin>
119 | 		</plugins>
120 | 	</build>
121 | 
122 | </project>
123 | 


--------------------------------------------------------------------------------
/samples/building_concepts.txt:
--------------------------------------------------------------------------------
  1 | flying buttress
  2 | brussels carpet
  3 | capitol
  4 | rose window
  5 | abbey
  6 | coliseum
  7 | nave
  8 | cathedral
  9 | pantheon
 10 | chateau
 11 | belfry
 12 | gothic
 13 | temple
 14 | aisle
 15 | pointed arch
 16 | rotunda
 17 | organ loft
 18 | onion dome
 19 | palace
 20 | bastion
 21 | campanile
 22 | cloister
 23 | dome
 24 | clock tower
 25 | roman arch
 26 | round arch
 27 | amphitheater
 28 | church
 29 | facade
 30 | frieze
 31 | ceiling
 32 | ballpark
 33 | gargoyle
 34 | colonnade
 35 | manor
 36 | altar
 37 | battlement
 38 | corbel
 39 | castle
 40 | brownstone
 41 | mansion
 42 | fortification
 43 | pediment
 44 | row house
 45 | pedestal
 46 | acropolis
 47 | apartment
 48 | building complex
 49 | skyscraper
 50 | stronghold
 51 | monument
 52 | fortress
 53 | great hall
 54 | tower
 55 | drawbridge
 56 | arch
 57 | portico
 58 | stadium
 59 | field house
 60 | condominium
 61 | fort
 62 | steeple
 63 | steel arch bridge
 64 | memorial
 65 | column
 66 | gable
 67 | stained
 68 | dome building
 69 | watchtower
 70 | marina
 71 | city
 72 | support column
 73 | concrete
 74 | cantilever bridge
 75 | building
 76 | roof
 77 | door knocker
 78 | building structure
 79 | department store
 80 | cityscape
 81 | bazaar
 82 | casino
 83 | baluster
 84 | auditorium
 85 | hall
 86 | truss
 87 | brickwork
 88 | assembly hall
 89 | harbor
 90 | radome
 91 | architecture
 92 | warehouse
 93 | chandelier
 94 | house
 95 | window box
 96 | ruins
 97 | greenhouse
 98 | stairwell
 99 | window
100 | lighthouse
101 | mezzanine
102 | country house
103 | library
104 | stairs
105 | bookshop
106 | waterfront
107 | cemetery
108 | villa
109 | rafter
110 | stoop
111 | resort
112 | brick
113 | bannister
114 | mantel
115 | wall
116 | loft
117 | shelter
118 | cafeteria
119 | farmhouse
120 | cabin
121 | 


--------------------------------------------------------------------------------
/samples/samples.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/socialsensor/multimedia-geotagging/08a434ca3f6f11a15824e391b50a53f011d24159/samples/samples.zip


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/data/GeoCell.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.data;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | import java.util.Map.Entry;
 6 | 
 7 | import gr.iti.mklab.util.Utils;
 8 | 
 9 | /**
10 |  * Class that implements the earth cells.
11 |  * @author gkordo
12 |  *
13 |  */
14 | public class GeoCell {
15 | 
16 | 	private Double totalProb;
17 | 	private String id;
18 | 	private Float confidence;
19 | 	private Map<String, Float> evidence;
20 | 	
21 | 	/**
22 | 	 * Constructor of the class where the id is specified and the
23 | 	 * evidence and the summation of the probabilities are initialized.
24 | 	 * @param id : cell ID
25 | 	 */
26 | 	public GeoCell(String id){
27 | 		this.id = id;
28 | 		this.evidence = new HashMap<String, Float>();
29 | 		this.totalProb = 0.0;
30 | 	}
31 | 
32 | 	/**
33 | 	 * 
34 | 	 * @return the cell ID
35 | 	 */
36 | 	public String getID(){
37 | 		return id;
38 | 	}
39 | 
40 | 	/**
41 | 	 * Set the value of the confidence of choosing that cell.
42 | 	 * @param confidence : value of confidence
43 | 	 */
44 | 	public void setConfidence(Float confidence){
45 | 		this.confidence = confidence;
46 | 	}
47 | 
48 | 	/**
49 | 	 * 
50 | 	 * @return the confidence of the cell
51 | 	 */
52 | 	public Float getConfidence(){
53 | 		return confidence;
54 | 	}
55 | 
56 | 	/**
57 | 	 * 
58 | 	 * @return the summation of all probabilities
59 | 	 */
60 | 	public Double getTotalProb() {
61 | 		return totalProb;
62 | 	}
63 | 
64 | 	/**
65 | 	 * Add the given probability to the summation and store the word.
66 | 	 * @param prob : probability of the word
67 | 	 * @param word : actual word
68 | 	 */
69 | 	public void addProb(double prob, String word) {
70 | 		totalProb += prob;
71 | 		this.evidence.put(word, (float) prob);
72 | 	}
73 | 	
74 | 	/**
75 | 	 * 
76 | 	 * @return the sorted map of the word and their probabilities
77 | 	 */
78 | 	public Map<String, Float> getEvidence(){
79 | 		Map<String, Float> unsortMap = new HashMap<String, Float>();
80 | 		for(Entry<String, Float> word:evidence.entrySet()){
81 | 			if(word.getValue()/totalProb>0.0001){
82 | 				unsortMap.put(word.getKey(), (float) (word.getValue()/totalProb));
83 | 			}
84 | 		}
85 | 		return Utils.sortByValues(unsortMap);
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/data/ImageMetadata.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.data;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | /**
 6 |  * The class that contains the metadata of an image.
 7 |  * @author gkordo
 8 |  * 
 9 |  */
10 | public class ImageMetadata{
11 | 
12 | 	private String imageID;
13 | 	private String predictedCell,coarserCell;
14 | 	private String  userID;
15 | 	private Set<String> tags;
16 | 
17 | 	/**
18 | 	 * Constructor using the metadata provided by the dataset file
19 | 	 * @param id : image ID
20 | 	 * @param userID : user ID
21 | 	 * @param tags : image tags
22 | 	 */
23 | 	public ImageMetadata (String id, String userID,  Set<String> tags) {
24 | 		this.imageID = id;
25 | 		this.userID = userID;
26 | 		this.tags = tags;
27 | 	}
28 | 
29 | 	public String getId () {
30 | 		return imageID;
31 | 	}
32 | 
33 | 	public String getUserId () {
34 | 		return userID;
35 | 	}
36 | 
37 | 	public Set<String> getTags () {
38 | 		return tags;
39 | 	}
40 | 
41 | 	public void setPredictedCell (String cell){
42 | 		this.predictedCell = cell;
43 | 	}
44 | 	
45 | 	public void setCoarserCell (String cell){
46 | 		this.coarserCell = cell;
47 | 	}
48 | 
49 | 	public String getCell () {
50 | 		return predictedCell;
51 | 	}
52 | 	
53 | 	public String getCoarserCell () {
54 | 		return coarserCell;
55 | 	}
56 | }


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/LanguageModel.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.methods;
  2 | 
  3 | import gr.iti.mklab.data.GeoCell;
  4 | import gr.iti.mklab.tools.DataManager;
  5 | import gr.iti.mklab.util.EasyBufferedReader;
  6 | import gr.iti.mklab.util.Utils;
  7 | import gr.iti.mklab.util.Progress;
  8 | 
  9 | import java.util.HashMap;
 10 | import java.util.Map;
 11 | import java.util.Set;
 12 | import java.util.Map.Entry;
 13 | 
 14 | import org.apache.log4j.Logger;
 15 | 
 16 | /**
 17 |  * This class is the core of the algorithm. It is the implementation of the language model.
 18 |  * The Most Likely Cell of the given image is calculated.
 19 |  * @author gkordo
 20 |  *
 21 |  */
 22 | public class LanguageModel {
 23 | 
 24 | 	protected Map<String, Double[]> selectedTermWeights;
 25 | 
 26 | 	private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.LanguageModel");
 27 | 
 28 | 	// The function that compose the other functions to calculate and
 29 | 	// return the Most Likely Cell (MLC) for a query item.
 30 | 	public GeoCell calculateLanguageModel(Set<String> sentenceWords, 
 31 | 			Map<String, Map<String, Double>> termCellProbsMap, boolean confidenceFlag) {
 32 | 
 33 | 		Map<String, GeoCell> cellMap = calculateCellsProbForImageTags(sentenceWords,
 34 | 				termCellProbsMap);
 35 | 
 36 | 		GeoCell mlc = findMLC(cellMap, confidenceFlag);
 37 | 
 38 | 		return mlc;
 39 | 	}
 40 | 
 41 | 	// find the Most Likely Cell.
 42 | 	private GeoCell findMLC(
 43 | 			Map<String, GeoCell> cellMap, boolean confidenceFlag) {
 44 | 
 45 | 		cellMap = Utils.sortByMLCValues(cellMap);
 46 | 
 47 | 		GeoCell mlc = null;
 48 | 
 49 | 		if (!cellMap.isEmpty()){
 50 | 			String mlcId = cellMap.keySet().toArray()[0].toString();
 51 | 
 52 | 			mlc = cellMap.get(mlcId);
 53 | 			
 54 | 			if(confidenceFlag)
 55 | 			mlc.setConfidence((float) calculateConfidence(cellMap, mlcId, 0.3));
 56 | 		}
 57 | 
 58 | 		return mlc;
 59 | 	}
 60 | 
 61 | 	// calculate confidence for the estimated location
 62 | 	private static double calculateConfidence(Map<String, GeoCell> cellMap, 
 63 | 			String mlc, double l) {
 64 | 
 65 | 		Double sum = 0.0, total = 0.0;
 66 | 
 67 | 		for(Entry<String, GeoCell> entry:cellMap.entrySet()){
 68 | 			double[] mCell = {Double.parseDouble(mlc.split("_")[0]),
 69 | 					Double.parseDouble(mlc.split("_")[1])};
 70 | 			double[] cell = {Double.parseDouble(entry.getKey().split("_")[0]),
 71 | 					Double.parseDouble(mlc.split("_")[1])};
 72 | 			if((cell[0] >= (mCell[0]-l)) && (cell[0] <= (mCell[0]+l))
 73 | 					&& (cell[1] >= (mCell[1]-l)) && (cell[1] <= (mCell[1]+l))){
 74 | 				sum += entry.getValue().getTotalProb();
 75 | 			}
 76 | 			total += entry.getValue().getTotalProb();
 77 | 		}
 78 | 		return sum/total;
 79 | 	}
 80 | 
 81 | 	/**
 82 | 	 * This is the function that calculate the cell probabilities.
 83 | 	 * @param sentenceWords : list of words contained in tweet text
 84 | 	 * @return a map of cell
 85 | 	 */
 86 | 	private Map<String, GeoCell> calculateCellsProbForImageTags (Set<String> terms,
 87 | 			Map<String, Map<String, Double>> termCellProbsMap) {
 88 | 
 89 | 		Map<String, GeoCell> cellMap = new HashMap<String, GeoCell>();
 90 | 
 91 | 		for(String term:terms){
 92 | 			if(termCellProbsMap.containsKey(term)){
 93 | 				double locality= selectedTermWeights.get(term)[1];
 94 | 				double entropy= selectedTermWeights.get(term)[0];
 95 | 
 96 | 				for(Entry<String, Double> entry: termCellProbsMap.get(term).entrySet()){
 97 | 					String cell = entry.getKey();
 98 | 					if(cellMap.containsKey(cell)){
 99 | 						cellMap.get(cell).addProb(entry.getValue()
100 | 								*(0.8*locality+0.2*entropy), term);
101 | 					}else{
102 | 						GeoCell tmp = new GeoCell(cell);
103 | 						tmp.addProb(entry.getValue()
104 | 								*(0.8*locality+0.2*entropy), term);
105 | 						cellMap.put(cell,tmp);
106 | 					}
107 | 				}
108 | 			}
109 | 		}
110 | 		return cellMap;
111 | 	}
112 | 
113 | 	/**
114 | 	 * Initialize Language Model
115 | 	 * @param testFile : file that contains test image metadata
116 | 	 * @param probFile : file that contains the term-cell probabilities
117 | 	 * @param weightFolder : the folder that contains the term weights
118 | 	 * @return the term-cell probability map
119 | 	 */
120 | 	public Map<String,Map<String,Double>> loadTermCellProbsAndWeights(
121 | 			String testFile, String probFile, String weightFolder){
122 | 
123 | 		// Feature Selection
124 | 		loadTermWeights(weightFolder);
125 | 
126 | 		logger.info("loading cells' probabilities for all tags from " + probFile);
127 | 
128 | 		long startTime = System.currentTimeMillis();
129 | 		Progress prog = new Progress(startTime,10,1,"loading",logger);
130 | 		
131 | 		Map<String,Map<String,Double>> tagCellProbsMap = 
132 | 				new HashMap<String,Map<String,Double>>();
133 | 		Set<String> termsInTestSet = DataManager.getSetOfTerms(testFile);
134 | 		
135 | 		EasyBufferedReader reader = new EasyBufferedReader(probFile);
136 | 		String line;
137 | 		// load tag-cell probabilities from the given file
138 | 		while ((line = reader.readLine())!=null){
139 | 			prog.showMessege(System.currentTimeMillis());
140 | 			String term = line.split("\t")[0];
141 | 
142 | 			if(line.split("\t").length>1 && termsInTestSet.contains(term) 
143 | 					&& selectedTermWeights.containsKey(term)){				
144 | 				Map<String, Double> tmpCellMap = new HashMap<String,Double>();
145 | 				for(String cell:line.split("\t")[2].split(" ")){
146 | 					tmpCellMap.put(cell.split(">")[0], 
147 | 							Double.parseDouble(cell.split(">")[1]));
148 | 				}
149 | 				tagCellProbsMap.put(term, tmpCellMap);
150 | 			}
151 | 		}
152 | 		logger.info(tagCellProbsMap.size() + " tags loaded in " + 
153 | 				(System.currentTimeMillis()-startTime)/1000.0 + "s");
154 | 		reader.close();
155 | 
156 | 		return tagCellProbsMap;
157 | 	}
158 | 	
159 | 	private void loadTermWeights(String folder){
160 | 		
161 | 		// load locality weight of the terms
162 | 		EasyBufferedReader reader = new 
163 | 				EasyBufferedReader(folder + "/locality_weights");
164 | 		String line;
165 | 		while ((line = reader.readLine())!=null){
166 | 			Double[] temp = {0.0, Double.parseDouble(line.split("\t")[1])};
167 | 			selectedTermWeights.put(line.split("\t")[0], temp);
168 | 		}
169 | 		reader.close();
170 | 		
171 | 		// load spatial entropy weight of the terms
172 | 		reader = new EasyBufferedReader(
173 | 				folder + "/spatial_entropy_weights");
174 | 		while ((line = reader.readLine())!=null){
175 | 			if(selectedTermWeights.containsKey(line.split("\t")[0]))
176 | 				selectedTermWeights.get(line.split("\t")[0])[0] = 
177 | 				Double.parseDouble(line.split("\t")[1]);
178 | 		}
179 | 		reader.close();
180 | 	}
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/MultipleGrid.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.methods;
 2 | 
 3 | import gr.iti.mklab.util.EasyBufferedReader;
 4 | import gr.iti.mklab.util.EasyBufferedWriter;
 5 | 
 6 | import org.apache.log4j.Logger;
 7 | 
 8 | /**
 9 |  * The implementation of the Internal Grid technique
10 |  * @author gkordo
11 |  *
12 |  */
13 | public class MultipleGrid {
14 | 
15 | 	static Logger logger = Logger.getLogger("gr.iti.mklab.method.InternalGrid");
16 | 
17 | 	/**
18 | 	 * Method that perform the Multiple Grid technique and generates
19 | 	 * the arguments for the similarity search Class contractor
20 | 	 * @param dir : directory of the project
21 | 	 * @param resultFile : name of the output file
22 | 	 * @param resultCorserGrid : file with the estimated cells of the coarser grid
23 | 	 * @param resultFinerGrid : file with the estimated cells of the finer grid
24 | 	 */	
25 | 	public static void determinCellIDsForSS(String dir, String resultFile,
26 | 			String resultCorserGrid, String resultFinerGrid){
27 | 
28 | 		logger.info("Process: Multiple Grid Technique\t|\t"
29 | 				+ "Status: INITIALIZE");
30 | 		// Initialize parameters
31 | 		EasyBufferedReader resultLMGCReader = new EasyBufferedReader(dir + resultCorserGrid);
32 | 		EasyBufferedReader resultLMGFReader = new EasyBufferedReader(dir + resultFinerGrid);
33 | 		EasyBufferedWriter writer = new EasyBufferedWriter(dir + resultFile);
34 | 
35 | 		String corseMLC;
36 | 		String fineMLC;
37 | 
38 | 		logger.info("Process: Multiple Grid Technique\t|\t"
39 | 				+ "Status: STARTED");
40 | 
41 | 		while ((corseMLC=resultLMGCReader.readLine())!=null 
42 | 				&& (fineMLC=resultLMGFReader.readLine())!=null){
43 | 			
44 | 			if(!corseMLC.split("\t")[1].equals("N/A")){
45 | 				String mlc = deterimBoarders(corseMLC.split("\t")[1], fineMLC.split("\t")[1]);
46 | 				if(!mlc.isEmpty()){
47 | 					writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1]
48 | 							+ ":" + mlc); // selected cell ID and the sell of the coarser granularity
49 | 				}else{
50 | 					writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1]
51 | 							+ ":" + corseMLC.split("\t")[1]);
52 | 				}
53 | 				writer.newLine();
54 | 			} else{
55 | 				writer.write(corseMLC.split("\t")[0] + "\tN/A");
56 | 			}
57 | 		}
58 | 
59 | 		logger.info("Process: Multiple Grid Technique\t|\t"
60 | 				+ "Status: COMPLETED");
61 | 
62 | 		writer.close();
63 | 		resultLMGCReader.close();
64 | 		resultLMGFReader.close();
65 | 	}
66 | 
67 | 	/**
68 | 	 * Method that determines the borders of the cell that similarity search will take place
69 | 	 * @param corseMLC : estimated cell of the coarser grid
70 | 	 * @param fineMLC : estimated cell of the finer grid
71 | 	 */
72 | 	private static String deterimBoarders(String corseMLC, String fineMLC){
73 | 
74 | 		String mlc = corseMLC;
75 | 
76 | 		if (!corseMLC.equals("N/A")){
77 | 			Double[] corseLatLon = {Double.parseDouble(corseMLC.split("_")[0]),
78 | 					Double.parseDouble(corseMLC.split("_")[1])};
79 | 
80 | 			if(!fineMLC.equals("N/A")){
81 | 				Double[] fineLatLon = {Double.parseDouble(fineMLC.split("_")[0]),
82 | 						Double.parseDouble(fineMLC.split("_")[1])};
83 | 
84 | 				// check whether the estimated cell of the finer grid laying 
85 | 				// inside the borders of the estimated cell of the coarser grid
86 | 				if(fineLatLon[0]>=(corseLatLon[0]-0.005) 
87 | 						&& fineLatLon[0]<=(corseLatLon[0]+0.005)
88 | 						&& fineLatLon[1]>=(corseLatLon[1]-0.005) 
89 | 						&& fineLatLon[1]<=(corseLatLon[1]+0.005)){
90 | 					mlc = fineMLC;
91 | 				}
92 | 			}
93 | 		}
94 | 
95 | 		return mlc;
96 | 	}
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/SimilaritySearch.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.methods;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.log4j.Logger;
 10 | 
 11 | import gr.iti.mklab.tools.CenterOfGravity;
 12 | import gr.iti.mklab.util.EasyBufferedWriter;
 13 | import gr.iti.mklab.util.Progress;
 14 | import gr.iti.mklab.util.EasyBufferedReader;
 15 | 
 16 | /**
 17 |  * Class that estimates the final location for every query image
 18 |  * @author gkordo
 19 |  *
 20 |  */
 21 | public class SimilaritySearch extends CenterOfGravity{
 22 | 
 23 | 	private Map<String,String> estimatedCellMap = new HashMap<String,String>();
 24 | 	private Map<String,String> similarities = new HashMap<String,String>();
 25 | 	private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch");
 26 | 
 27 | 	/**
 28 | 	 * Contractor of the class.
 29 | 	 * @param multipleGridFile : file that contains the results of the multiple grid technique
 30 | 	 * @param similarityFile : file that contains the similar images of every query images 
 31 | 	 * @param testFile : file that contains the test image's metadata
 32 | 	 * @param outputFile : name of the output file
 33 | 	 * @param k : number of similar images based on the center-of-gravity is calculated
 34 | 	 * @param a : variable required for center-of-gravity calculation
 35 | 	 */
 36 | 	public SimilaritySearch(String testFile,String multipleGridFile, 
 37 | 			String similarityFile, String outputFile, int k, int a) {
 38 | 		super(a);
 39 | 
 40 | 		logger.info("Process: Location Estimation\t|\t"
 41 | 				+ "Status: INITIALIZE");
 42 | 		loadEstimatedCells(multipleGridFile);
 43 | 		logger.info("Process: Location Estimation\t|\t"
 44 | 				+ "Status: STARTED");
 45 | 		estimateLocation(similarityFile,k);
 46 | 		writeResultsInFile(testFile, outputFile);
 47 | 		logger.info("Process: Location Estimation\t|\t"
 48 | 				+ "Status: COMPLETED");
 49 | 	}
 50 | 
 51 | 	/**
 52 | 	 * Function that loads the estimated cells from the Multiple Grid Technique.
 53 | 	 * @param multipleGridFile : ile that contains the results of the multiple grid technique
 54 | 	 */
 55 | 	private void loadEstimatedCells(String multipleGridFile) {
 56 | 
 57 | 		EasyBufferedReader reader = new EasyBufferedReader(multipleGridFile);
 58 | 
 59 | 		String line;
 60 | 		while ((line = reader.readLine())!=null){
 61 | 			if((!line.split("\t")[1].equals("N/A"))){
 62 | 				estimatedCellMap.put(line.split("\t")[0], line.split("\t")[1]);
 63 | 			}
 64 | 		}
 65 | 
 66 | 		reader.close();
 67 | 	}
 68 | 
 69 | 	/**
 70 | 	 * Final location estimation of the images contained in the test set 
 71 | 	 * @param similarityFile : file that contains the similar images of every query images 
 72 | 	 * @param cellFile : file that contains the results of the multiple grid technique
 73 | 	 * @param k : number of similar images based on the center-of-gravity is calculated
 74 | 	 */
 75 | 	private void estimateLocation(String similarityFile, int k) {
 76 | 
 77 | 		EasyBufferedReader reader = new EasyBufferedReader(similarityFile);
 78 | 
 79 | 		Progress prog = new Progress(System.currentTimeMillis(), 1000000, 100, 1, "calculate", logger);
 80 | 		int count=0;
 81 | 		String line;
 82 | 
 83 | 		// Calculate the final results
 84 | 		while ((line = reader.readLine())!=null){
 85 | 			prog.showProgress(count, System.currentTimeMillis());
 86 | 			if(estimatedCellMap.containsKey(line.split("\t")[0])){
 87 | 				similarities.put(line.split("\t")[0], 
 88 | 						findSimilarImages(line, estimatedCellMap.get(line.split("\t")[0]), k));
 89 | 			}
 90 | 			count++;
 91 | 		}
 92 | 		reader.close();
 93 | 	}
 94 | 
 95 | 	/**
 96 | 	 * Location estimation for a query image.
 97 | 	 * @param line : line that contain the similarity of the train images
 98 | 	 * @param cells : estimated cells from the multiple grid technique
 99 | 	 * @param k : number of similar images based on the center-of-gravity is calculated
100 | 	 * @return estimated location
101 | 	 */
102 | 	private static String findSimilarImages(String line, String cells, int k){
103 | 
104 | 		List<String> images = new ArrayList<String>();
105 | 		Collections.addAll(images, line.split("\t")[1].split(" "));
106 | 
107 | 		Map<String,Double> similarity = new HashMap<String,Double>(k);
108 | 		Map<String,Double> similarityCoarser = new HashMap<String,Double>(k);
109 | 
110 | 		boolean flag = false;
111 | 		Double[] result = new Double[2];
112 | 
113 | 		// final estimation
114 | 		for(String image:images){
115 | 			if(similarity.size()<k){
116 | 				if(!cells.split(">")[0].equals(cells.split(">")[1])){
117 | 					if(deterimCell(image.split(">")[0],cells)){
118 | 						similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1]));
119 | 					}else if(similarityCoarser.size()<k && similarity.isEmpty()){
120 | 						similarityCoarser.put(image.split(">")[0], Double.parseDouble(image.split(">")[1]));
121 | 					}
122 | 				}else {
123 | 					similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1]));
124 | 				}
125 | 			}else{
126 | 				flag = true;
127 | 				result = computeCoordination(similarity);
128 | 				break;
129 | 			}
130 | 		}
131 | 
132 | 		if(similarity.size()>0 && !flag){
133 | 			flag = true;
134 | 			result = computeCoordination(similarity);
135 | 		}else if(similarityCoarser.size()>0 && !flag){
136 | 			flag = true;
137 | 			result = computeCoordination(similarityCoarser);
138 | 		}
139 | 
140 | 		// final return
141 | 		if(flag){
142 | 			return result[1] + "\t" + result[0];
143 | 		}else{
144 | 			return cells.split(">")[0].replace("_", "\t");
145 | 		}
146 | 	}
147 | 
148 | 	/**
149 | 	 * Function that determines if the given point lays inside a define cell.
150 | 	 * @param point : latitude-longitude pair
151 | 	 * @param cell : grid's cell
152 | 	 * @return a boolean that contain the information
153 | 	 */
154 | 	private static boolean deterimCell(String point, String cell){
155 | 
156 | 		boolean cellID = false;
157 | 
158 | 		Double[] pointLoc = {Double.parseDouble(point.split("_")[0]), Double.parseDouble(point.split("_")[1])};
159 | 		Double[] cellLoc = {Double.parseDouble(cell.split("_")[0]), Double.parseDouble(cell.split("_")[1])};
160 | 
161 | 		if((pointLoc[0]>=(cellLoc[0]-0.0005)) && (pointLoc[0]<=(cellLoc[0]+0.0005))
162 | 				&&(pointLoc[1]>=(cellLoc[1]-0.0005)) && (pointLoc[1]<=(cellLoc[1]+0.0005))){
163 | 			cellID = true;
164 | 		}
165 | 
166 | 		return cellID;
167 | 	}
168 | 
169 | 	/**
170 | 	 * Function that write the result in a file
171 | 	 * @param testFile : file that contains the test image's metadata
172 | 	 * @param outputFile : name of the output file
173 | 	 */
174 | 	private void writeResultsInFile(String testFile, String outputFile) {
175 | 
176 | 		EasyBufferedReader reader = new EasyBufferedReader(testFile);
177 | 		EasyBufferedWriter writer = new EasyBufferedWriter(outputFile);		
178 | 
179 | 		String line;
180 | 		// for every query image
181 | 		while ((line = reader.readLine())!=null){
182 | 
183 | 			writer.write(line.split("\t")[0]);
184 | 
185 | 			if(similarities.containsKey(line.split("\t")[0])){ // the location have been estimated
186 | 				writer.write(line.split("\t")[1] + "\t" +
187 | 						line.split("\t")[12] + "\t" + line.split("\t")[13] + "\t" +
188 | 						similarities.get(line.split("\t")[0]));
189 | 				writer.newLine();
190 | 			}else{ // no estimation
191 | 				writer.write(line.split("\t")[1] + "\t" +
192 | 						line.split("\t")[12] + "\t" + line.split("\t")[13]
193 | 						+ "\t-73.98282136256299\t40.75282028252674");
194 | 				writer.newLine();
195 | 			}
196 | 		}
197 | 		reader.close();
198 | 		writer.close();
199 | 	}	
200 | }
201 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/TermCellProbs.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.methods;
  2 | 
  3 | import gr.iti.mklab.tools.InterfaceTermCellProb;
  4 | import gr.iti.mklab.util.Utils;
  5 | import gr.iti.mklab.util.TextUtil;
  6 | 
  7 | import java.io.File;
  8 | import java.io.IOException;
  9 | import java.math.BigDecimal;
 10 | import java.util.*;
 11 | import java.util.Map.Entry;
 12 | 
 13 | import org.apache.commons.io.FileUtils;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.*;
 16 | import org.apache.hadoop.mapred.*;
 17 | import org.apache.log4j.Logger;
 18 | 
 19 | /**
 20 |  * Class that calculate the term-cell probabilities for all term in all cells and saves the results in file.
 21 |  * The implementation employ hadoop map-reduce function.
 22 |  * @author gkordo
 23 |  *
 24 |  */
 25 | public class TermCellProbs implements InterfaceTermCellProb{
 26 | 
 27 | 	private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.TermCellProbCalculator");
 28 | 	private static Set<String> testIDs;
 29 | 	private static Set<String> users;
 30 | 	private static int scale;
 31 | 
 32 | 	/**
 33 | 	 * Contractor of the class get the set of image IDs and the user IDs of the images in the test set.
 34 | 	 * @param testIDs : set of test image IDs
 35 | 	 * @param users	: set of test user IDs
 36 | 	 */
 37 | 	public TermCellProbs(Set<String> testIDs, Set<String> users){
 38 | 		TermCellProbs.testIDs = testIDs;
 39 | 		TermCellProbs.users = users;
 40 | 	}	
 41 | 
 42 | 	/**
 43 | 	 * Map class that takes the lines of the train file as input and creates key-value pairs,
 44 | 	 * using as keys the terms contained in the images and as values strings that contain
 45 | 	 * the information regarding the cell and user ID.
 46 | 	 * @author gkordo
 47 | 	 *
 48 | 	 */
 49 | 	public static class MapTermCellProb extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
 50 | 
 51 | 		/**
 52 | 		 * Required map function
 53 | 		 * @param key : key value
 54 | 		 * @param value : input string
 55 | 		 * @param output : output collector
 56 | 		 * @param reporter : reporter of the job
 57 | 		 */
 58 | 		public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
 59 | 
 60 | 			String[] metadata = value.toString().split("\t");
 61 | 
 62 | 			if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
 63 | 					&& !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
 64 | 					&& (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
 65 | 
 66 | 				// get image cell based on its latitude-longitude pair
 67 | 				BigDecimal cellLonCenter = new BigDecimal(Double.parseDouble(
 68 | 						metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP);
 69 | 				BigDecimal cellLatCenter = new BigDecimal(Double.parseDouble(
 70 | 						metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP);
 71 | 
 72 | 				String cellID = cellLonCenter+"_"+cellLatCenter;
 73 | 
 74 | 				//get image user ID
 75 | 				String userID = metadata[3];
 76 | 
 77 | 				// get image tags
 78 | 				Set<String> terms = new HashSet<String>();
 79 | 				TextUtil.parse(metadata[10], terms);
 80 | 				TextUtil.parse(metadata[8], terms);
 81 | 
 82 | 				for(String term:terms){
 83 | 					if(!term.isEmpty() && term.length() > 2){
 84 | 						output.collect(new Text(term), new Text(cellID+">"+userID)); // key-value pair
 85 | 					}
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 
 91 | 
 92 | 	/**
 93 | 	 * Reduce class that get the key-value pairs and calculate the term-cell probabilities of every term.
 94 | 	 * @author gkordo
 95 | 	 *
 96 | 	 */
 97 | 	public static class ReduceTermCellProb extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
 98 | 
 99 | 		/**
100 | 		 * Required reduce function
101 | 		 * @param key : key value
102 | 		 * @param values : set of values that share the same key
103 | 		 * @param output : output collector
104 | 		 * @param reporter : reporter of the job
105 | 		 */
106 | 		public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
107 | 
108 | 			// frequency map that contains the count of the different users for every single cell
109 | 			Map<String,Set<String>> termFreq = new HashMap<String,Set<String>>(); 
110 | 			int Nt = 0; // total user count
111 | 
112 | 			// process every value that corresponds to a specific key
113 | 			while (values.hasNext()) {
114 | 
115 | 				String entry = values.next().toString();
116 | 
117 | 				// retrieve cell ID and user ID from the value of the pair 
118 | 				String cellID = entry.split(">")[0];
119 | 				String userID = entry.split(">")[1];
120 | 
121 | 				// update of the frequency map
122 | 				if (termFreq.containsKey(cellID)){
123 | 					if(!termFreq.get(cellID).contains(userID)){
124 | 						Nt++;
125 | 						termFreq.get(cellID).add(userID);
126 | 					}
127 | 				}else{
128 | 					Nt++;
129 | 					termFreq.put(cellID,new HashSet<String>());
130 | 					termFreq.get(cellID).add(userID);
131 | 				}
132 | 			}
133 | 
134 | 			// calculation of the tag-cell probabilities map for every cell
135 | 			Map<String,Double> cellsProbs = new HashMap<String,Double>();
136 | 			for(Entry<String, Set<String>> entryCell : termFreq.entrySet()){
137 | 				String cellID = entryCell.getKey();
138 | 				Double cellProb = ((double)(entryCell.getValue().size()))/Nt;
139 | 				cellsProbs.put(cellID,cellProb);
140 | 			}
141 | 
142 | 			// sorting of the tag-cell probabilities map
143 | 			Map<String, Double> cellsProbsSorted = Utils.sortByValues(cellsProbs);
144 | 
145 | 			// convert tag-cell probabilities map in string in order to be saved in the output file
146 | 			String out = convertMapToString(cellsProbsSorted,termFreq);
147 | 
148 | 			// send output to collector
149 | 			output.collect(key, new Text(out));
150 | 		}
151 | 
152 | 		/**
153 | 		 * Function that convert tag-cell probabilities map in output string.
154 | 		 * @param cellsProbs : tag-cell probabilities map
155 | 		 * @param termFreq : frequency map
156 | 		 * @return a string contains cell IDs accompanied with tag-cell probabilities
157 | 		 */
158 | 		public static String convertMapToString(Map<String,Double> cellsProbs,
159 | 				Map<String, Set<String>> termFreq){
160 | 			String out = "";
161 | 			for(Entry<String, Double> entryCell: cellsProbs.entrySet()){
162 | 				if(cellsProbs.get(entryCell.getKey()) >= 0.00001){
163 | 					String tempCellIDProb = entryCell.getKey() 
164 | 							+ ">" + cellsProbs.get(entryCell.getKey()) 
165 | 							+ ">" + termFreq.get(entryCell.getKey()).size();
166 | 
167 | 					out += (tempCellIDProb + " ");
168 | 				}
169 | 			}
170 | 			return out.trim();
171 | 		}
172 | 	}
173 | 
174 | 	/**
175 | 	 * Core function for the job of tag-cell probabilities calculation.
176 | 	 * @param dir : directory of the project
177 | 	 * @param trainFolder : the file of the train set
178 | 	 * @param outFolder : the folder where the tag-set probabilities file will be stored
179 | 	 * @param scale : the scale of the grid that is used
180 | 	 */
181 | 	public void calculatorTermCellProb(String dir, String trainFolder,
182 | 			String outFolder, int scale) throws IOException{
183 | 
184 | 		logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
185 | 				+ "Status: INITIALIZE");
186 | 		
187 | 		TermCellProbs.scale = scale;
188 | 
189 | 		// initialize Job
190 | 		JobConf conf = new JobConf(TermCellProbs.class);
191 | 		conf.setJobName("termcellprobmapred");
192 | 
193 | 		conf.setOutputKeyClass(Text.class);
194 | 		conf.setOutputValueClass(Text.class);
195 | 
196 | 		conf.setMapperClass(MapTermCellProb.class);
197 | 		conf.setReducerClass(ReduceTermCellProb.class);
198 | 
199 | 		conf.setInputFormat(TextInputFormat.class);
200 | 		conf.setOutputFormat(TextOutputFormat.class);
201 | 
202 | 		// clean the output file directory
203 | 		File folder = new File(dir + outFolder);
204 | 		if (folder.exists()) {
205 | 			FileUtils.cleanDirectory(folder);
206 | 			FileUtils.forceDelete(folder);
207 | 		}
208 | 
209 | 		FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
210 | 		FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder));
211 | 
212 | 		// start Job
213 | 		logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
214 | 				+ "Status: STARTED");
215 | 		long startTime = System.currentTimeMillis();
216 | 		JobClient.runJob(conf);
217 | 		logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
218 | 				+ "Status: COMPLETED\t|\tTotal time: " + 
219 | 				(System.currentTimeMillis()-startTime)/60000.0+"m");
220 | 
221 | 		new File(dir + outFolder + "/part-00000").renameTo(
222 | 				new File(dir + outFolder + "/term_cell_probs")); // rename the output file
223 | 	}
224 | }


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/metrics/Entropy.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.metrics;
  2 | 
  3 | import gr.iti.mklab.util.EasyBufferedReader;
  4 | import gr.iti.mklab.util.EasyBufferedWriter;
  5 | import gr.iti.mklab.util.Utils;
  6 | 
  7 | import java.io.File;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | import java.util.Map.Entry;
 11 | 
 12 | import org.apache.commons.math3.distribution.NormalDistribution;
 13 | import org.apache.commons.math3.stat.descriptive.moment.Mean;
 14 | import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
 15 | import org.apache.log4j.Logger;
 16 | 
 17 | /**
 18 |  * Entropy class update the file that contains the tag-cell probabilities with the spatial entropy of every individual tag.
 19 |  * Calculate the spatial tag entropy for all of the tags. Entropy is used for feature weighting.
 20 |  * @author gkordo
 21 |  *
 22 |  */
 23 | public class Entropy {
 24 | 
 25 | 	static Logger logger = Logger.getLogger("gr.iti.mklab.method.Entropy");
 26 | 
 27 | 	/**
 28 | 	 * Calculate the Spatial Entropy weights of the LM terms
 29 | 	 * @param dir : project directory
 30 | 	 * @param fileTermCell : Term-Cell probability file
 31 | 	 */
 32 | 	public static void calculateEntropyWeights(String dir, String fileTermCell){
 33 | 
 34 | 		logger.info("Process: Spatial Entropy weights calculation\t|\t"
 35 | 				+ "Status: INITIALIZE");
 36 | 
 37 | 		new File(dir + "Weights").mkdir();
 38 | 
 39 | 		// Term Spatial Entropy calculation
 40 | 		EasyBufferedReader reader = new EasyBufferedReader(dir + fileTermCell);
 41 | 		Map<String, Double> termSpatialEntropy = new HashMap<String, Double>();
 42 | 		long sTime = System.currentTimeMillis();
 43 | 		String line;
 44 | 		while ((line=reader.readLine())!=null){
 45 | 			String term = line.split("\t")[0];
 46 | 			String[] cells = line.split("\t")[1].split(" ");
 47 | 			if(cells.length > 1
 48 | 					&& term.length() > 3){
 49 | 				termSpatialEntropy.put(term,
 50 | 						computeEntropyNaive(cells));
 51 | 			}
 52 | 		}
 53 | 		reader.close();
 54 | 		
 55 | 		logger.info("Process: Spatial Entropy weights calculation\t|\t"
 56 | 				+ "Status: STARTED");
 57 | 		
 58 | 		// Spatial Entropy weights calculation of terms
 59 | 		Map<String, Double> weights = calculateSpatialEntropyWeights(termSpatialEntropy); 
 60 | 		
 61 | 		// store weights
 62 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
 63 | 				dir + "Weights/spatial_entropy_weights");
 64 | 		for(Entry<String, Double> term:weights.entrySet()){
 65 | 			writer.write(term.getKey() + "\t" + term.getValue());
 66 | 			writer.newLine();
 67 | 		}
 68 | 
 69 | 		logger.info("Process: Spatial Entropy weights calculation\t|\t"
 70 | 				+ "Status: COMPLETED\t|\tTotal time: " +
 71 | 				(System.currentTimeMillis()-sTime)/1000.0 + "s");
 72 | 		writer.close();
 73 | 	}
 74 | 
 75 | 	/**
 76 | 	 * Shannon entropy formula
 77 | 	 * @param probabilities : probability distribution
 78 | 	 * @return
 79 | 	 */
 80 | 	private static double computeEntropyNaive(String[] probabilities) {
 81 | 		double entropy = 0.0;
 82 | 		for (int i=0;i< probabilities.length;i++) {
 83 | 			double p = Double.parseDouble(probabilities[i].split(">")[1]);
 84 | 			if(p != 0.0){
 85 | 				entropy -= p * Math.log(p);
 86 | 			}
 87 | 		}
 88 | 		return entropy;
 89 | 	}
 90 | 
 91 | 	/**
 92 | 	 * Calculate the max probability value applying the Gaussian functionon the
 93 | 	 * probability distribution
 94 | 	 * @param entropies : spatial entropy values of the terms
 95 | 	 * @return max weight
 96 | 	 */
 97 | 	private static Map<String, Double> calculateSpatialEntropyWeights(
 98 | 			Map<String, Double>  entropies){
 99 | 		
100 | 		double[] termSpatialEntropyValues = entropies
101 | 				.values().stream().mapToDouble(d -> d).toArray();
102 | 		
103 | 		NormalDistribution gd = new NormalDistribution( // Gaussian function for re-weighting
104 | 				new Mean().evaluate(termSpatialEntropyValues),
105 | 				new StandardDeviation().evaluate(termSpatialEntropyValues));
106 | 		
107 | 		Double gdMax = 0.0;
108 | 		Map<String, Double> weights = new HashMap<String, Double>();
109 | 		for(Entry<String, Double> p:entropies.entrySet()){
110 | 			double weight = gd.density(p.getValue());
111 | 			weights.put(p.getKey(), weight);
112 | 			if(gdMax < weight){
113 | 				gdMax = weight;
114 | 			}
115 | 		}
116 | 		
117 | 		for(Entry<String, Double> term:weights.entrySet()){
118 | 			term.setValue(term.getValue()/gdMax);
119 | 		}
120 | 				
121 | 		return Utils.sortByValues(weights);
122 | 	}
123 | }


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/metrics/Locality.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.metrics;
  2 | 
  3 | import gr.iti.mklab.tools.DataManager;
  4 | import gr.iti.mklab.util.EasyBufferedReader;
  5 | import gr.iti.mklab.util.EasyBufferedWriter;
  6 | import gr.iti.mklab.util.TextUtil;
  7 | import gr.iti.mklab.util.Utils;
  8 | 
  9 | import java.io.File;
 10 | import java.io.IOException;
 11 | import java.math.BigDecimal;
 12 | import java.util.HashMap;
 13 | import java.util.HashSet;
 14 | import java.util.Iterator;
 15 | import java.util.Map;
 16 | import java.util.Map.Entry;
 17 | import java.util.Set;
 18 | 
 19 | import org.apache.commons.io.FileUtils;
 20 | import org.apache.hadoop.fs.Path;
 21 | import org.apache.hadoop.io.LongWritable;
 22 | import org.apache.hadoop.io.Text;
 23 | import org.apache.hadoop.mapred.FileInputFormat;
 24 | import org.apache.hadoop.mapred.FileOutputFormat;
 25 | import org.apache.hadoop.mapred.JobClient;
 26 | import org.apache.hadoop.mapred.JobConf;
 27 | import org.apache.hadoop.mapred.MapReduceBase;
 28 | import org.apache.hadoop.mapred.Mapper;
 29 | import org.apache.hadoop.mapred.OutputCollector;
 30 | import org.apache.hadoop.mapred.Reducer;
 31 | import org.apache.hadoop.mapred.Reporter;
 32 | import org.apache.hadoop.mapred.TextInputFormat;
 33 | import org.apache.hadoop.mapred.TextOutputFormat;
 34 | import org.apache.log4j.Logger;
 35 | 
 36 | /**
 37 |  * Class that calculate the locality of the terms and saves the results in file.
 38 |  * The implementation employ hadoop map-reduce function.
 39 |  * @author gkordo
 40 |  *
 41 |  */
 42 | public class Locality {
 43 | 
 44 | 	private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.Locality");
 45 | 	private static Set<String> testIDs;
 46 | 	private static Set<String> users;
 47 | 	private static int scale;
 48 | 
 49 | 	public Locality(String testFile, int scale){
 50 | 		testIDs = DataManager.getSetOfImageIDs(testFile);
 51 | 		users = DataManager.getSetOfUserID(testFile);
 52 | 		Locality.scale = scale;
 53 | 	}
 54 | 
 55 | 	/**
 56 | 	 * Map class that takes the lines of the train file as input and creates key-value pairs,
 57 | 	 * using as keys the tags contained in the images and as values strings that contain
 58 | 	 * the information regarding the cell and user ID.
 59 | 	 * @author gkordo
 60 | 	 *
 61 | 	 */
 62 | 	public static class MapLocality extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
 63 | 
 64 | 		/**
 65 | 		 * Required map function
 66 | 		 * @param key : key value
 67 | 		 * @param value : input string
 68 | 		 * @param output : output collector
 69 | 		 * @param reporter : reporter of the job
 70 | 		 */
 71 | 		public void map(LongWritable key, Text value,
 72 | 				OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
 73 | 
 74 | 			String[] metadata = value.toString().split("\t");
 75 | 
 76 | 			if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
 77 | 					&& !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
 78 | 					&& (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
 79 | 
 80 | 				BigDecimal tmpLonCenter = new BigDecimal(
 81 | 						Double.parseDouble(metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP);
 82 | 				BigDecimal tmpLatCenter = new BigDecimal(
 83 | 						Double.parseDouble(metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP);
 84 | 
 85 | 				//get image user ID
 86 | 				String userID = metadata[3];
 87 | 
 88 | 				// get image tags
 89 | 				Set<String> terms = new HashSet<String>();
 90 | 				TextUtil.parse(metadata[10], terms);
 91 | 				TextUtil.parse(metadata[8], terms);
 92 | 
 93 | 				// send key-value pairs
 94 | 				for(String term:terms) {
 95 | 					if(!term.isEmpty() && term.length() > 2){
 96 | 						for(int j=-2;j<2;j++){
 97 | 							for(int k=-2;k<2;k++){
 98 | 								output.collect(new Text(term), new Text(userID + ">" +
 99 | 										(tmpLonCenter.doubleValue()+((j)*0.01)) + "_" +
100 | 										(tmpLatCenter.doubleValue()+((k)*0.01))));
101 | 							}							
102 | 						}
103 | 					}
104 | 				}
105 | 			}
106 | 		}
107 | 	}
108 | 
109 | 	/**
110 | 	 * Reduce class that get the key-value pairs and calculate the locality of every term.
111 | 	 * @author gkordo
112 | 	 *
113 | 	 */
114 | 	public static class ReduceLocality extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
115 | 
116 | 		/**
117 | 		 * Required reduce function
118 | 		 * @param key : key value
119 | 		 * @param values : set of values that share the same key
120 | 		 * @param output : output collector
121 | 		 * @param reporter : reporter of the job
122 | 		 */
123 | 		public void reduce(Text key, Iterator<Text> values,
124 | 				OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
125 | 
126 | 			// map of cells that contains the count of the different users for every single cell
127 | 			Map<String,Set<String>> cells =  new HashMap<String,Set<String>>();
128 | 			int Nt = 0; // total user count
129 | 
130 | 			while (values.hasNext()) {
131 | 
132 | 				String value = values.next().toString();
133 | 
134 | 				// retrieve cell ID and user ID from the value of the pair 
135 | 				String user = value.split(">")[0];
136 | 				String cell = value.split(">")[1];
137 | 
138 | 				// update of the frequency map
139 | 				if(cells.containsKey(cell)){
140 | 					if(!cells.get(cell).contains(user)){
141 | 						cells.get(cell).add(user);
142 | 						Nt++;
143 | 					}
144 | 				}else{
145 | 					cells.put(cell,new HashSet<String>());
146 | 					cells.get(cell).add(user);
147 | 					Nt++;
148 | 				}
149 | 			}
150 | 
151 | 			// locality calculation
152 | 			double locality = 0.0;
153 | 			for(Entry<String, Set<String>> entry : cells.entrySet()){
154 | 				int v=entry.getValue().size();
155 | 				locality+=v*(v-1)/Nt;
156 | 
157 | 			}
158 | 
159 | 			// send output to collector
160 | 			if(locality > 0.0){
161 | 				output.collect(key, new Text(locality + ""));
162 | 			}
163 | 		}
164 | 	}
165 | 
166 | 	/**
167 | 	 * Core function for the job of tag-cell probabilities calculation.
168 | 	 * @param dir : project directory
169 | 	 * @param trainFolder : the file of the train set
170 | 	 * @throws IOException : file not found
171 | 	 */
172 | 	public void calculateLocality(String dir, String trainFolder) throws IOException{
173 | 
174 | 		logger.info("Process: Locality weight calculation\t|\t"
175 | 				+ "Status: INITIALIZE");
176 | 		JobConf conf = new JobConf(Locality.class);
177 | 		conf.setJobName("Locality");
178 | 
179 | 		conf.setOutputKeyClass(Text.class);
180 | 		conf.setOutputValueClass(Text.class);
181 | 
182 | 		conf.setMapperClass(MapLocality.class);
183 | 		conf.setReducerClass(ReduceLocality.class);
184 | 
185 | 		conf.setInputFormat(TextInputFormat.class);
186 | 		conf.setOutputFormat(TextOutputFormat.class);
187 | 
188 | 		// clean the output file directory
189 | 		File folder = new File(dir + "temp/locality");
190 | 		if (folder.exists()) {
191 | 			FileUtils.cleanDirectory(folder);
192 | 			FileUtils.forceDelete(folder);
193 | 		}
194 | 
195 | 		FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
196 | 		FileOutputFormat.setOutputPath(conf, new Path(dir + "temp/locality"));
197 | 
198 | 		logger.info("Process: Locality weight calculation\t|\t"
199 | 				+ "Status: STARTED");
200 | 		long startTime = System.currentTimeMillis();
201 | 		JobClient.runJob(conf);
202 | 
203 | 		sortAndStore(dir + "temp/locality/part-00000",
204 | 				dir + "Weights/locality_weights");
205 | 
206 | 		logger.info("Process: Locality weight calculation\t|\t"
207 | 				+ "Status: COMPLETED\t|\tTotal time: " + 
208 | 				(System.currentTimeMillis()-startTime)/60000.0+"m");
209 | 	}
210 | 
211 | 	/**
212 | 	 * Sort terms based on their locality values and calculate weights.
213 | 	 * The locality term weight are stored in the given file.
214 | 	 * @param inFile : file of the locality values of the terms
215 | 	 * @param outFile : output file
216 | 	 */
217 | 	private void sortAndStore(String inFile, String outFile){
218 | 
219 | 		// load locality values
220 | 		EasyBufferedReader reader = new EasyBufferedReader(inFile);
221 | 		Map<String, Double> termLocalityValues = new HashMap<String, Double>();
222 | 		String line;
223 | 		while ((line = reader.readLine())!=null){
224 | 			String term = line.split("\t")[0];
225 | 			double locality = Double.parseDouble(line.split("\t")[1]);
226 | 			termLocalityValues.put(term, locality);
227 | 		}
228 | 		reader.close();
229 | 
230 | 		// sort and store weights
231 | 		termLocalityValues = Utils.sortByValues(termLocalityValues);
232 | 		EasyBufferedWriter writer = new EasyBufferedWriter(outFile);
233 | 		int i = 0, totalTerms = termLocalityValues.size();
234 | 		for(Entry<String, Double> entry : termLocalityValues.entrySet()){
235 | 			writer.write(entry.getKey()+"\t"+(double)(totalTerms-i)/totalTerms);
236 | 			writer.newLine();
237 | 			i++;
238 | 		}
239 | 		writer.close();
240 | 	}
241 | }
242 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/AmbiguityBasedSampling.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.mmcomms16;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | import java.util.Map.Entry;
  9 | import java.util.Set;
 10 | 
 11 | import org.apache.log4j.Logger;
 12 | 
 13 | import gr.iti.mklab.util.EasyBufferedReader;
 14 | import gr.iti.mklab.util.EasyBufferedWriter;
 15 | import gr.iti.mklab.util.Utils;
 16 | 
 17 | @SuppressWarnings("unchecked")
 18 | public class AmbiguityBasedSampling extends Sampling{
 19 | 
 20 | 	private static Logger logger = Logger.getLogger(
 21 | 			"gr.iti.mklab.eval.AmbiguityBasedSampling");
 22 | 
 23 | 	public static Object sample(String testFile) throws Exception{
 24 | 
 25 | 		logger.info("Sampling: Ambiguity-based Strategy");
 26 | 		
 27 | 		AmbiguityBasedSampling sampling = 
 28 | 				new AmbiguityBasedSampling();
 29 | 
 30 | 		return sampling.writeInFile(sampling.loadData(testFile));
 31 | 	}
 32 | 
 33 | 	protected Object loadData(String testFile) {
 34 | 
 35 | 		Map<String, Double> ambiguous =
 36 | 				computeCityEntropies(loadOccurrences(testFile));
 37 | 		logger.info(ambiguous.size() + " Towns loaded");
 38 | 		
 39 | 		Map<String, Boolean> images = new
 40 | 				HashMap<String, Boolean>();
 41 | 		double median = Utils.medianItemDouble(ambiguous);
 42 | 		
 43 | 		EasyBufferedReader reader = 
 44 | 				new EasyBufferedReader(testFile);
 45 | 		String line;
 46 | 		while((line = reader.readLine())!=null){
 47 | 			String imageID = line.split("\t")[0];
 48 | 			for(String place:line .split("\t")[1].split(",")){
 49 | 				if(place.split(":").length>2 
 50 | 						&& place.split(":")[2].contains("Town")){
 51 | 					if(ambiguous.containsKey(place.split(":")[1]) &&
 52 | 							ambiguous.get(place.split(":")[1])>median){
 53 | 						images.put(imageID, true);
 54 | 					}else{
 55 | 						images.put(imageID, false);
 56 | 					}
 57 | 				}
 58 | 			}
 59 | 		}
 60 | 		reader.close();
 61 | 		return images;
 62 | 	}
 63 | 
 64 | 	protected Object writeInFile(Object data) {
 65 | 
 66 | 		Map<String, Boolean> images = 
 67 | 				(Map<String, Boolean>) data;
 68 | 
 69 | 		Map<Boolean, Set<String>> respond = new
 70 | 				HashMap<Boolean, Set<String>>();
 71 | 		
 72 | 		respond.put(true, new HashSet<String>());
 73 | 		respond.put(false, new HashSet<String>());
 74 | 		
 75 | 		EasyBufferedWriter writerA = new EasyBufferedWriter(
 76 | 				"samples/ambiguous_sampling.txt");
 77 | 		EasyBufferedWriter writerN = new EasyBufferedWriter(
 78 | 				"samples/non_ambiguous_sampling.txt");
 79 | 		for(Entry<String, Boolean> image:images.entrySet()){
 80 | 			respond.get(image.getValue()).add(image.getKey());
 81 | 			if(image.getValue()){
 82 | 				writerA.write(image.getKey());
 83 | 				writerA.newLine();
 84 | 			}else{
 85 | 				writerN.write(image.getKey());
 86 | 				writerN.newLine();
 87 | 			}
 88 | 		}
 89 | 		writerA.close();
 90 | 		writerN.close();
 91 | 		
 92 | 		return respond;
 93 | 	}
 94 | 
 95 | 	private static double computeEntropyNaive(
 96 | 			final List<Double> probabilities, int total) {
 97 | 		double entropy = 0.0;
 98 | 		for (Double p:probabilities) {
 99 | 			p /= total;
100 | 			if(p!=0.0){
101 | 				entropy -= p * Math.log(p);
102 | 			}
103 | 		}
104 | 		return entropy;
105 | 	}
106 | 
107 | 	private static Map<String, Double> computeCityEntropies(
108 | 			Map<String, Map<String, Integer>> townNames) {
109 | 		Map<String, Double> ambiguous = new HashMap<String, Double>();
110 | 
111 | 		for(Entry<String, Map<String, Integer>> town:townNames.entrySet()){
112 | 			List<Double> p = new ArrayList<Double>();
113 | 			int total = 0;
114 | 			for(Entry<String, Integer> code:town.getValue().entrySet()){
115 | 				p.add((double) code.getValue());
116 | 				total += code.getValue();
117 | 			}
118 | 			double entropy = computeEntropyNaive(p, total);
119 | 			if(entropy > 0.0)
120 | 				ambiguous.put(town.getKey(), entropy);
121 | 		}
122 | 
123 | 		return ambiguous;
124 | 	}
125 | 
126 | 	private static Map<String, Map<String, Integer>> 
127 | 	loadOccurrences(String testFile) {
128 | 
129 | 		Map<String, Map<String, Integer>> townNames = 
130 | 				new HashMap<String, Map<String, Integer>>();
131 | 
132 | 		EasyBufferedReader reader = 
133 | 				new EasyBufferedReader(testFile);
134 | 		String line;
135 | 		while((line = reader.readLine())!=null){
136 | 			for(String place:line .split("\t")[1].split(",")){
137 | 				if(place.split(":").length>2 && place.split(":")[2].contains("Town")){
138 | 					String townCode = place.split(":")[0];
139 | 					String townName = place.split(":")[1];
140 | 
141 | 					if(townNames.containsKey(townName)){
142 | 						if(townNames.get(townName).containsKey(townCode)){
143 | 							townNames.get(townName).put(townCode, 
144 | 									townNames.get(townName).get(townCode) + 1);
145 | 						}else{
146 | 							townNames.get(townName).put(townCode, 1);
147 | 						}
148 | 					}else{
149 | 						townNames.put(townName, new HashMap<String, Integer>());
150 | 						townNames.get(townName).put(townCode, 1);
151 | 					}
152 | 				}
153 | 			}
154 | 		}
155 | 		reader.close();
156 | 
157 | 		return townNames;
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/BuildingSampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.log4j.Logger;
 7 | 
 8 | import gr.iti.mklab.util.EasyBufferedReader;
 9 | import gr.iti.mklab.util.EasyBufferedWriter;
10 | 
11 | @SuppressWarnings("unchecked")
12 | public class BuildingSampling extends Sampling{
13 | 
14 | 	private static Logger logger = Logger.getLogger(
15 | 			"gr.iti.mklab.eval.BuildingSampling");
16 | 
17 | 	public static Object sample(String testFile) throws Exception{
18 | 
19 | 		logger.info("Sampling: Building Strategy");
20 | 		
21 | 		BuildingSampling sampling = 
22 | 				new BuildingSampling();
23 | 
24 | 		return sampling.writeInFile(sampling.loadData(testFile));
25 | 	}
26 | 
27 | 	protected Object loadData(String testFile) {
28 | 
29 | 		Set<String> buildingConcepts = new HashSet<String>();
30 | 
31 | 		EasyBufferedReader reader = 
32 | 				new EasyBufferedReader("samples/building_concepts.txt");
33 | 		String line;
34 | 		while((line = reader.readLine())!=null){
35 | 			buildingConcepts.add(line);
36 | 		}
37 | 		reader.close();
38 | 		
39 | 		Set<String> buildingImages = new HashSet<String>();
40 | 		reader = new EasyBufferedReader(testFile);
41 | 		while((line = reader.readLine())!=null){
42 | 			String imageID = line.split("\t")[0];
43 | 			for(String concept:line .split("\t")[1].split(",")){
44 | 				if(buildingConcepts.contains(concept.split(":")[0])){
45 | 					buildingImages.add(imageID);
46 | 				}
47 | 			}
48 | 		}
49 | 		reader.close();
50 | 		logger.info(buildingImages.size() + " Building Images loaded");
51 | 
52 | 		return buildingImages;
53 | 	}
54 | 
55 | 	protected Object writeInFile(Object data) {
56 | 
57 | 		Set<String> buildingImages = (Set<String>) data;
58 | 
59 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
60 | 				"samples/building_sampling.txt");
61 | 		for(String image:buildingImages){
62 | 			writer.write(image + "\t");
63 | 			writer.newLine();
64 | 		}
65 | 		writer.close();
66 | 		
67 | 		return buildingImages;
68 | 	}
69 | }


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/GeographicalUniformSampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.math.BigDecimal;
 4 | import java.util.ArrayList;
 5 | import java.util.Collections;
 6 | import java.util.HashMap;
 7 | import java.util.HashSet;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | import java.util.Set;
12 | 
13 | import org.apache.log4j.Logger;
14 | 
15 | import gr.iti.mklab.util.EasyBufferedReader;
16 | import gr.iti.mklab.util.EasyBufferedWriter;
17 | import gr.iti.mklab.util.Utils;
18 | 
19 | @SuppressWarnings("unchecked")
20 | public class GeographicalUniformSampling extends Sampling {
21 | 
22 | 	private static Logger logger = Logger.getLogger(
23 | 			"gr.iti.mklab.eval.GeographicalUniformSampling");
24 | 
25 | 	public static Object sample(String testFile) throws Exception{
26 | 
27 | 		logger.info("Sampling: Geographical Uniform Strategy");
28 | 		
29 | 		GeographicalUniformSampling sampling = 
30 | 				new GeographicalUniformSampling();
31 | 
32 | 		return sampling.writeInFile(sampling.loadData(testFile));
33 | 	}
34 | 
35 | 	protected Object loadData(String testFile) {
36 | 
37 | 		Map<String, Set<String>> cells =
38 | 				new HashMap<String, Set<String>>();
39 | 
40 | 		EasyBufferedReader reader = 
41 | 				new EasyBufferedReader(testFile);
42 | 		String line;
43 | 		while((line = reader.readLine())!=null){
44 | 
45 | 			BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble(
46 | 					line.split("\t")[12])).setScale(1, BigDecimal.ROUND_HALF_UP);
47 | 			BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble(
48 | 					line.split("\t")[13])).setScale(1, BigDecimal.ROUND_HALF_UP);
49 | 
50 | 			String cell = tmpLatCenter + " " + tmpLonCenter;
51 | 			if(cells.containsKey(cell)){
52 | 				cells.get(cell).add(line.split("\t")[1]);
53 | 			}else{
54 | 				cells.put(cell, new HashSet<String>());
55 | 				cells.get(cell).add(line.split("\t")[1]);
56 | 			}
57 | 		}
58 | 		reader.close();
59 | 		logger.info(cells.size() + " Cells loaded");
60 | 
61 | 		return cells;
62 | 	}
63 | 
64 | 	protected Object writeInFile(Object data) {
65 | 
66 | 		Map<String, Set<String>> cells = (Map<String, Set<String>>) data;
67 | 
68 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
69 | 				"samples/geographical_uniform_sampling.txt");
70 | 
71 | 		int median = Utils.medianSet(cells);
72 | 
73 | 		Set<String> respond = new HashSet<String>();
74 | 
75 | 		for(Entry<String, Set<String>> cell:cells.entrySet()){
76 | 			List<String> images = 
77 | 					new ArrayList<String>(cell.getValue());
78 | 			Collections.shuffle(images);
79 | 
80 | 			for(int i=0;i<median;i++){
81 | 				if(i<images.size()){
82 | 					respond.add(images.get(i));
83 | 					writer.write(images.get(i));
84 | 					writer.newLine();
85 | 				}else{
86 | 					break;
87 | 				}
88 | 			}
89 | 		}
90 | 		writer.close();
91 | 
92 | 		return respond;
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/GeographicallyFocusedSampling.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.mmcomms16;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.HashSet;
  5 | import java.util.Map;
  6 | import java.util.Set;
  7 | import java.util.Map.Entry;
  8 | 
  9 | import org.apache.log4j.Logger;
 10 | 
 11 | import gr.iti.mklab.util.EasyBufferedReader;
 12 | import gr.iti.mklab.util.EasyBufferedWriter;
 13 | 
 14 | @SuppressWarnings("unchecked")
 15 | public class GeographicallyFocusedSampling extends Sampling{
 16 | 
 17 | 	private static Logger logger = Logger.getLogger(
 18 | 			"gr.iti.mklab.eval.GeographicallyFocusedSampling");
 19 | 
 20 | 	public static Object sample(String testFile) throws Exception{
 21 | 
 22 | 		logger.info("Sampling: Geographically Focused Strategy");
 23 | 		
 24 | 		GeographicallyFocusedSampling sampling = 
 25 | 				new GeographicallyFocusedSampling();
 26 | 
 27 | 		return sampling.writeInFile(sampling.loadData(testFile));
 28 | 	}
 29 | 
 30 | 	protected Object loadData(String testFile) {
 31 | 
 32 | 		Map<String, Map<String, Set<String>>> places =
 33 | 				new HashMap<String, Map<String, Set<String>>>();
 34 | 
 35 | 		places.put("continents", new HashMap<String, Set<String>>());
 36 | 		places.put("countries", new HashMap<String, Set<String>>());
 37 | 
 38 | 		EasyBufferedReader reader = 
 39 | 				new EasyBufferedReader(testFile);
 40 | 		String line;
 41 | 		while((line = reader.readLine())!=null){
 42 | 			String imageID = line.split("\t")[0];
 43 | 			for(String place:line .split("\t")[1].split(",")){
 44 | 				if(place.split(":").length>2 && place.contains("Timezone")){
 45 | 					String continent = place.split(":")[1].split("%")[0];
 46 | 
 47 | 					switch(continent) {
 48 | 					case "Pacific" :
 49 | 						continent = "America";
 50 | 						break;
 51 | 					case "Atlantic" : 
 52 | 						continent = "America";
 53 | 						break;
 54 | 					case "Indian" : 
 55 | 						continent = "Asia"; 
 56 | 						break;
 57 | 					}
 58 | 					if(places.get("continents").containsKey(continent)){
 59 | 						places.get("continents").get(continent).add(imageID);
 60 | 					}else{
 61 | 						places.get("continents").put(continent, new HashSet<String>());
 62 | 						places.get("continents").get(continent).add(imageID);
 63 | 					}
 64 | 				}
 65 | 
 66 | 				if(place.split(":").length>2 && place.contains("Country")){
 67 | 					String country = place.split(":")[1].split("%")[0];
 68 | 					if(places.get("countries").containsKey(country)){
 69 | 						places.get("countries").get(country).add(imageID);
 70 | 					}else{
 71 | 						places.get("countries").put(country, new HashSet<String>());
 72 | 						places.get("countries").get(country).add(imageID);
 73 | 					}
 74 | 				}
 75 | 			}
 76 | 		}
 77 | 		reader.close();
 78 | 		logger.info(places.get("continents").size() + " Continents loaded");
 79 | 		logger.info(places.get("countries").size() + " Countries loaded");
 80 | 
 81 | 		return places;
 82 | 	}
 83 | 
 84 | 	protected Object writeInFile(Object data) {
 85 | 
 86 | 		Map<String, Map<String, Set<String>>> places = 
 87 | 				(Map<String, Map<String, Set<String>>>) data;
 88 | 
 89 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
 90 | 				"samples/geographically_focused_sampling_continents.txt");
 91 | 		for(Entry<String, Set<String>> continent:places.get("continents").entrySet()){
 92 | 			writer.write(continent.getKey() + "\t");
 93 | 			for(String images:continent.getValue()){
 94 | 				writer.write(images + " ");
 95 | 			}
 96 | 			writer.newLine();
 97 | 		}
 98 | 		writer.close();
 99 | 
100 | 		writer = new EasyBufferedWriter(
101 | 				"samples/geographically_focused_sampling_countries.txt");
102 | 
103 | 		for(Entry<String, Set<String>> country:places.get("countries").entrySet()){
104 | 			writer.write(country.getKey() + "\t");
105 | 			for(String images:country.getValue()){
106 | 				writer.write(images + " ");
107 | 			}
108 | 			writer.newLine();
109 | 		}
110 | 		writer.close();
111 | 
112 | 		return places;
113 | 	}
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/Sampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 | 
3 | public abstract class Sampling {
4 | 
5 | 	protected abstract Object loadData(String testFile);
6 | 	
7 | 	protected abstract Object writeInFile(Object data);
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/TextBasedSampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.HashSet;
 5 | import java.util.Map;
 6 | import java.util.Map.Entry;
 7 | import java.util.Set;
 8 | 
 9 | import org.apache.log4j.Logger;
10 | 
11 | import gr.iti.mklab.util.EasyBufferedReader;
12 | import gr.iti.mklab.util.EasyBufferedWriter;
13 | import gr.iti.mklab.util.Utils;
14 | 
15 | @SuppressWarnings("unchecked")
16 | public class TextBasedSampling extends Sampling {
17 | 
18 | 	private static Logger logger = Logger.getLogger(
19 | 			"gr.iti.mklab.eval.TextBasedSampling");
20 | 
21 | 	public static Object sample(String testFile) throws Exception{
22 | 
23 | 		logger.info("Sampling: Text-based Strategy");
24 | 		
25 | 		TextBasedSampling sampling = new TextBasedSampling();
26 | 
27 | 		return sampling.writeInFile(sampling.loadData(testFile));
28 | 	}
29 | 
30 | 	protected Object loadData(String testFile) {
31 | 
32 | 		Map<String, Integer> images = 
33 | 				new HashMap<String, Integer>();
34 | 
35 | 		EasyBufferedReader reader = 
36 | 				new EasyBufferedReader(testFile);	
37 | 		String line;
38 | 		while((line = reader.readLine())!=null){
39 | 			int tags = (!line.split("\t")[10].isEmpty()
40 | 					?line.split("\t")[10].split(",").length:0);
41 | 			int title =	(!line.split("\t")[8].isEmpty()
42 | 					?line.split("\t")[8].split("\\+").length:0);
43 | 
44 | 			images.put(line.split("\t")[1], tags+title);
45 | 		}
46 | 		reader.close();
47 | 		logger.info(images.size() + " Images loaded");
48 | 
49 | 		return images;
50 | 	}
51 | 
52 | 	protected Object writeInFile(Object data) {
53 | 
54 | 		Map<String, Integer> images = 
55 | 				(Map<String, Integer>) data;
56 | 
57 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
58 | 				"samples/text_based_sampling.txt");
59 | 
60 | 		Set<String> respond = new HashSet<String>();
61 | 
62 | 		int median = Utils.medianItemInt(images);
63 | 		
64 | 		for(Entry<String, Integer> image:images.entrySet()){
65 | 			if(image.getValue() >= median){
66 | 				respond.add(image.getKey());
67 | 				writer.write(image.getKey());
68 | 				writer.newLine();
69 | 			}
70 | 		}
71 | 		writer.close();
72 | 
73 | 		return respond;
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/TextDiversitySampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Collections;
 5 | import java.util.HashMap;
 6 | import java.util.HashSet;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | import java.util.Map.Entry;
10 | import java.util.Set;
11 | import java.util.stream.Collectors;
12 | import java.util.stream.IntStream;
13 | 
14 | import org.apache.log4j.Logger;
15 | 
16 | import gr.iti.mklab.util.EasyBufferedReader;
17 | import gr.iti.mklab.util.EasyBufferedWriter;
18 | import info.debatty.java.lsh.MinHash;
19 | 
20 | @SuppressWarnings("unchecked")
21 | public class TextDiversitySampling extends Sampling {
22 | 
23 | 	private static Logger logger = Logger.getLogger(
24 | 			"gr.iti.mklab.eval.TextDiversitySampling");
25 | 
26 | 	public static Object sample(String testFile) throws Exception{
27 | 
28 | 		logger.info("Sampling: Text Diversity Strategy");
29 | 		
30 | 		TextDiversitySampling sampling = new TextDiversitySampling();
31 | 
32 | 		return sampling.writeInFile(sampling.loadData(testFile));
33 | 	}
34 | 
35 | 	protected Object loadData(String testFile) {
36 | 
37 | 		Map<List<Integer>, List<String>> buckets =
38 | 				new HashMap<List<Integer>, List<String>>();
39 | 		Map<String, Integer> tags =
40 | 				new HashMap<String, Integer>();
41 | 		int n = 510914;
42 | 		MinHash mh = new MinHash(0.1, n);
43 | 		
44 | 		EasyBufferedReader reader = 
45 | 				new EasyBufferedReader(testFile);	
46 | 		String line;
47 | 		while((line = reader.readLine())!=null){
48 | 			String imageID = line.split("\t")[1];
49 | 			String imageTags = line.split("\t")[10];
50 | 			boolean[] vector = new boolean[n];
51 | 
52 | 			for(String tag:imageTags.split(",")){
53 | 				if(!tags.containsKey(tag)){
54 | 					tags.put(tag, tags.size());
55 | 				}
56 | 				vector[tags.get(tag)] = true;
57 | 			}
58 | 			
59 | 			List<Integer> hash = IntStream.of((mh.signature(vector)
60 | 					)).boxed().collect(Collectors.toList());
61 | 			if(buckets.containsKey(hash)){
62 | 				buckets.get(hash).add(imageID);
63 | 			}else{
64 | 				buckets.put(hash, new ArrayList<String>());
65 | 				buckets.get(hash).add(imageID);
66 | 			}
67 | 		}
68 | 		reader.close();
69 | 		logger.info(buckets.size() + " Buckets created");
70 | 
71 | 		return buckets;
72 | 	}
73 | 
74 | 	protected Object writeInFile(Object data) {
75 | 
76 | 		Map<List<Integer>, List<String>> buckets =
77 | 				(Map<List<Integer>, List<String>>)  data;
78 | 
79 | 		Set<String> respond = new HashSet<String>();
80 | 		
81 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
82 | 				"samples/text_diversity_sampling.txt");
83 | 
84 | 		for(Entry<List<Integer>, List<String>> bucket
85 | 				:buckets.entrySet()){
86 | 			List<String> images = bucket.getValue();
87 | 			Collections.shuffle(images);
88 | 			
89 | 			respond.add(images.get(0));
90 | 			writer.write(images.get(0));
91 | 			writer.newLine();
92 | 		}
93 | 		writer.close();
94 | 		
95 | 		return respond;
96 | 	}
97 | 	
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/UserUniformSampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Collections;
 5 | import java.util.HashMap;
 6 | import java.util.HashSet;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | import java.util.Map.Entry;
10 | import java.util.Set;
11 | 
12 | import org.apache.log4j.Logger;
13 | 
14 | import gr.iti.mklab.util.EasyBufferedReader;
15 | import gr.iti.mklab.util.EasyBufferedWriter;
16 | 
17 | @SuppressWarnings("unchecked")
18 | public class UserUniformSampling extends Sampling {
19 | 
20 | 	private static Logger logger = Logger.getLogger(
21 | 			"gr.iti.mklab.eval.UserUniformSampling");
22 | 
23 | 	public static Object sample(String testFile) throws Exception{
24 | 
25 | 		logger.info("Sampling: User Uniform Strategy");
26 | 		
27 | 		UserUniformSampling sampling = new UserUniformSampling();
28 | 
29 | 		return sampling.writeInFile(sampling.loadData(testFile));
30 | 	}
31 | 
32 | 	protected Object loadData(String testFile) {
33 | 
34 | 		Map<String, Set<String>> users =
35 | 				new HashMap<String, Set<String>>();
36 | 
37 | 		EasyBufferedReader reader = 
38 | 				new EasyBufferedReader(testFile);
39 | 		String line;
40 | 		while((line = reader.readLine())!=null){
41 | 			String user = line.split("\t")[3];
42 | 			if(users.containsKey(user)){
43 | 				users.get(user).add(line.split("\t")[1]);
44 | 			}else{
45 | 				users.put(user, new HashSet<String>());
46 | 				users.get(user).add(line.split("\t")[1]);
47 | 			}
48 | 		}
49 | 		reader.close();
50 | 		logger.info(users.size() + " Users loaded");
51 | 
52 | 		return users;
53 | 	}
54 | 
55 | 	protected Object writeInFile(Object data) {
56 | 
57 | 		Map<String, Set<String>> users =
58 | 				(Map<String, Set<String>>) data;
59 | 
60 | 		Set<String> respond = new HashSet<String>();
61 | 		
62 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
63 | 				"samples/user_uniform_sampling.txt");
64 | 
65 | 		for(Entry<String, Set<String>> user:users.entrySet()){
66 | 			List<String> images =
67 | 					new ArrayList<String>(user.getValue());
68 | 			Collections.shuffle(images);
69 | 
70 | 			respond.add(images.get(0));
71 | 			writer.write(images.get(0));
72 | 			writer.newLine();
73 | 		}
74 | 		writer.close();
75 | 		
76 | 		return respond;
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/VisualSampling.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.mmcomms16;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.HashSet;
 5 | import java.util.Map;
 6 | import java.util.Set;
 7 | import java.util.Map.Entry;
 8 | 
 9 | import org.apache.log4j.Logger;
10 | 
11 | import gr.iti.mklab.util.EasyBufferedReader;
12 | import gr.iti.mklab.util.EasyBufferedWriter;
13 | 
14 | @SuppressWarnings("unchecked")
15 | public class VisualSampling extends Sampling{
16 | 
17 | 	private static Logger logger = Logger.getLogger(
18 | 			"gr.iti.mklab.eval.VisualSampling");
19 | 
20 | 	public static Object sample(String testFile) throws Exception{
21 | 
22 | 		logger.info("Sampling: Visual Strategy");
23 | 		
24 | 		VisualSampling sampling = 
25 | 				new VisualSampling();
26 | 
27 | 		return sampling.writeInFile(sampling.loadData(testFile));
28 | 	}
29 | 
30 | 	protected Object loadData(String testFile) {
31 | 
32 | 		Map<String, Set<String>> concepts =
33 | 				new HashMap<String, Set<String>>();
34 | 
35 | 		EasyBufferedReader reader = 
36 | 				new EasyBufferedReader(testFile);
37 | 		String line;
38 | 		while((line = reader.readLine())!=null){
39 | 			String imageID = line.split("\t")[0];
40 | 			for(String concept:line .split("\t")[1].split(",")){
41 | 				if(concepts.containsKey(concept.split(":")[0])){
42 | 					concepts.get(concept.split(":")[0]).add(imageID);
43 | 				}else{
44 | 					concepts.put(concept.split(":")[0], new HashSet<String>());
45 | 					concepts.get(concept.split(":")[0]).add(imageID);
46 | 				}
47 | 			}
48 | 		}
49 | 		reader.close();
50 | 		logger.info(concepts.size() + " Concepts loaded");
51 | 
52 | 		return concepts;
53 | 	}
54 | 
55 | 	protected Object writeInFile(Object data) {
56 | 
57 | 		Map<String, Set<String>> concepts = 
58 | 				(Map<String, Set<String>>) data;
59 | 
60 | 		EasyBufferedWriter writer = new EasyBufferedWriter(
61 | 				"samples/visual_sampling.txt");
62 | 		for(Entry<String, Set<String>> concept:concepts.entrySet()){
63 | 			writer.write(concept.getKey() + "\t");
64 | 			for(String images:concept.getValue()){
65 | 				writer.write(images + " ");
66 | 			}
67 | 			writer.newLine();
68 | 		}
69 | 		writer.close();
70 | 		
71 | 		return concepts;
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/CenterOfGravity.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.tools;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Map.Entry;
 5 | 
 6 | /**
 7 |  * Abstract class that execute the calculation of the center-of-gravity of the most similar images 
 8 |  * @author gkordo
 9 |  *
10 |  */
11 | public abstract class CenterOfGravity {
12 | 
13 | 	protected static int a;
14 | 
15 | 	// Contractor initialize a variable
16 | 	public CenterOfGravity(int a){
17 | 		CenterOfGravity.a = a;
18 | 	}
19 | 	
20 | 	/**
21 | 	 * Calculation of the center-of-gravity of the k most similar images
22 | 	 * @param mapSim : the map with the k most similar images and their similarity values
23 | 	 * @return the estimated location of the query image
24 | 	 */
25 | 	protected static Double[] computeCoordination(Map<String,Double> mapSim){
26 | 
27 | 		double [] loc = new double[3];
28 | 		Double[] c = new Double[2];
29 | 		int k = mapSim.size();
30 | 
31 | 		for (Entry<String, Double> entry:mapSim.entrySet()){
32 | 
33 | 			double sim = entry.getValue();
34 | 			double lat = Double.parseDouble(entry.getKey().split("_")[1]);
35 | 			double lon = Double.parseDouble(entry.getKey().split("_")[0]);
36 | 
37 | 			loc[0] += Math.pow(sim,a)
38 | 					* Math.cos(lat * (Math.PI / 180D))
39 | 					* Math.cos(lon * (Math.PI / 180D)) / k;
40 | 
41 | 			loc[1] += Math.pow(sim,a)
42 | 					* Math.cos(lat * (Math.PI / 180D))
43 | 					* Math.sin(lon * (Math.PI / 180D)) / k;
44 | 
45 | 			loc[2] += Math.pow(sim,a)
46 | 					* Math.sin(lat * (Math.PI / 180D)) / k;
47 | 
48 | 			c[0] = (Double) (Math.atan2(loc[2], Math.sqrt(Math.pow(loc[0],2) 
49 | 					+ Math.pow(loc[1],2))) * (180D/Math.PI));
50 | 
51 | 			c[1] = (Double) (Math.atan2(loc[1], loc[0]) * (180D/Math.PI));
52 | 		}
53 | 		return c;
54 | 	}
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/DataManager.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.tools;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.log4j.Logger;
 7 | 
 8 | import gr.iti.mklab.util.EasyBufferedReader;
 9 | import gr.iti.mklab.util.TextUtil;
10 | 
11 | /**
12 |  * Data manager
13 |  * @author gkordo
14 |  *
15 |  */
16 | public class DataManager {
17 | 
18 | 	static Logger logger = Logger.getLogger("gr.iti.mklab.tools.DataManager");
19 | 
20 | 	// return a set contain the image IDs of the provided dataset
21 | 	public static Set<String> getSetOfImageIDs(String file){
22 | 
23 | 		Set<String> usersIncludedInFile = new HashSet<String>();
24 | 
25 | 		EasyBufferedReader reader = new EasyBufferedReader(file);
26 | 
27 | 		String input;
28 | 
29 | 		logger.info("images contained in file " + file);
30 | 		while ((input= reader.readLine())!=null){
31 | 			usersIncludedInFile.add(input.split("\t")[1]);			
32 | 		}
33 | 		logger.info(usersIncludedInFile.size()+" total images included in file");
34 | 		reader.close();
35 | 
36 | 		return usersIncludedInFile;
37 | 	}
38 | 
39 | 	// return a set contain the individual tags of the provided dataset
40 | 	public static Set<String> getSetOfTerms(String file){
41 | 
42 | 		EasyBufferedReader reader = new EasyBufferedReader(file);
43 | 		Set<String> termsIncludedInFile = new HashSet<String>();
44 | 		
45 | 		String line;
46 | 
47 | 		logger.info("deterim the diffrent tags contained in file " + file);
48 | 		while ((line= reader.readLine())!=null){
49 | 
50 | 			Set<String> terms = new HashSet<String>();
51 | 			TextUtil.parse(line.split("\t")[10], terms);
52 | 			TextUtil.parse(line.split("\t")[8], terms);
53 | 
54 | 			termsIncludedInFile.addAll(terms);
55 | 
56 | 		}
57 | 		logger.info(termsIncludedInFile.size()+" total tags included in file");
58 | 		reader.close();
59 | 
60 | 		return termsIncludedInFile;
61 | 	}
62 | 
63 | 	// return a set contain the different users in the provided dataset
64 | 	public static Set<String> getSetOfUserID (String file){
65 | 
66 | 		Set<String> usersIncludedInFile = new HashSet<String>();
67 | 
68 | 		EasyBufferedReader reader = new EasyBufferedReader(file);
69 | 
70 | 		String input;
71 | 
72 | 		logger.info("deterim the diffrent users contained in file " + file);
73 | 		while ((input= reader.readLine())!=null){
74 | 			usersIncludedInFile.add(input.split("\t")[3]);			
75 | 		}
76 | 		logger.info(usersIncludedInFile.size()+" total users included in file");
77 | 		reader.close();
78 | 
79 | 		return usersIncludedInFile;
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/InterfaceTermCellProb.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.tools;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Interface of tag-cell probability calculator 
 7 |  * @author gkordo
 8 |  *
 9 |  */
10 | public interface InterfaceTermCellProb {
11 | 
12 | 	/**
13 | 	 * Function where the tag-cell probabilities are calculated and stored in a defined file.
14 | 	 * @param dir : directory of the project
15 | 	 * @param trainFile : file that contains the train set
16 | 	 * @param outFile : output file
17 | 	 * @param scale : grid scale
18 | 	 * @throws IOException : file not found
19 | 	 */
20 | 	public void calculatorTermCellProb(String dir, String trainFile,
21 | 			String outFile, int scale) throws IOException;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/SimilarityCalculator.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.tools;
  2 | 
  3 | import gr.iti.mklab.data.ImageMetadata;
  4 | import gr.iti.mklab.util.EasyBufferedReader;
  5 | import gr.iti.mklab.util.Utils;
  6 | import gr.iti.mklab.util.TextUtil;
  7 | 
  8 | import java.io.File;
  9 | import java.io.IOException;
 10 | import java.math.BigDecimal;
 11 | import java.util.*;
 12 | import java.util.Map.Entry;
 13 | 
 14 | import org.apache.commons.io.FileUtils;
 15 | import org.apache.hadoop.fs.Path;
 16 | import org.apache.hadoop.io.*;
 17 | import org.apache.hadoop.mapred.*;
 18 | import org.apache.log4j.Logger;
 19 | 
 20 | /**
 21 |  * For a query image, the similarity between the images contained in the train set is calculated based on their corresponding term sets.
 22 |  * Class that implements similarity search based on Map-Reduce scheme.
 23 |  * @author gkordo
 24 |  *
 25 |  */
 26 | public class SimilarityCalculator{
 27 | 
 28 | 	private static Set<String> testIDs;
 29 | 	private static Set<String>users;
 30 | 	private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch");
 31 | 	static java.util.Map<String, List<ImageMetadata>> predictedCellsOfTestImages = new HashMap<String,  List<ImageMetadata>>();
 32 | 
 33 | 	/**
 34 | 	 * Contractor of the class.
 35 | 	 * @param testFile : file that contains the test image's metadata
 36 | 	 * @param resultFile : file that contains the MLC of every query image
 37 | 	 */
 38 | 	public SimilarityCalculator(String testFile, String resultFile){
 39 | 		loadTestImages(testFile,resultFile);
 40 | 	}
 41 | 
 42 | 
 43 | 	/**
 44 | 	 * Map class that takes the lines of the train file as input and creates key-value pairs,
 45 | 	 * using as keys the image IDs of the test set images and as values strings that contain
 46 | 	 * the location of the train images and the calculated similarity.
 47 | 	 * @author gkordo
 48 | 	 *
 49 | 	 */
 50 | 	public static class MapSimilaritySearch extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
 51 | 
 52 | 		/**
 53 | 		 * Required map function
 54 | 		 * @param key : key value
 55 | 		 * @param value : input string
 56 | 		 * @param output : output collector
 57 | 		 * @param reporter : reporter of the job
 58 | 		 */
 59 | 		public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
 60 | 			
 61 | 			String[] metadata = value.toString().split("\t");
 62 | 
 63 | 			if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
 64 | 					&& !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
 65 | 					&& (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
 66 | 
 67 | 				// get image cell based on its latitude-longitude pair
 68 | 				BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble(
 69 | 						metadata[12])).setScale(2, BigDecimal.ROUND_HALF_UP);
 70 | 				BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble(
 71 | 						metadata[13])).setScale(2, BigDecimal.ROUND_HALF_UP);
 72 | 
 73 | 				Set<String> trainImageTerms = new HashSet<String>();
 74 | 				TextUtil.parse(metadata[10], trainImageTerms);
 75 | 				TextUtil.parse(metadata[8], trainImageTerms);
 76 | 				
 77 | 				// there is at least estimated location laying inside the borders of cell
 78 | 				if(predictedCellsOfTestImages.containsKey(tmpLonCenter+"_"+tmpLatCenter)
 79 | 						&& trainImageTerms.size() > 1){
 80 | 
 81 | 					// calculate similarity between the train image and all images that lay inside the boarded of the specific cell
 82 | 					for(ImageMetadata entry : predictedCellsOfTestImages
 83 | 							.get(tmpLonCenter+"_"+tmpLatCenter)){
 84 | 
 85 | 						// determine the common terms
 86 | 						List<String> common = new ArrayList<String>(trainImageTerms);
 87 | 						common.retainAll(entry.getTags());
 88 | 
 89 | 						// calculate similarity
 90 | 						double sjacc = (double) common.size() / (entry.getTags().size() 
 91 | 								+ trainImageTerms.size() - common.size());
 92 | 						if(sjacc>0.05){
 93 | 							output.collect(new Text(entry.getId()), new Text(String.valueOf(sjacc) +
 94 | 									">" + metadata[12] + "_"+metadata[13]));
 95 | 						}
 96 | 					}
 97 | 				}
 98 | 			}
 99 | 		}
100 | 	}
101 | 
102 | 	/**
103 | 	 * Reduce class that get the key-value pairs and sort the similarities for a test image.
104 | 	 * @author gkordo
105 | 	 *
106 | 	 */
107 | 	public static class ReduceSimilaritySearch extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
108 | 
109 | 		/**
110 | 		 * Required reduce function
111 | 		 * @param key : key value
112 | 		 * @param values : set of values that share the same key
113 | 		 * @param output : output collector
114 | 		 * @param reporter : reporter of the job
115 | 		 */
116 | 		public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
117 | 
118 | 			java.util.Map<String,Double> simImages = new HashMap<String,Double>();
119 | 
120 | 			// load values in a topic similarity map
121 | 			while (values.hasNext()) {
122 | 				String entry = values.next().toString();
123 | 				simImages.put(entry.split(">")[1],Double.parseDouble(entry.split(">")[0]));
124 | 			}
125 | 
126 | 			// sort similarity map
127 | 			simImages = Utils.sortByValues(simImages);
128 | 
129 | 			// write in output file
130 | 			output.collect(key, new Text(convertSimMapToStr(simImages)));
131 | 		}
132 | 
133 | 		/**
134 | 		 * Function that converts similarity map to output string
135 | 		 * @param simImages : similarity map
136 | 		 * @return a string that contains similarity and location of the train images
137 | 		 */
138 | 		public String convertSimMapToStr(java.util.Map<String,Double> simImages){
139 | 			String out = "";
140 | 
141 | 			for(Entry<String,Double> entry : simImages.entrySet()){
142 | 				out += entry.getKey() + ">" + entry.getValue() + " ";
143 | 			}
144 | 
145 | 			return out.trim();
146 | 		}
147 | 	}
148 | 
149 | 	/**
150 | 	 * Core function for the job of similarity search.
151 | 	 * @param dir : directory of the project
152 | 	 * @param trainFolder : the file of the train set
153 | 	 * @param outFolder : the folder where the tag-set probabilities file will be stored
154 | 	 * @throws Exception : file not found
155 | 	 */
156 | 	public void performSimilarityCalculation(String dir, String trainFolder, String outFolder) throws Exception {
157 | 
158 | 		logger.info("Process: Similarity Calculation\t|\t"
159 | 				+ "Status: INITIALIZE");
160 | 		JobConf conf = new JobConf(SimilarityCalculator.class);
161 | 		conf.setJobName("similaritysearch");
162 | 
163 | 		conf.setOutputKeyClass(Text.class);
164 | 		conf.setOutputValueClass(Text.class);
165 | 
166 | 		conf.setMapperClass(MapSimilaritySearch.class);
167 | 
168 | 		conf.setReducerClass(ReduceSimilaritySearch.class);
169 | 
170 | 		conf.setInputFormat(TextInputFormat.class);
171 | 		conf.setOutputFormat(TextOutputFormat.class);
172 | 
173 | 		// clean the output file directory
174 | 		File file = new File(dir + outFolder);
175 | 		if (file.exists()) {
176 | 			FileUtils.cleanDirectory(file);
177 | 			FileUtils.forceDelete(file);
178 | 		}
179 | 
180 | 		FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
181 | 		FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder));
182 | 
183 | 		logger.info("Process: Similarity Calculation\t|\t"
184 | 				+ "Status: STARTED");
185 | 		long startTime = System.currentTimeMillis();
186 | 		JobClient.runJob(conf);
187 | 		logger.info("Process: Similarity Calculation\t|\t"
188 | 				+ "Status: COMPLETED\t|\tTotal time: " + 
189 | 				(System.currentTimeMillis()-startTime)/60000.0+"m");
190 | 
191 | 		new File(dir + outFolder + "/part-00000").renameTo(
192 | 				new File(dir + outFolder + "/image_similarities")); // rename the output file
193 | 	}
194 | 
195 | 	/**
196 | 	 * Load test images in a map based on their MLCs. Also update the set of test image IDs and test user IDs.
197 | 	 * @param testFile
198 | 	 * @param resultFile
199 | 	 */
200 | 	private void loadTestImages(String testFile, String resultFile){
201 | 
202 | 		EasyBufferedReader readerTest = new EasyBufferedReader(testFile);
203 | 		EasyBufferedReader readerResult = new EasyBufferedReader(resultFile);
204 | 		String lineT,lineR;
205 | 
206 | 		while ((lineT = readerTest.readLine())!=null && (lineR = readerResult.readLine())!=null){
207 | 
208 | 			if(!lineR.split("\t")[1].equals("N/A")){
209 | 				// create an object based on test image metadata
210 | 				Set<String> terms = new HashSet<String>();
211 | 				TextUtil.parse(lineR.split("\t")[10], terms);
212 | 				TextUtil.parse(lineR.split("\t")[8], terms);
213 | 				ImageMetadata image = new ImageMetadata(lineT.split("\t")[1], lineT.split("\t")[3], terms);
214 | 
215 | 				// update respective sets
216 | 				testIDs.add(lineT.split("\t")[0]);
217 | 				users.add(lineT.split("\t")[2]);
218 | 
219 | 				// load image object to the corresponding cell of the map
220 | 				if(predictedCellsOfTestImages.containsKey(lineR.split("\t")[1].split(":")[0])){
221 | 					predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image);
222 | 				}else{
223 | 					predictedCellsOfTestImages.put(lineR.split("\t")[1].split(":")[0], 
224 | 							new ArrayList<ImageMetadata>());
225 | 					predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image);
226 | 				}
227 | 			}
228 | 		}
229 | 
230 | 		logger.info(users.size()+" different users appeared in " + testIDs.size() + " images");
231 | 		readerTest.close();
232 | 		readerResult.close();
233 | 	}
234 | }


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/EasyBufferedReader.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.util;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.io.Reader;
 9 | import java.io.UnsupportedEncodingException;
10 | 
11 | import org.apache.log4j.Logger;
12 | 
13 | public class EasyBufferedReader extends BufferedReader {
14 | 
15 | 	protected Logger logger;
16 | 	
17 | 	
18 | 	static final Reader createReader(String textFile, Logger logger){
19 | 		try {
20 | 			return new InputStreamReader(new FileInputStream(textFile), "UTF-8");
21 | 		} catch (UnsupportedEncodingException e) {
22 | 			logger.error(e.getMessage());
23 | 		} catch (FileNotFoundException e) {
24 | 			logger.error(e.getMessage());
25 | 		}
26 | 		return null;
27 | 	}
28 | 	
29 | 	public EasyBufferedReader(String textFile) {
30 | 		super(createReader(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedReader")));
31 | 		this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedReader");
32 | 		logger.debug("opened " + textFile);
33 | 	}
34 | 
35 | 	@Override
36 | 	public void close() {
37 | 		try {
38 | 			super.close();
39 | 		} catch (IOException e) {
40 | 			logger.error(e.getMessage());
41 | 		}
42 | 	}
43 | 
44 | 	@Override
45 | 	public String readLine() {
46 | 		try {
47 | 			return super.readLine();
48 | 		} catch (IOException e) {
49 | 			logger.error(e.getMessage());
50 | 		}
51 | 		return null;
52 | 	}
53 | 	
54 | 	
55 | 	
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/EasyBufferedWriter.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.util;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.FileOutputStream;
 6 | import java.io.IOException;
 7 | import java.io.OutputStreamWriter;
 8 | import java.io.UnsupportedEncodingException;
 9 | import java.io.Writer;
10 | 
11 | import org.apache.log4j.Logger;
12 | 
13 | public class EasyBufferedWriter extends BufferedWriter {
14 | 	
15 | 	protected Logger logger;
16 | 	
17 | 	
18 | 	static final Writer createWriter(String textFile, Logger logger, boolean end){
19 | 		try {
20 | 			return new OutputStreamWriter(new FileOutputStream(textFile,end), "UTF-8");
21 | 		} catch (UnsupportedEncodingException e) {
22 | 			logger.error(e.getMessage());
23 | 		} catch (FileNotFoundException e) {
24 | 			logger.error(e.getMessage());
25 | 		}
26 | 		return null;
27 | 	}
28 | 	
29 | 	public EasyBufferedWriter(String textFile) {
30 | 		super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),false));
31 | 		this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter");
32 | 		logger.debug("opened " + textFile);
33 | 	}
34 | 	
35 | 	public EasyBufferedWriter(String textFile, boolean end) {
36 | 		super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),end));
37 | 		this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter");
38 | 		logger.debug("opened " + textFile);
39 | 	}
40 | 	
41 | 	@Override
42 | 	public void close() {
43 | 		try {
44 | 			super.close();
45 | 		} catch (IOException e) {
46 | 			logger.error(e.getMessage());
47 | 		}
48 | 	}
49 | 
50 | 	@Override
51 | 	public void write(String s) {
52 | 		try {
53 | 			super.write(s);
54 | 		} catch (IOException e){
55 | 			logger.error(e.getMessage());
56 | 		}
57 | 	}
58 | 
59 | 	@Override
60 | 	public void newLine() {
61 | 		try {
62 | 			super.newLine();
63 | 		} catch (IOException e){
64 | 			logger.error(e.getMessage());
65 | 		}
66 | 	}
67 | 
68 | 	
69 | 	
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/Progress.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.util;
 2 | 
 3 | import org.apache.log4j.Logger;
 4 | 
 5 | public class Progress {
 6 | 
 7 | 	private long gStartTime, lastTime;
 8 | 	private int div, scaleTime;
 9 | 	private String mesPerCent, mesTime, messege;
10 | 	private int sec;
11 | 	private Logger logger;
12 | 
13 | 	public Progress(long gStartTime, int limitCountLines, int scalePerCent, int scaleTime, String messege, Logger logger){
14 | 		this.gStartTime = gStartTime;
15 | 
16 | 		this.mesPerCent = "%";
17 | 		if(scalePerCent==10){this.mesPerCent = "0" + this.mesPerCent;}
18 | 
19 | 		this.scaleTime = scaleTime;
20 | 		this.mesTime = "m";
21 | 		if(scaleTime==1){this.mesTime = "s";}
22 | 
23 | 		this.div = limitCountLines/scalePerCent;
24 | 		this.messege = messege;
25 | 		
26 | 		this.logger = logger;
27 | 	}
28 | 	
29 | 	public Progress(long gStartTime, int sec, int scaleTime, String messege, Logger logger){
30 | 		this.sec = sec;
31 | 		this.gStartTime = gStartTime;
32 | 		
33 | 		this.scaleTime = scaleTime;
34 | 		
35 | 		this.mesTime = "min";
36 | 		this.messege = messege;
37 | 		if(scaleTime==1){this.mesTime = "s";}
38 | 		
39 | 		this.logger = logger;
40 | 	}
41 | 	
42 | 	public void showMessege(long stopTime){
43 | 		if(stopTime-lastTime>sec*1000){
44 | 			logger.info(messege+" > "+ (stopTime-gStartTime)/(scaleTime*1000) + mesTime);
45 | 			lastTime=stopTime;
46 | 		}
47 | 	}
48 | 	
49 | 	public void showProgress(int count, long stopTime){
50 | 		if(count%div==0){
51 | 			logger.info(messege+" > "+count/div+ mesPerCent + " > " + (stopTime-gStartTime)/(scaleTime*1000) + mesTime);
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/TextUtil.java:
--------------------------------------------------------------------------------
 1 | package gr.iti.mklab.util;
 2 | 
 3 | import java.io.UnsupportedEncodingException;
 4 | import java.net.URLDecoder;
 5 | import java.text.Normalizer;
 6 | import java.util.Set;
 7 | import java.util.regex.Pattern;
 8 | 
 9 | 
10 | public class TextUtil {
11 | 	
12 | 	public static String deAccent(String str) {
13 | 		String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); 
14 | 		Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
15 | 		return pattern.matcher(nfdNormalizedString).replaceAll("");
16 | 	}
17 | 	
18 | 	public static Set<String> parse (String text, Set<String> terms) {
19 | 
20 | 		if ((text !=null ) || (text !="")){
21 | 			try{
22 | 				text = URLDecoder.decode(text, "UTF-8");
23 | 				text = deAccent(text);
24 | 
25 | 				text = text.trim(); // removes redundant white spaces
26 | 				text = text.replaceAll("[\\p{Punct}&&[^\\,]]", "");
27 | 				text = text.replaceAll("[0-9]+", "");
28 | 
29 | 				text = text.toLowerCase();
30 | 				text = text.replaceAll("\\s{2,}", " ");
31 | 				text = text.replaceAll("\\,{2,}", ",");
32 | 				text = text.trim();
33 | 
34 | 				for(String term:text.split(",")){
35 | 					if(!term.replaceAll(" ", "").matches("[0-9]+")&&!term.isEmpty()){
36 | 						terms.add(term.trim());
37 | 						for(String interm:term.split(" ")){
38 | 							if(!interm.matches("[0-9]+")){
39 | 								terms.add(interm);
40 | 							}
41 | 						}
42 | 					}
43 | 				}
44 | 			}catch(UnsupportedEncodingException exception){	
45 | 			}catch(IllegalArgumentException exception){}
46 | 		}
47 | 		return terms;
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/Utils.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.util;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | import java.util.Map.Entry;
  7 | 
  8 | import gr.iti.mklab.data.GeoCell;
  9 | 
 10 | import java.util.Set;
 11 | import java.util.Collections;
 12 | import java.util.Comparator;
 13 | import java.util.HashMap;
 14 | import java.util.Iterator;
 15 | import java.util.LinkedHashMap;
 16 | import java.util.LinkedList;
 17 | 
 18 | public class Utils {
 19 | 
 20 | 	public static <K extends Comparable,V extends Comparable> Map<K,V> sortByValues(Map<K,V> map){
 21 | 		List<Map.Entry<K,V>> entries = new LinkedList<Map.Entry<K,V>>(map.entrySet());
 22 | 
 23 | 		Collections.sort(entries, Collections.reverseOrder(new Comparator<Map.Entry<K,V>>() {
 24 | 
 25 | 			public int compare(Entry<K, V> o1, Entry<K, V> o2) {
 26 | 				return o1.getValue().compareTo(o2.getValue());
 27 | 			}
 28 | 		}));
 29 | 		//LinkedHashMap will keep the keys in the order they are inserted
 30 | 		//which is currently sorted on natural ordering
 31 | 		Map<K,V> sortedMap = new LinkedHashMap<K,V>();
 32 | 
 33 | 		for(Map.Entry<K,V> entry: entries){
 34 | 			sortedMap.put(entry.getKey(), entry.getValue());
 35 | 		}
 36 | 
 37 | 		return sortedMap;
 38 | 	}
 39 | 
 40 | 	public static <K extends Comparable,V extends Comparable> Map<K,V[]> sortByValuesTable(Map<K,V[]> map){
 41 | 		List<Map.Entry<K,V[]>> entries = new LinkedList<Map.Entry<K,V[]>>(map.entrySet());
 42 | 
 43 | 		Collections.sort(entries, Collections.reverseOrder(new Comparator<Map.Entry<K,V[]>>() {
 44 | 			public int compare(Entry<K, V[]> o1, Entry<K, V[]> o2) {
 45 | 				return o1.getValue()[0].compareTo(o2.getValue()[0]);
 46 | 			}
 47 | 		}));
 48 | 		//LinkedHashMap will keep the keys in the order they are inserted
 49 | 		//which is currently sorted on natural ordering
 50 | 		Map<K,V[]> sortedMap = new LinkedHashMap<K,V[]>();
 51 | 
 52 | 		for(Map.Entry<K,V[]> entry: entries){
 53 | 			sortedMap.put(entry.getKey(), entry.getValue());
 54 | 		}
 55 | 
 56 | 		return sortedMap;
 57 | 	}
 58 | 
 59 | 	public static Map<String, GeoCell> sortByMLCValues(Map<String, GeoCell> unsortMap) {
 60 | 
 61 | 		// Convert Map to List
 62 | 		List<Map.Entry<String, GeoCell>> list = 
 63 | 				new LinkedList<Map.Entry<String, GeoCell>>(unsortMap.entrySet());
 64 | 
 65 | 		// Sort list with comparator, to compare the Map values
 66 | 		Collections.sort(list, new Comparator<Map.Entry<String, GeoCell>>() {
 67 | 			public int compare(Map.Entry<String, GeoCell> o1,
 68 | 					Map.Entry<String, GeoCell> o2) {
 69 | 				return -(o1.getValue()).getTotalProb().compareTo(o2.getValue().getTotalProb());
 70 | 			}
 71 | 		});
 72 | 
 73 | 		// Convert sorted map back to a Map
 74 | 		Map<String, GeoCell> sortedMap = new LinkedHashMap<String, GeoCell>();
 75 | 		for (Iterator<Map.Entry<String, GeoCell>> it = list.iterator(); it.hasNext();) {
 76 | 			Map.Entry<String, GeoCell> entry = it.next();
 77 | 			sortedMap.put(entry.getKey(), entry.getValue());
 78 | 		}
 79 | 		return sortedMap;
 80 | 	}	
 81 | 
 82 | 	public static <K, V> HashMap<K,V> getFirstEntryOfSortedMap(Map<K,V> map){
 83 | 		HashMap <K,V> firstEntry = new HashMap<K,V>();
 84 | 
 85 | 		for ( Entry<K, V> entry : map.entrySet()){
 86 | 			firstEntry.put(entry.getKey(), entry.getValue());
 87 | 			break;
 88 | 		}		
 89 | 
 90 | 		return firstEntry;
 91 | 	}
 92 | 
 93 | 	public static <K,V> HashMap<V,K> invertKeysValues(Map<K,V> map){
 94 | 
 95 | 		HashMap<V,K> invertedHashMap = new HashMap<V,K>();
 96 | 
 97 | 		for(Entry<K, V> entry : map.entrySet()){
 98 | 			invertedHashMap.put(entry.getValue(), entry.getKey());
 99 | 		}
100 | 
101 | 		return invertedHashMap;
102 | 
103 | 	}
104 | 
105 | 	public static double median(List<Double> p) 
106 | 	{
107 | 		Double[] b = new Double[p.size()];
108 | 		int i=0;
109 | 		for (Double entry: p){
110 | 			b[i]=entry;
111 | 			i++;
112 | 		}
113 | 		Arrays.sort(b);
114 | 		if (p.size() % 2 == 0) 
115 | 		{
116 | 			return (b[(b.length / 2)-1] + b[b.length / 2]) / 2.0;
117 | 		} 
118 | 		else 
119 | 		{
120 | 			return b[b.length / 2];
121 | 		}
122 | 	}
123 | 
124 | 	public static <K extends Comparable,V extends Comparable> 
125 | 	int medianSet(Map<K, Set<V>> map) 
126 | 	{
127 | 		int[] b = new int[map.size()];
128 | 		int i = 0;
129 | 		for (Entry<K, Set<V>> entry: map.entrySet()){
130 | 			b[i] = entry.getValue().size();
131 | 			i++;
132 | 		}
133 | 		Arrays.sort(b);
134 | 		if (b.length % 2 == 0) 
135 | 		{
136 | 			return (int) Math.floor((b[(b.length / 2)-1] 
137 | 					+ b[b.length / 2]) / 2.0);
138 | 		} 
139 | 		else
140 | 		{
141 | 			return b[b.length / 2];
142 | 		}
143 | 	}
144 | 
145 | 	public static <K extends Comparable,V extends Comparable>
146 | 	int medianItemInt(Map<K, V> map) 
147 | 	{
148 | 		int[] b = new int[map.size()];
149 | 
150 | 		int i = 0;
151 | 		for (Entry<K, V> entry: map.entrySet()){
152 | 			b[i] = (int) entry.getValue();
153 | 			i++;
154 | 		}
155 | 		Arrays.sort(b);
156 | 		if (b.length % 2 == 0) 
157 | 		{
158 | 			return  (int) Math.floor((b[(b.length / 2)-1] 
159 | 					+ b[b.length / 2]) / 2.0);
160 | 		} 
161 | 		else
162 | 		{
163 | 			return b[b.length / 2];
164 | 		}
165 | 	}
166 | 	
167 | 	public static <K extends Comparable,V extends Comparable>
168 | 	double medianItemDouble(Map<K, V> map) 
169 | 	{
170 | 		double[] b = new double[map.size()];
171 | 
172 | 		int i = 0;
173 | 		for (Entry<K, V> entry: map.entrySet()){
174 | 			b[i] = (double) entry.getValue();
175 | 			i++;
176 | 		}
177 | 		Arrays.sort(b);
178 | 		if (b.length % 2 == 0) 
179 | 		{
180 | 			return (b[(b.length / 2)-1] 
181 | 					+ b[b.length / 2] / 2.0);
182 | 		} 
183 | 		else
184 | 		{
185 | 			return b[b.length / 2];
186 | 		}
187 | 	}
188 | }
189 | 


--------------------------------------------------------------------------------
/src/test/java/gr/iti/mklab/main/Evaluation.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.main;
  2 | 
  3 | import java.io.FileInputStream;
  4 | import java.text.DecimalFormat;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | import java.util.Map.Entry;
 10 | import java.util.Properties;
 11 | import java.util.Set;
 12 | 
 13 | import org.apache.log4j.Logger;
 14 | 
 15 | import gr.iti.mklab.mmcomms16.AmbiguityBasedSampling;
 16 | import gr.iti.mklab.mmcomms16.BuildingSampling;
 17 | import gr.iti.mklab.mmcomms16.GeographicalUniformSampling;
 18 | import gr.iti.mklab.mmcomms16.GeographicallyFocusedSampling;
 19 | import gr.iti.mklab.mmcomms16.TextBasedSampling;
 20 | import gr.iti.mklab.mmcomms16.TextDiversitySampling;
 21 | import gr.iti.mklab.mmcomms16.UserUniformSampling;
 22 | import gr.iti.mklab.mmcomms16.VisualSampling;
 23 | import gr.iti.mklab.util.EasyBufferedReader;
 24 | import gr.iti.mklab.util.Utils;
 25 | import net.sf.geographiclib.Geodesic;
 26 | 
 27 | /**
 28 |  * Function that calculate the result of a geolocation method based on the Karney's algorithm.
 29 |  * @author gkordo
 30 |  *
 31 |  */
 32 | public class Evaluation {
 33 | 
 34 | 	private static Logger logger = Logger.getLogger(
 35 | 			"gr.iti.mklab.eval.VisualSampling");
 36 | 
 37 | 	/**
 38 | 	 * Initialize the map of the results based on the given values.
 39 | 	 * Lower range = 10^minRange
 40 | 	 * Higher ranger = 10^maxRange
 41 | 	 * 
 42 | 	 * @param minRangeScale
 43 | 	 * @param maxRangeScale
 44 | 	 * @return
 45 | 	 */
 46 | 	private static Map<Integer, Integer> initializeResultMap(int minRangeScale, int maxRangeScale){
 47 | 		Map<Integer, Integer> map = new HashMap<Integer, Integer>();
 48 | 		for(int i=minRangeScale;i<maxRangeScale;i++){
 49 | 			map.put(i, 0);
 50 | 		}
 51 | 		return map;
 52 | 	}
 53 | 
 54 | 	/**
 55 | 	 * Print the precision for different ranges, based on the total number of items provided.
 56 | 	 * 
 57 | 	 * @param estimationResultMap
 58 | 	 * @param totalItems
 59 | 	 */
 60 | 	private static void printPrecisionResults(Map<Integer, Integer> estimationResultMap, 
 61 | 			int totalItems, int minRangeScale, int maxRangeScale){
 62 | 
 63 | 		for(int i=minRangeScale;i<maxRangeScale;i++){
 64 | 			double precision = Math.pow(10, i);
 65 | 
 66 | 			DecimalFormat df = new DecimalFormat();
 67 | 			df.setMaximumFractionDigits(2);
 68 | 
 69 | 			logger.info("Precision @ " + precision + "km: " 
 70 | 					+ df.format((float)estimationResultMap.get(i)/totalItems*100)+"%");
 71 | 		}
 72 | 	}
 73 | 
 74 | 	/** 
 75 | 	 * Main function
 76 | 	 * @param args
 77 | 	 * @throws Exception 
 78 | 	 */
 79 | 	@SuppressWarnings("unchecked")
 80 | 	public static void main(String[] args) throws Exception{
 81 | 
 82 | 		Properties properties = new Properties();
 83 | 
 84 | 		properties.load(new FileInputStream("eval.properties"));
 85 | 
 86 | 		// input files
 87 | 		String testFile = properties.getProperty("testFile");
 88 | 		String placeFile = properties.getProperty("placeFile");
 89 | 		String conceptFile = properties.getProperty("conceptFile");
 90 | 
 91 | 		String resultFile = properties.getProperty("resultFile");
 92 | 
 93 | 		// minimum and maximum ranges
 94 | 		int minRangeScale = Integer.parseInt(properties.getProperty("minRangeScale"));
 95 | 		int maxRangeScale = Integer.parseInt(properties.getProperty("maxRangeScale"));
 96 | 
 97 | 		String sampling = properties.getProperty("sampling");
 98 | 
 99 | 		Set<String> collection = null;
100 | 
101 | 		// Sampling Strategies
102 | 		switch(sampling) {
103 | 
104 | 		case "GUS" : // Geographical Uniform Sampling
105 | 
106 | 			collection = (Set<String>) GeographicalUniformSampling.sample(testFile);
107 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
108 | 			break;
109 | 
110 | 		case "UUS" : // User Uniform Sampling
111 | 
112 | 			collection = (Set<String>) UserUniformSampling.sample(testFile);
113 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
114 | 			break;
115 | 
116 | 		case "TBS" : // Text-based Sampling
117 | 
118 | 			collection = (Set<String>) TextBasedSampling.sample(testFile);
119 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
120 | 			break;
121 | 
122 | 		case "TDS" : // Text Diversity Sampling
123 | 
124 | 			collection = (Set<String>) TextDiversitySampling.sample(testFile);
125 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
126 | 			break;
127 | 
128 | 		case "GFS" : // Geographically Focused Sampling
129 | 
130 | 			Map<String, Map<String, Set<String>>> places = (Map<String,
131 | 					Map<String, Set<String>>>) GeographicallyFocusedSampling.sample(placeFile);
132 | 			logger.info("----------Continents----------");
133 | 			evaluateMultiSets(resultFile, places.get("continents"), minRangeScale, maxRangeScale);
134 | 			logger.info("----------Countries----------");
135 | 			evaluateMultiSets(resultFile, places.get("countries"), minRangeScale, maxRangeScale);
136 | 			break;
137 | 
138 | 		case "ABS" : // Ambiguity-based Sampling
139 | 			
140 | 			Map<Boolean, Set<String>> ambiuous = (Map<Boolean, Set<String>>)
141 | 			AmbiguityBasedSampling.sample(placeFile);
142 | 			logger.info("----------Ambiguous----------");
143 | 			evaluateSingleSet(resultFile, ambiuous.get(true), minRangeScale, maxRangeScale, false);
144 | 			logger.info("----------Non-Ambiguous----------");
145 | 			evaluateSingleSet(resultFile, ambiuous.get(false), minRangeScale, maxRangeScale,false);
146 | 			break;
147 | 
148 | 		case "VS" : // Visual Sampling
149 | 			
150 | 			Map<String, Set<String>> concepts = (Map<String, Set<String>>)
151 | 			VisualSampling.sample(conceptFile);
152 | 			logger.info("----------Concepts----------");
153 | 			evaluateMultiSets(resultFile, concepts, minRangeScale, maxRangeScale);
154 | 			break;
155 | 
156 | 		case "BS" : // Building Sampling
157 | 			
158 | 			collection = (Set<String>) BuildingSampling.sample(conceptFile);
159 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
160 | 			break;
161 | 
162 | 		default: // No Sampling
163 | 			logger.info("Sampling: No Strategy");
164 | 			evaluateSingleSet(resultFile, collection, minRangeScale, maxRangeScale, false);
165 | 			
166 | 		}
167 | 
168 | 	}
169 | 
170 | 	/**
171 | 	 * Calculate the precition and median error on a collection of images
172 | 	 * @param resultFile : file of the results
173 | 	 * @param collection : collection of image IDs
174 | 	 * @param minRangeScale : minimum precision range
175 | 	 * @param maxRangeScale : minimum precision range
176 | 	 * @param oneLine : print results in one line
177 | 	 */
178 | 	private static void evaluateSingleSet(String resultFile, Set<String> collection, 
179 | 			int minRangeScale, int maxRangeScale, boolean oneLine){
180 | 
181 | 		// function that calculates the distance between two lat/lon points
182 | 		Geodesic geo = new Geodesic(6378.1370D, 298.257223563);		
183 | 
184 | 		// Initialize result containers
185 | 		Map<Integer, Integer> estimationResultMap = initializeResultMap(minRangeScale, maxRangeScale);
186 | 		List<Double> distances = new ArrayList<Double>();
187 | 
188 | 		// Estimated and total item counters
189 | 		int estimations = 0;
190 | 
191 | 		// File reader
192 | 		EasyBufferedReader reader = new EasyBufferedReader(resultFile);
193 | 		String line;
194 | 		while((line = reader.readLine()) != null){
195 | 			if(collection == null || collection.contains(line.split("\t")[0])){
196 | 				try{
197 | 					// Pairs of lat/lon points
198 | 					Double[] estimatedLocation = 
199 | 						{Double.parseDouble(line.split("\t")[4]),
200 | 								Double.parseDouble(line.split("\t")[3])};
201 | 					Double[] groundTruthLocation = 
202 | 						{Double.parseDouble(line.split("\t")[2]),
203 | 								Double.parseDouble(line.split("\t")[1])};
204 | 
205 | 
206 | 					// calculate distance
207 | 					double distance = geo.Inverse(groundTruthLocation[0], groundTruthLocation[1],
208 | 							estimatedLocation[0], estimatedLocation[1]).s12;
209 | 
210 | 					// store results
211 | 					for(int i=minRangeScale;i<maxRangeScale;i++){
212 | 						if(distance < Math.pow(10, i)){
213 | 							estimationResultMap.put(i, estimationResultMap.get(i) + 1);
214 | 						}
215 | 					}
216 | 					distances.add(distance);
217 | 					estimations++;
218 | 
219 | 				}catch (ArrayIndexOutOfBoundsException e){ // line is not in the right format
220 | 					System.out.println(e.getMessage());
221 | 				}
222 | 			}
223 | 		}
224 | 		reader.close();
225 | 
226 | 		DecimalFormat df = new DecimalFormat();
227 | 		df.setMaximumFractionDigits(2);
228 | 
229 | 		// print results
230 | 		if(oneLine){
231 | 			logger.info("Items: " + estimations + "\tP@1km: " + df.format(
232 | 					(float)estimationResultMap.get(0)/estimations*100) + "%\tm.error: " +
233 | 					df.format(Utils.median(distances)) + "km");
234 | 		}else{
235 | 			logger.info("Total items: " + estimations);
236 | 			printPrecisionResults(estimationResultMap, estimations, minRangeScale, maxRangeScale);
237 | 			logger.info("Median Distance Error: " + df.format(Utils.median(distances)) + "km");
238 | 		}
239 | 	}
240 | 
241 | 	/**
242 | 	 * 
243 | 	 * @param resultFile : file of the results
244 | 	 * @param collections : several collections of image IDs
245 | 	 * @param minRangeScale : minimum precision range
246 | 	 * @param maxRangeScale : minimum precision range
247 | 	 */
248 | 	private static void evaluateMultiSets(String resultFile, 
249 | 			Map<String, Set<String>> collections, int minRangeScale, int maxRangeScale) {
250 | 		for(Entry<String, Set<String>> entry:collections.entrySet()){
251 | 			logger.info(entry.getKey());
252 | 			evaluateSingleSet(resultFile, entry.getValue(), 0, 1, true);
253 | 		}
254 | 	}
255 | }


--------------------------------------------------------------------------------
/src/test/java/gr/iti/mklab/main/MultimediaGeotagging.java:
--------------------------------------------------------------------------------
  1 | package gr.iti.mklab.main;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | import java.util.Properties;
  8 | import java.util.Set;
  9 | 
 10 | import org.apache.log4j.Logger;
 11 | 
 12 | import gr.iti.mklab.methods.MultipleGrid;
 13 | import gr.iti.mklab.data.GeoCell;
 14 | import gr.iti.mklab.methods.LanguageModel;
 15 | import gr.iti.mklab.methods.SimilaritySearch;
 16 | import gr.iti.mklab.methods.TermCellProbs;
 17 | import gr.iti.mklab.metrics.Entropy;
 18 | import gr.iti.mklab.metrics.Locality;
 19 | import gr.iti.mklab.tools.DataManager;
 20 | import gr.iti.mklab.tools.SimilarityCalculator;
 21 | import gr.iti.mklab.util.EasyBufferedReader;
 22 | import gr.iti.mklab.util.EasyBufferedWriter;
 23 | import gr.iti.mklab.util.Progress;
 24 | import gr.iti.mklab.util.TextUtil;
 25 | 
 26 | /**
 27 |  * The main class that combines all the other class in order to implement the method.
 28 |  * For memory allocation issues the main method has been separated in three steps, create,
 29 |  * train, FS (Feature Selection), LM (language model), IG (multiple grid technique) and 
 30 |  * SS (similarity search).
 31 |  * @author gkordo
 32 |  *
 33 |  */
 34 | public class MultimediaGeotagging {
 35 | 
 36 | 	static Logger logger = Logger.getLogger("gr.iti.mklab.MainPlacingTask");
 37 | 
 38 | 	public static void main(String[] args) throws Exception{
 39 | 
 40 | 		Properties properties = new Properties();
 41 | 
 42 | 		logger.info("Program Started");
 43 | 
 44 | 		properties.load(new FileInputStream("config.properties"));
 45 | 		String dir = properties.getProperty("dir");
 46 | 
 47 | 		String trainFolder = properties.getProperty("trainFolder");
 48 | 		String testFile = properties.getProperty("testFile");
 49 | 
 50 | 		String process = properties.getProperty("process");
 51 | 
 52 | 		int coarserScale = Integer.parseInt(properties.getProperty("coarserScale"));
 53 | 		int finerScale = Integer.parseInt(properties.getProperty("finerScale"));
 54 | 
 55 | 		int k = Integer.parseInt(properties.getProperty("k"));
 56 | 		String resultFile = properties.getProperty("resultFile");
 57 | 
 58 | 
 59 | 		// Built of the Language Model
 60 | 		if(process.contains("train") || process.equals("all")){
 61 | 			Set<String> testIDs = DataManager.getSetOfImageIDs(dir + testFile);
 62 | 			Set<String> usersIDs = DataManager.getSetOfUserID(dir + testFile);
 63 | 
 64 | 			TermCellProbs trainLM = new TermCellProbs(testIDs, usersIDs);
 65 | 
 66 | 			trainLM.calculatorTermCellProb(dir, trainFolder, 
 67 | 					"Term-Cell Probs/scale_" + coarserScale, coarserScale);
 68 | 
 69 | 			trainLM.calculatorTermCellProb(dir, trainFolder, 
 70 | 					"Term-Cell Probs/scale_" + finerScale, finerScale);
 71 | 		}
 72 | 
 73 | 		// Feature Selection and Feature Weighting (Locality and Spatial Entropy Calculation)
 74 | 		if(process.contains("FS") || process.equals("all")){
 75 | 			Entropy.calculateEntropyWeights(dir, "Term-Cell Probs/scale_" + coarserScale
 76 | 					+ "/term_cell_probs");
 77 | 			
 78 | 			Locality loc = new Locality(dir + testFile, coarserScale);
 79 | 			loc.calculateLocality(dir, trainFolder);
 80 | 		}
 81 | 
 82 | 		// Language Model
 83 | 		if(process.contains("LM") || process.equals("all")){
 84 | 			MultimediaGeotagging.computeMLCs(dir, testFile, "resultLM_scale" + coarserScale, 
 85 | 					"Term-Cell Probs/scale_" + coarserScale + "/term_cell_probs", 
 86 | 					"Weights", true);
 87 | 
 88 | 			MultimediaGeotagging.computeMLCs(dir, testFile, "resultLM_scale" + finerScale, 
 89 | 					"Term-Cell Probs/scale_" + finerScale + "/term_cell_probs", 
 90 | 					"Weights", false);
 91 | 		}
 92 | 
 93 | 		// Multiple Grid Technique
 94 | 		if(process.contains("MG") || process.equals("all")){
 95 | 			MultipleGrid.determinCellIDsForSS(dir + "resultLM/", 
 96 | 					"resultLM_mg" + coarserScale + "-" + finerScale,
 97 | 					"resultLM_scale"+coarserScale, "resultLM_scale"+finerScale);
 98 | 		}
 99 | 
100 | 		//Similarity Search
101 | 		if(process.contains("SS") || process.equals("all")){
102 | 			new SimilarityCalculator(dir + testFile, dir + 
103 | 					"resultLM/resultLM_mg" + coarserScale + "-" + finerScale)
104 | 			.performSimilarityCalculation(dir, trainFolder, "resultSS");
105 | 
106 | 			new SimilaritySearch(dir + testFile, dir + 
107 | 					"resultLM/resultLM_mg" + coarserScale + "-" + finerScale, 
108 | 					dir + "resultSS/image_similarities", dir + resultFile, k, 1);
109 | 		}
110 | 
111 | 		logger.info("Program Finished");
112 | 	}
113 | 
114 | 	/**
115 | 	 * Function that perform language model method for a file provided and in the determined scale
116 | 	 * @param dir : directory of the project
117 | 	 * @param testFile : the file that contains the testset images
118 | 	 * @param resultFile : the name of the file that the results of the language model will be saved
119 | 	 * @param termCellProbsFile : the file that contains the term-cell probabilities
120 | 	 * @param weightFolder : the folder that contains the files of term weights
121 | 	 * @param thetaG : feature selection accuracy threshold
122 | 	 * @param thetaT : feature selection frequency threshold
123 | 	 */
124 | 	public static void computeMLCs(String dir, 
125 | 			String testFile, String resultFile, String termCellProbsFile, 
126 | 			String weightFolder, boolean confidenceFlag){
127 | 
128 | 		logger.info("Process: Language Model MLC\t|\t"
129 | 				+ "Status: INITIALIZE\t|\tFile: " + testFile);
130 | 		
131 | 		new File(dir + "resultsLM").mkdir();
132 | 		EasyBufferedReader reader = new EasyBufferedReader(dir + testFile);
133 | 		EasyBufferedWriter writer = new EasyBufferedWriter(dir + "resultsLM/" + resultFile);
134 | 		EasyBufferedWriter writerCE = new EasyBufferedWriter(dir + "resultsLM/" +
135 | 				resultFile + "_conf_evid");
136 | 
137 | 		// initialization of the Language Model
138 | 		LanguageModel lmItem = new LanguageModel();
139 | 		Map<String, Map<String, Double>> termCellProbsMap = lmItem.loadTermCellProbsAndWeights
140 | 				(dir + testFile, dir + termCellProbsFile, dir + weightFolder);
141 | 
142 | 		logger.info("Process: Language Model MLC\t|\t"
143 | 				+ "Status: STARTED");
144 | 		
145 | 		
146 | 		int count = 0, total = 1000000;
147 | 		long startTime = System.currentTimeMillis();
148 | 		Progress prog = new Progress(startTime,total,10,60, "calculate",logger);
149 | 		String line;
150 | 		while ((line = reader.readLine())!=null && count<=total){
151 | 
152 | 			prog.showProgress(count, System.currentTimeMillis());
153 | 			count++;
154 | 			
155 | 			String[] metadata = line.split("\t");
156 | 			
157 | 			// Pre-procession of the tags and title
158 | 			Set<String> terms = new HashSet<String>();
159 | 			TextUtil.parse(metadata[10], terms);
160 | 			TextUtil.parse(metadata[8], terms);
161 | 
162 | 			GeoCell result = lmItem.calculateLanguageModel(terms,
163 | 					termCellProbsMap, confidenceFlag);
164 | 
165 | 			if(result == null){ // no result from tags and title procession
166 | 				// give image's description in the language model (if provided)
167 | 				result = lmItem.calculateLanguageModel(TextUtil.parse(metadata[9], terms),
168 | 						termCellProbsMap, confidenceFlag);
169 | 			}
170 | 
171 | 			// write the results
172 | 			if(result != null && !result.equals("null")){
173 | 				writer.write(line.split("\t")[0] + "\t" + result.getID());
174 | 				if(confidenceFlag)
175 | 					writerCE.write(line.split("\t")[0] + "\t" + result.getConfidence() +
176 | 							"\t" + result.getConfidence().toString());
177 | 			}else{
178 | 				writer.write(line.split("\t")[0] + "\tN/A");
179 | 				if(confidenceFlag)
180 | 					writerCE.write(line.split("\t")[0] + "\tN/A");
181 | 			}
182 | 			writer.newLine();
183 | 			if(confidenceFlag)
184 | 				writerCE.newLine();
185 | 		}
186 | 
187 | 		logger.info("Process: Language Model MLC\t|\tStatus: COMPLETED\t|\tTotal Time: " +
188 | 				(System.currentTimeMillis()-startTime)/60000.0+"m");
189 | 		reader.close();
190 | 		writer.close();
191 | 		writerCE.close();
192 | 
193 | 	}
194 | }
195 | 


--------------------------------------------------------------------------------