├── LICENSE
├── README.md
├── config.properties
├── eval.properties
├── log4j.properties
├── pom.xml
├── samples
├── building_concepts.txt
└── samples.zip
└── src
├── main
└── java
│ └── gr
│ └── iti
│ └── mklab
│ ├── data
│ ├── GeoCell.java
│ └── ImageMetadata.java
│ ├── methods
│ ├── LanguageModel.java
│ ├── MultipleGrid.java
│ ├── SimilaritySearch.java
│ └── TermCellProbs.java
│ ├── metrics
│ ├── Entropy.java
│ └── Locality.java
│ ├── mmcomms16
│ ├── AmbiguityBasedSampling.java
│ ├── BuildingSampling.java
│ ├── GeographicalUniformSampling.java
│ ├── GeographicallyFocusedSampling.java
│ ├── Sampling.java
│ ├── TextBasedSampling.java
│ ├── TextDiversitySampling.java
│ ├── UserUniformSampling.java
│ └── VisualSampling.java
│ ├── tools
│ ├── CenterOfGravity.java
│ ├── DataManager.java
│ ├── InterfaceTermCellProb.java
│ └── SimilarityCalculator.java
│ └── util
│ ├── EasyBufferedReader.java
│ ├── EasyBufferedWriter.java
│ ├── Progress.java
│ ├── TextUtil.java
│ └── Utils.java
└── test
└── java
└── gr
└── iti
└── mklab
└── main
├── Evaluation.java
└── MultimediaGeotagging.java
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Multimedia Geotagging
2 | ======
3 |
4 | This repository contains the implementation of algorithms that estimate the geographic location of multimedia items based on their textual content. The approach is described in here and here. It was submitted in MediaEval Placing Task 2016.
5 |
6 |
7 |
8 |
Main Method
9 |
10 | The approach is a refined language model, including feature selection and weighting schemes and heuristic techniques that improves the accuracy in finer granularities. It is a text-based method, in which a complex geographical-tag model is built from the tags, titles and the locations of a massive amount of geotagged images that are included in a training set, in order to estimate the location of each query image included in a test set.
11 |
12 | The main approach comprises two major processing steps, an offline and an online.
13 |
14 |
Offline Processing Step
15 |
16 | * Pre-processing
17 | * apply URL decoding, lowercase transformation, tokenization
18 | * remove accents, punctuations and symbols (e.g. “.%!&”)
19 | * discard terms consisting of numerics or less than three characters
20 |
21 | * Language Model
22 | * divide earth surface in rectangular cells with a side length of 0.01°
23 | * calculate term-cell probabilities based on the users that used the term inside the cell
24 |
25 | * Feature selection
26 | * calculate locality score of every term in the dataset
27 | * locality is based on the term frequency and the neighbor users that have used it in the cell distribution
28 | * the final set of selected terms is formed from the terms with locality score greater than zero
29 |
30 | * Feature weighting using spatial entropy
31 | * calculate spatial entropy values of every term applying the Shannon entropy formula in the term-cell probabilities
32 | * spatial entropy weights derives from a Gaussian weight function over the spatial entropy of terms
33 | * locality weights derives from the relative position in the rank of terms based on their locality score
34 | * combine locality and spatial entropy weight to generate the final weights
35 |
36 |
Online Processing Step
37 |
38 | * Language Model based estimation (prior-estimation)
39 | * the probability of each cell is calculated
40 | * Most Likely Cell (MLC) considered the cell with the highest probability and used to produce the estimation
41 |
42 | * Multiple Resolution Grids
43 | * build different language models for multiple resolution grids (side length 0.01° and 0.001°)
44 | * estimate the MLC combining the result of the individual language models
45 |
46 | * Similarity Search
47 | * determine the most similar training images within the MLC
48 | * their center-of-gravity is the final location estimation
49 |
50 |
51 |
Instructions
52 |
53 | In order to make possible to run the project you have to set all necessary argument in configurations, following the instruction for every argument. The default values may be used.
54 |
55 |
56 | _Input File_
57 | The imput files must be in the same format as YFCC100M dataset.
58 |
59 |
60 | _Output Files_
61 | At the end of the training process, the algorithm creates a folder named `TermCellProbs` and inside the folder another folder named `scale_(s)`, named appropriately based on the scale `s` of the language model's cells. The format of this file is the following.
62 |
63 | term cell1-lon_cell1-lat>cell1-prob>cell1-users cell2-lon_cell2-lat>cell2-prob>cell2-users...
64 |
65 | `term`: the actual name of the term
66 | `cellx`: the x most probable cell.
67 | `cellx-lon_cellx-lat`: the longitude and latitude of center of the `cellx`, which is used as cell ID
68 | `cellx-prob`: the probability of the `cellx` for the specific tag
69 | `cellx-users`: the number of users that used the specific term in the `cellx`
70 |
71 | The output of the feature weighting scheme is a folder with name `Weights` containing two files one for locality weight and one for spatial entropy weights, namely `locality_weights` and `spatial_entropy_weights`, respectively. Each row contains a term and its corresponding weight, separated with a tab.
72 |
73 | The files that are described above are given as input in the Language Model estimation process. During this process, a folder named `resultsLM` and inside that folder two files named `resultsLM_scale(s)`are created, where are included the MLCs of the query images. Every row contains the imageID and the MLC (tab-separated) of the image that corresponds in the respective line in the test set. Also, a file named `resultsLM_scale(s)_conf_evid` is created in the same folder, containing the confidence and evidences that lead to estimated MLC, for every query image.
74 |
75 | Having estimated the MLCs for both granularity grids, the files are fed to the Multiple Resolution Grids technique, which produce a file named `resultsLM_mg(cs)-(fs)`, where `(cs)` and `(fs)` stands for coarser and finer granularity grid, respectively. Every row of this file contains the image id, the MLC of the coarser language model and the result of the Multiple Resolution Grids technique, separated with a `>`.
76 |
77 | In conclusion, the file that is created by the Multiple Resolution Grids technique is used for the final processes of the algorithm, Similarity Search. During this process, a folder named `resultSS` is created, containing the similarity values and the location of the images that containing in the MLG of every image in the test set. The final results are saved in the file specified in the arguments, and the records in each row are the ID of the query image, the real longitude and latitude, the estimated longitude and latitude, and they are tab-separated.
78 |
79 |
Evaluation Framework
80 |
81 | This pacage contains the implemetations of the sampling strategies described in the MMCommons 2016 paper. In order to run the evaluation framework you have to set all necessary argument in configuration file, following the instruction for every argument. To run the code, the Evaluation class have to be executed.
82 |
83 | Additionally, in this folder, the zip file that contains the generated collections from the different sampling strategies and the file of the building concepts can be found. Keep in mind that the geographical uniform sampling, the user uniform sampling and text diversity sampling generates different files in every code execution because they involve random selections and permutations.
84 |
85 |
Demo Version
86 |
87 | There have been developed a demo version and a storm module of the approach.
88 |
89 |
Contact for further details about the project
90 |
91 | Giorgos Kordopatis-Zilos (georgekordopatis@iti.gr)
92 | Symeon Papadopoulos (papadop@iti.gr)
93 |
--------------------------------------------------------------------------------
/config.properties:
--------------------------------------------------------------------------------
1 | #Project directory
2 | dir=/home/georgekordopatis/Documents/multimedia-geotagging/images/
3 |
4 | #Processes of the program
5 | #Values:
6 | #create = create the needed sets (training and test)
7 | #train = create Cell-Tag probability file with the entropy value for each tag
8 | #FS = Feature Selection
9 | #LM = Language Model
10 | #IG = Internal Grid
11 | #SS = Similarity Search
12 | #all = all the processes
13 | process=train
14 |
15 | #Folder that contains the training files and Test set file
16 | trainFolder=/yfcc100m/
17 | testFile=/testset/2016/mediaeval2016_placing_test
18 |
19 | #Scale of Grid
20 | #side cell = 10^(-scale) (i.e. scale 2 = 0.01)
21 | coarserScale=2
22 | finerScale=3
23 |
24 | #Total number of the similar images (k) and the result files of the LM process for multiple grids (input)
25 | #required for IGSS process
26 | k=5
27 |
28 | #Name of the final Result File (output)
29 | resultFile=results_G2-3_k
--------------------------------------------------------------------------------
/eval.properties:
--------------------------------------------------------------------------------
1 | #Paths to the input Files
2 | testFile=mediaeval2015_placing_test
3 | placeFile=mediaeval2015_placing_test_places
4 | conceptFile=mediaeval2015_placing_test_autotags
5 | resultFile=results
6 |
7 | #Sampling Strategy
8 | #GUS <-- Geographical Uniform Sampling
9 | #UUS <-- User Uniform Sampling
10 | #TBS <-- Text-based Sampling
11 | #TDS <-- Text Diversity Sampling
12 | #GFS <-- Geographically Focused Sampling
13 | #ABS <-- Ambiguity-based Sampling
14 | #VS <-- Visual Sampling
15 | #BS <-- Building Sampling
16 | #(Empty) <-- No sampling
17 | sampling=GUS
18 |
19 | #Minimum and Maximum precision range
20 | #precisionrange = 10^(scale) (i.e. scale -1 --> range 0.1km)
21 | minRangeScale=-2
22 | maxRangeScale=3
23 |
--------------------------------------------------------------------------------
/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set up logging to include a file record of the output
2 | # Note: the file is always created, even if there is
3 | # no actual output.
4 | log4j.rootLogger=info, stdout, R
5 |
6 | # Log format to standard out
7 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
8 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.stdout.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n
10 |
11 | # File based log output
12 | log4j.appender.R=org.apache.log4j.RollingFileAppender
13 | log4j.appender.R.File=testout.log
14 | log4j.appender.R.MaxFileSize=100000KB
15 | log4j.appender.R.encoding=UTF-8
16 | # Keep one backup file
17 | log4j.appender.R.MaxBackupIndex=1
18 | log4j.appender.R.layout=org.apache.log4j.PatternLayout
19 | log4j.appender.R.layout.ConversionPattern= %5p [%d][%t](%F:%L) %m%n
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | 4.0.0
5 | gr.iti.mklab
6 | multimedia-geotagging
7 | 0.1-SNAPSHOT
8 | jar
9 |
10 | multimedia-geotagging
11 | https://github.com/socialsensor/multimedia-geotagging
12 | Contains the implementation of algorithms that estimate the geographic location of media content based on their content and metadata.
13 |
14 |
15 |
16 | gkordo
17 | Giorgos Kordopatis-Zilos
18 | georgekordopatis@iti.gr
19 |
20 |
21 |
22 |
23 |
24 | The Apache Software License, Version 2.0
25 | http://www.apache.org/licenses/LICENSE-2.0.txt
26 | repo
27 |
28 |
29 |
30 |
31 | scm:git:git@github.com:socialsensor/multimedia-geotagging.git
32 | scm:git:git@github.com:socialsensor/multimedia-geotagging.git
33 | git@github.com:socialsensor/multimedia-geotagging.git
34 |
35 |
36 |
37 | UTF-8
38 |
39 |
40 |
41 |
42 |
43 | junit
44 | junit
45 | 3.8.1
46 | test
47 |
48 |
49 |
50 | log4j
51 | log4j
52 | 1.2.16
53 |
54 |
55 |
56 | org.apache.hadoop
57 | hadoop-core
58 | 1.2.1
59 |
60 |
61 |
62 | org.apache.commons
63 | commons-math3
64 | 3.4.1
65 |
66 |
67 |
68 | info.debatty
69 | java-lsh
70 | 0.10
71 |
72 |
73 |
74 | net.sf.geographiclib
75 | GeographicLib-Java
76 | 1.42
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 | org.apache.maven.plugins
85 | maven-compiler-plugin
86 | 2.5.1
87 |
88 | 1.6
89 | 1.6
90 |
91 |
92 |
93 |
94 | org.apache.maven.plugins
95 | maven-source-plugin
96 | 2.2.1
97 |
98 |
99 | attach-sources
100 |
101 | jar
102 |
103 |
104 |
105 |
106 |
107 | org.apache.maven.plugins
108 | maven-javadoc-plugin
109 | 2.9.1
110 |
111 |
112 | attach-javadocs
113 |
114 | jar
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/samples/building_concepts.txt:
--------------------------------------------------------------------------------
1 | flying buttress
2 | brussels carpet
3 | capitol
4 | rose window
5 | abbey
6 | coliseum
7 | nave
8 | cathedral
9 | pantheon
10 | chateau
11 | belfry
12 | gothic
13 | temple
14 | aisle
15 | pointed arch
16 | rotunda
17 | organ loft
18 | onion dome
19 | palace
20 | bastion
21 | campanile
22 | cloister
23 | dome
24 | clock tower
25 | roman arch
26 | round arch
27 | amphitheater
28 | church
29 | facade
30 | frieze
31 | ceiling
32 | ballpark
33 | gargoyle
34 | colonnade
35 | manor
36 | altar
37 | battlement
38 | corbel
39 | castle
40 | brownstone
41 | mansion
42 | fortification
43 | pediment
44 | row house
45 | pedestal
46 | acropolis
47 | apartment
48 | building complex
49 | skyscraper
50 | stronghold
51 | monument
52 | fortress
53 | great hall
54 | tower
55 | drawbridge
56 | arch
57 | portico
58 | stadium
59 | field house
60 | condominium
61 | fort
62 | steeple
63 | steel arch bridge
64 | memorial
65 | column
66 | gable
67 | stained
68 | dome building
69 | watchtower
70 | marina
71 | city
72 | support column
73 | concrete
74 | cantilever bridge
75 | building
76 | roof
77 | door knocker
78 | building structure
79 | department store
80 | cityscape
81 | bazaar
82 | casino
83 | baluster
84 | auditorium
85 | hall
86 | truss
87 | brickwork
88 | assembly hall
89 | harbor
90 | radome
91 | architecture
92 | warehouse
93 | chandelier
94 | house
95 | window box
96 | ruins
97 | greenhouse
98 | stairwell
99 | window
100 | lighthouse
101 | mezzanine
102 | country house
103 | library
104 | stairs
105 | bookshop
106 | waterfront
107 | cemetery
108 | villa
109 | rafter
110 | stoop
111 | resort
112 | brick
113 | bannister
114 | mantel
115 | wall
116 | loft
117 | shelter
118 | cafeteria
119 | farmhouse
120 | cabin
121 |
--------------------------------------------------------------------------------
/samples/samples.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/socialsensor/multimedia-geotagging/08a434ca3f6f11a15824e391b50a53f011d24159/samples/samples.zip
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/data/GeoCell.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.data;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 | import java.util.Map.Entry;
6 |
7 | import gr.iti.mklab.util.Utils;
8 |
9 | /**
10 | * Class that implements the earth cells.
11 | * @author gkordo
12 | *
13 | */
14 | public class GeoCell {
15 |
16 | private Double totalProb;
17 | private String id;
18 | private Float confidence;
19 | private Map evidence;
20 |
21 | /**
22 | * Constructor of the class where the id is specified and the
23 | * evidence and the summation of the probabilities are initialized.
24 | * @param id : cell ID
25 | */
26 | public GeoCell(String id){
27 | this.id = id;
28 | this.evidence = new HashMap();
29 | this.totalProb = 0.0;
30 | }
31 |
32 | /**
33 | *
34 | * @return the cell ID
35 | */
36 | public String getID(){
37 | return id;
38 | }
39 |
40 | /**
41 | * Set the value of the confidence of choosing that cell.
42 | * @param confidence : value of confidence
43 | */
44 | public void setConfidence(Float confidence){
45 | this.confidence = confidence;
46 | }
47 |
48 | /**
49 | *
50 | * @return the confidence of the cell
51 | */
52 | public Float getConfidence(){
53 | return confidence;
54 | }
55 |
56 | /**
57 | *
58 | * @return the summation of all probabilities
59 | */
60 | public Double getTotalProb() {
61 | return totalProb;
62 | }
63 |
64 | /**
65 | * Add the given probability to the summation and store the word.
66 | * @param prob : probability of the word
67 | * @param word : actual word
68 | */
69 | public void addProb(double prob, String word) {
70 | totalProb += prob;
71 | this.evidence.put(word, (float) prob);
72 | }
73 |
74 | /**
75 | *
76 | * @return the sorted map of the word and their probabilities
77 | */
78 | public Map getEvidence(){
79 | Map unsortMap = new HashMap();
80 | for(Entry word:evidence.entrySet()){
81 | if(word.getValue()/totalProb>0.0001){
82 | unsortMap.put(word.getKey(), (float) (word.getValue()/totalProb));
83 | }
84 | }
85 | return Utils.sortByValues(unsortMap);
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/data/ImageMetadata.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.data;
2 |
3 | import java.util.Set;
4 |
5 | /**
6 | * The class that contains the metadata of an image.
7 | * @author gkordo
8 | *
9 | */
10 | public class ImageMetadata{
11 |
12 | private String imageID;
13 | private String predictedCell,coarserCell;
14 | private String userID;
15 | private Set tags;
16 |
17 | /**
18 | * Constructor using the metadata provided by the dataset file
19 | * @param id : image ID
20 | * @param userID : user ID
21 | * @param tags : image tags
22 | */
23 | public ImageMetadata (String id, String userID, Set tags) {
24 | this.imageID = id;
25 | this.userID = userID;
26 | this.tags = tags;
27 | }
28 |
29 | public String getId () {
30 | return imageID;
31 | }
32 |
33 | public String getUserId () {
34 | return userID;
35 | }
36 |
37 | public Set getTags () {
38 | return tags;
39 | }
40 |
41 | public void setPredictedCell (String cell){
42 | this.predictedCell = cell;
43 | }
44 |
45 | public void setCoarserCell (String cell){
46 | this.coarserCell = cell;
47 | }
48 |
49 | public String getCell () {
50 | return predictedCell;
51 | }
52 |
53 | public String getCoarserCell () {
54 | return coarserCell;
55 | }
56 | }
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/LanguageModel.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.methods;
2 |
3 | import gr.iti.mklab.data.GeoCell;
4 | import gr.iti.mklab.tools.DataManager;
5 | import gr.iti.mklab.util.EasyBufferedReader;
6 | import gr.iti.mklab.util.Utils;
7 | import gr.iti.mklab.util.Progress;
8 |
9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.Set;
12 | import java.util.Map.Entry;
13 |
14 | import org.apache.log4j.Logger;
15 |
16 | /**
17 | * This class is the core of the algorithm. It is the implementation of the language model.
18 | * The Most Likely Cell of the given image is calculated.
19 | * @author gkordo
20 | *
21 | */
22 | public class LanguageModel {
23 |
24 | protected Map selectedTermWeights;
25 |
26 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.LanguageModel");
27 |
28 | // The function that compose the other functions to calculate and
29 | // return the Most Likely Cell (MLC) for a query item.
30 | public GeoCell calculateLanguageModel(Set sentenceWords,
31 | Map> termCellProbsMap, boolean confidenceFlag) {
32 |
33 | Map cellMap = calculateCellsProbForImageTags(sentenceWords,
34 | termCellProbsMap);
35 |
36 | GeoCell mlc = findMLC(cellMap, confidenceFlag);
37 |
38 | return mlc;
39 | }
40 |
41 | // find the Most Likely Cell.
42 | private GeoCell findMLC(
43 | Map cellMap, boolean confidenceFlag) {
44 |
45 | cellMap = Utils.sortByMLCValues(cellMap);
46 |
47 | GeoCell mlc = null;
48 |
49 | if (!cellMap.isEmpty()){
50 | String mlcId = cellMap.keySet().toArray()[0].toString();
51 |
52 | mlc = cellMap.get(mlcId);
53 |
54 | if(confidenceFlag)
55 | mlc.setConfidence((float) calculateConfidence(cellMap, mlcId, 0.3));
56 | }
57 |
58 | return mlc;
59 | }
60 |
61 | // calculate confidence for the estimated location
62 | private static double calculateConfidence(Map cellMap,
63 | String mlc, double l) {
64 |
65 | Double sum = 0.0, total = 0.0;
66 |
67 | for(Entry entry:cellMap.entrySet()){
68 | double[] mCell = {Double.parseDouble(mlc.split("_")[0]),
69 | Double.parseDouble(mlc.split("_")[1])};
70 | double[] cell = {Double.parseDouble(entry.getKey().split("_")[0]),
71 | Double.parseDouble(mlc.split("_")[1])};
72 | if((cell[0] >= (mCell[0]-l)) && (cell[0] <= (mCell[0]+l))
73 | && (cell[1] >= (mCell[1]-l)) && (cell[1] <= (mCell[1]+l))){
74 | sum += entry.getValue().getTotalProb();
75 | }
76 | total += entry.getValue().getTotalProb();
77 | }
78 | return sum/total;
79 | }
80 |
81 | /**
82 | * This is the function that calculate the cell probabilities.
83 | * @param sentenceWords : list of words contained in tweet text
84 | * @return a map of cell
85 | */
86 | private Map calculateCellsProbForImageTags (Set terms,
87 | Map> termCellProbsMap) {
88 |
89 | Map cellMap = new HashMap();
90 |
91 | for(String term:terms){
92 | if(termCellProbsMap.containsKey(term)){
93 | double locality= selectedTermWeights.get(term)[1];
94 | double entropy= selectedTermWeights.get(term)[0];
95 |
96 | for(Entry entry: termCellProbsMap.get(term).entrySet()){
97 | String cell = entry.getKey();
98 | if(cellMap.containsKey(cell)){
99 | cellMap.get(cell).addProb(entry.getValue()
100 | *(0.8*locality+0.2*entropy), term);
101 | }else{
102 | GeoCell tmp = new GeoCell(cell);
103 | tmp.addProb(entry.getValue()
104 | *(0.8*locality+0.2*entropy), term);
105 | cellMap.put(cell,tmp);
106 | }
107 | }
108 | }
109 | }
110 | return cellMap;
111 | }
112 |
113 | /**
114 | * Initialize Language Model
115 | * @param testFile : file that contains test image metadata
116 | * @param probFile : file that contains the term-cell probabilities
117 | * @param weightFolder : the folder that contains the term weights
118 | * @return the term-cell probability map
119 | */
120 | public Map> loadTermCellProbsAndWeights(
121 | String testFile, String probFile, String weightFolder){
122 |
123 | // Feature Selection
124 | loadTermWeights(weightFolder);
125 |
126 | logger.info("loading cells' probabilities for all tags from " + probFile);
127 |
128 | long startTime = System.currentTimeMillis();
129 | Progress prog = new Progress(startTime,10,1,"loading",logger);
130 |
131 | Map> tagCellProbsMap =
132 | new HashMap>();
133 | Set termsInTestSet = DataManager.getSetOfTerms(testFile);
134 |
135 | EasyBufferedReader reader = new EasyBufferedReader(probFile);
136 | String line;
137 | // load tag-cell probabilities from the given file
138 | while ((line = reader.readLine())!=null){
139 | prog.showMessege(System.currentTimeMillis());
140 | String term = line.split("\t")[0];
141 |
142 | if(line.split("\t").length>1 && termsInTestSet.contains(term)
143 | && selectedTermWeights.containsKey(term)){
144 | Map tmpCellMap = new HashMap();
145 | for(String cell:line.split("\t")[2].split(" ")){
146 | tmpCellMap.put(cell.split(">")[0],
147 | Double.parseDouble(cell.split(">")[1]));
148 | }
149 | tagCellProbsMap.put(term, tmpCellMap);
150 | }
151 | }
152 | logger.info(tagCellProbsMap.size() + " tags loaded in " +
153 | (System.currentTimeMillis()-startTime)/1000.0 + "s");
154 | reader.close();
155 |
156 | return tagCellProbsMap;
157 | }
158 |
159 | private void loadTermWeights(String folder){
160 |
161 | // load locality weight of the terms
162 | EasyBufferedReader reader = new
163 | EasyBufferedReader(folder + "/locality_weights");
164 | String line;
165 | while ((line = reader.readLine())!=null){
166 | Double[] temp = {0.0, Double.parseDouble(line.split("\t")[1])};
167 | selectedTermWeights.put(line.split("\t")[0], temp);
168 | }
169 | reader.close();
170 |
171 | // load spatial entropy weight of the terms
172 | reader = new EasyBufferedReader(
173 | folder + "/spatial_entropy_weights");
174 | while ((line = reader.readLine())!=null){
175 | if(selectedTermWeights.containsKey(line.split("\t")[0]))
176 | selectedTermWeights.get(line.split("\t")[0])[0] =
177 | Double.parseDouble(line.split("\t")[1]);
178 | }
179 | reader.close();
180 | }
181 | }
182 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/MultipleGrid.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.methods;
2 |
3 | import gr.iti.mklab.util.EasyBufferedReader;
4 | import gr.iti.mklab.util.EasyBufferedWriter;
5 |
6 | import org.apache.log4j.Logger;
7 |
8 | /**
9 | * The implementation of the Internal Grid technique
10 | * @author gkordo
11 | *
12 | */
13 | public class MultipleGrid {
14 |
15 | static Logger logger = Logger.getLogger("gr.iti.mklab.method.InternalGrid");
16 |
17 | /**
18 | * Method that perform the Multiple Grid technique and generates
19 | * the arguments for the similarity search Class contractor
20 | * @param dir : directory of the project
21 | * @param resultFile : name of the output file
22 | * @param resultCorserGrid : file with the estimated cells of the coarser grid
23 | * @param resultFinerGrid : file with the estimated cells of the finer grid
24 | */
25 | public static void determinCellIDsForSS(String dir, String resultFile,
26 | String resultCorserGrid, String resultFinerGrid){
27 |
28 | logger.info("Process: Multiple Grid Technique\t|\t"
29 | + "Status: INITIALIZE");
30 | // Initialize parameters
31 | EasyBufferedReader resultLMGCReader = new EasyBufferedReader(dir + resultCorserGrid);
32 | EasyBufferedReader resultLMGFReader = new EasyBufferedReader(dir + resultFinerGrid);
33 | EasyBufferedWriter writer = new EasyBufferedWriter(dir + resultFile);
34 |
35 | String corseMLC;
36 | String fineMLC;
37 |
38 | logger.info("Process: Multiple Grid Technique\t|\t"
39 | + "Status: STARTED");
40 |
41 | while ((corseMLC=resultLMGCReader.readLine())!=null
42 | && (fineMLC=resultLMGFReader.readLine())!=null){
43 |
44 | if(!corseMLC.split("\t")[1].equals("N/A")){
45 | String mlc = deterimBoarders(corseMLC.split("\t")[1], fineMLC.split("\t")[1]);
46 | if(!mlc.isEmpty()){
47 | writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1]
48 | + ":" + mlc); // selected cell ID and the sell of the coarser granularity
49 | }else{
50 | writer.write(corseMLC.split("\t")[0] + "\t" + corseMLC.split("\t")[1]
51 | + ":" + corseMLC.split("\t")[1]);
52 | }
53 | writer.newLine();
54 | } else{
55 | writer.write(corseMLC.split("\t")[0] + "\tN/A");
56 | }
57 | }
58 |
59 | logger.info("Process: Multiple Grid Technique\t|\t"
60 | + "Status: COMPLETED");
61 |
62 | writer.close();
63 | resultLMGCReader.close();
64 | resultLMGFReader.close();
65 | }
66 |
67 | /**
68 | * Method that determines the borders of the cell that similarity search will take place
69 | * @param corseMLC : estimated cell of the coarser grid
70 | * @param fineMLC : estimated cell of the finer grid
71 | */
72 | private static String deterimBoarders(String corseMLC, String fineMLC){
73 |
74 | String mlc = corseMLC;
75 |
76 | if (!corseMLC.equals("N/A")){
77 | Double[] corseLatLon = {Double.parseDouble(corseMLC.split("_")[0]),
78 | Double.parseDouble(corseMLC.split("_")[1])};
79 |
80 | if(!fineMLC.equals("N/A")){
81 | Double[] fineLatLon = {Double.parseDouble(fineMLC.split("_")[0]),
82 | Double.parseDouble(fineMLC.split("_")[1])};
83 |
84 | // check whether the estimated cell of the finer grid laying
85 | // inside the borders of the estimated cell of the coarser grid
86 | if(fineLatLon[0]>=(corseLatLon[0]-0.005)
87 | && fineLatLon[0]<=(corseLatLon[0]+0.005)
88 | && fineLatLon[1]>=(corseLatLon[1]-0.005)
89 | && fineLatLon[1]<=(corseLatLon[1]+0.005)){
90 | mlc = fineMLC;
91 | }
92 | }
93 | }
94 |
95 | return mlc;
96 | }
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/SimilaritySearch.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.methods;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | import org.apache.log4j.Logger;
10 |
11 | import gr.iti.mklab.tools.CenterOfGravity;
12 | import gr.iti.mklab.util.EasyBufferedWriter;
13 | import gr.iti.mklab.util.Progress;
14 | import gr.iti.mklab.util.EasyBufferedReader;
15 |
16 | /**
17 | * Class that estimates the final location for every query image
18 | * @author gkordo
19 | *
20 | */
21 | public class SimilaritySearch extends CenterOfGravity{
22 |
23 | private Map estimatedCellMap = new HashMap();
24 | private Map similarities = new HashMap();
25 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch");
26 |
27 | /**
28 | * Contractor of the class.
29 | * @param multipleGridFile : file that contains the results of the multiple grid technique
30 | * @param similarityFile : file that contains the similar images of every query images
31 | * @param testFile : file that contains the test image's metadata
32 | * @param outputFile : name of the output file
33 | * @param k : number of similar images based on the center-of-gravity is calculated
34 | * @param a : variable required for center-of-gravity calculation
35 | */
36 | public SimilaritySearch(String testFile,String multipleGridFile,
37 | String similarityFile, String outputFile, int k, int a) {
38 | super(a);
39 |
40 | logger.info("Process: Location Estimation\t|\t"
41 | + "Status: INITIALIZE");
42 | loadEstimatedCells(multipleGridFile);
43 | logger.info("Process: Location Estimation\t|\t"
44 | + "Status: STARTED");
45 | estimateLocation(similarityFile,k);
46 | writeResultsInFile(testFile, outputFile);
47 | logger.info("Process: Location Estimation\t|\t"
48 | + "Status: COMPLETED");
49 | }
50 |
51 | /**
52 | * Function that loads the estimated cells from the Multiple Grid Technique.
53 | * @param multipleGridFile : ile that contains the results of the multiple grid technique
54 | */
55 | private void loadEstimatedCells(String multipleGridFile) {
56 |
57 | EasyBufferedReader reader = new EasyBufferedReader(multipleGridFile);
58 |
59 | String line;
60 | while ((line = reader.readLine())!=null){
61 | if((!line.split("\t")[1].equals("N/A"))){
62 | estimatedCellMap.put(line.split("\t")[0], line.split("\t")[1]);
63 | }
64 | }
65 |
66 | reader.close();
67 | }
68 |
69 | /**
70 | * Final location estimation of the images contained in the test set
71 | * @param similarityFile : file that contains the similar images of every query images
72 | * @param cellFile : file that contains the results of the multiple grid technique
73 | * @param k : number of similar images based on the center-of-gravity is calculated
74 | */
75 | private void estimateLocation(String similarityFile, int k) {
76 |
77 | EasyBufferedReader reader = new EasyBufferedReader(similarityFile);
78 |
79 | Progress prog = new Progress(System.currentTimeMillis(), 1000000, 100, 1, "calculate", logger);
80 | int count=0;
81 | String line;
82 |
83 | // Calculate the final results
84 | while ((line = reader.readLine())!=null){
85 | prog.showProgress(count, System.currentTimeMillis());
86 | if(estimatedCellMap.containsKey(line.split("\t")[0])){
87 | similarities.put(line.split("\t")[0],
88 | findSimilarImages(line, estimatedCellMap.get(line.split("\t")[0]), k));
89 | }
90 | count++;
91 | }
92 | reader.close();
93 | }
94 |
95 | /**
96 | * Location estimation for a query image.
97 | * @param line : line that contain the similarity of the train images
98 | * @param cells : estimated cells from the multiple grid technique
99 | * @param k : number of similar images based on the center-of-gravity is calculated
100 | * @return estimated location
101 | */
102 | private static String findSimilarImages(String line, String cells, int k){
103 |
104 | List images = new ArrayList();
105 | Collections.addAll(images, line.split("\t")[1].split(" "));
106 |
107 | Map similarity = new HashMap(k);
108 | Map similarityCoarser = new HashMap(k);
109 |
110 | boolean flag = false;
111 | Double[] result = new Double[2];
112 |
113 | // final estimation
114 | for(String image:images){
115 | if(similarity.size()")[0].equals(cells.split(">")[1])){
117 | if(deterimCell(image.split(">")[0],cells)){
118 | similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1]));
119 | }else if(similarityCoarser.size()")[0], Double.parseDouble(image.split(">")[1]));
121 | }
122 | }else {
123 | similarity.put(image.split(">")[0], Double.parseDouble(image.split(">")[1]));
124 | }
125 | }else{
126 | flag = true;
127 | result = computeCoordination(similarity);
128 | break;
129 | }
130 | }
131 |
132 | if(similarity.size()>0 && !flag){
133 | flag = true;
134 | result = computeCoordination(similarity);
135 | }else if(similarityCoarser.size()>0 && !flag){
136 | flag = true;
137 | result = computeCoordination(similarityCoarser);
138 | }
139 |
140 | // final return
141 | if(flag){
142 | return result[1] + "\t" + result[0];
143 | }else{
144 | return cells.split(">")[0].replace("_", "\t");
145 | }
146 | }
147 |
148 | /**
149 | * Function that determines if the given point lays inside a define cell.
150 | * @param point : latitude-longitude pair
151 | * @param cell : grid's cell
152 | * @return a boolean that contain the information
153 | */
154 | private static boolean deterimCell(String point, String cell){
155 |
156 | boolean cellID = false;
157 |
158 | Double[] pointLoc = {Double.parseDouble(point.split("_")[0]), Double.parseDouble(point.split("_")[1])};
159 | Double[] cellLoc = {Double.parseDouble(cell.split("_")[0]), Double.parseDouble(cell.split("_")[1])};
160 |
161 | if((pointLoc[0]>=(cellLoc[0]-0.0005)) && (pointLoc[0]<=(cellLoc[0]+0.0005))
162 | &&(pointLoc[1]>=(cellLoc[1]-0.0005)) && (pointLoc[1]<=(cellLoc[1]+0.0005))){
163 | cellID = true;
164 | }
165 |
166 | return cellID;
167 | }
168 |
169 | /**
170 | * Function that write the result in a file
171 | * @param testFile : file that contains the test image's metadata
172 | * @param outputFile : name of the output file
173 | */
174 | private void writeResultsInFile(String testFile, String outputFile) {
175 |
176 | EasyBufferedReader reader = new EasyBufferedReader(testFile);
177 | EasyBufferedWriter writer = new EasyBufferedWriter(outputFile);
178 |
179 | String line;
180 | // for every query image
181 | while ((line = reader.readLine())!=null){
182 |
183 | writer.write(line.split("\t")[0]);
184 |
185 | if(similarities.containsKey(line.split("\t")[0])){ // the location have been estimated
186 | writer.write(line.split("\t")[1] + "\t" +
187 | line.split("\t")[12] + "\t" + line.split("\t")[13] + "\t" +
188 | similarities.get(line.split("\t")[0]));
189 | writer.newLine();
190 | }else{ // no estimation
191 | writer.write(line.split("\t")[1] + "\t" +
192 | line.split("\t")[12] + "\t" + line.split("\t")[13]
193 | + "\t-73.98282136256299\t40.75282028252674");
194 | writer.newLine();
195 | }
196 | }
197 | reader.close();
198 | writer.close();
199 | }
200 | }
201 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/methods/TermCellProbs.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.methods;
2 |
3 | import gr.iti.mklab.tools.InterfaceTermCellProb;
4 | import gr.iti.mklab.util.Utils;
5 | import gr.iti.mklab.util.TextUtil;
6 |
7 | import java.io.File;
8 | import java.io.IOException;
9 | import java.math.BigDecimal;
10 | import java.util.*;
11 | import java.util.Map.Entry;
12 |
13 | import org.apache.commons.io.FileUtils;
14 | import org.apache.hadoop.fs.Path;
15 | import org.apache.hadoop.io.*;
16 | import org.apache.hadoop.mapred.*;
17 | import org.apache.log4j.Logger;
18 |
19 | /**
20 | * Class that calculate the term-cell probabilities for all term in all cells and saves the results in file.
21 | * The implementation employ hadoop map-reduce function.
22 | * @author gkordo
23 | *
24 | */
25 | public class TermCellProbs implements InterfaceTermCellProb{
26 |
27 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.TermCellProbCalculator");
28 | private static Set testIDs;
29 | private static Set users;
30 | private static int scale;
31 |
32 | /**
33 | * Contractor of the class get the set of image IDs and the user IDs of the images in the test set.
34 | * @param testIDs : set of test image IDs
35 | * @param users : set of test user IDs
36 | */
37 | public TermCellProbs(Set testIDs, Set users){
38 | TermCellProbs.testIDs = testIDs;
39 | TermCellProbs.users = users;
40 | }
41 |
42 | /**
43 | * Map class that takes the lines of the train file as input and creates key-value pairs,
44 | * using as keys the terms contained in the images and as values strings that contain
45 | * the information regarding the cell and user ID.
46 | * @author gkordo
47 | *
48 | */
49 | public static class MapTermCellProb extends MapReduceBase implements Mapper {
50 |
51 | /**
52 | * Required map function
53 | * @param key : key value
54 | * @param value : input string
55 | * @param output : output collector
56 | * @param reporter : reporter of the job
57 | */
58 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
59 |
60 | String[] metadata = value.toString().split("\t");
61 |
62 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
63 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
64 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
65 |
66 | // get image cell based on its latitude-longitude pair
67 | BigDecimal cellLonCenter = new BigDecimal(Double.parseDouble(
68 | metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP);
69 | BigDecimal cellLatCenter = new BigDecimal(Double.parseDouble(
70 | metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP);
71 |
72 | String cellID = cellLonCenter+"_"+cellLatCenter;
73 |
74 | //get image user ID
75 | String userID = metadata[3];
76 |
77 | // get image tags
78 | Set terms = new HashSet();
79 | TextUtil.parse(metadata[10], terms);
80 | TextUtil.parse(metadata[8], terms);
81 |
82 | for(String term:terms){
83 | if(!term.isEmpty() && term.length() > 2){
84 | output.collect(new Text(term), new Text(cellID+">"+userID)); // key-value pair
85 | }
86 | }
87 | }
88 | }
89 | }
90 |
91 |
92 | /**
93 | * Reduce class that get the key-value pairs and calculate the term-cell probabilities of every term.
94 | * @author gkordo
95 | *
96 | */
97 | public static class ReduceTermCellProb extends MapReduceBase implements Reducer {
98 |
99 | /**
100 | * Required reduce function
101 | * @param key : key value
102 | * @param values : set of values that share the same key
103 | * @param output : output collector
104 | * @param reporter : reporter of the job
105 | */
106 | public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
107 |
108 | // frequency map that contains the count of the different users for every single cell
109 | Map> termFreq = new HashMap>();
110 | int Nt = 0; // total user count
111 |
112 | // process every value that corresponds to a specific key
113 | while (values.hasNext()) {
114 |
115 | String entry = values.next().toString();
116 |
117 | // retrieve cell ID and user ID from the value of the pair
118 | String cellID = entry.split(">")[0];
119 | String userID = entry.split(">")[1];
120 |
121 | // update of the frequency map
122 | if (termFreq.containsKey(cellID)){
123 | if(!termFreq.get(cellID).contains(userID)){
124 | Nt++;
125 | termFreq.get(cellID).add(userID);
126 | }
127 | }else{
128 | Nt++;
129 | termFreq.put(cellID,new HashSet());
130 | termFreq.get(cellID).add(userID);
131 | }
132 | }
133 |
134 | // calculation of the tag-cell probabilities map for every cell
135 | Map cellsProbs = new HashMap();
136 | for(Entry> entryCell : termFreq.entrySet()){
137 | String cellID = entryCell.getKey();
138 | Double cellProb = ((double)(entryCell.getValue().size()))/Nt;
139 | cellsProbs.put(cellID,cellProb);
140 | }
141 |
142 | // sorting of the tag-cell probabilities map
143 | Map cellsProbsSorted = Utils.sortByValues(cellsProbs);
144 |
145 | // convert tag-cell probabilities map in string in order to be saved in the output file
146 | String out = convertMapToString(cellsProbsSorted,termFreq);
147 |
148 | // send output to collector
149 | output.collect(key, new Text(out));
150 | }
151 |
152 | /**
153 | * Function that convert tag-cell probabilities map in output string.
154 | * @param cellsProbs : tag-cell probabilities map
155 | * @param termFreq : frequency map
156 | * @return a string contains cell IDs accompanied with tag-cell probabilities
157 | */
158 | public static String convertMapToString(Map cellsProbs,
159 | Map> termFreq){
160 | String out = "";
161 | for(Entry entryCell: cellsProbs.entrySet()){
162 | if(cellsProbs.get(entryCell.getKey()) >= 0.00001){
163 | String tempCellIDProb = entryCell.getKey()
164 | + ">" + cellsProbs.get(entryCell.getKey())
165 | + ">" + termFreq.get(entryCell.getKey()).size();
166 |
167 | out += (tempCellIDProb + " ");
168 | }
169 | }
170 | return out.trim();
171 | }
172 | }
173 |
174 | /**
175 | * Core function for the job of tag-cell probabilities calculation.
176 | * @param dir : directory of the project
177 | * @param trainFolder : the file of the train set
178 | * @param outFolder : the folder where the tag-set probabilities file will be stored
179 | * @param scale : the scale of the grid that is used
180 | */
181 | public void calculatorTermCellProb(String dir, String trainFolder,
182 | String outFolder, int scale) throws IOException{
183 |
184 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
185 | + "Status: INITIALIZE");
186 |
187 | TermCellProbs.scale = scale;
188 |
189 | // initialize Job
190 | JobConf conf = new JobConf(TermCellProbs.class);
191 | conf.setJobName("termcellprobmapred");
192 |
193 | conf.setOutputKeyClass(Text.class);
194 | conf.setOutputValueClass(Text.class);
195 |
196 | conf.setMapperClass(MapTermCellProb.class);
197 | conf.setReducerClass(ReduceTermCellProb.class);
198 |
199 | conf.setInputFormat(TextInputFormat.class);
200 | conf.setOutputFormat(TextOutputFormat.class);
201 |
202 | // clean the output file directory
203 | File folder = new File(dir + outFolder);
204 | if (folder.exists()) {
205 | FileUtils.cleanDirectory(folder);
206 | FileUtils.forceDelete(folder);
207 | }
208 |
209 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
210 | FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder));
211 |
212 | // start Job
213 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
214 | + "Status: STARTED");
215 | long startTime = System.currentTimeMillis();
216 | JobClient.runJob(conf);
217 | logger.info("Process: Term-Cell Propabilities Calculation\t|\t"
218 | + "Status: COMPLETED\t|\tTotal time: " +
219 | (System.currentTimeMillis()-startTime)/60000.0+"m");
220 |
221 | new File(dir + outFolder + "/part-00000").renameTo(
222 | new File(dir + outFolder + "/term_cell_probs")); // rename the output file
223 | }
224 | }
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/metrics/Entropy.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.metrics;
2 |
3 | import gr.iti.mklab.util.EasyBufferedReader;
4 | import gr.iti.mklab.util.EasyBufferedWriter;
5 | import gr.iti.mklab.util.Utils;
6 |
7 | import java.io.File;
8 | import java.util.HashMap;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 |
12 | import org.apache.commons.math3.distribution.NormalDistribution;
13 | import org.apache.commons.math3.stat.descriptive.moment.Mean;
14 | import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
15 | import org.apache.log4j.Logger;
16 |
17 | /**
18 | * Entropy class update the file that contains the tag-cell probabilities with the spatial entropy of every individual tag.
19 | * Calculate the spatial tag entropy for all of the tags. Entropy is used for feature weighting.
20 | * @author gkordo
21 | *
22 | */
23 | public class Entropy {
24 |
25 | static Logger logger = Logger.getLogger("gr.iti.mklab.method.Entropy");
26 |
27 | /**
28 | * Calculate the Spatial Entropy weights of the LM terms
29 | * @param dir : project directory
30 | * @param fileTermCell : Term-Cell probability file
31 | */
32 | public static void calculateEntropyWeights(String dir, String fileTermCell){
33 |
34 | logger.info("Process: Spatial Entropy weights calculation\t|\t"
35 | + "Status: INITIALIZE");
36 |
37 | new File(dir + "Weights").mkdir();
38 |
39 | // Term Spatial Entropy calculation
40 | EasyBufferedReader reader = new EasyBufferedReader(dir + fileTermCell);
41 | Map termSpatialEntropy = new HashMap();
42 | long sTime = System.currentTimeMillis();
43 | String line;
44 | while ((line=reader.readLine())!=null){
45 | String term = line.split("\t")[0];
46 | String[] cells = line.split("\t")[1].split(" ");
47 | if(cells.length > 1
48 | && term.length() > 3){
49 | termSpatialEntropy.put(term,
50 | computeEntropyNaive(cells));
51 | }
52 | }
53 | reader.close();
54 |
55 | logger.info("Process: Spatial Entropy weights calculation\t|\t"
56 | + "Status: STARTED");
57 |
58 | // Spatial Entropy weights calculation of terms
59 | Map weights = calculateSpatialEntropyWeights(termSpatialEntropy);
60 |
61 | // store weights
62 | EasyBufferedWriter writer = new EasyBufferedWriter(
63 | dir + "Weights/spatial_entropy_weights");
64 | for(Entry term:weights.entrySet()){
65 | writer.write(term.getKey() + "\t" + term.getValue());
66 | writer.newLine();
67 | }
68 |
69 | logger.info("Process: Spatial Entropy weights calculation\t|\t"
70 | + "Status: COMPLETED\t|\tTotal time: " +
71 | (System.currentTimeMillis()-sTime)/1000.0 + "s");
72 | writer.close();
73 | }
74 |
75 | /**
76 | * Shannon entropy formula
77 | * @param probabilities : probability distribution
78 | * @return
79 | */
80 | private static double computeEntropyNaive(String[] probabilities) {
81 | double entropy = 0.0;
82 | for (int i=0;i< probabilities.length;i++) {
83 | double p = Double.parseDouble(probabilities[i].split(">")[1]);
84 | if(p != 0.0){
85 | entropy -= p * Math.log(p);
86 | }
87 | }
88 | return entropy;
89 | }
90 |
91 | /**
92 | * Calculate the max probability value applying the Gaussian functionon the
93 | * probability distribution
94 | * @param entropies : spatial entropy values of the terms
95 | * @return max weight
96 | */
97 | private static Map calculateSpatialEntropyWeights(
98 | Map entropies){
99 |
100 | double[] termSpatialEntropyValues = entropies
101 | .values().stream().mapToDouble(d -> d).toArray();
102 |
103 | NormalDistribution gd = new NormalDistribution( // Gaussian function for re-weighting
104 | new Mean().evaluate(termSpatialEntropyValues),
105 | new StandardDeviation().evaluate(termSpatialEntropyValues));
106 |
107 | Double gdMax = 0.0;
108 | Map weights = new HashMap();
109 | for(Entry p:entropies.entrySet()){
110 | double weight = gd.density(p.getValue());
111 | weights.put(p.getKey(), weight);
112 | if(gdMax < weight){
113 | gdMax = weight;
114 | }
115 | }
116 |
117 | for(Entry term:weights.entrySet()){
118 | term.setValue(term.getValue()/gdMax);
119 | }
120 |
121 | return Utils.sortByValues(weights);
122 | }
123 | }
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/metrics/Locality.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.metrics;
2 |
3 | import gr.iti.mklab.tools.DataManager;
4 | import gr.iti.mklab.util.EasyBufferedReader;
5 | import gr.iti.mklab.util.EasyBufferedWriter;
6 | import gr.iti.mklab.util.TextUtil;
7 | import gr.iti.mklab.util.Utils;
8 |
9 | import java.io.File;
10 | import java.io.IOException;
11 | import java.math.BigDecimal;
12 | import java.util.HashMap;
13 | import java.util.HashSet;
14 | import java.util.Iterator;
15 | import java.util.Map;
16 | import java.util.Map.Entry;
17 | import java.util.Set;
18 |
19 | import org.apache.commons.io.FileUtils;
20 | import org.apache.hadoop.fs.Path;
21 | import org.apache.hadoop.io.LongWritable;
22 | import org.apache.hadoop.io.Text;
23 | import org.apache.hadoop.mapred.FileInputFormat;
24 | import org.apache.hadoop.mapred.FileOutputFormat;
25 | import org.apache.hadoop.mapred.JobClient;
26 | import org.apache.hadoop.mapred.JobConf;
27 | import org.apache.hadoop.mapred.MapReduceBase;
28 | import org.apache.hadoop.mapred.Mapper;
29 | import org.apache.hadoop.mapred.OutputCollector;
30 | import org.apache.hadoop.mapred.Reducer;
31 | import org.apache.hadoop.mapred.Reporter;
32 | import org.apache.hadoop.mapred.TextInputFormat;
33 | import org.apache.hadoop.mapred.TextOutputFormat;
34 | import org.apache.log4j.Logger;
35 |
36 | /**
37 | * Class that calculate the locality of the terms and saves the results in file.
38 | * The implementation employ hadoop map-reduce function.
39 | * @author gkordo
40 | *
41 | */
42 | public class Locality {
43 |
44 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.Locality");
45 | private static Set testIDs;
46 | private static Set users;
47 | private static int scale;
48 |
49 | public Locality(String testFile, int scale){
50 | testIDs = DataManager.getSetOfImageIDs(testFile);
51 | users = DataManager.getSetOfUserID(testFile);
52 | Locality.scale = scale;
53 | }
54 |
55 | /**
56 | * Map class that takes the lines of the train file as input and creates key-value pairs,
57 | * using as keys the tags contained in the images and as values strings that contain
58 | * the information regarding the cell and user ID.
59 | * @author gkordo
60 | *
61 | */
62 | public static class MapLocality extends MapReduceBase implements Mapper {
63 |
64 | /**
65 | * Required map function
66 | * @param key : key value
67 | * @param value : input string
68 | * @param output : output collector
69 | * @param reporter : reporter of the job
70 | */
71 | public void map(LongWritable key, Text value,
72 | OutputCollector output, Reporter reporter) throws IOException {
73 |
74 | String[] metadata = value.toString().split("\t");
75 |
76 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
77 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
78 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
79 |
80 | BigDecimal tmpLonCenter = new BigDecimal(
81 | Double.parseDouble(metadata[12])).setScale(scale, BigDecimal.ROUND_HALF_UP);
82 | BigDecimal tmpLatCenter = new BigDecimal(
83 | Double.parseDouble(metadata[13])).setScale(scale, BigDecimal.ROUND_HALF_UP);
84 |
85 | //get image user ID
86 | String userID = metadata[3];
87 |
88 | // get image tags
89 | Set terms = new HashSet();
90 | TextUtil.parse(metadata[10], terms);
91 | TextUtil.parse(metadata[8], terms);
92 |
93 | // send key-value pairs
94 | for(String term:terms) {
95 | if(!term.isEmpty() && term.length() > 2){
96 | for(int j=-2;j<2;j++){
97 | for(int k=-2;k<2;k++){
98 | output.collect(new Text(term), new Text(userID + ">" +
99 | (tmpLonCenter.doubleValue()+((j)*0.01)) + "_" +
100 | (tmpLatCenter.doubleValue()+((k)*0.01))));
101 | }
102 | }
103 | }
104 | }
105 | }
106 | }
107 | }
108 |
109 | /**
110 | * Reduce class that get the key-value pairs and calculate the locality of every term.
111 | * @author gkordo
112 | *
113 | */
114 | public static class ReduceLocality extends MapReduceBase implements Reducer {
115 |
116 | /**
117 | * Required reduce function
118 | * @param key : key value
119 | * @param values : set of values that share the same key
120 | * @param output : output collector
121 | * @param reporter : reporter of the job
122 | */
123 | public void reduce(Text key, Iterator values,
124 | OutputCollector output, Reporter reporter) throws IOException {
125 |
126 | // map of cells that contains the count of the different users for every single cell
127 | Map> cells = new HashMap>();
128 | int Nt = 0; // total user count
129 |
130 | while (values.hasNext()) {
131 |
132 | String value = values.next().toString();
133 |
134 | // retrieve cell ID and user ID from the value of the pair
135 | String user = value.split(">")[0];
136 | String cell = value.split(">")[1];
137 |
138 | // update of the frequency map
139 | if(cells.containsKey(cell)){
140 | if(!cells.get(cell).contains(user)){
141 | cells.get(cell).add(user);
142 | Nt++;
143 | }
144 | }else{
145 | cells.put(cell,new HashSet());
146 | cells.get(cell).add(user);
147 | Nt++;
148 | }
149 | }
150 |
151 | // locality calculation
152 | double locality = 0.0;
153 | for(Entry> entry : cells.entrySet()){
154 | int v=entry.getValue().size();
155 | locality+=v*(v-1)/Nt;
156 |
157 | }
158 |
159 | // send output to collector
160 | if(locality > 0.0){
161 | output.collect(key, new Text(locality + ""));
162 | }
163 | }
164 | }
165 |
166 | /**
167 | * Core function for the job of tag-cell probabilities calculation.
168 | * @param dir : project directory
169 | * @param trainFolder : the file of the train set
170 | * @throws IOException : file not found
171 | */
172 | public void calculateLocality(String dir, String trainFolder) throws IOException{
173 |
174 | logger.info("Process: Locality weight calculation\t|\t"
175 | + "Status: INITIALIZE");
176 | JobConf conf = new JobConf(Locality.class);
177 | conf.setJobName("Locality");
178 |
179 | conf.setOutputKeyClass(Text.class);
180 | conf.setOutputValueClass(Text.class);
181 |
182 | conf.setMapperClass(MapLocality.class);
183 | conf.setReducerClass(ReduceLocality.class);
184 |
185 | conf.setInputFormat(TextInputFormat.class);
186 | conf.setOutputFormat(TextOutputFormat.class);
187 |
188 | // clean the output file directory
189 | File folder = new File(dir + "temp/locality");
190 | if (folder.exists()) {
191 | FileUtils.cleanDirectory(folder);
192 | FileUtils.forceDelete(folder);
193 | }
194 |
195 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
196 | FileOutputFormat.setOutputPath(conf, new Path(dir + "temp/locality"));
197 |
198 | logger.info("Process: Locality weight calculation\t|\t"
199 | + "Status: STARTED");
200 | long startTime = System.currentTimeMillis();
201 | JobClient.runJob(conf);
202 |
203 | sortAndStore(dir + "temp/locality/part-00000",
204 | dir + "Weights/locality_weights");
205 |
206 | logger.info("Process: Locality weight calculation\t|\t"
207 | + "Status: COMPLETED\t|\tTotal time: " +
208 | (System.currentTimeMillis()-startTime)/60000.0+"m");
209 | }
210 |
211 | /**
212 | * Sort terms based on their locality values and calculate weights.
213 | * The locality term weight are stored in the given file.
214 | * @param inFile : file of the locality values of the terms
215 | * @param outFile : output file
216 | */
217 | private void sortAndStore(String inFile, String outFile){
218 |
219 | // load locality values
220 | EasyBufferedReader reader = new EasyBufferedReader(inFile);
221 | Map termLocalityValues = new HashMap();
222 | String line;
223 | while ((line = reader.readLine())!=null){
224 | String term = line.split("\t")[0];
225 | double locality = Double.parseDouble(line.split("\t")[1]);
226 | termLocalityValues.put(term, locality);
227 | }
228 | reader.close();
229 |
230 | // sort and store weights
231 | termLocalityValues = Utils.sortByValues(termLocalityValues);
232 | EasyBufferedWriter writer = new EasyBufferedWriter(outFile);
233 | int i = 0, totalTerms = termLocalityValues.size();
234 | for(Entry entry : termLocalityValues.entrySet()){
235 | writer.write(entry.getKey()+"\t"+(double)(totalTerms-i)/totalTerms);
236 | writer.newLine();
237 | i++;
238 | }
239 | writer.close();
240 | }
241 | }
242 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/AmbiguityBasedSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 | import java.util.HashSet;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 | import java.util.Set;
10 |
11 | import org.apache.log4j.Logger;
12 |
13 | import gr.iti.mklab.util.EasyBufferedReader;
14 | import gr.iti.mklab.util.EasyBufferedWriter;
15 | import gr.iti.mklab.util.Utils;
16 |
17 | @SuppressWarnings("unchecked")
18 | public class AmbiguityBasedSampling extends Sampling{
19 |
20 | private static Logger logger = Logger.getLogger(
21 | "gr.iti.mklab.eval.AmbiguityBasedSampling");
22 |
23 | public static Object sample(String testFile) throws Exception{
24 |
25 | logger.info("Sampling: Ambiguity-based Strategy");
26 |
27 | AmbiguityBasedSampling sampling =
28 | new AmbiguityBasedSampling();
29 |
30 | return sampling.writeInFile(sampling.loadData(testFile));
31 | }
32 |
33 | protected Object loadData(String testFile) {
34 |
35 | Map ambiguous =
36 | computeCityEntropies(loadOccurrences(testFile));
37 | logger.info(ambiguous.size() + " Towns loaded");
38 |
39 | Map images = new
40 | HashMap();
41 | double median = Utils.medianItemDouble(ambiguous);
42 |
43 | EasyBufferedReader reader =
44 | new EasyBufferedReader(testFile);
45 | String line;
46 | while((line = reader.readLine())!=null){
47 | String imageID = line.split("\t")[0];
48 | for(String place:line .split("\t")[1].split(",")){
49 | if(place.split(":").length>2
50 | && place.split(":")[2].contains("Town")){
51 | if(ambiguous.containsKey(place.split(":")[1]) &&
52 | ambiguous.get(place.split(":")[1])>median){
53 | images.put(imageID, true);
54 | }else{
55 | images.put(imageID, false);
56 | }
57 | }
58 | }
59 | }
60 | reader.close();
61 | return images;
62 | }
63 |
64 | protected Object writeInFile(Object data) {
65 |
66 | Map images =
67 | (Map) data;
68 |
69 | Map> respond = new
70 | HashMap>();
71 |
72 | respond.put(true, new HashSet());
73 | respond.put(false, new HashSet());
74 |
75 | EasyBufferedWriter writerA = new EasyBufferedWriter(
76 | "samples/ambiguous_sampling.txt");
77 | EasyBufferedWriter writerN = new EasyBufferedWriter(
78 | "samples/non_ambiguous_sampling.txt");
79 | for(Entry image:images.entrySet()){
80 | respond.get(image.getValue()).add(image.getKey());
81 | if(image.getValue()){
82 | writerA.write(image.getKey());
83 | writerA.newLine();
84 | }else{
85 | writerN.write(image.getKey());
86 | writerN.newLine();
87 | }
88 | }
89 | writerA.close();
90 | writerN.close();
91 |
92 | return respond;
93 | }
94 |
95 | private static double computeEntropyNaive(
96 | final List probabilities, int total) {
97 | double entropy = 0.0;
98 | for (Double p:probabilities) {
99 | p /= total;
100 | if(p!=0.0){
101 | entropy -= p * Math.log(p);
102 | }
103 | }
104 | return entropy;
105 | }
106 |
107 | private static Map computeCityEntropies(
108 | Map> townNames) {
109 | Map ambiguous = new HashMap();
110 |
111 | for(Entry> town:townNames.entrySet()){
112 | List p = new ArrayList();
113 | int total = 0;
114 | for(Entry code:town.getValue().entrySet()){
115 | p.add((double) code.getValue());
116 | total += code.getValue();
117 | }
118 | double entropy = computeEntropyNaive(p, total);
119 | if(entropy > 0.0)
120 | ambiguous.put(town.getKey(), entropy);
121 | }
122 |
123 | return ambiguous;
124 | }
125 |
126 | private static Map>
127 | loadOccurrences(String testFile) {
128 |
129 | Map> townNames =
130 | new HashMap>();
131 |
132 | EasyBufferedReader reader =
133 | new EasyBufferedReader(testFile);
134 | String line;
135 | while((line = reader.readLine())!=null){
136 | for(String place:line .split("\t")[1].split(",")){
137 | if(place.split(":").length>2 && place.split(":")[2].contains("Town")){
138 | String townCode = place.split(":")[0];
139 | String townName = place.split(":")[1];
140 |
141 | if(townNames.containsKey(townName)){
142 | if(townNames.get(townName).containsKey(townCode)){
143 | townNames.get(townName).put(townCode,
144 | townNames.get(townName).get(townCode) + 1);
145 | }else{
146 | townNames.get(townName).put(townCode, 1);
147 | }
148 | }else{
149 | townNames.put(townName, new HashMap());
150 | townNames.get(townName).put(townCode, 1);
151 | }
152 | }
153 | }
154 | }
155 | reader.close();
156 |
157 | return townNames;
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/BuildingSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.log4j.Logger;
7 |
8 | import gr.iti.mklab.util.EasyBufferedReader;
9 | import gr.iti.mklab.util.EasyBufferedWriter;
10 |
11 | @SuppressWarnings("unchecked")
12 | public class BuildingSampling extends Sampling{
13 |
14 | private static Logger logger = Logger.getLogger(
15 | "gr.iti.mklab.eval.BuildingSampling");
16 |
17 | public static Object sample(String testFile) throws Exception{
18 |
19 | logger.info("Sampling: Building Strategy");
20 |
21 | BuildingSampling sampling =
22 | new BuildingSampling();
23 |
24 | return sampling.writeInFile(sampling.loadData(testFile));
25 | }
26 |
27 | protected Object loadData(String testFile) {
28 |
29 | Set buildingConcepts = new HashSet();
30 |
31 | EasyBufferedReader reader =
32 | new EasyBufferedReader("samples/building_concepts.txt");
33 | String line;
34 | while((line = reader.readLine())!=null){
35 | buildingConcepts.add(line);
36 | }
37 | reader.close();
38 |
39 | Set buildingImages = new HashSet();
40 | reader = new EasyBufferedReader(testFile);
41 | while((line = reader.readLine())!=null){
42 | String imageID = line.split("\t")[0];
43 | for(String concept:line .split("\t")[1].split(",")){
44 | if(buildingConcepts.contains(concept.split(":")[0])){
45 | buildingImages.add(imageID);
46 | }
47 | }
48 | }
49 | reader.close();
50 | logger.info(buildingImages.size() + " Building Images loaded");
51 |
52 | return buildingImages;
53 | }
54 |
55 | protected Object writeInFile(Object data) {
56 |
57 | Set buildingImages = (Set) data;
58 |
59 | EasyBufferedWriter writer = new EasyBufferedWriter(
60 | "samples/building_sampling.txt");
61 | for(String image:buildingImages){
62 | writer.write(image + "\t");
63 | writer.newLine();
64 | }
65 | writer.close();
66 |
67 | return buildingImages;
68 | }
69 | }
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/GeographicalUniformSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.math.BigDecimal;
4 | import java.util.ArrayList;
5 | import java.util.Collections;
6 | import java.util.HashMap;
7 | import java.util.HashSet;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | import java.util.Set;
12 |
13 | import org.apache.log4j.Logger;
14 |
15 | import gr.iti.mklab.util.EasyBufferedReader;
16 | import gr.iti.mklab.util.EasyBufferedWriter;
17 | import gr.iti.mklab.util.Utils;
18 |
19 | @SuppressWarnings("unchecked")
20 | public class GeographicalUniformSampling extends Sampling {
21 |
22 | private static Logger logger = Logger.getLogger(
23 | "gr.iti.mklab.eval.GeographicalUniformSampling");
24 |
25 | public static Object sample(String testFile) throws Exception{
26 |
27 | logger.info("Sampling: Geographical Uniform Strategy");
28 |
29 | GeographicalUniformSampling sampling =
30 | new GeographicalUniformSampling();
31 |
32 | return sampling.writeInFile(sampling.loadData(testFile));
33 | }
34 |
35 | protected Object loadData(String testFile) {
36 |
37 | Map> cells =
38 | new HashMap>();
39 |
40 | EasyBufferedReader reader =
41 | new EasyBufferedReader(testFile);
42 | String line;
43 | while((line = reader.readLine())!=null){
44 |
45 | BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble(
46 | line.split("\t")[12])).setScale(1, BigDecimal.ROUND_HALF_UP);
47 | BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble(
48 | line.split("\t")[13])).setScale(1, BigDecimal.ROUND_HALF_UP);
49 |
50 | String cell = tmpLatCenter + " " + tmpLonCenter;
51 | if(cells.containsKey(cell)){
52 | cells.get(cell).add(line.split("\t")[1]);
53 | }else{
54 | cells.put(cell, new HashSet());
55 | cells.get(cell).add(line.split("\t")[1]);
56 | }
57 | }
58 | reader.close();
59 | logger.info(cells.size() + " Cells loaded");
60 |
61 | return cells;
62 | }
63 |
64 | protected Object writeInFile(Object data) {
65 |
66 | Map> cells = (Map>) data;
67 |
68 | EasyBufferedWriter writer = new EasyBufferedWriter(
69 | "samples/geographical_uniform_sampling.txt");
70 |
71 | int median = Utils.medianSet(cells);
72 |
73 | Set respond = new HashSet();
74 |
75 | for(Entry> cell:cells.entrySet()){
76 | List images =
77 | new ArrayList(cell.getValue());
78 | Collections.shuffle(images);
79 |
80 | for(int i=0;i>> places =
33 | new HashMap>>();
34 |
35 | places.put("continents", new HashMap>());
36 | places.put("countries", new HashMap>());
37 |
38 | EasyBufferedReader reader =
39 | new EasyBufferedReader(testFile);
40 | String line;
41 | while((line = reader.readLine())!=null){
42 | String imageID = line.split("\t")[0];
43 | for(String place:line .split("\t")[1].split(",")){
44 | if(place.split(":").length>2 && place.contains("Timezone")){
45 | String continent = place.split(":")[1].split("%")[0];
46 |
47 | switch(continent) {
48 | case "Pacific" :
49 | continent = "America";
50 | break;
51 | case "Atlantic" :
52 | continent = "America";
53 | break;
54 | case "Indian" :
55 | continent = "Asia";
56 | break;
57 | }
58 | if(places.get("continents").containsKey(continent)){
59 | places.get("continents").get(continent).add(imageID);
60 | }else{
61 | places.get("continents").put(continent, new HashSet());
62 | places.get("continents").get(continent).add(imageID);
63 | }
64 | }
65 |
66 | if(place.split(":").length>2 && place.contains("Country")){
67 | String country = place.split(":")[1].split("%")[0];
68 | if(places.get("countries").containsKey(country)){
69 | places.get("countries").get(country).add(imageID);
70 | }else{
71 | places.get("countries").put(country, new HashSet());
72 | places.get("countries").get(country).add(imageID);
73 | }
74 | }
75 | }
76 | }
77 | reader.close();
78 | logger.info(places.get("continents").size() + " Continents loaded");
79 | logger.info(places.get("countries").size() + " Countries loaded");
80 |
81 | return places;
82 | }
83 |
84 | protected Object writeInFile(Object data) {
85 |
86 | Map>> places =
87 | (Map>>) data;
88 |
89 | EasyBufferedWriter writer = new EasyBufferedWriter(
90 | "samples/geographically_focused_sampling_continents.txt");
91 | for(Entry> continent:places.get("continents").entrySet()){
92 | writer.write(continent.getKey() + "\t");
93 | for(String images:continent.getValue()){
94 | writer.write(images + " ");
95 | }
96 | writer.newLine();
97 | }
98 | writer.close();
99 |
100 | writer = new EasyBufferedWriter(
101 | "samples/geographically_focused_sampling_countries.txt");
102 |
103 | for(Entry> country:places.get("countries").entrySet()){
104 | writer.write(country.getKey() + "\t");
105 | for(String images:country.getValue()){
106 | writer.write(images + " ");
107 | }
108 | writer.newLine();
109 | }
110 | writer.close();
111 |
112 | return places;
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/Sampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | public abstract class Sampling {
4 |
5 | protected abstract Object loadData(String testFile);
6 |
7 | protected abstract Object writeInFile(Object data);
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/TextBasedSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.HashMap;
4 | import java.util.HashSet;
5 | import java.util.Map;
6 | import java.util.Map.Entry;
7 | import java.util.Set;
8 |
9 | import org.apache.log4j.Logger;
10 |
11 | import gr.iti.mklab.util.EasyBufferedReader;
12 | import gr.iti.mklab.util.EasyBufferedWriter;
13 | import gr.iti.mklab.util.Utils;
14 |
15 | @SuppressWarnings("unchecked")
16 | public class TextBasedSampling extends Sampling {
17 |
18 | private static Logger logger = Logger.getLogger(
19 | "gr.iti.mklab.eval.TextBasedSampling");
20 |
21 | public static Object sample(String testFile) throws Exception{
22 |
23 | logger.info("Sampling: Text-based Strategy");
24 |
25 | TextBasedSampling sampling = new TextBasedSampling();
26 |
27 | return sampling.writeInFile(sampling.loadData(testFile));
28 | }
29 |
30 | protected Object loadData(String testFile) {
31 |
32 | Map images =
33 | new HashMap();
34 |
35 | EasyBufferedReader reader =
36 | new EasyBufferedReader(testFile);
37 | String line;
38 | while((line = reader.readLine())!=null){
39 | int tags = (!line.split("\t")[10].isEmpty()
40 | ?line.split("\t")[10].split(",").length:0);
41 | int title = (!line.split("\t")[8].isEmpty()
42 | ?line.split("\t")[8].split("\\+").length:0);
43 |
44 | images.put(line.split("\t")[1], tags+title);
45 | }
46 | reader.close();
47 | logger.info(images.size() + " Images loaded");
48 |
49 | return images;
50 | }
51 |
52 | protected Object writeInFile(Object data) {
53 |
54 | Map images =
55 | (Map) data;
56 |
57 | EasyBufferedWriter writer = new EasyBufferedWriter(
58 | "samples/text_based_sampling.txt");
59 |
60 | Set respond = new HashSet();
61 |
62 | int median = Utils.medianItemInt(images);
63 |
64 | for(Entry image:images.entrySet()){
65 | if(image.getValue() >= median){
66 | respond.add(image.getKey());
67 | writer.write(image.getKey());
68 | writer.newLine();
69 | }
70 | }
71 | writer.close();
72 |
73 | return respond;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/TextDiversitySampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.Map.Entry;
10 | import java.util.Set;
11 | import java.util.stream.Collectors;
12 | import java.util.stream.IntStream;
13 |
14 | import org.apache.log4j.Logger;
15 |
16 | import gr.iti.mklab.util.EasyBufferedReader;
17 | import gr.iti.mklab.util.EasyBufferedWriter;
18 | import info.debatty.java.lsh.MinHash;
19 |
20 | @SuppressWarnings("unchecked")
21 | public class TextDiversitySampling extends Sampling {
22 |
23 | private static Logger logger = Logger.getLogger(
24 | "gr.iti.mklab.eval.TextDiversitySampling");
25 |
26 | public static Object sample(String testFile) throws Exception{
27 |
28 | logger.info("Sampling: Text Diversity Strategy");
29 |
30 | TextDiversitySampling sampling = new TextDiversitySampling();
31 |
32 | return sampling.writeInFile(sampling.loadData(testFile));
33 | }
34 |
35 | protected Object loadData(String testFile) {
36 |
37 | Map, List> buckets =
38 | new HashMap, List>();
39 | Map tags =
40 | new HashMap();
41 | int n = 510914;
42 | MinHash mh = new MinHash(0.1, n);
43 |
44 | EasyBufferedReader reader =
45 | new EasyBufferedReader(testFile);
46 | String line;
47 | while((line = reader.readLine())!=null){
48 | String imageID = line.split("\t")[1];
49 | String imageTags = line.split("\t")[10];
50 | boolean[] vector = new boolean[n];
51 |
52 | for(String tag:imageTags.split(",")){
53 | if(!tags.containsKey(tag)){
54 | tags.put(tag, tags.size());
55 | }
56 | vector[tags.get(tag)] = true;
57 | }
58 |
59 | List hash = IntStream.of((mh.signature(vector)
60 | )).boxed().collect(Collectors.toList());
61 | if(buckets.containsKey(hash)){
62 | buckets.get(hash).add(imageID);
63 | }else{
64 | buckets.put(hash, new ArrayList());
65 | buckets.get(hash).add(imageID);
66 | }
67 | }
68 | reader.close();
69 | logger.info(buckets.size() + " Buckets created");
70 |
71 | return buckets;
72 | }
73 |
74 | protected Object writeInFile(Object data) {
75 |
76 | Map, List> buckets =
77 | (Map, List>) data;
78 |
79 | Set respond = new HashSet();
80 |
81 | EasyBufferedWriter writer = new EasyBufferedWriter(
82 | "samples/text_diversity_sampling.txt");
83 |
84 | for(Entry, List> bucket
85 | :buckets.entrySet()){
86 | List images = bucket.getValue();
87 | Collections.shuffle(images);
88 |
89 | respond.add(images.get(0));
90 | writer.write(images.get(0));
91 | writer.newLine();
92 | }
93 | writer.close();
94 |
95 | return respond;
96 | }
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/UserUniformSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.Map.Entry;
10 | import java.util.Set;
11 |
12 | import org.apache.log4j.Logger;
13 |
14 | import gr.iti.mklab.util.EasyBufferedReader;
15 | import gr.iti.mklab.util.EasyBufferedWriter;
16 |
17 | @SuppressWarnings("unchecked")
18 | public class UserUniformSampling extends Sampling {
19 |
20 | private static Logger logger = Logger.getLogger(
21 | "gr.iti.mklab.eval.UserUniformSampling");
22 |
23 | public static Object sample(String testFile) throws Exception{
24 |
25 | logger.info("Sampling: User Uniform Strategy");
26 |
27 | UserUniformSampling sampling = new UserUniformSampling();
28 |
29 | return sampling.writeInFile(sampling.loadData(testFile));
30 | }
31 |
32 | protected Object loadData(String testFile) {
33 |
34 | Map> users =
35 | new HashMap>();
36 |
37 | EasyBufferedReader reader =
38 | new EasyBufferedReader(testFile);
39 | String line;
40 | while((line = reader.readLine())!=null){
41 | String user = line.split("\t")[3];
42 | if(users.containsKey(user)){
43 | users.get(user).add(line.split("\t")[1]);
44 | }else{
45 | users.put(user, new HashSet());
46 | users.get(user).add(line.split("\t")[1]);
47 | }
48 | }
49 | reader.close();
50 | logger.info(users.size() + " Users loaded");
51 |
52 | return users;
53 | }
54 |
55 | protected Object writeInFile(Object data) {
56 |
57 | Map> users =
58 | (Map>) data;
59 |
60 | Set respond = new HashSet();
61 |
62 | EasyBufferedWriter writer = new EasyBufferedWriter(
63 | "samples/user_uniform_sampling.txt");
64 |
65 | for(Entry> user:users.entrySet()){
66 | List images =
67 | new ArrayList(user.getValue());
68 | Collections.shuffle(images);
69 |
70 | respond.add(images.get(0));
71 | writer.write(images.get(0));
72 | writer.newLine();
73 | }
74 | writer.close();
75 |
76 | return respond;
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/mmcomms16/VisualSampling.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.mmcomms16;
2 |
3 | import java.util.HashMap;
4 | import java.util.HashSet;
5 | import java.util.Map;
6 | import java.util.Set;
7 | import java.util.Map.Entry;
8 |
9 | import org.apache.log4j.Logger;
10 |
11 | import gr.iti.mklab.util.EasyBufferedReader;
12 | import gr.iti.mklab.util.EasyBufferedWriter;
13 |
14 | @SuppressWarnings("unchecked")
15 | public class VisualSampling extends Sampling{
16 |
17 | private static Logger logger = Logger.getLogger(
18 | "gr.iti.mklab.eval.VisualSampling");
19 |
20 | public static Object sample(String testFile) throws Exception{
21 |
22 | logger.info("Sampling: Visual Strategy");
23 |
24 | VisualSampling sampling =
25 | new VisualSampling();
26 |
27 | return sampling.writeInFile(sampling.loadData(testFile));
28 | }
29 |
30 | protected Object loadData(String testFile) {
31 |
32 | Map> concepts =
33 | new HashMap>();
34 |
35 | EasyBufferedReader reader =
36 | new EasyBufferedReader(testFile);
37 | String line;
38 | while((line = reader.readLine())!=null){
39 | String imageID = line.split("\t")[0];
40 | for(String concept:line .split("\t")[1].split(",")){
41 | if(concepts.containsKey(concept.split(":")[0])){
42 | concepts.get(concept.split(":")[0]).add(imageID);
43 | }else{
44 | concepts.put(concept.split(":")[0], new HashSet());
45 | concepts.get(concept.split(":")[0]).add(imageID);
46 | }
47 | }
48 | }
49 | reader.close();
50 | logger.info(concepts.size() + " Concepts loaded");
51 |
52 | return concepts;
53 | }
54 |
55 | protected Object writeInFile(Object data) {
56 |
57 | Map> concepts =
58 | (Map>) data;
59 |
60 | EasyBufferedWriter writer = new EasyBufferedWriter(
61 | "samples/visual_sampling.txt");
62 | for(Entry> concept:concepts.entrySet()){
63 | writer.write(concept.getKey() + "\t");
64 | for(String images:concept.getValue()){
65 | writer.write(images + " ");
66 | }
67 | writer.newLine();
68 | }
69 | writer.close();
70 |
71 | return concepts;
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/CenterOfGravity.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.tools;
2 |
3 | import java.util.Map;
4 | import java.util.Map.Entry;
5 |
6 | /**
7 | * Abstract class that execute the calculation of the center-of-gravity of the most similar images
8 | * @author gkordo
9 | *
10 | */
11 | public abstract class CenterOfGravity {
12 |
13 | protected static int a;
14 |
15 | // Contractor initialize a variable
16 | public CenterOfGravity(int a){
17 | CenterOfGravity.a = a;
18 | }
19 |
20 | /**
21 | * Calculation of the center-of-gravity of the k most similar images
22 | * @param mapSim : the map with the k most similar images and their similarity values
23 | * @return the estimated location of the query image
24 | */
25 | protected static Double[] computeCoordination(Map mapSim){
26 |
27 | double [] loc = new double[3];
28 | Double[] c = new Double[2];
29 | int k = mapSim.size();
30 |
31 | for (Entry entry:mapSim.entrySet()){
32 |
33 | double sim = entry.getValue();
34 | double lat = Double.parseDouble(entry.getKey().split("_")[1]);
35 | double lon = Double.parseDouble(entry.getKey().split("_")[0]);
36 |
37 | loc[0] += Math.pow(sim,a)
38 | * Math.cos(lat * (Math.PI / 180D))
39 | * Math.cos(lon * (Math.PI / 180D)) / k;
40 |
41 | loc[1] += Math.pow(sim,a)
42 | * Math.cos(lat * (Math.PI / 180D))
43 | * Math.sin(lon * (Math.PI / 180D)) / k;
44 |
45 | loc[2] += Math.pow(sim,a)
46 | * Math.sin(lat * (Math.PI / 180D)) / k;
47 |
48 | c[0] = (Double) (Math.atan2(loc[2], Math.sqrt(Math.pow(loc[0],2)
49 | + Math.pow(loc[1],2))) * (180D/Math.PI));
50 |
51 | c[1] = (Double) (Math.atan2(loc[1], loc[0]) * (180D/Math.PI));
52 | }
53 | return c;
54 | }
55 | }
56 |
57 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/DataManager.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.tools;
2 |
3 | import java.util.HashSet;
4 | import java.util.Set;
5 |
6 | import org.apache.log4j.Logger;
7 |
8 | import gr.iti.mklab.util.EasyBufferedReader;
9 | import gr.iti.mklab.util.TextUtil;
10 |
11 | /**
12 | * Data manager
13 | * @author gkordo
14 | *
15 | */
16 | public class DataManager {
17 |
18 | static Logger logger = Logger.getLogger("gr.iti.mklab.tools.DataManager");
19 |
20 | // return a set contain the image IDs of the provided dataset
21 | public static Set getSetOfImageIDs(String file){
22 |
23 | Set usersIncludedInFile = new HashSet();
24 |
25 | EasyBufferedReader reader = new EasyBufferedReader(file);
26 |
27 | String input;
28 |
29 | logger.info("images contained in file " + file);
30 | while ((input= reader.readLine())!=null){
31 | usersIncludedInFile.add(input.split("\t")[1]);
32 | }
33 | logger.info(usersIncludedInFile.size()+" total images included in file");
34 | reader.close();
35 |
36 | return usersIncludedInFile;
37 | }
38 |
39 | // return a set contain the individual tags of the provided dataset
40 | public static Set getSetOfTerms(String file){
41 |
42 | EasyBufferedReader reader = new EasyBufferedReader(file);
43 | Set termsIncludedInFile = new HashSet();
44 |
45 | String line;
46 |
47 | logger.info("deterim the diffrent tags contained in file " + file);
48 | while ((line= reader.readLine())!=null){
49 |
50 | Set terms = new HashSet();
51 | TextUtil.parse(line.split("\t")[10], terms);
52 | TextUtil.parse(line.split("\t")[8], terms);
53 |
54 | termsIncludedInFile.addAll(terms);
55 |
56 | }
57 | logger.info(termsIncludedInFile.size()+" total tags included in file");
58 | reader.close();
59 |
60 | return termsIncludedInFile;
61 | }
62 |
63 | // return a set contain the different users in the provided dataset
64 | public static Set getSetOfUserID (String file){
65 |
66 | Set usersIncludedInFile = new HashSet();
67 |
68 | EasyBufferedReader reader = new EasyBufferedReader(file);
69 |
70 | String input;
71 |
72 | logger.info("deterim the diffrent users contained in file " + file);
73 | while ((input= reader.readLine())!=null){
74 | usersIncludedInFile.add(input.split("\t")[3]);
75 | }
76 | logger.info(usersIncludedInFile.size()+" total users included in file");
77 | reader.close();
78 |
79 | return usersIncludedInFile;
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/InterfaceTermCellProb.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.tools;
2 |
3 | import java.io.IOException;
4 |
5 | /**
6 | * Interface of tag-cell probability calculator
7 | * @author gkordo
8 | *
9 | */
10 | public interface InterfaceTermCellProb {
11 |
12 | /**
13 | * Function where the tag-cell probabilities are calculated and stored in a defined file.
14 | * @param dir : directory of the project
15 | * @param trainFile : file that contains the train set
16 | * @param outFile : output file
17 | * @param scale : grid scale
18 | * @throws IOException : file not found
19 | */
20 | public void calculatorTermCellProb(String dir, String trainFile,
21 | String outFile, int scale) throws IOException;
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/tools/SimilarityCalculator.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.tools;
2 |
3 | import gr.iti.mklab.data.ImageMetadata;
4 | import gr.iti.mklab.util.EasyBufferedReader;
5 | import gr.iti.mklab.util.Utils;
6 | import gr.iti.mklab.util.TextUtil;
7 |
8 | import java.io.File;
9 | import java.io.IOException;
10 | import java.math.BigDecimal;
11 | import java.util.*;
12 | import java.util.Map.Entry;
13 |
14 | import org.apache.commons.io.FileUtils;
15 | import org.apache.hadoop.fs.Path;
16 | import org.apache.hadoop.io.*;
17 | import org.apache.hadoop.mapred.*;
18 | import org.apache.log4j.Logger;
19 |
20 | /**
21 | * For a query image, the similarity between the images contained in the train set is calculated based on their corresponding term sets.
22 | * Class that implements similarity search based on Map-Reduce scheme.
23 | * @author gkordo
24 | *
25 | */
26 | public class SimilarityCalculator{
27 |
28 | private static Set testIDs;
29 | private static Setusers;
30 | private static Logger logger = Logger.getLogger("gr.iti.mklab.methods.SimilaritySearch");
31 | static java.util.Map> predictedCellsOfTestImages = new HashMap>();
32 |
33 | /**
34 | * Contractor of the class.
35 | * @param testFile : file that contains the test image's metadata
36 | * @param resultFile : file that contains the MLC of every query image
37 | */
38 | public SimilarityCalculator(String testFile, String resultFile){
39 | loadTestImages(testFile,resultFile);
40 | }
41 |
42 |
43 | /**
44 | * Map class that takes the lines of the train file as input and creates key-value pairs,
45 | * using as keys the image IDs of the test set images and as values strings that contain
46 | * the location of the train images and the calculated similarity.
47 | * @author gkordo
48 | *
49 | */
50 | public static class MapSimilaritySearch extends MapReduceBase implements Mapper {
51 |
52 | /**
53 | * Required map function
54 | * @param key : key value
55 | * @param value : input string
56 | * @param output : output collector
57 | * @param reporter : reporter of the job
58 | */
59 | public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
60 |
61 | String[] metadata = value.toString().split("\t");
62 |
63 | if (!testIDs.contains(metadata[1]) && !users.contains(metadata[3]) // train image and its user are not contained in the test set
64 | && !metadata[12].isEmpty() && !metadata[13].isEmpty() // train image contains coordinations
65 | && (!metadata[10].isEmpty() || !metadata[8].isEmpty())){ // train image contains any textual information
66 |
67 | // get image cell based on its latitude-longitude pair
68 | BigDecimal tmpLonCenter = new BigDecimal(Double.parseDouble(
69 | metadata[12])).setScale(2, BigDecimal.ROUND_HALF_UP);
70 | BigDecimal tmpLatCenter = new BigDecimal(Double.parseDouble(
71 | metadata[13])).setScale(2, BigDecimal.ROUND_HALF_UP);
72 |
73 | Set trainImageTerms = new HashSet();
74 | TextUtil.parse(metadata[10], trainImageTerms);
75 | TextUtil.parse(metadata[8], trainImageTerms);
76 |
77 | // there is at least estimated location laying inside the borders of cell
78 | if(predictedCellsOfTestImages.containsKey(tmpLonCenter+"_"+tmpLatCenter)
79 | && trainImageTerms.size() > 1){
80 |
81 | // calculate similarity between the train image and all images that lay inside the boarded of the specific cell
82 | for(ImageMetadata entry : predictedCellsOfTestImages
83 | .get(tmpLonCenter+"_"+tmpLatCenter)){
84 |
85 | // determine the common terms
86 | List common = new ArrayList(trainImageTerms);
87 | common.retainAll(entry.getTags());
88 |
89 | // calculate similarity
90 | double sjacc = (double) common.size() / (entry.getTags().size()
91 | + trainImageTerms.size() - common.size());
92 | if(sjacc>0.05){
93 | output.collect(new Text(entry.getId()), new Text(String.valueOf(sjacc) +
94 | ">" + metadata[12] + "_"+metadata[13]));
95 | }
96 | }
97 | }
98 | }
99 | }
100 | }
101 |
102 | /**
103 | * Reduce class that get the key-value pairs and sort the similarities for a test image.
104 | * @author gkordo
105 | *
106 | */
107 | public static class ReduceSimilaritySearch extends MapReduceBase implements Reducer {
108 |
109 | /**
110 | * Required reduce function
111 | * @param key : key value
112 | * @param values : set of values that share the same key
113 | * @param output : output collector
114 | * @param reporter : reporter of the job
115 | */
116 | public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
117 |
118 | java.util.Map simImages = new HashMap();
119 |
120 | // load values in a topic similarity map
121 | while (values.hasNext()) {
122 | String entry = values.next().toString();
123 | simImages.put(entry.split(">")[1],Double.parseDouble(entry.split(">")[0]));
124 | }
125 |
126 | // sort similarity map
127 | simImages = Utils.sortByValues(simImages);
128 |
129 | // write in output file
130 | output.collect(key, new Text(convertSimMapToStr(simImages)));
131 | }
132 |
133 | /**
134 | * Function that converts similarity map to output string
135 | * @param simImages : similarity map
136 | * @return a string that contains similarity and location of the train images
137 | */
138 | public String convertSimMapToStr(java.util.Map simImages){
139 | String out = "";
140 |
141 | for(Entry entry : simImages.entrySet()){
142 | out += entry.getKey() + ">" + entry.getValue() + " ";
143 | }
144 |
145 | return out.trim();
146 | }
147 | }
148 |
149 | /**
150 | * Core function for the job of similarity search.
151 | * @param dir : directory of the project
152 | * @param trainFolder : the file of the train set
153 | * @param outFolder : the folder where the tag-set probabilities file will be stored
154 | * @throws Exception : file not found
155 | */
156 | public void performSimilarityCalculation(String dir, String trainFolder, String outFolder) throws Exception {
157 |
158 | logger.info("Process: Similarity Calculation\t|\t"
159 | + "Status: INITIALIZE");
160 | JobConf conf = new JobConf(SimilarityCalculator.class);
161 | conf.setJobName("similaritysearch");
162 |
163 | conf.setOutputKeyClass(Text.class);
164 | conf.setOutputValueClass(Text.class);
165 |
166 | conf.setMapperClass(MapSimilaritySearch.class);
167 |
168 | conf.setReducerClass(ReduceSimilaritySearch.class);
169 |
170 | conf.setInputFormat(TextInputFormat.class);
171 | conf.setOutputFormat(TextOutputFormat.class);
172 |
173 | // clean the output file directory
174 | File file = new File(dir + outFolder);
175 | if (file.exists()) {
176 | FileUtils.cleanDirectory(file);
177 | FileUtils.forceDelete(file);
178 | }
179 |
180 | FileInputFormat.setInputPaths(conf, new Path(dir + trainFolder));
181 | FileOutputFormat.setOutputPath(conf, new Path(dir + outFolder));
182 |
183 | logger.info("Process: Similarity Calculation\t|\t"
184 | + "Status: STARTED");
185 | long startTime = System.currentTimeMillis();
186 | JobClient.runJob(conf);
187 | logger.info("Process: Similarity Calculation\t|\t"
188 | + "Status: COMPLETED\t|\tTotal time: " +
189 | (System.currentTimeMillis()-startTime)/60000.0+"m");
190 |
191 | new File(dir + outFolder + "/part-00000").renameTo(
192 | new File(dir + outFolder + "/image_similarities")); // rename the output file
193 | }
194 |
195 | /**
196 | * Load test images in a map based on their MLCs. Also update the set of test image IDs and test user IDs.
197 | * @param testFile
198 | * @param resultFile
199 | */
200 | private void loadTestImages(String testFile, String resultFile){
201 |
202 | EasyBufferedReader readerTest = new EasyBufferedReader(testFile);
203 | EasyBufferedReader readerResult = new EasyBufferedReader(resultFile);
204 | String lineT,lineR;
205 |
206 | while ((lineT = readerTest.readLine())!=null && (lineR = readerResult.readLine())!=null){
207 |
208 | if(!lineR.split("\t")[1].equals("N/A")){
209 | // create an object based on test image metadata
210 | Set terms = new HashSet();
211 | TextUtil.parse(lineR.split("\t")[10], terms);
212 | TextUtil.parse(lineR.split("\t")[8], terms);
213 | ImageMetadata image = new ImageMetadata(lineT.split("\t")[1], lineT.split("\t")[3], terms);
214 |
215 | // update respective sets
216 | testIDs.add(lineT.split("\t")[0]);
217 | users.add(lineT.split("\t")[2]);
218 |
219 | // load image object to the corresponding cell of the map
220 | if(predictedCellsOfTestImages.containsKey(lineR.split("\t")[1].split(":")[0])){
221 | predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image);
222 | }else{
223 | predictedCellsOfTestImages.put(lineR.split("\t")[1].split(":")[0],
224 | new ArrayList());
225 | predictedCellsOfTestImages.get(lineR.split("\t")[1].split(":")[0]).add(image);
226 | }
227 | }
228 | }
229 |
230 | logger.info(users.size()+" different users appeared in " + testIDs.size() + " images");
231 | readerTest.close();
232 | readerResult.close();
233 | }
234 | }
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/EasyBufferedReader.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileInputStream;
5 | import java.io.FileNotFoundException;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.io.Reader;
9 | import java.io.UnsupportedEncodingException;
10 |
11 | import org.apache.log4j.Logger;
12 |
13 | public class EasyBufferedReader extends BufferedReader {
14 |
15 | protected Logger logger;
16 |
17 |
18 | static final Reader createReader(String textFile, Logger logger){
19 | try {
20 | return new InputStreamReader(new FileInputStream(textFile), "UTF-8");
21 | } catch (UnsupportedEncodingException e) {
22 | logger.error(e.getMessage());
23 | } catch (FileNotFoundException e) {
24 | logger.error(e.getMessage());
25 | }
26 | return null;
27 | }
28 |
29 | public EasyBufferedReader(String textFile) {
30 | super(createReader(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedReader")));
31 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedReader");
32 | logger.debug("opened " + textFile);
33 | }
34 |
35 | @Override
36 | public void close() {
37 | try {
38 | super.close();
39 | } catch (IOException e) {
40 | logger.error(e.getMessage());
41 | }
42 | }
43 |
44 | @Override
45 | public String readLine() {
46 | try {
47 | return super.readLine();
48 | } catch (IOException e) {
49 | logger.error(e.getMessage());
50 | }
51 | return null;
52 | }
53 |
54 |
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/EasyBufferedWriter.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.util;
2 |
3 | import java.io.BufferedWriter;
4 | import java.io.FileNotFoundException;
5 | import java.io.FileOutputStream;
6 | import java.io.IOException;
7 | import java.io.OutputStreamWriter;
8 | import java.io.UnsupportedEncodingException;
9 | import java.io.Writer;
10 |
11 | import org.apache.log4j.Logger;
12 |
13 | public class EasyBufferedWriter extends BufferedWriter {
14 |
15 | protected Logger logger;
16 |
17 |
18 | static final Writer createWriter(String textFile, Logger logger, boolean end){
19 | try {
20 | return new OutputStreamWriter(new FileOutputStream(textFile,end), "UTF-8");
21 | } catch (UnsupportedEncodingException e) {
22 | logger.error(e.getMessage());
23 | } catch (FileNotFoundException e) {
24 | logger.error(e.getMessage());
25 | }
26 | return null;
27 | }
28 |
29 | public EasyBufferedWriter(String textFile) {
30 | super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),false));
31 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter");
32 | logger.debug("opened " + textFile);
33 | }
34 |
35 | public EasyBufferedWriter(String textFile, boolean end) {
36 | super(createWriter(textFile, Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter"),end));
37 | this.logger = Logger.getLogger("eu.socialsensor.util.EasyBufferedWriter");
38 | logger.debug("opened " + textFile);
39 | }
40 |
41 | @Override
42 | public void close() {
43 | try {
44 | super.close();
45 | } catch (IOException e) {
46 | logger.error(e.getMessage());
47 | }
48 | }
49 |
50 | @Override
51 | public void write(String s) {
52 | try {
53 | super.write(s);
54 | } catch (IOException e){
55 | logger.error(e.getMessage());
56 | }
57 | }
58 |
59 | @Override
60 | public void newLine() {
61 | try {
62 | super.newLine();
63 | } catch (IOException e){
64 | logger.error(e.getMessage());
65 | }
66 | }
67 |
68 |
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/Progress.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.util;
2 |
3 | import org.apache.log4j.Logger;
4 |
5 | public class Progress {
6 |
7 | private long gStartTime, lastTime;
8 | private int div, scaleTime;
9 | private String mesPerCent, mesTime, messege;
10 | private int sec;
11 | private Logger logger;
12 |
13 | public Progress(long gStartTime, int limitCountLines, int scalePerCent, int scaleTime, String messege, Logger logger){
14 | this.gStartTime = gStartTime;
15 |
16 | this.mesPerCent = "%";
17 | if(scalePerCent==10){this.mesPerCent = "0" + this.mesPerCent;}
18 |
19 | this.scaleTime = scaleTime;
20 | this.mesTime = "m";
21 | if(scaleTime==1){this.mesTime = "s";}
22 |
23 | this.div = limitCountLines/scalePerCent;
24 | this.messege = messege;
25 |
26 | this.logger = logger;
27 | }
28 |
29 | public Progress(long gStartTime, int sec, int scaleTime, String messege, Logger logger){
30 | this.sec = sec;
31 | this.gStartTime = gStartTime;
32 |
33 | this.scaleTime = scaleTime;
34 |
35 | this.mesTime = "min";
36 | this.messege = messege;
37 | if(scaleTime==1){this.mesTime = "s";}
38 |
39 | this.logger = logger;
40 | }
41 |
42 | public void showMessege(long stopTime){
43 | if(stopTime-lastTime>sec*1000){
44 | logger.info(messege+" > "+ (stopTime-gStartTime)/(scaleTime*1000) + mesTime);
45 | lastTime=stopTime;
46 | }
47 | }
48 |
49 | public void showProgress(int count, long stopTime){
50 | if(count%div==0){
51 | logger.info(messege+" > "+count/div+ mesPerCent + " > " + (stopTime-gStartTime)/(scaleTime*1000) + mesTime);
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/TextUtil.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.util;
2 |
3 | import java.io.UnsupportedEncodingException;
4 | import java.net.URLDecoder;
5 | import java.text.Normalizer;
6 | import java.util.Set;
7 | import java.util.regex.Pattern;
8 |
9 |
10 | public class TextUtil {
11 |
12 | public static String deAccent(String str) {
13 | String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
14 | Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
15 | return pattern.matcher(nfdNormalizedString).replaceAll("");
16 | }
17 |
18 | public static Set parse (String text, Set terms) {
19 |
20 | if ((text !=null ) || (text !="")){
21 | try{
22 | text = URLDecoder.decode(text, "UTF-8");
23 | text = deAccent(text);
24 |
25 | text = text.trim(); // removes redundant white spaces
26 | text = text.replaceAll("[\\p{Punct}&&[^\\,]]", "");
27 | text = text.replaceAll("[0-9]+", "");
28 |
29 | text = text.toLowerCase();
30 | text = text.replaceAll("\\s{2,}", " ");
31 | text = text.replaceAll("\\,{2,}", ",");
32 | text = text.trim();
33 |
34 | for(String term:text.split(",")){
35 | if(!term.replaceAll(" ", "").matches("[0-9]+")&&!term.isEmpty()){
36 | terms.add(term.trim());
37 | for(String interm:term.split(" ")){
38 | if(!interm.matches("[0-9]+")){
39 | terms.add(interm);
40 | }
41 | }
42 | }
43 | }
44 | }catch(UnsupportedEncodingException exception){
45 | }catch(IllegalArgumentException exception){}
46 | }
47 | return terms;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/gr/iti/mklab/util/Utils.java:
--------------------------------------------------------------------------------
1 | package gr.iti.mklab.util;
2 |
3 | import java.util.Arrays;
4 | import java.util.List;
5 | import java.util.Map;
6 | import java.util.Map.Entry;
7 |
8 | import gr.iti.mklab.data.GeoCell;
9 |
10 | import java.util.Set;
11 | import java.util.Collections;
12 | import java.util.Comparator;
13 | import java.util.HashMap;
14 | import java.util.Iterator;
15 | import java.util.LinkedHashMap;
16 | import java.util.LinkedList;
17 |
18 | public class Utils {
19 |
20 | public static Map sortByValues(Map map){
21 | List> entries = new LinkedList>(map.entrySet());
22 |
23 | Collections.sort(entries, Collections.reverseOrder(new Comparator>() {
24 |
25 | public int compare(Entry o1, Entry o2) {
26 | return o1.getValue().compareTo(o2.getValue());
27 | }
28 | }));
29 | //LinkedHashMap will keep the keys in the order they are inserted
30 | //which is currently sorted on natural ordering
31 | Map sortedMap = new LinkedHashMap();
32 |
33 | for(Map.Entry entry: entries){
34 | sortedMap.put(entry.getKey(), entry.getValue());
35 | }
36 |
37 | return sortedMap;
38 | }
39 |
40 | public static Map sortByValuesTable(Map map){
41 | List> entries = new LinkedList>(map.entrySet());
42 |
43 | Collections.sort(entries, Collections.reverseOrder(new Comparator>() {
44 | public int compare(Entry o1, Entry o2) {
45 | return o1.getValue()[0].compareTo(o2.getValue()[0]);
46 | }
47 | }));
48 | //LinkedHashMap will keep the keys in the order they are inserted
49 | //which is currently sorted on natural ordering
50 | Map sortedMap = new LinkedHashMap();
51 |
52 | for(Map.Entry entry: entries){
53 | sortedMap.put(entry.getKey(), entry.getValue());
54 | }
55 |
56 | return sortedMap;
57 | }
58 |
59 | public static Map sortByMLCValues(Map unsortMap) {
60 |
61 | // Convert Map to List
62 | List> list =
63 | new LinkedList>(unsortMap.entrySet());
64 |
65 | // Sort list with comparator, to compare the Map values
66 | Collections.sort(list, new Comparator>() {
67 | public int compare(Map.Entry o1,
68 | Map.Entry o2) {
69 | return -(o1.getValue()).getTotalProb().compareTo(o2.getValue().getTotalProb());
70 | }
71 | });
72 |
73 | // Convert sorted map back to a Map
74 | Map sortedMap = new LinkedHashMap();
75 | for (Iterator> it = list.iterator(); it.hasNext();) {
76 | Map.Entry entry = it.next();
77 | sortedMap.put(entry.getKey(), entry.getValue());
78 | }
79 | return sortedMap;
80 | }
81 |
82 | public static HashMap getFirstEntryOfSortedMap(Map map){
83 | HashMap firstEntry = new HashMap();
84 |
85 | for ( Entry entry : map.entrySet()){
86 | firstEntry.put(entry.getKey(), entry.getValue());
87 | break;
88 | }
89 |
90 | return firstEntry;
91 | }
92 |
93 | public static HashMap invertKeysValues(Map map){
94 |
95 | HashMap invertedHashMap = new HashMap();
96 |
97 | for(Entry