├── .gitignore
├── README.md
├── docs
    ├── implementation.md
    ├── known-issues.md
    └── paper
    │   └── paper.pdf
├── pom.xml
├── sampleData
    └── data
├── scripts
    ├── database
    │   ├── __init__.py
    │   └── mysql.py
    ├── schema
    │   └── spatium.sql
    ├── scrapper
    │   ├── __init__.py
    │   └── socrata.py
    └── utils
    │   ├── __init__.py
    │   └── configParser.py
└── src
    └── main
        ├── resources
            ├── log4j.properties
            └── reference.conf.sample
        └── scala
            └── com
                └── github
                    └── locis
                        ├── apps
                            ├── CountInstance.scala
                            ├── DataLoader.scala
                            ├── MapReduceJob.scala
                            ├── NeighborGrouping.scala
                            ├── NeighborSearch.scala
                            ├── PatternSearch.scala
                            └── package-info.java
                        ├── map
                            ├── CountInstanceMapper.scala
                            ├── NeighborGroupingMapper.scala
                            ├── NeighborSearchMapper.scala
                            ├── PatternSearchMapper.scala
                            └── package-info.java
                        ├── reduce
                            ├── CountInstanceReducer.scala
                            ├── NeighborGroupingReducer.scala
                            ├── NeighborSearchReducer.scala
                            ├── PatternSearchReducer.scala
                            └── package-info.java
                        └── utils
                            ├── ConfigUtils.scala
                            ├── DataParser.scala
                            ├── DistanceMeasure.scala
                            ├── HBaseUtil.scala
                            ├── HDFSWriter.scala
                            ├── Mysql.scala
                            └── package-info.java


/.gitignore:
--------------------------------------------------------------------------------
  1 | ##############################
  2 | ####gitignore for eclipse#####
  3 | ##############################
  4 | .metadata
  5 | bin/
  6 | tmp/
  7 | *.tmp
  8 | *.bak
  9 | *.swp
 10 | *~.nib
 11 | local.properties
 12 | .settings/
 13 | .loadpath
 14 | .recommenders
 15 | 
 16 | # Eclipse Core
 17 | .project
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # PyDev specific (Python IDE for Eclipse)
 26 | *.pydevproject
 27 | 
 28 | # CDT-specific (C/C++ Development Tooling)
 29 | .cproject
 30 | 
 31 | # JDT-specific (Eclipse Java Development Tools)
 32 | .classpath
 33 | 
 34 | # Java annotation processor (APT)
 35 | .factorypath
 36 | 
 37 | # PDT-specific (PHP Development Tools)
 38 | .buildpath
 39 | 
 40 | # sbteclipse plugin
 41 | .target
 42 | 
 43 | # Tern plugin
 44 | .tern-project
 45 | 
 46 | # TeXlipse plugin
 47 | .texlipse
 48 | 
 49 | # STS (Spring Tool Suite)
 50 | .springBeans
 51 | 
 52 | # Code Recommenders
 53 | .recommenders/
 54 | 
 55 | ##############################
 56 | #####gitignore for scala######
 57 | ##############################
 58 | *.class
 59 | *.log
 60 | 
 61 | # sbt specific
 62 | .cache
 63 | .history
 64 | .lib/
 65 | dist/*
 66 | target/
 67 | lib_managed/
 68 | src_managed/
 69 | project/boot/
 70 | project/plugins/project/
 71 | 
 72 | # Scala-IDE specific
 73 | .scala_dependencies
 74 | .worksheet
 75 | .cache-main
 76 | .cache-tests
 77 | 
 78 | ##############################
 79 | ######gitignore for app#######
 80 | ##############################
 81 | 
 82 | src/main/resources/reference.conf
 83 | data/*
 84 | 
 85 | ##############################
 86 | ####gitignore for python######
 87 | ##############################
 88 | 
 89 | # Byte-compiled / optimized / DLL files
 90 | __pycache__/
 91 | *.py[cod]
 92 | *$py.class
 93 | 
 94 | # C extensions
 95 | *.so
 96 | 
 97 | # Distribution / packaging
 98 | .Python
 99 | env/
100 | build/
101 | develop-eggs/
102 | dist/
103 | downloads/
104 | eggs/
105 | .eggs/
106 | lib/
107 | lib64/
108 | parts/
109 | sdist/
110 | var/
111 | *.egg-info/
112 | .installed.cfg
113 | *.egg
114 | 
115 | # PyInstaller
116 | #  Usually these files are written by a python script from a template
117 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
118 | *.manifest
119 | *.spec
120 | 
121 | # Installer logs
122 | pip-log.txt
123 | pip-delete-this-directory.txt
124 | 
125 | # Unit test / coverage reports
126 | htmlcov/
127 | .tox/
128 | .coverage
129 | .coverage.*
130 | .cache
131 | nosetests.xml
132 | coverage.xml
133 | *,cover
134 | .hypothesis/
135 | 
136 | # Translations
137 | *.mo
138 | *.pot
139 | 
140 | # Django stuff:
141 | *.log
142 | local_settings.py
143 | 
144 | # Flask instance folder
145 | instance/
146 | 
147 | # Scrapy stuff:
148 | .scrapy
149 | 
150 | # Sphinx documentation
151 | docs/_build/
152 | 
153 | # PyBuilder
154 | target/
155 | 
156 | # IPython Notebook
157 | .ipynb_checkpoints
158 | 
159 | # pyenv
160 | .python-version
161 | 
162 | # celery beat schedule file
163 | celerybeat-schedule
164 | 
165 | # dotenv
166 | .env
167 | 
168 | # virtualenv
169 | venv/
170 | ENV/
171 | 
172 | # Spyder project settings
173 | .spyderproject
174 | 
175 | # Rope project settings
176 | .ropeproject


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # locis
 2 | Implementation of  [A Parallel Spatial Co-location Mining Algorithm Based on MapReduce](docs/paper/paper.pdf) paper
 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.971748.svg)](https://doi.org/10.5281/zenodo.971748)
 4 | 
 5 | ## Colocation Pattern
 6 | 
 7 | A spatial colocation pattern is a set of features that co-occur in space. For example, two crimes, say Robbery and Assault, would form a colocation pattern if they are reported together at many places. Think of spatial colocation pattern mining as [association rule mining](https://en.wikipedia.org/wiki/Association_rule_learning) in the spatial domain.
 8 | 
 9 | ## Setup
10 | 
11 | * Download and setup Scala, Hadoop (with HDFS) and HBase for versions given [here](docs/implementation.md).
12 | * Refer [this](https://github.com/shagunsodhani/book-keeper) for sample values for Hadoop and HBase configurations in pseudo distributed mode and [this](docs/known-issues.md) for some known issues when setting up HBase. 
13 | * Start Hadoop using `$HADOOP_HOME/sbin/start-dfs.sh` and HBase using `$HBASE_HOME/bin/start-hbase.sh`.
14 | * Verify that Hadoop and HBase are working propery by opening [http://localhost:50070/](http://localhost:50070/) and [http://localhost:16010/](http://localhost:16010/) respectively.
15 | * Copy `src/main/resources/reference.conf.sample` to `src/main/resources/reference.conf` and populate values.
16 | * Run `mvn clean install` in project folder.
17 | 
18 | ### To download dataset
19 | 
20 | * Obtain an application token from [Socrata portal](https://dev.socrata.com/register) and copy it to `socrata.key` field in `reference.conf`.
21 | * Copy schema from `scripts/schema`.
22 | * Run `python scripts/scrapper/socrata.py`.
23 | 
24 | ### To load data in HDFS
25 | 
26 | * Run `scala -cp target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.DataLoader <input_path_to_write_raw_data>`
27 | * If no path is provided, it writes to `/user/locis/input/data`
28 | 
29 | ### Dummy Dataset
30 | 
31 | * A very small dataset (6 rows) can be found in `sampleData/data` file. The file can be used for testing the different MapReduce tasks without having to download the socrata dataset. 
32 | * Add the file to hdfs using the put command `$HADOOP_HOME/bin/hdfs dfs -put <path_to_locis>/sampleData/data <input_path_to_write_raw_data>` and proceed to run MapReduce tasks.
33 | 
34 | ### To run Neighbour Search MapReduce task
35 | 
36 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.NeighborSearch <input_path_to_read_raw_data> <output_path_to_write_neighbors>`
37 | 
38 | ### To run Neighbour Grouping MapReduce task
39 | 
40 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.NeighborGrouping <input_path_to_read_neighbors> <output_path_to_write_neighbor_groups>`
41 | 
42 | ### To run Count Instance MapReduce task
43 | 
44 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.CountInstance <input_path_to_read_neighbor_groups> <output_path_to_write_instance_count>`
45 | 
46 | ### To run Colocation Pattern Search MapReduce task
47 | 
48 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.PatternSearch <input_path_to_read_neighbor_groups> <output_path_to_write_prevalence_scores> <size_of_colocation>`
49 | 
50 | Note that for running colocation pattern search task for size k, the results for size 1 to *k-1* should already be in the db. So to find colocation patterns of size *k*, run the script for 1 to *k* and not just *k*. This task can be easily automated using a bash script. 
51 | 
52 | ### License
53 | 
54 | [MIT](https://shagun.mit-license.org/)
55 | 


--------------------------------------------------------------------------------
/docs/implementation.md:
--------------------------------------------------------------------------------
 1 | ## Dataset
 2 | 
 3 | [Crime data for Chicago city](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2)
 4 | 
 5 | ## Stack
 6 | 
 7 | | Tech   |  Version | 
 8 | |--------|----------| 
 9 | | Scala  |  2.11.7  | 
10 | | java   |    1.8   | 
11 | | Hadoop |  2.7.2   | 
12 | | HBase  |  1.2.1   | 
13 | 
14 | ## Algorithm
15 | 
16 | * Input: Data in the form (id, type, (latitude, longitude))
17 | * Map data points to different grids.
18 | * Use plane-sweep algorithm to find neighbors for each data point.
19 | * Perform neighbor grouping.
20 | * Count instances for different types.
21 | * Generate size k co-locations.
22 | 
23 | ## Using HBase Data Model
24 | 
25 | HBase is used at following places:
26 | 
27 | * Save *(event, count)* pairs in reducer for [counting instances of different event types](https://github.com/shagunsodhani/locis/issues/8). Here, we can use *event* as the *row key* and *count* as the *value*.
28 | 
29 | * Save prevalent colocation patterns in reducer for [co-location pattern search](https://github.com/shagunsodhani/locis/issues/7). Here we can use the *eventset* as the *row key*, *size* as the *column key* and *[instance]* as the *value*.
30 | 
31 | * Read size *k-1* colocations in *scanNTransactions* method in mapper for [co-location pattern search](https://github.com/shagunsodhani/locis/issues/7). Here, the lookup can be performed easily using the *row key* for a given size (*column key*).
32 | 
33 | ## Notes
34 | 
35 | * The algorithm does not perform candidate set generation.
36 | 


--------------------------------------------------------------------------------
/docs/known-issues.md:
--------------------------------------------------------------------------------
1 | ### HBase
2 | 
3 | * In pseduo distributed mode, if the HMaster is not up with a single run of `$HBASE_HOME/bin/start-hbase.sh`, run the command one more time.


--------------------------------------------------------------------------------
/docs/paper/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/docs/paper/paper.pdf


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |    <modelVersion>4.0.0</modelVersion>
  4 |    <groupId>locis</groupId>
  5 |    <artifactId>locis</artifactId>
  6 |    <version>0.0.1-SNAPSHOT</version>
  7 |    <properties>
  8 |       <maven.compiler.source>1.6</maven.compiler.source>
  9 |       <maven.compiler.target>1.6</maven.compiler.target>
 10 |       <encoding>UTF-8</encoding>
 11 |       <scala.version>2.11.7</scala.version>
 12 |    </properties>
 13 |    <dependencies>
 14 |       <dependency>
 15 |          <!-- Scala dependency -->
 16 |          <groupId>org.scala-lang</groupId>
 17 |          <artifactId>scala-library</artifactId>
 18 |          <version>2.11.7</version>
 19 |       </dependency>
 20 |       <dependency>
 21 |          <groupId>org.apache.hadoop</groupId>
 22 |          <artifactId>hadoop-common</artifactId>
 23 |          <version>2.7.2</version>
 24 |       </dependency>
 25 |       <dependency>
 26 |          <groupId>org.apache.hadoop</groupId>
 27 |          <artifactId>hadoop-client</artifactId>
 28 |          <version>2.7.2</version>
 29 |       </dependency>
 30 |       <dependency>
 31 |          <groupId>org.apache.hadoop</groupId>
 32 |          <artifactId>hadoop-hdfs</artifactId>
 33 |          <version>2.7.2</version>
 34 |       </dependency>
 35 |       <dependency>
 36 |          <groupId>org.apache.hadoop</groupId>
 37 |          <artifactId>hadoop-mapreduce-client-core</artifactId>
 38 |          <version>2.7.2</version>
 39 |       </dependency>
 40 |       <dependency>
 41 |          <groupId>com.github.mauricio</groupId>
 42 |          <artifactId>mysql-async_2.11</artifactId>
 43 |          <version>0.2.19</version>
 44 |       </dependency>
 45 |       <dependency>
 46 |          <groupId>org.slf4j</groupId>
 47 |          <artifactId>slf4j-api</artifactId>
 48 |          <version>1.7.21</version>
 49 |       </dependency>
 50 |       <dependency>
 51 |          <groupId>org.slf4j</groupId>
 52 |          <artifactId>slf4j-simple</artifactId>
 53 |          <version>1.7.21</version>
 54 |       </dependency>
 55 |       <dependency>
 56 |          <groupId>com.typesafe</groupId>
 57 |          <artifactId>config</artifactId>
 58 |          <version>1.3.0</version>
 59 |       </dependency>
 60 |       <dependency>
 61 |          <groupId>org.apache.hbase</groupId>
 62 |          <artifactId>hbase-client</artifactId>
 63 |          <version>1.2.1</version>
 64 |       </dependency>
 65 |    </dependencies>
 66 |    <build>
 67 |       <finalName>locis</finalName>
 68 |       <sourceDirectory>src/main/scala</sourceDirectory>
 69 |       <testSourceDirectory>src/test/scala</testSourceDirectory>
 70 |       <plugins>
 71 |          <plugin>
 72 |             <groupId>org.scala-tools</groupId>
 73 |             <artifactId>maven-scala-plugin</artifactId>
 74 |             <version>2.15.0</version>
 75 |             <executions>
 76 |                <execution>
 77 |                   <goals>
 78 |                      <goal>compile</goal>
 79 |                      <goal>testCompile</goal>
 80 |                   </goals>
 81 |                </execution>
 82 |             </executions>
 83 |          </plugin>
 84 |          <plugin>
 85 |             <groupId>org.apache.maven.plugins</groupId>
 86 |             <artifactId>maven-jar-plugin</artifactId>
 87 |             <version>2.3</version>
 88 |             <configuration>
 89 |                <outputDirectory>${basedir}/target</outputDirectory>
 90 |             </configuration>
 91 |          </plugin>
 92 |          <plugin>
 93 |             <groupId>org.apache.maven.plugins</groupId>
 94 |             <artifactId>maven-shade-plugin</artifactId>
 95 |             <version>2.3</version>
 96 |             <executions>
 97 |                <execution>
 98 |                   <phase>package</phase>
 99 |                   <goals>
100 |                      <goal>shade</goal>
101 |                   </goals>
102 |                </execution>
103 |             </executions>
104 |             <configuration>
105 |                <filters>
106 |                   <filter>
107 |                      <artifact>*:*</artifact>
108 |                      <excludes>
109 |                         <exclude>META-INF/*.SF</exclude>
110 |                         <exclude>META-INF/*.DSA</exclude>
111 |                         <exclude>META-INF/*.RSA</exclude>
112 |                      </excludes>
113 |                   </filter>
114 |                </filters>
115 |                <finalName>uber-${project.artifactId}-${project.version}</finalName>
116 |             </configuration>
117 |          </plugin>
118 |       </plugins>
119 |    </build>
120 | </project>


--------------------------------------------------------------------------------
/sampleData/data:
--------------------------------------------------------------------------------
1 | 1,A,1,1,0,0,1,0,0,0
2 | 2,A,0,0,0,0,1,0,0,0
3 | 3,B,1,2,0,0,1,0,0,0
4 | 4,C,1,3,0,0,1,0,0,0
5 | 5,D,2,3,0,0,1,0,0,0
6 | 6,E,1171610,11903564,0,0,2,0,0,0


--------------------------------------------------------------------------------
/scripts/database/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/database/__init__.py


--------------------------------------------------------------------------------
/scripts/database/mysql.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from utils.configParser import parse
 3 | import MySQLdb
 4 | 
 5 | def connect(config_path):
 6 | 
 7 |     '''Open database connection and return conn object to perform database queries'''
 8 |     config = parse(config_path)
 9 |     host = config['mysql.host']
10 |     user = config['mysql.username']
11 |     passwd = config['mysql.password']
12 |     db = config['mysql.database']
13 | 
14 |     try:
15 |         conn=MySQLdb.connect(host, user, passwd, db)
16 |         return conn
17 |     except MySQLdb.Error, e:
18 |         print "ERROR %d IN CONNECTION: %s" % (e.args[0], e.args[1])
19 | 
20 | 
21 | def write(sql,cursor,conn):
22 |     '''Perform insert and update operations on the databse.
23 |        Need to pass the cursor object as a parameter'''
24 |     try:
25 |         cursor.execute(sql)
26 |         conn.commit()
27 |     except MySQLdb.ProgrammingError, e:
28 |         print "ERROR %d IN WRITE OPERATION: %s" % (e.args[0], e.args[1])
29 |         print "LAST QUERY WAS: %s" %sql
30 | 
31 | 
32 | def read(sql,cursor):
33 |     '''Perform read operations on the databse.
34 |        Need to pass the cursor object as a parameter'''
35 |     try:
36 |         cursor.execute(sql)
37 |         result = cursor.fetchall()
38 |         return result
39 |     except MySQLdb.ProgrammingError, e:
40 |         print "ERROR %d IN READ OPERATION: %s" % (e.args[0], e.args[1])
41 |         print "LAST QUERY WAS: %s" %sql
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/scripts/schema/spatium.sql:
--------------------------------------------------------------------------------
 1 | -- phpMyAdmin SQL Dump
 2 | -- version 4.0.10deb1
 3 | -- http://www.phpmyadmin.net
 4 | --
 5 | -- Host: localhost
 6 | -- Generation Time: Apr 24, 2016 at 09:15 AM
 7 | -- Server version: 5.5.49-0ubuntu0.14.04.1
 8 | -- PHP Version: 5.5.9-1ubuntu4.16
 9 | 
10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
11 | SET time_zone = "+00:00";
12 | 
13 | 
14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
17 | /*!40101 SET NAMES utf8 */;
18 | 
19 | --
20 | -- Database: `spatium`
21 | --
22 | 
23 | -- --------------------------------------------------------
24 | 
25 | --
26 | -- Table structure for table `dataset`
27 | --
28 | 
29 | CREATE TABLE IF NOT EXISTS `dataset` (
30 |   `id` int(11) NOT NULL,
31 |   `longitude` double(25,15) NOT NULL,
32 |   `latitude` double(25,15) NOT NULL,
33 |   `primary_type` varchar(100) NOT NULL,
34 |   `date` int(11) NOT NULL,
35 |   `x_coordinate` int(11) NOT NULL,
36 |   `y_coordinate` int(11) NOT NULL,
37 |   `district` int(11) NOT NULL,
38 |   `ward` int(11) NOT NULL,
39 |   `community_area` int(11) NOT NULL,
40 |   `fbi_code` varchar(20) NOT NULL,
41 |   PRIMARY KEY (`id`),
42 |   KEY `primary_type` (`primary_type`),
43 |   KEY `date` (`date`)
44 | ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
45 | 
46 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
47 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
48 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
49 | 


--------------------------------------------------------------------------------
/scripts/scrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/scrapper/__init__.py


--------------------------------------------------------------------------------
/scripts/scrapper/socrata.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import datetime
 4 | import requests
 5 | import os
 6 | 
 7 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 8 | 
 9 | if not path in sys.path:
10 |     sys.path.insert(1, path)
11 | del path
12 | 
13 | from utils.configParser import parse
14 | import database.mysql as db
15 | 
16 | def date_to_timestamp(stime):
17 |     stime = stime.split('T')
18 |     date = stime[0]
19 |     temp = date.split("-")
20 |     a = []
21 |     a.append(int(temp[0]))
22 |     a.append(int(temp[1]))
23 |     a.append(int(temp[2]))
24 |     date = stime[1].split(':')
25 |     for i in date:
26 |         a.append(int(i))
27 |     a = datetime.datetime(a[0], a[1], a[2], a[3], a[4], a[5]).timetuple()
28 |     # year, month, day, hour, minute, second, microsecond, and tzinfo.
29 |     return int(time.mktime(a))
30 | 
31 | class Socrata():
32 |     """Class to fetch data using socrata API"""
33 | 
34 |     def __init__(self, limit, config_path):
35 |         
36 |         self.conn = db.connect(config_path)
37 |         self.cursor = self.conn.cursor()
38 |         self.url = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"
39 |         self.limit = limit
40 |         config = parse(config_path)
41 |         self.socrata_key = config["socrata.key"]
42 |     
43 |     def fetch_json(self, offset=0):
44 |         payload = {'$limit': self.limit, '$offset': offset, '$$app_token':self.socrata_key}
45 |         
46 |         try :
47 |             r = requests.get(self.url, params=payload)
48 |         except requests.exceptions.ChunkedEncodingError:
49 |             print payload
50 |             return self.fetch_json(offset = offset)
51 | 
52 |         to_save = ['latitude', 'longitude', 'id', 'primary_type','date', 'x_coordinate', 'y_coordinate', 
53 |         'district', 'ward', 'community_area', 'fbi_code']
54 |         print r.url
55 |         if r.json():
56 |             sql = "INSERT INTO dataset ("
57 |             for i in to_save:
58 |                 sql+=i+" , "
59 |             sql=sql[:-2]
60 |             sql+= ") VALUES "
61 |             for i in r.json():
62 |                 to_insert = "( "
63 |                 for j in to_save:
64 |                     if j not in i.keys():
65 |                         i[j] = "\'\'"
66 |                     else:
67 |                         if j == 'date':
68 |                             i[j] = str(date_to_timestamp(i[j]))
69 |                         i[j] = "\'"+i[j]+"\'"   
70 |                     to_insert+=i[j]+", "
71 |                 to_insert = to_insert[:-2]
72 |                 to_insert+='), '
73 |                 sql+=to_insert
74 |             sql = sql[:-2]
75 |             db.write(sql, self.cursor, self.conn)
76 |             return 1
77 |         else:
78 |             return 0
79 | 
80 |     def fetch_all(self, offset = 0):
81 |         while(self.fetch_json(offset = offset)):
82 |             offset+=self.limit
83 |             print offset, " elements inserted in db."
84 | 
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     config_path = "src/main/resources/reference.conf"
89 |     a = Socrata(limit = 1000, config_path = config_path)
90 |     offset = 4870000
91 |     # a.fetch_json(offset)
92 |     a.fetch_all(offset = 0) 


--------------------------------------------------------------------------------
/scripts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/utils/__init__.py


--------------------------------------------------------------------------------
/scripts/utils/configParser.py:
--------------------------------------------------------------------------------
 1 | def parse(file_path):
 2 |     # Method to read the config file.
 3 |     # Using a custom function for parsing so that we have only one config for 
 4 |     # both the scripts and the mapreduce tasks.
 5 |     config = {}
 6 |     with open(file_path) as f:
 7 |         for line in f:
 8 |             data = line.strip()
 9 |             if(data and not data.startswith("#")):
10 |                 (key, value) = data.split("=")
11 |                 config[key] = value
12 |     return config


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console and file
 2 | LOG_LEVEL=INFO
 3 | COMPONENT=locis
 4 | 
 5 | log4j.rootCategory=${LOG_LEVEL}, console, file
 6 | 
 7 | #Console appender
 8 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 9 | log4j.appender.console.target=System.out
10 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: [${COMPONENT}] %m%n
12 | 
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.parquet.hadoop.ParquetInputFormat=WARN
15 | log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=WARN
16 | log4j.logger.org.eclipse.jetty=WARN
17 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
18 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
19 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
20 | 
21 | # Direct log messages to a log file
22 | log4j.appender.file=org.apache.log4j.RollingFileAppender
23 | log4j.appender.file.File=logging.log
24 | log4j.appender.file.MaxFileSize=10MB
25 | log4j.appender.file.MaxBackupIndex=10
26 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
27 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - [${COMPONENT}] %m%n
28 | 


--------------------------------------------------------------------------------
/src/main/resources/reference.conf.sample:
--------------------------------------------------------------------------------
 1 | #mysql config
 2 | 
 3 | mysql.username=
 4 | mysql.host=
 5 | mysql.port=
 6 | mysql.password=
 7 | mysql.database=
 8 | 
 9 | #Hadoop FileSystem config
10 | 
11 | fs.defaultFS=
12 | 
13 | distance.threshold=50000
14 | # Note that this value is with respect to x-y coordinate system.
15 | participationIndex.threshold=0.1
16 | 
17 | socrata.key=


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/CountInstance.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.LongWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.Job
 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 9 | 
10 | import com.github.locis.map.CountInstanceMapper
11 | import com.github.locis.reduce.CountInstanceReducer
12 | import com.github.locis.utils.HBaseUtil
13 | 
14 | object CountInstance extends MapReduceJob {
15 | 
16 |   private val hBaseUtil = new HBaseUtil()
17 | 
18 |   def jobName: String = {
19 |     "CountInstance"
20 |   }
21 |   
22 |     override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " +
23 |     "com.github.locis.apps." + jobName + " <input_path_to_read_neighbor_groups> <output_path_to_write_instance_count>"
24 | 
25 |   def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = {
26 |     hBaseUtil.createInstanceCountTable()
27 |     val job = new Job(hadoopConfiguration, jobName)
28 |     job.setMapperClass(classOf[CountInstanceMapper])
29 |     job.setReducerClass(classOf[CountInstanceReducer])
30 |     job.setMapOutputKeyClass(classOf[Text])
31 |     job.setMapOutputValueClass((classOf[LongWritable]))
32 |     job.setOutputKeyClass(classOf[Text])
33 |     job.setOutputValueClass(classOf[LongWritable])
34 |     FileInputFormat.addInputPath(job, inputPath)
35 |     FileOutputFormat.setOutputPath(job, outputPath)
36 |     val status = if (job.waitForCompletion(true)) 0 else 1
37 |     System.exit(status)
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/DataLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.slf4j.Logger
 4 | import org.slf4j.LoggerFactory
 5 | 
 6 | import com.github.locis.utils.DataParser
 7 | import com.github.locis.utils.HDFSWriter
 8 | import com.github.locis.utils.Mysql
 9 | 
10 | object DataLoader {
11 | 
12 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
13 | 
14 |   def getCountOfRows() = {
15 |     val mysql = new Mysql()
16 |     val sqlQuery = "SELECT count(*) FROM dataset"
17 |     mysql.runQuery(sqlQuery)
18 |   }
19 | 
20 |   def loadData(limit: Long = -1, start: Long = -1) = {
21 |     //    Be careful with limit=-1. It would bring in a lot of data. (~500 mb)
22 |     val mysql = new Mysql()
23 |     val baseQuery = "SELECT " + DataParser.getAttributeList.mkString(",") +
24 |       " FROM dataset ORDER BY date ASC"
25 |     val sqlQuery = {
26 |       if (limit < 0) {
27 |         baseQuery
28 |       } else if (start < 0) {
29 |         baseQuery + " LIMIT " + limit.toString()
30 |       } else {
31 |         baseQuery + " LIMIT " + start.toString() + ", " + limit.toString()
32 |       }
33 |     }
34 |     mysql.runQuery(sqlQuery)
35 |   }
36 | 
37 |   def writeData(pathToWrite: String = "/user/locis/input/data", limit: Long = -1, start: Long = -1) = {
38 |     val writer = new HDFSWriter().getWriter(pathToWrite)
39 |     val result = loadData(limit, start)
40 |     result.foreach { x => writer.write((x.mkString(",") + "\n").getBytes) }
41 |     writer.close()
42 |   }
43 | 
44 |   def writeAllData(pathToWrite: String = "/user/locis/input/data") = {
45 |     val writer = new HDFSWriter().getWriter(pathToWrite)
46 |     val start: Long = 0
47 |     val limit: Long = 100000
48 |     val countOfRows: Long = getCountOfRows()(0)(0).asInstanceOf[Long]
49 |     val numberOfIterations = countOfRows / limit + 1
50 |     (1 to numberOfIterations.asInstanceOf[Int]).foreach {
51 |       iterationCounter =>
52 |         {
53 |           loadData(limit, start + limit * iterationCounter)
54 |             .foreach { x => writer.write((x.mkString(",") + "\n").getBytes) }
55 |         }
56 |     }
57 |     writer.close()
58 |   }
59 | 
60 |   def main(args: Array[String]): Unit = {
61 |     val limit: Long = 10
62 |     val pathToWrite = {
63 |       if (args.isEmpty) {
64 |         "/user/locis/input/data"
65 |       } else {
66 |         args(0)
67 |       }
68 |     }
69 |     writeAllData()
70 |   }
71 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/MapReduceJob.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.util.GenericOptionsParser
 5 | import org.slf4j.Logger
 6 | import org.slf4j.LoggerFactory
 7 | import com.github.locis.utils.ConfigUtils
 8 | import com.typesafe.config.ConfigFactory
 9 | 
10 | abstract class MapReduceJob {
11 | 
12 |   protected val logger: Logger = LoggerFactory.getLogger(getClass)
13 | 
14 |   protected val hadoopConfiguration = ConfigUtils.getHadoopConfiguration()
15 | 
16 |   protected val userConfiguration = ConfigFactory.load
17 | 
18 |   def jobName(): String
19 | 
20 |   protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " +
21 |     "com.github.locis.apps." + jobName + " <inputFileName> <outputFileName>"
22 | 
23 |   protected val numberOfArgsExpected = 2
24 | 
25 |   def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit
26 | 
27 |   def main(args: Array[String]): Unit = {
28 |     val otherArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs
29 |     if (otherArgs.length != numberOfArgsExpected) {
30 |       println(errorMsg)
31 |     } else {
32 |       val inputPath = new Path(args(0))
33 |       val outputPath = new Path(args(1))
34 |       run(inputPath, outputPath, args)
35 |     }
36 | 
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/NeighborGrouping.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapreduce.Job
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 8 | 
 9 | import com.github.locis.map.NeighborGroupingMapper
10 | import com.github.locis.reduce.NeighborGroupingReducer
11 | 
12 | object NeighborGrouping extends MapReduceJob {
13 | 
14 |   def jobName: String = {
15 |     "NeighborGrouping"
16 |   }
17 |   
18 |   override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " +
19 |     "com.github.locis.apps." + jobName + " <input_path_to_read_neighbors> <output_path_to_write_neighbor_groups>"
20 | 
21 |   def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = {
22 | 
23 |     val job = new Job(hadoopConfiguration, jobName)
24 |     job.setMapperClass(classOf[NeighborGroupingMapper])
25 |     job.setReducerClass(classOf[NeighborGroupingReducer])
26 |     job.setMapOutputKeyClass(classOf[Text])
27 |     job.setMapOutputValueClass((classOf[Text]))
28 |     job.setOutputKeyClass(classOf[Text])
29 |     job.setOutputValueClass(classOf[Text])
30 |     FileInputFormat.addInputPath(job, inputPath)
31 |     FileOutputFormat.setOutputPath(job, outputPath)
32 |     val status = if (job.waitForCompletion(true)) 0 else 1
33 |     System.exit(status)
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/NeighborSearch.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapreduce.Job
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 8 | 
 9 | import com.github.locis.map.NeighborSearchMapper
10 | import com.github.locis.reduce.NeighborSearchReducer
11 | 
12 | object NeighborSearch extends MapReduceJob {
13 | 
14 |   def jobName: String = {
15 |     "NeighborSearch"
16 |   }
17 | 
18 |   override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " +
19 |     "com.github.locis.apps." + jobName + " <input_path_to_read_raw_data> <output_path_to_write_neighbors>"
20 | 
21 |   def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = {
22 | 
23 |     val job = new Job(hadoopConfiguration, jobName)
24 |     job.setMapperClass(classOf[NeighborSearchMapper])
25 |     job.setReducerClass(classOf[NeighborSearchReducer])
26 |     job.setMapOutputKeyClass(classOf[Text])
27 |     job.setMapOutputValueClass((classOf[Text]))
28 |     job.setOutputKeyClass(classOf[Text])
29 |     job.setOutputValueClass(classOf[Text])
30 |     FileInputFormat.addInputPath(job, inputPath)
31 |     FileOutputFormat.setOutputPath(job, outputPath)
32 |     val status = if (job.waitForCompletion(true)) 0 else 1
33 |     System.exit(status)
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/PatternSearch.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.apps
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.DoubleWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.Job
 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 9 | 
10 | import com.github.locis.map.PatternSearchMapper
11 | import com.github.locis.reduce.PatternSearchReducer
12 | import com.github.locis.utils.HBaseUtil
13 | 
14 | object PatternSearch extends MapReduceJob {
15 | 
16 |   private val hBaseUtil = new HBaseUtil()
17 | 
18 |   def jobName: String = {
19 |     "PatternSearch"
20 |   }
21 | 
22 |   override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " +
23 |     "com.github.locis.apps." + jobName + " <input_path_to_read_neighbor_groups> " + 
24 |     "<output_path_to_write_prevalence_scores> <size_of_colocation>"
25 | 
26 |   override protected val numberOfArgsExpected = 3
27 | 
28 |   def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = {
29 |     val size = args(2)
30 |     val thresholdParticipationIndex = userConfiguration.getString("participationIndex.threshold")
31 |     hBaseUtil.createColocationStoreTable()
32 |     hadoopConfiguration.set("k", size)
33 |     hadoopConfiguration.set("thresholdParticipationIndex", thresholdParticipationIndex)
34 |     val job = new Job(hadoopConfiguration, jobName)
35 |     job.setMapperClass(classOf[PatternSearchMapper])
36 |     job.setReducerClass(classOf[PatternSearchReducer])
37 |     job.setMapOutputKeyClass(classOf[Text])
38 |     job.setMapOutputValueClass((classOf[Text]))
39 |     job.setOutputKeyClass(classOf[Text])
40 |     job.setOutputValueClass(classOf[DoubleWritable])
41 |     FileInputFormat.addInputPath(job, inputPath)
42 |     FileOutputFormat.setOutputPath(job, outputPath)
43 |     val status = if (job.waitForCompletion(true)) 0 else 1
44 |     System.exit(status)
45 |   }
46 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/apps/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 
3 |  */
4 | /**
5 |  * @author shagun
6 |  *
7 |  */
8 | package com.github.locis.apps;


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/map/CountInstanceMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.map
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapreduce.Mapper
 6 | import org.slf4j.Logger
 7 | import org.slf4j.LoggerFactory
 8 | 
 9 | import com.github.locis.utils.DataParser
10 | 
11 | /*
12 |  * This class maps the input data points to value 1.
13 |  * See : https://github.com/shagunsodhani/locis/issues/8
14 |  */
15 | 
16 | class CountInstanceMapper extends Mapper[LongWritable, Text, Text, LongWritable] {
17 | 
18 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
19 | 
20 |   override def map(key: LongWritable, value: Text,
21 |                    context: Mapper[LongWritable, Text, Text, LongWritable]#Context) = {
22 |     val line = value.toString().split("\t")
23 |     val keyString = line(0)
24 |     context.write(new Text(DataParser.getType(keyString)), new LongWritable(1L))
25 |   }
26 | 
27 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/map/NeighborGroupingMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.map
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapreduce.Mapper
 6 | import org.slf4j.Logger
 7 | import org.slf4j.LoggerFactory
 8 | 
 9 | import com.github.locis.utils.DataParser
10 | 
11 | /*
12 |  * This class maps the input data points to one of their neighbors.
13 |  * See : https://github.com/shagunsodhani/locis/issues/6
14 |  */
15 | 
16 | class NeighborGroupingMapper extends Mapper[LongWritable, Text, Text, Text] {
17 | 
18 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
19 | 
20 |   override def map(key: LongWritable, value: Text,
21 |                    context: Mapper[LongWritable, Text, Text, Text]#Context) = {
22 |     val line = value.toString().split("\t")
23 |     val keyString = line(0)
24 |     val valueString = line(1)
25 |     if (keyString == valueString || DataParser.getType(keyString) < DataParser.getType(valueString)) {
26 |       context.write(new Text(keyString), new Text(valueString))
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/map/NeighborSearchMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.map
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapreduce.Mapper
 6 | import org.slf4j.Logger
 7 | import org.slf4j.LoggerFactory
 8 | import com.github.locis.utils.DataParser
 9 | 
10 | /*
11 |  * This class maps the input data points to the grid number. For now, we are 
12 |  * using a dummy implementation. 
13 |  * See : https://github.com/shagunsodhani/locis/issues/2
14 |  */
15 | 
16 | class NeighborSearchMapper extends Mapper[LongWritable, Text, Text, Text] {
17 | 
18 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
19 |   
20 |   private def getGridId(dataPoint: String): String = {
21 |     DataParser.getKeyForGridMapping(dataPoint)
22 |   }
23 |   override def map(key: LongWritable, value: Text,
24 |                    context: Mapper[LongWritable, Text, Text, Text]#Context) = {
25 |     val gridNumber = new Text(getGridId(value.toString()))
26 |     context.write(gridNumber, value)
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/map/PatternSearchMapper.scala:
--------------------------------------------------------------------------------
  1 | package com.github.locis.map
  2 | 
  3 | import org.apache.hadoop.io.LongWritable
  4 | import org.apache.hadoop.io.Text
  5 | import org.apache.hadoop.mapreduce.Mapper
  6 | import org.slf4j.Logger
  7 | import org.slf4j.LoggerFactory
  8 | 
  9 | import com.github.locis.utils.DataParser
 10 | import com.github.locis.utils.HBaseUtil
 11 | 
 12 | /*
 13 |  * This class maps the input data points to (eventset, instance).
 14 |  * See : https://github.com/shagunsodhani/locis/issues/7
 15 |  */
 16 | 
 17 | class PatternSearchMapper extends Mapper[LongWritable, Text, Text, Text] {
 18 | 
 19 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
 20 | 
 21 |   private val hBaseUtil = new HBaseUtil()
 22 | 
 23 |   private val internalSeprator = "#"
 24 |   //  used to separate records within a colocation instance
 25 | 
 26 |   private val externalSeperator = "\t"
 27 |   //  used to separate records in a list of colocations
 28 | 
 29 |   private def scanNTransactions(neighborhood: Array[String], k: Int): Iterator[Array[String]] = {
 30 |     val key = neighborhood(0)
 31 |     neighborhood
 32 |       .filterNot { _.equals(key) }
 33 |       .combinations(k - 1)
 34 |   }
 35 | 
 36 |   private def checkCliqueness(instance: Array[String], k: Int): Boolean = {
 37 |     if (instance.isEmpty) {
 38 |       true
 39 |     } else {
 40 |       val sortedInstance = instance
 41 |         .sortBy { dataPoint => DataParser.getType(dataPoint) }
 42 |       val rowName = sortedInstance
 43 |         .map { dataPoint => DataParser.getType(dataPoint) }
 44 |         .mkString(internalSeprator)
 45 | 
 46 |       hBaseUtil
 47 |         .readColocationStoreRow(rowName = rowName, size = k - 1)
 48 |         .split(externalSeperator)
 49 |         .contains(sortedInstance.mkString(internalSeprator))
 50 |     }
 51 |   }
 52 | 
 53 |   private def eventTypeOf(instance: Array[String]): Array[String] = {
 54 |     instance.map { DataParser.getType }
 55 |   }
 56 | 
 57 |   override def map(key: LongWritable, value: Text,
 58 |                    context: Mapper[LongWritable, Text, Text, Text]#Context) = {
 59 |     val k = context.getConfiguration.get("k").toInt
 60 |     val line = value.toString().split(externalSeperator)
 61 |     val keyString = line(0)
 62 | 
 63 |     /*
 64 |      *     I have a feeling that this could explode for very large datasets. It should not be brought in-memory.
 65 |      *     But querying HBase everytime would be very slow. Given the number of steps this set would take to be created, I am not convinced if
 66 |      *     filtering on basis of prevalent colocation types would indeed benefit the implementation.
 67 |      */
 68 |     val prevalentColocationTypes = {
 69 |       if (k > 1) {
 70 |         hBaseUtil
 71 |           .scanColocationStoreColumn("size", k - 1)
 72 |           .flatMap(colocationType => colocationType.split(internalSeprator).toIterable)
 73 |           .toSet
 74 |       } else {
 75 |         hBaseUtil
 76 |           .scanInstanceCountColumn("size")
 77 |           .flatMap(colocationType => colocationType.split(internalSeprator).toIterable)
 78 |           .toSet
 79 |       }
 80 |     }
 81 | 
 82 |     val neighborhood = line
 83 |       .tail
 84 |       .filter { neighbor => prevalentColocationTypes.contains(DataParser.getType(neighbor)) }
 85 | 
 86 |     scanNTransactions(neighborhood, k)
 87 |       .filter { checkCliqueness(_, k) }
 88 |       .foreach { instance =>
 89 |         {
 90 |           val eventSet = (eventTypeOf(instance) ++ Array(DataParser.getType(keyString)))
 91 |             .sortBy { identity }
 92 |             .mkString(internalSeprator)
 93 |           val sortedInstance = (instance ++ Array(keyString))
 94 |             .sortBy { dataPoint => DataParser.getType(dataPoint) }
 95 |             .mkString(internalSeprator)
 96 |           context.write(new Text(eventSet), new Text(sortedInstance))
 97 |         }
 98 |       }
 99 |   }
100 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/map/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 
3 |  */
4 | /**
5 |  * @author shagun
6 |  *
7 |  */
8 | package com.github.locis.map;


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/reduce/CountInstanceReducer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.reduce
 2 | 
 3 | import scala.collection.JavaConversions.asScalaIterator
 4 | 
 5 | import org.apache.hadoop.io.LongWritable
 6 | import org.apache.hadoop.io.Text
 7 | import org.apache.hadoop.mapreduce.Reducer
 8 | import org.slf4j.Logger
 9 | import org.slf4j.LoggerFactory
10 | 
11 | import com.github.locis.utils.HBaseUtil
12 | 
13 | /*
14 |  * This class counts the number of instances of each type and saves it to HBase
15 |  * See : https://github.com/shagunsodhani/locis/issues/8
16 |  */
17 | 
18 | class CountInstanceReducer extends Reducer[Text, LongWritable, Text, LongWritable] {
19 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
20 | 
21 |   private val hBaseUtil = new HBaseUtil()
22 | 
23 |   override def reduce(key: Text, values: java.lang.Iterable[LongWritable],
24 |                       context: Reducer[Text, LongWritable, Text, LongWritable]#Context): Unit = {
25 |     val sum = values.iterator().map { x => x.get() }.sum
26 |     hBaseUtil.writeToInstanceCountTable(
27 |       List(
28 |         (key.toString(), sum.toString())))
29 |     context.write(key, new LongWritable(sum))
30 |   }
31 | 
32 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/reduce/NeighborGroupingReducer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.reduce
 2 | 
 3 | import org.apache.hadoop.io.Text
 4 | import org.apache.hadoop.mapreduce.Reducer
 5 | import org.slf4j.Logger
 6 | import org.slf4j.LoggerFactory
 7 | 
 8 | import com.github.locis.utils.DataParser
 9 | 
10 | /*
11 |  * This class groups all the neighbors for a given key (dataPoint) in sorted order. 
12 |  * algorithm. 
13 |  * See : https://github.com/shagunsodhani/locis/issues/6
14 |  */
15 | 
16 | class NeighborGroupingReducer extends Reducer[Text, Text, Text, Text] {
17 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
18 | 
19 |   override def reduce(key: Text, values: java.lang.Iterable[Text],
20 |                       context: Reducer[Text, Text, Text, Text]#Context): Unit = {
21 |     val dataPointIterator = values.iterator()
22 |     val sortedObjectSet = {
23 |       var tempObjectSet = Set[String]()
24 |       while (dataPointIterator.hasNext()) {
25 |         tempObjectSet += dataPointIterator.next().toString()
26 |       }
27 |       tempObjectSet.toSeq.sortBy { dataPoint => DataParser.getType(dataPoint) }
28 |     }
29 |     val nRecord = (Seq(key.toString()) ++ sortedObjectSet).mkString("\t")
30 | //    This step adds the key to the nRecord set for the second time which seems not only unnecessary but also erroneous.
31 | //    val nRecord = sortedObjectSet.mkString("\t")
32 |     context.write(key, new Text(nRecord))
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/reduce/NeighborSearchReducer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.reduce
 2 | 
 3 | import org.apache.hadoop.io.Text
 4 | import org.apache.hadoop.mapreduce.Reducer
 5 | import org.slf4j.Logger
 6 | import org.slf4j.LoggerFactory
 7 | 
 8 | import com.github.locis.utils.DataParser
 9 | import com.github.locis.utils.DistanceMeasure
10 | import com.typesafe.config.ConfigFactory
11 | 
12 | /*
13 |  * This class finds all the neighbors in a given grid using the plane-sweep 
14 |  * algorithm. 
15 |  * See : https://github.com/shagunsodhani/locis/issues/3
16 |  */
17 | 
18 | class NeighborSearchReducer extends Reducer[Text, Text, Text, Text] {
19 | 
20 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
21 | 
22 |   private val distanceThreshold: Double = ConfigFactory.load.getDouble("distance.threshold")
23 | 
24 |   private def getEuclideanDistance(dataPoint1: String, dataPoint2: String): Double = {
25 |     val y1 = DataParser.getY(dataPoint1)
26 |     val y2 = DataParser.getY(dataPoint2)
27 | 
28 |     val x1 = DataParser.getX(dataPoint1)
29 |     val x2 = DataParser.getX(dataPoint2)
30 | 
31 |     DistanceMeasure.euclideanDistance(x1, y1, x2, y2)
32 |   }
33 | 
34 |   private def getHaversineDistance(dataPoint1: String, dataPoint2: String): Double = {
35 |     val lat1 = DataParser.getY(dataPoint1)
36 |     val lat2 = DataParser.getY(dataPoint2)
37 | 
38 |     val lng1 = DataParser.getX(dataPoint1)
39 |     val lng2 = DataParser.getX(dataPoint2)
40 | 
41 |     DistanceMeasure.haversineDistance(lat1, lng1, lat2, lng2)
42 |   }
43 | 
44 |   private def planeSweep(dataPoints: java.lang.Iterable[Text]): Set[(String, String)] = {
45 |     val dataPointIterator = dataPoints.iterator()
46 |     val objectSet = {
47 |       var tempObjectSet = Set[String]()
48 |       while (dataPointIterator.hasNext()) {
49 |         tempObjectSet += dataPointIterator.next().toString()
50 |       }
51 |       tempObjectSet.toSeq.sortBy { dataPoint => DataParser.getX(dataPoint) }
52 |     }
53 |     //    sorted Object Set
54 |     var activeSet = Set[String]()
55 |     var resultSet = Set[(String, String)]()
56 |     var j = 0
57 |     val n = objectSet.length
58 |     (0 to n-1).foreach {
59 |       i =>
60 |         {
61 |           while (DataParser.getX(objectSet(i)) - DataParser.getX(objectSet(j)) > distanceThreshold) {
62 |             activeSet -= objectSet(i)
63 |             j += 1
64 |           }
65 |           val range = activeSet
66 |             .filter { dataPoint =>
67 |               {
68 |                 val yDataPoint = DataParser.getY(dataPoint)
69 |                 val yObjectSetPoint = DataParser.getY(objectSet(i))
70 |                 ((yDataPoint <= yObjectSetPoint + distanceThreshold)
71 |                   && (yDataPoint >= yObjectSetPoint - distanceThreshold))
72 |               }
73 |             }
74 |           range.filter {
75 |             dataPointInrange => (getEuclideanDistance(objectSet(i), dataPointInrange) <= distanceThreshold)
76 |           }.foreach { dataPoint => (resultSet += ((objectSet(i), dataPoint))) }
77 |           activeSet += objectSet(i)
78 | //    This step adds a data point as its own neighbor but does not seem to be required. Infact, using this step can make
79 | //    the scanNTransactions step slower.
80 |           resultSet += ((objectSet(i), objectSet(i)))
81 |         }
82 |     }
83 |     resultSet
84 |   }
85 | 
86 |   override def reduce(key: Text, values: java.lang.Iterable[Text],
87 |                       context: Reducer[Text, Text, Text, Text]#Context): Unit = {
88 |     planeSweep(values).foreach { dataPoint =>
89 |       {
90 |         context.write(new Text(dataPoint._1), new Text(dataPoint._2))
91 |       }
92 |     }
93 | 
94 |   }
95 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/reduce/PatternSearchReducer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.reduce
 2 | 
 3 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter
 4 | 
 5 | import org.apache.hadoop.io.DoubleWritable
 6 | import org.apache.hadoop.io.Text
 7 | import org.apache.hadoop.mapreduce.Reducer
 8 | import org.slf4j.Logger
 9 | import org.slf4j.LoggerFactory
10 | 
11 | import com.github.locis.utils.DataParser
12 | import com.github.locis.utils.HBaseUtil
13 | /*
14 |  * This class groups all the instances for a given key (eventset). 
15 |  * See : https://github.com/shagunsodhani/locis/issues/7
16 |  */
17 | 
18 | class PatternSearchReducer extends Reducer[Text, Text, Text, DoubleWritable] {
19 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
20 | 
21 |   private val hBaseUtil = new HBaseUtil()
22 |   private val internalSeprator = "#"
23 |   //  used to separate records within a colocation instance
24 | 
25 |   private val externalSeperator = "\t"
26 |   //  used to separate records in a list of colocations
27 | 
28 |   private def getEventTypeCount(eventType: String): Long = {
29 |     hBaseUtil.readInstanceCountRow(eventType).toLong
30 |   }
31 |   private def computeParticipationIndexAndInstanceString(eventType: String,
32 |                                                          instanceListIterator: Iterable[String]) = {
33 |     /*
34 |      * We have to compute the instance string in this function because we can not consume the iterable twice.
35 |      */
36 |     val eventTypeList = eventType.split(internalSeprator)
37 |     val eventTypeToIntMapping = eventTypeList.zipWithIndex
38 |     //    Mapping each event type to an integer
39 |     val eventTypeToCountMapping = eventTypeToIntMapping.map {
40 |       eventTypeIntTuple =>
41 |         {
42 |           (eventTypeIntTuple._2, getEventTypeCount(eventTypeIntTuple._1))
43 |         }
44 |     }.toMap
45 |     //    Mapping each event type (via an integer) to a its total number of instances
46 |     val eventTypeToSetMapping = eventTypeToIntMapping.map {
47 |       eventTypeIntTuple =>
48 |         {
49 |           (eventTypeIntTuple._2, scala.collection.mutable.Set[String]())
50 |         }
51 |     }.toMap
52 |     //      new scala.collection.mutable.HashMap[Int, scala.collection.mutable.Set[String]]
53 |     //    Mapping each event type (via an integer) to a set of unique instances
54 | 
55 |     val instanceStringBuffer = new StringBuilder
56 | 
57 |     instanceListIterator.foreach { instanceList =>
58 |       {
59 |         val instancesToIntMapping = instanceList.split(internalSeprator).zipWithIndex
60 |         instancesToIntMapping.foreach {
61 |           instanceIntTuple =>
62 |             {
63 |               eventTypeToSetMapping(instanceIntTuple._2) += DataParser.getId(instanceIntTuple._1)
64 |               instanceStringBuffer ++= instanceIntTuple._1 + internalSeprator
65 |             }
66 |         }
67 |         instanceStringBuffer.deleteCharAt(instanceStringBuffer.length-1)
68 |         instanceStringBuffer ++= externalSeperator
69 |       }
70 |     }
71 |     instanceStringBuffer.deleteCharAt(instanceStringBuffer.length-1)
72 |     val participationIndex = eventTypeToSetMapping.map {
73 |       intSetTuple =>
74 |         {
75 |           intSetTuple._2.count { x => true }.toDouble / eventTypeToCountMapping(intSetTuple._1)
76 |         }
77 |     }.min
78 |     (participationIndex, instanceStringBuffer.toString())
79 |   }
80 | 
81 |   override def reduce(key: Text, values: java.lang.Iterable[Text],
82 |                       context: Reducer[Text, Text, Text, DoubleWritable]#Context): Unit = {
83 |     val k = context.getConfiguration.get("k").toInt
84 |     val eventType = key.toString()
85 |     val instanceListIterator = values.asScala.map { _.toString() }
86 |     val thresholdParticipationIndex = context.getConfiguration.get("thresholdParticipationIndex").toDouble
87 |     val (participationIndex, instanceString) = computeParticipationIndexAndInstanceString(eventType,
88 |       instanceListIterator)
89 |     if (participationIndex > thresholdParticipationIndex) {
90 |       hBaseUtil.writeToColocationStoreTable(List((eventType, instanceString)), k)
91 |       context.write(key, new DoubleWritable(participationIndex))
92 |     }
93 |   }
94 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/reduce/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 
3 |  */
4 | /**
5 |  * @author shagun
6 |  *
7 |  */
8 | package com.github.locis.reduce;


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/ConfigUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.utils
 2 | 
 3 | import org.apache.hadoop.conf.Configuration
 4 | import org.apache.hadoop.fs.LocalFileSystem
 5 | import org.apache.hadoop.hdfs.DistributedFileSystem
 6 | import org.slf4j.Logger
 7 | import org.slf4j.LoggerFactory
 8 | 
 9 | import com.typesafe.config.ConfigFactory
10 | 
11 | object ConfigUtils {
12 | 
13 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
14 | 
15 |   def getHadoopConfiguration() = {
16 |     val configuration = new Configuration()
17 |     configuration.set("fs.defaultFS", ConfigFactory.load.getString("fs.defaultFS"))
18 |     configuration.set("fs.hdfs.impl", classOf[org.apache.hadoop.hdfs.DistributedFileSystem].getName())
19 |     configuration.set("fs.file.impl", classOf[org.apache.hadoop.fs.LocalFileSystem].getName())
20 |     configuration
21 |   }
22 | 
23 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/DataParser.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.utils
 2 | 
 3 | import org.slf4j.Logger
 4 | import org.slf4j.LoggerFactory
 5 | 
 6 | object DataParser {
 7 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
 8 | 
 9 |   private val sep = ","
10 | 
11 |   private val attributeList: List[String] = List(
12 |     "id",
13 |     "primary_type",
14 |     "x_coordinate",
15 |     "y_coordinate",
16 |     "longitude",
17 |     "latitude",
18 |     "district",
19 |     "ward",
20 |     "community_area",
21 |     "fbi_code")
22 | 
23 |   private val attributeIndexMap = attributeList.zipWithIndex.toMap
24 | 
25 |   private def getIndex(attribute: String) = {
26 |     attributeIndexMap.get(attribute)
27 |   }
28 | 
29 |   def getId(dataPoint: String): String = {
30 |     val typeIndex = attributeIndexMap("id")
31 |     dataPoint.split(sep)(typeIndex)
32 |   }
33 | 
34 |   def getType(dataPoint: String): String = {
35 |     val typeIndex = attributeIndexMap("primary_type")
36 |     dataPoint.split(sep)(typeIndex)
37 |   }
38 | 
39 |   def getX(dataPoint: String): Double = {
40 |     val xIndex = attributeIndexMap("x_coordinate")
41 |     dataPoint.split(sep)(xIndex).toDouble
42 |   }
43 | 
44 |   def getY(dataPoint: String): Double = {
45 |     val yIndex = attributeIndexMap("y_coordinate")
46 |     dataPoint.split(sep)(yIndex).toDouble
47 |   }
48 | 
49 |   def getAttributeList: List[String] = {
50 |     attributeList
51 |   }
52 | 
53 |   def getKeyForGridMapping(dataPoint: String): String = {
54 |     val keyIndex = attributeIndexMap("district")
55 |     dataPoint.split(sep)(keyIndex)
56 |   }
57 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/DistanceMeasure.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.utils
 2 | 
 3 | import org.slf4j.Logger
 4 | import org.slf4j.LoggerFactory
 5 | 
 6 | object DistanceMeasure {
 7 | 
 8 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
 9 | 
10 |   private val earthRadius: Double = 6371 //in km
11 | 
12 |   def haversineDistance(lat1: Double, lng1: Double, lat2: Double, lng2: Double) = {
13 |     /*
14 |  * Function to return distance between two points (represented by latitude and longitude).
15 |  * The distance is returned in km
16 |  */
17 |     val dLat = Math.toRadians(lat2 - lat1)
18 |     val dLng = Math.toRadians(lng2 - lng1)
19 |     val a = Math.sin(dLat / 2) * Math.sin(dLat / 2) +
20 |       Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2)) *
21 |       Math.sin(dLng / 2) * Math.sin(dLng / 2)
22 |     val c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
23 |     earthRadius * c
24 |   }
25 | 
26 |   def euclideanDistance(x1: Double, y1: Double, x2: Double, y2: Double) = {
27 |     /*
28 |  * Function to return distance between two points in the euclidean space.
29 |  * The unit is not known but can be worked out.
30 |  */
31 |     math.sqrt(math.pow(x1 - x2, 2) + math.pow(y1 - y2, 2))
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/HBaseUtil.scala:
--------------------------------------------------------------------------------
  1 | package com.github.locis.utils
  2 | 
  3 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter
  4 | 
  5 | import org.apache.hadoop.hbase.HBaseConfiguration
  6 | import org.apache.hadoop.hbase.HColumnDescriptor
  7 | import org.apache.hadoop.hbase.HTableDescriptor
  8 | import org.apache.hadoop.hbase.TableName
  9 | import org.apache.hadoop.hbase.client.ConnectionFactory
 10 | import org.apache.hadoop.hbase.client.Get
 11 | import org.apache.hadoop.hbase.client.HTable
 12 | import org.apache.hadoop.hbase.client.Put
 13 | import org.apache.hadoop.hbase.client.Result
 14 | import org.apache.hadoop.hbase.client.Scan
 15 | import org.apache.hadoop.hbase.util.Bytes
 16 | import org.slf4j.Logger
 17 | import org.slf4j.LoggerFactory
 18 | 
 19 | class HBaseUtil {
 20 | 
 21 |   /*
 22 |  * This class contains all the HBase related logic. 
 23 |  * See : https://github.com/shagunsodhani/locis/issues/10
 24 |  * 		 : https://github.com/shagunsodhani/locis/issues/12	
 25 |  */
 26 | 
 27 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
 28 | 
 29 |   private val conf = HBaseConfiguration.create()
 30 |   private val connection = ConnectionFactory.createConnection(conf)
 31 |   private val admin = connection.getAdmin()
 32 |   private val instanceCountTableName = "InstanceCount"
 33 |   private val colocationStoreTableName = "ColocationStore"
 34 | 
 35 |   private def isTableExist(tableName: TableName) = {
 36 |     admin.tableExists(tableName)
 37 |   }
 38 | 
 39 |   private def createTable(tableName: String, columnList: List[String]) = {
 40 |     /*
 41 |  * This method creates a table with a given name.
 42 |  * If a table with same name exists, nothing is done.	
 43 |  */
 44 |     val _tableName = TableName.valueOf(tableName)
 45 |     if (!isTableExist(_tableName)) {
 46 |       val tableDescriptor = new HTableDescriptor(_tableName)
 47 |       columnList.foreach { x =>
 48 |         tableDescriptor.addFamily(new HColumnDescriptor(x))
 49 |       }
 50 |       admin.createTable(tableDescriptor)
 51 |     }
 52 |   }
 53 | 
 54 |   def createInstanceCountTable() = {
 55 |     /*
 56 |  * This method creates a table to track the instance counts for each event type.
 57 |  * For Data Model, see : https://github.com/shagunsodhani/locis/blob/master/docs/implementation.md#using-hbase-data-model
 58 |  */
 59 |     val columnList = List("size")
 60 |     createTable(instanceCountTableName, columnList)
 61 |   }
 62 | 
 63 |   def createColocationStoreTable() = {
 64 |     /*
 65 |  * This method creates a table to store colocations of different sizes.
 66 |  * For Data Model, see : https://github.com/shagunsodhani/locis/blob/master/docs/implementation.md#using-hbase-data-model
 67 |  */
 68 |     val columnList = List("size")
 69 |     createTable(colocationStoreTableName, columnList)
 70 |   }
 71 | 
 72 |   def writeToInstanceCountTable(rowkeyValueList: List[(String, String)]) = {
 73 |     val table = new HTable(conf, instanceCountTableName)
 74 |     val sizeByteArray = Bytes.toBytes("1")
 75 |     rowkeyValueList.foreach {
 76 |       rowkeyValue =>
 77 |         {
 78 |           val rowKey = rowkeyValue._1
 79 |           val value = rowkeyValue._2
 80 |           val row = new Put(Bytes.toBytes(rowKey))
 81 |           row.addImmutable(Bytes.toBytes("size"), sizeByteArray,
 82 |             Bytes.toBytes(value))
 83 |           table.put(row)
 84 |         }
 85 |         table.flushCommits()
 86 |         table.close()
 87 |     }
 88 |   }
 89 | 
 90 |   def writeToColocationStoreTable(rowkeyValueList: List[(String, String)],
 91 |                                   size: Int) = {
 92 |     val table = new HTable(conf, colocationStoreTableName)
 93 |     val sizeByteArray = Bytes.toBytes(size.toString())
 94 |     rowkeyValueList.foreach {
 95 |       rowkeyValue =>
 96 |         {
 97 |           val rowKey = rowkeyValue._1
 98 |           val value = rowkeyValue._2
 99 |           val row = new Put(Bytes.toBytes(rowKey))
100 |           row.addImmutable(Bytes.toBytes("size"), sizeByteArray,
101 |             Bytes.toBytes(value))
102 |           table.put(row)
103 |         }
104 |         table.flushCommits()
105 |         table.close()
106 |     }
107 |   }
108 | 
109 |   def deleteTable(tableName: String) = {
110 |     val _tableName = TableName.valueOf(tableName)
111 |     if (admin.tableExists(_tableName)) {
112 |       admin.disableTable(_tableName)
113 |       admin.deleteTable(_tableName)
114 |     }
115 |   }
116 | 
117 |   def scanColocationStoreColumn(columnName: String, size: Int): Iterable[String] = {
118 |     /*
119 |      * Method to scan a column in HBase.
120 |      * See : https://github.com/shagunsodhani/locis/issues/15
121 |      */
122 |     val colocationStoreTable = new HTable(conf, colocationStoreTableName)
123 |     val scan = new Scan()
124 |     scan.addColumn(Bytes.toBytes(columnName), Bytes.toBytes(size.toString()))
125 |     colocationStoreTable
126 |       .getScanner(scan)
127 |       .asScala
128 |       .map { result => Bytes.toString(result.getRow) }
129 |   }
130 | 
131 |   def readColocationStoreRow(rowName: String, size: Int): String = {
132 |     /*
133 |      * Method to read a row in HBase.
134 |      * See : https://github.com/shagunsodhani/locis/issues/15
135 |      */
136 |     val colocationStoreTable = new HTable(conf, colocationStoreTableName)
137 |     val get = new Get(Bytes.toBytes(rowName))
138 |     val result: Result = colocationStoreTable.get(get)
139 |     Bytes.toString(result.getValue(Bytes.toBytes("size"), Bytes.toBytes((size).toString())))
140 |   }
141 | 
142 |   def readInstanceCountRow(rowName: String, size: Int = 1): String = {
143 |     /*
144 |      * Method to read a row in HBase.
145 |      * See : https://github.com/shagunsodhani/locis/issues/15
146 |      */
147 |     val instanceCountTable = new HTable(conf, instanceCountTableName)
148 |     val get = new Get(Bytes.toBytes(rowName))
149 |     val result: Result = instanceCountTable.get(get)
150 |     Bytes.toString(result.getValue(Bytes.toBytes("size"), Bytes.toBytes((size).toString())))
151 |   }
152 | 
153 |   def scanInstanceCountColumn(columnName: String, size: Int = 1): Iterable[String] = {
154 |     /*
155 |      * Method to scan a column in HBase.
156 |      * See : https://github.com/shagunsodhani/locis/issues/15
157 |      */
158 |     val instanceCountTable = new HTable(conf, instanceCountTableName)
159 |     val scan = new Scan()
160 |     scan.addColumn(Bytes.toBytes(columnName), Bytes.toBytes(size.toString()))
161 |     instanceCountTable
162 |       .getScanner(scan)
163 |       .asScala
164 |       .map { result => Bytes.toString(result.getRow) }
165 |   }
166 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/HDFSWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.utils
 2 | 
 3 | import org.apache.hadoop.conf.Configuration
 4 | import org.apache.hadoop.fs.FileSystem
 5 | import org.apache.hadoop.fs.LocalFileSystem
 6 | import org.apache.hadoop.fs.Path
 7 | import org.apache.hadoop.hdfs.DistributedFileSystem
 8 | import org.apache.hadoop.util.Progressable
 9 | import org.slf4j.Logger
10 | import org.slf4j.LoggerFactory
11 | 
12 | import com.typesafe.config.ConfigFactory
13 | 
14 | class HDFSWriter {
15 | 
16 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
17 | 
18 |   private val configuration = new Configuration()
19 |   configuration.set("fs.defaultFS", ConfigFactory.load.getString("fs.defaultFS"))
20 |   configuration.set("fs.hdfs.impl", classOf[org.apache.hadoop.hdfs.DistributedFileSystem].getName())
21 |   configuration.set("fs.file.impl", classOf[org.apache.hadoop.fs.LocalFileSystem].getName())
22 | 
23 |   private val fileSystem = FileSystem.get(configuration)
24 | 
25 |   def getWriter(outputFilePath: String) = {
26 |     val progressable = new Progressable() {
27 |       @Override
28 |       def progress() {
29 |         print(".");
30 |       }
31 |     }
32 |     fileSystem.create(new Path(outputFilePath), progressable)
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/Mysql.scala:
--------------------------------------------------------------------------------
 1 | package com.github.locis.utils
 2 | 
 3 | import scala.concurrent.Await
 4 | import scala.concurrent.Future
 5 | import scala.concurrent.duration.Duration
 6 | 
 7 | import org.slf4j.Logger
 8 | import org.slf4j.LoggerFactory
 9 | 
10 | import com.github.mauricio.async.db.Configuration
11 | import com.github.mauricio.async.db.QueryResult
12 | import com.github.mauricio.async.db.ResultSet
13 | import com.github.mauricio.async.db.mysql.MySQLConnection
14 | import com.typesafe.config.ConfigFactory
15 | 
16 | class Mysql {
17 | 
18 |   private val logger: Logger = LoggerFactory.getLogger(getClass)
19 | 
20 |   private val mysqlConnection: com.github.mauricio.async.db.mysql.MySQLConnection = {
21 |     val configuration: Configuration = {
22 |       val username: String = ConfigFactory.load.getString("mysql.username")
23 |       val host: String = ConfigFactory.load.getString("mysql.host")
24 |       val port: Int = ConfigFactory.load.getInt("mysql.port")
25 |       val password: String = ConfigFactory.load.getString("mysql.password")
26 |       val database: String = ConfigFactory.load.getString("mysql.database")
27 |       new Configuration(username, host, port, Option(password), Option(database))
28 |     }
29 |     val connection = new MySQLConnection(configuration)
30 |     Await.result(connection.connect, Duration.Inf)
31 |     connection
32 |   }
33 | 
34 |   def runQuery(sqlQuery: String): ResultSet = {
35 |     val future: Future[QueryResult] = mysqlConnection
36 |       .sendPreparedStatement(sqlQuery)
37 |     Await.result(future, Duration.Inf).rows.get
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/locis/utils/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 
3 |  */
4 | /**
5 |  * @author shagun
6 |  *
7 |  */
8 | package com.github.locis.utils;


--------------------------------------------------------------------------------