├── .gitignore ├── README.md ├── docs ├── implementation.md ├── known-issues.md └── paper │ └── paper.pdf ├── pom.xml ├── sampleData └── data ├── scripts ├── database │ ├── __init__.py │ └── mysql.py ├── schema │ └── spatium.sql ├── scrapper │ ├── __init__.py │ └── socrata.py └── utils │ ├── __init__.py │ └── configParser.py └── src └── main ├── resources ├── log4j.properties └── reference.conf.sample └── scala └── com └── github └── locis ├── apps ├── CountInstance.scala ├── DataLoader.scala ├── MapReduceJob.scala ├── NeighborGrouping.scala ├── NeighborSearch.scala ├── PatternSearch.scala └── package-info.java ├── map ├── CountInstanceMapper.scala ├── NeighborGroupingMapper.scala ├── NeighborSearchMapper.scala ├── PatternSearchMapper.scala └── package-info.java ├── reduce ├── CountInstanceReducer.scala ├── NeighborGroupingReducer.scala ├── NeighborSearchReducer.scala ├── PatternSearchReducer.scala └── package-info.java └── utils ├── ConfigUtils.scala ├── DataParser.scala ├── DistanceMeasure.scala ├── HBaseUtil.scala ├── HDFSWriter.scala ├── Mysql.scala └── package-info.java /.gitignore: -------------------------------------------------------------------------------- 1 | ############################## 2 | ####gitignore for eclipse##### 3 | ############################## 4 | .metadata 5 | bin/ 6 | tmp/ 7 | *.tmp 8 | *.bak 9 | *.swp 10 | *~.nib 11 | local.properties 12 | .settings/ 13 | .loadpath 14 | .recommenders 15 | 16 | # Eclipse Core 17 | .project 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # PyDev specific (Python IDE for Eclipse) 26 | *.pydevproject 27 | 28 | # CDT-specific (C/C++ Development Tooling) 29 | .cproject 30 | 31 | # JDT-specific (Eclipse Java Development Tools) 32 | .classpath 33 | 34 | # Java annotation processor (APT) 35 | .factorypath 36 | 37 | # PDT-specific (PHP Development Tools) 38 | .buildpath 39 | 40 | # sbteclipse plugin 41 | .target 42 | 43 | # Tern plugin 44 | .tern-project 45 | 46 | # TeXlipse plugin 47 | .texlipse 48 | 49 | # STS (Spring Tool Suite) 50 | .springBeans 51 | 52 | # Code Recommenders 53 | .recommenders/ 54 | 55 | ############################## 56 | #####gitignore for scala###### 57 | ############################## 58 | *.class 59 | *.log 60 | 61 | # sbt specific 62 | .cache 63 | .history 64 | .lib/ 65 | dist/* 66 | target/ 67 | lib_managed/ 68 | src_managed/ 69 | project/boot/ 70 | project/plugins/project/ 71 | 72 | # Scala-IDE specific 73 | .scala_dependencies 74 | .worksheet 75 | .cache-main 76 | .cache-tests 77 | 78 | ############################## 79 | ######gitignore for app####### 80 | ############################## 81 | 82 | src/main/resources/reference.conf 83 | data/* 84 | 85 | ############################## 86 | ####gitignore for python###### 87 | ############################## 88 | 89 | # Byte-compiled / optimized / DLL files 90 | __pycache__/ 91 | *.py[cod] 92 | *$py.class 93 | 94 | # C extensions 95 | *.so 96 | 97 | # Distribution / packaging 98 | .Python 99 | env/ 100 | build/ 101 | develop-eggs/ 102 | dist/ 103 | downloads/ 104 | eggs/ 105 | .eggs/ 106 | lib/ 107 | lib64/ 108 | parts/ 109 | sdist/ 110 | var/ 111 | *.egg-info/ 112 | .installed.cfg 113 | *.egg 114 | 115 | # PyInstaller 116 | # Usually these files are written by a python script from a template 117 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 118 | *.manifest 119 | *.spec 120 | 121 | # Installer logs 122 | pip-log.txt 123 | pip-delete-this-directory.txt 124 | 125 | # Unit test / coverage reports 126 | htmlcov/ 127 | .tox/ 128 | .coverage 129 | .coverage.* 130 | .cache 131 | nosetests.xml 132 | coverage.xml 133 | *,cover 134 | .hypothesis/ 135 | 136 | # Translations 137 | *.mo 138 | *.pot 139 | 140 | # Django stuff: 141 | *.log 142 | local_settings.py 143 | 144 | # Flask instance folder 145 | instance/ 146 | 147 | # Scrapy stuff: 148 | .scrapy 149 | 150 | # Sphinx documentation 151 | docs/_build/ 152 | 153 | # PyBuilder 154 | target/ 155 | 156 | # IPython Notebook 157 | .ipynb_checkpoints 158 | 159 | # pyenv 160 | .python-version 161 | 162 | # celery beat schedule file 163 | celerybeat-schedule 164 | 165 | # dotenv 166 | .env 167 | 168 | # virtualenv 169 | venv/ 170 | ENV/ 171 | 172 | # Spyder project settings 173 | .spyderproject 174 | 175 | # Rope project settings 176 | .ropeproject -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # locis 2 | Implementation of [A Parallel Spatial Co-location Mining Algorithm Based on MapReduce](docs/paper/paper.pdf) paper 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.971748.svg)](https://doi.org/10.5281/zenodo.971748) 4 | 5 | ## Colocation Pattern 6 | 7 | A spatial colocation pattern is a set of features that co-occur in space. For example, two crimes, say Robbery and Assault, would form a colocation pattern if they are reported together at many places. Think of spatial colocation pattern mining as [association rule mining](https://en.wikipedia.org/wiki/Association_rule_learning) in the spatial domain. 8 | 9 | ## Setup 10 | 11 | * Download and setup Scala, Hadoop (with HDFS) and HBase for versions given [here](docs/implementation.md). 12 | * Refer [this](https://github.com/shagunsodhani/book-keeper) for sample values for Hadoop and HBase configurations in pseudo distributed mode and [this](docs/known-issues.md) for some known issues when setting up HBase. 13 | * Start Hadoop using `$HADOOP_HOME/sbin/start-dfs.sh` and HBase using `$HBASE_HOME/bin/start-hbase.sh`. 14 | * Verify that Hadoop and HBase are working propery by opening [http://localhost:50070/](http://localhost:50070/) and [http://localhost:16010/](http://localhost:16010/) respectively. 15 | * Copy `src/main/resources/reference.conf.sample` to `src/main/resources/reference.conf` and populate values. 16 | * Run `mvn clean install` in project folder. 17 | 18 | ### To download dataset 19 | 20 | * Obtain an application token from [Socrata portal](https://dev.socrata.com/register) and copy it to `socrata.key` field in `reference.conf`. 21 | * Copy schema from `scripts/schema`. 22 | * Run `python scripts/scrapper/socrata.py`. 23 | 24 | ### To load data in HDFS 25 | 26 | * Run `scala -cp target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.DataLoader ` 27 | * If no path is provided, it writes to `/user/locis/input/data` 28 | 29 | ### Dummy Dataset 30 | 31 | * A very small dataset (6 rows) can be found in `sampleData/data` file. The file can be used for testing the different MapReduce tasks without having to download the socrata dataset. 32 | * Add the file to hdfs using the put command `$HADOOP_HOME/bin/hdfs dfs -put /sampleData/data ` and proceed to run MapReduce tasks. 33 | 34 | ### To run Neighbour Search MapReduce task 35 | 36 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.NeighborSearch ` 37 | 38 | ### To run Neighbour Grouping MapReduce task 39 | 40 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.NeighborGrouping ` 41 | 42 | ### To run Count Instance MapReduce task 43 | 44 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.CountInstance ` 45 | 46 | ### To run Colocation Pattern Search MapReduce task 47 | 48 | * Run `$HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar com.github.locis.apps.PatternSearch ` 49 | 50 | Note that for running colocation pattern search task for size k, the results for size 1 to *k-1* should already be in the db. So to find colocation patterns of size *k*, run the script for 1 to *k* and not just *k*. This task can be easily automated using a bash script. 51 | 52 | ### License 53 | 54 | [MIT](https://shagun.mit-license.org/) 55 | -------------------------------------------------------------------------------- /docs/implementation.md: -------------------------------------------------------------------------------- 1 | ## Dataset 2 | 3 | [Crime data for Chicago city](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2) 4 | 5 | ## Stack 6 | 7 | | Tech | Version | 8 | |--------|----------| 9 | | Scala | 2.11.7 | 10 | | java | 1.8 | 11 | | Hadoop | 2.7.2 | 12 | | HBase | 1.2.1 | 13 | 14 | ## Algorithm 15 | 16 | * Input: Data in the form (id, type, (latitude, longitude)) 17 | * Map data points to different grids. 18 | * Use plane-sweep algorithm to find neighbors for each data point. 19 | * Perform neighbor grouping. 20 | * Count instances for different types. 21 | * Generate size k co-locations. 22 | 23 | ## Using HBase Data Model 24 | 25 | HBase is used at following places: 26 | 27 | * Save *(event, count)* pairs in reducer for [counting instances of different event types](https://github.com/shagunsodhani/locis/issues/8). Here, we can use *event* as the *row key* and *count* as the *value*. 28 | 29 | * Save prevalent colocation patterns in reducer for [co-location pattern search](https://github.com/shagunsodhani/locis/issues/7). Here we can use the *eventset* as the *row key*, *size* as the *column key* and *[instance]* as the *value*. 30 | 31 | * Read size *k-1* colocations in *scanNTransactions* method in mapper for [co-location pattern search](https://github.com/shagunsodhani/locis/issues/7). Here, the lookup can be performed easily using the *row key* for a given size (*column key*). 32 | 33 | ## Notes 34 | 35 | * The algorithm does not perform candidate set generation. 36 | -------------------------------------------------------------------------------- /docs/known-issues.md: -------------------------------------------------------------------------------- 1 | ### HBase 2 | 3 | * In pseduo distributed mode, if the HMaster is not up with a single run of `$HBASE_HOME/bin/start-hbase.sh`, run the command one more time. -------------------------------------------------------------------------------- /docs/paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/docs/paper/paper.pdf -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | locis 5 | locis 6 | 0.0.1-SNAPSHOT 7 | 8 | 1.6 9 | 1.6 10 | UTF-8 11 | 2.11.7 12 | 13 | 14 | 15 | 16 | org.scala-lang 17 | scala-library 18 | 2.11.7 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | 2.7.2 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-client 28 | 2.7.2 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-hdfs 33 | 2.7.2 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-mapreduce-client-core 38 | 2.7.2 39 | 40 | 41 | com.github.mauricio 42 | mysql-async_2.11 43 | 0.2.19 44 | 45 | 46 | org.slf4j 47 | slf4j-api 48 | 1.7.21 49 | 50 | 51 | org.slf4j 52 | slf4j-simple 53 | 1.7.21 54 | 55 | 56 | com.typesafe 57 | config 58 | 1.3.0 59 | 60 | 61 | org.apache.hbase 62 | hbase-client 63 | 1.2.1 64 | 65 | 66 | 67 | locis 68 | src/main/scala 69 | src/test/scala 70 | 71 | 72 | org.scala-tools 73 | maven-scala-plugin 74 | 2.15.0 75 | 76 | 77 | 78 | compile 79 | testCompile 80 | 81 | 82 | 83 | 84 | 85 | org.apache.maven.plugins 86 | maven-jar-plugin 87 | 2.3 88 | 89 | ${basedir}/target 90 | 91 | 92 | 93 | org.apache.maven.plugins 94 | maven-shade-plugin 95 | 2.3 96 | 97 | 98 | package 99 | 100 | shade 101 | 102 | 103 | 104 | 105 | 106 | 107 | *:* 108 | 109 | META-INF/*.SF 110 | META-INF/*.DSA 111 | META-INF/*.RSA 112 | 113 | 114 | 115 | uber-${project.artifactId}-${project.version} 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /sampleData/data: -------------------------------------------------------------------------------- 1 | 1,A,1,1,0,0,1,0,0,0 2 | 2,A,0,0,0,0,1,0,0,0 3 | 3,B,1,2,0,0,1,0,0,0 4 | 4,C,1,3,0,0,1,0,0,0 5 | 5,D,2,3,0,0,1,0,0,0 6 | 6,E,1171610,11903564,0,0,2,0,0,0 -------------------------------------------------------------------------------- /scripts/database/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/database/__init__.py -------------------------------------------------------------------------------- /scripts/database/mysql.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils.configParser import parse 3 | import MySQLdb 4 | 5 | def connect(config_path): 6 | 7 | '''Open database connection and return conn object to perform database queries''' 8 | config = parse(config_path) 9 | host = config['mysql.host'] 10 | user = config['mysql.username'] 11 | passwd = config['mysql.password'] 12 | db = config['mysql.database'] 13 | 14 | try: 15 | conn=MySQLdb.connect(host, user, passwd, db) 16 | return conn 17 | except MySQLdb.Error, e: 18 | print "ERROR %d IN CONNECTION: %s" % (e.args[0], e.args[1]) 19 | 20 | 21 | def write(sql,cursor,conn): 22 | '''Perform insert and update operations on the databse. 23 | Need to pass the cursor object as a parameter''' 24 | try: 25 | cursor.execute(sql) 26 | conn.commit() 27 | except MySQLdb.ProgrammingError, e: 28 | print "ERROR %d IN WRITE OPERATION: %s" % (e.args[0], e.args[1]) 29 | print "LAST QUERY WAS: %s" %sql 30 | 31 | 32 | def read(sql,cursor): 33 | '''Perform read operations on the databse. 34 | Need to pass the cursor object as a parameter''' 35 | try: 36 | cursor.execute(sql) 37 | result = cursor.fetchall() 38 | return result 39 | except MySQLdb.ProgrammingError, e: 40 | print "ERROR %d IN READ OPERATION: %s" % (e.args[0], e.args[1]) 41 | print "LAST QUERY WAS: %s" %sql 42 | 43 | 44 | -------------------------------------------------------------------------------- /scripts/schema/spatium.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 4.0.10deb1 3 | -- http://www.phpmyadmin.net 4 | -- 5 | -- Host: localhost 6 | -- Generation Time: Apr 24, 2016 at 09:15 AM 7 | -- Server version: 5.5.49-0ubuntu0.14.04.1 8 | -- PHP Version: 5.5.9-1ubuntu4.16 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | SET time_zone = "+00:00"; 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | 19 | -- 20 | -- Database: `spatium` 21 | -- 22 | 23 | -- -------------------------------------------------------- 24 | 25 | -- 26 | -- Table structure for table `dataset` 27 | -- 28 | 29 | CREATE TABLE IF NOT EXISTS `dataset` ( 30 | `id` int(11) NOT NULL, 31 | `longitude` double(25,15) NOT NULL, 32 | `latitude` double(25,15) NOT NULL, 33 | `primary_type` varchar(100) NOT NULL, 34 | `date` int(11) NOT NULL, 35 | `x_coordinate` int(11) NOT NULL, 36 | `y_coordinate` int(11) NOT NULL, 37 | `district` int(11) NOT NULL, 38 | `ward` int(11) NOT NULL, 39 | `community_area` int(11) NOT NULL, 40 | `fbi_code` varchar(20) NOT NULL, 41 | PRIMARY KEY (`id`), 42 | KEY `primary_type` (`primary_type`), 43 | KEY `date` (`date`) 44 | ) ENGINE=InnoDB DEFAULT CHARSET=latin1; 45 | 46 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 47 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 48 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 49 | -------------------------------------------------------------------------------- /scripts/scrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/scrapper/__init__.py -------------------------------------------------------------------------------- /scripts/scrapper/socrata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import datetime 4 | import requests 5 | import os 6 | 7 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 8 | 9 | if not path in sys.path: 10 | sys.path.insert(1, path) 11 | del path 12 | 13 | from utils.configParser import parse 14 | import database.mysql as db 15 | 16 | def date_to_timestamp(stime): 17 | stime = stime.split('T') 18 | date = stime[0] 19 | temp = date.split("-") 20 | a = [] 21 | a.append(int(temp[0])) 22 | a.append(int(temp[1])) 23 | a.append(int(temp[2])) 24 | date = stime[1].split(':') 25 | for i in date: 26 | a.append(int(i)) 27 | a = datetime.datetime(a[0], a[1], a[2], a[3], a[4], a[5]).timetuple() 28 | # year, month, day, hour, minute, second, microsecond, and tzinfo. 29 | return int(time.mktime(a)) 30 | 31 | class Socrata(): 32 | """Class to fetch data using socrata API""" 33 | 34 | def __init__(self, limit, config_path): 35 | 36 | self.conn = db.connect(config_path) 37 | self.cursor = self.conn.cursor() 38 | self.url = "https://data.cityofchicago.org/resource/ijzp-q8t2.json" 39 | self.limit = limit 40 | config = parse(config_path) 41 | self.socrata_key = config["socrata.key"] 42 | 43 | def fetch_json(self, offset=0): 44 | payload = {'$limit': self.limit, '$offset': offset, '$$app_token':self.socrata_key} 45 | 46 | try : 47 | r = requests.get(self.url, params=payload) 48 | except requests.exceptions.ChunkedEncodingError: 49 | print payload 50 | return self.fetch_json(offset = offset) 51 | 52 | to_save = ['latitude', 'longitude', 'id', 'primary_type','date', 'x_coordinate', 'y_coordinate', 53 | 'district', 'ward', 'community_area', 'fbi_code'] 54 | print r.url 55 | if r.json(): 56 | sql = "INSERT INTO dataset (" 57 | for i in to_save: 58 | sql+=i+" , " 59 | sql=sql[:-2] 60 | sql+= ") VALUES " 61 | for i in r.json(): 62 | to_insert = "( " 63 | for j in to_save: 64 | if j not in i.keys(): 65 | i[j] = "\'\'" 66 | else: 67 | if j == 'date': 68 | i[j] = str(date_to_timestamp(i[j])) 69 | i[j] = "\'"+i[j]+"\'" 70 | to_insert+=i[j]+", " 71 | to_insert = to_insert[:-2] 72 | to_insert+='), ' 73 | sql+=to_insert 74 | sql = sql[:-2] 75 | db.write(sql, self.cursor, self.conn) 76 | return 1 77 | else: 78 | return 0 79 | 80 | def fetch_all(self, offset = 0): 81 | while(self.fetch_json(offset = offset)): 82 | offset+=self.limit 83 | print offset, " elements inserted in db." 84 | 85 | 86 | 87 | if __name__ == '__main__': 88 | config_path = "src/main/resources/reference.conf" 89 | a = Socrata(limit = 1000, config_path = config_path) 90 | offset = 4870000 91 | # a.fetch_json(offset) 92 | a.fetch_all(offset = 0) -------------------------------------------------------------------------------- /scripts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/locis/1906620280165ba2c3553ba5b75990759bdce20d/scripts/utils/__init__.py -------------------------------------------------------------------------------- /scripts/utils/configParser.py: -------------------------------------------------------------------------------- 1 | def parse(file_path): 2 | # Method to read the config file. 3 | # Using a custom function for parsing so that we have only one config for 4 | # both the scripts and the mapreduce tasks. 5 | config = {} 6 | with open(file_path) as f: 7 | for line in f: 8 | data = line.strip() 9 | if(data and not data.startswith("#")): 10 | (key, value) = data.split("=") 11 | config[key] = value 12 | return config -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console and file 2 | LOG_LEVEL=INFO 3 | COMPONENT=locis 4 | 5 | log4j.rootCategory=${LOG_LEVEL}, console, file 6 | 7 | #Console appender 8 | log4j.appender.console=org.apache.log4j.ConsoleAppender 9 | log4j.appender.console.target=System.out 10 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: [${COMPONENT}] %m%n 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.parquet.hadoop.ParquetInputFormat=WARN 15 | log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=WARN 16 | log4j.logger.org.eclipse.jetty=WARN 17 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 18 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 19 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 20 | 21 | # Direct log messages to a log file 22 | log4j.appender.file=org.apache.log4j.RollingFileAppender 23 | log4j.appender.file.File=logging.log 24 | log4j.appender.file.MaxFileSize=10MB 25 | log4j.appender.file.MaxBackupIndex=10 26 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 27 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - [${COMPONENT}] %m%n 28 | -------------------------------------------------------------------------------- /src/main/resources/reference.conf.sample: -------------------------------------------------------------------------------- 1 | #mysql config 2 | 3 | mysql.username= 4 | mysql.host= 5 | mysql.port= 6 | mysql.password= 7 | mysql.database= 8 | 9 | #Hadoop FileSystem config 10 | 11 | fs.defaultFS= 12 | 13 | distance.threshold=50000 14 | # Note that this value is with respect to x-y coordinate system. 15 | participationIndex.threshold=0.1 16 | 17 | socrata.key= -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/CountInstance.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.LongWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.Job 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 9 | 10 | import com.github.locis.map.CountInstanceMapper 11 | import com.github.locis.reduce.CountInstanceReducer 12 | import com.github.locis.utils.HBaseUtil 13 | 14 | object CountInstance extends MapReduceJob { 15 | 16 | private val hBaseUtil = new HBaseUtil() 17 | 18 | def jobName: String = { 19 | "CountInstance" 20 | } 21 | 22 | override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " + 23 | "com.github.locis.apps." + jobName + " " 24 | 25 | def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = { 26 | hBaseUtil.createInstanceCountTable() 27 | val job = new Job(hadoopConfiguration, jobName) 28 | job.setMapperClass(classOf[CountInstanceMapper]) 29 | job.setReducerClass(classOf[CountInstanceReducer]) 30 | job.setMapOutputKeyClass(classOf[Text]) 31 | job.setMapOutputValueClass((classOf[LongWritable])) 32 | job.setOutputKeyClass(classOf[Text]) 33 | job.setOutputValueClass(classOf[LongWritable]) 34 | FileInputFormat.addInputPath(job, inputPath) 35 | FileOutputFormat.setOutputPath(job, outputPath) 36 | val status = if (job.waitForCompletion(true)) 0 else 1 37 | System.exit(status) 38 | } 39 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/DataLoader.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.slf4j.Logger 4 | import org.slf4j.LoggerFactory 5 | 6 | import com.github.locis.utils.DataParser 7 | import com.github.locis.utils.HDFSWriter 8 | import com.github.locis.utils.Mysql 9 | 10 | object DataLoader { 11 | 12 | private val logger: Logger = LoggerFactory.getLogger(getClass) 13 | 14 | def getCountOfRows() = { 15 | val mysql = new Mysql() 16 | val sqlQuery = "SELECT count(*) FROM dataset" 17 | mysql.runQuery(sqlQuery) 18 | } 19 | 20 | def loadData(limit: Long = -1, start: Long = -1) = { 21 | // Be careful with limit=-1. It would bring in a lot of data. (~500 mb) 22 | val mysql = new Mysql() 23 | val baseQuery = "SELECT " + DataParser.getAttributeList.mkString(",") + 24 | " FROM dataset ORDER BY date ASC" 25 | val sqlQuery = { 26 | if (limit < 0) { 27 | baseQuery 28 | } else if (start < 0) { 29 | baseQuery + " LIMIT " + limit.toString() 30 | } else { 31 | baseQuery + " LIMIT " + start.toString() + ", " + limit.toString() 32 | } 33 | } 34 | mysql.runQuery(sqlQuery) 35 | } 36 | 37 | def writeData(pathToWrite: String = "/user/locis/input/data", limit: Long = -1, start: Long = -1) = { 38 | val writer = new HDFSWriter().getWriter(pathToWrite) 39 | val result = loadData(limit, start) 40 | result.foreach { x => writer.write((x.mkString(",") + "\n").getBytes) } 41 | writer.close() 42 | } 43 | 44 | def writeAllData(pathToWrite: String = "/user/locis/input/data") = { 45 | val writer = new HDFSWriter().getWriter(pathToWrite) 46 | val start: Long = 0 47 | val limit: Long = 100000 48 | val countOfRows: Long = getCountOfRows()(0)(0).asInstanceOf[Long] 49 | val numberOfIterations = countOfRows / limit + 1 50 | (1 to numberOfIterations.asInstanceOf[Int]).foreach { 51 | iterationCounter => 52 | { 53 | loadData(limit, start + limit * iterationCounter) 54 | .foreach { x => writer.write((x.mkString(",") + "\n").getBytes) } 55 | } 56 | } 57 | writer.close() 58 | } 59 | 60 | def main(args: Array[String]): Unit = { 61 | val limit: Long = 10 62 | val pathToWrite = { 63 | if (args.isEmpty) { 64 | "/user/locis/input/data" 65 | } else { 66 | args(0) 67 | } 68 | } 69 | writeAllData() 70 | } 71 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/MapReduceJob.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.util.GenericOptionsParser 5 | import org.slf4j.Logger 6 | import org.slf4j.LoggerFactory 7 | import com.github.locis.utils.ConfigUtils 8 | import com.typesafe.config.ConfigFactory 9 | 10 | abstract class MapReduceJob { 11 | 12 | protected val logger: Logger = LoggerFactory.getLogger(getClass) 13 | 14 | protected val hadoopConfiguration = ConfigUtils.getHadoopConfiguration() 15 | 16 | protected val userConfiguration = ConfigFactory.load 17 | 18 | def jobName(): String 19 | 20 | protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " + 21 | "com.github.locis.apps." + jobName + " " 22 | 23 | protected val numberOfArgsExpected = 2 24 | 25 | def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit 26 | 27 | def main(args: Array[String]): Unit = { 28 | val otherArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs 29 | if (otherArgs.length != numberOfArgsExpected) { 30 | println(errorMsg) 31 | } else { 32 | val inputPath = new Path(args(0)) 33 | val outputPath = new Path(args(1)) 34 | run(inputPath, outputPath, args) 35 | } 36 | 37 | } 38 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/NeighborGrouping.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Job 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 8 | 9 | import com.github.locis.map.NeighborGroupingMapper 10 | import com.github.locis.reduce.NeighborGroupingReducer 11 | 12 | object NeighborGrouping extends MapReduceJob { 13 | 14 | def jobName: String = { 15 | "NeighborGrouping" 16 | } 17 | 18 | override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " + 19 | "com.github.locis.apps." + jobName + " " 20 | 21 | def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = { 22 | 23 | val job = new Job(hadoopConfiguration, jobName) 24 | job.setMapperClass(classOf[NeighborGroupingMapper]) 25 | job.setReducerClass(classOf[NeighborGroupingReducer]) 26 | job.setMapOutputKeyClass(classOf[Text]) 27 | job.setMapOutputValueClass((classOf[Text])) 28 | job.setOutputKeyClass(classOf[Text]) 29 | job.setOutputValueClass(classOf[Text]) 30 | FileInputFormat.addInputPath(job, inputPath) 31 | FileOutputFormat.setOutputPath(job, outputPath) 32 | val status = if (job.waitForCompletion(true)) 0 else 1 33 | System.exit(status) 34 | } 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/NeighborSearch.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Job 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 8 | 9 | import com.github.locis.map.NeighborSearchMapper 10 | import com.github.locis.reduce.NeighborSearchReducer 11 | 12 | object NeighborSearch extends MapReduceJob { 13 | 14 | def jobName: String = { 15 | "NeighborSearch" 16 | } 17 | 18 | override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " + 19 | "com.github.locis.apps." + jobName + " " 20 | 21 | def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = { 22 | 23 | val job = new Job(hadoopConfiguration, jobName) 24 | job.setMapperClass(classOf[NeighborSearchMapper]) 25 | job.setReducerClass(classOf[NeighborSearchReducer]) 26 | job.setMapOutputKeyClass(classOf[Text]) 27 | job.setMapOutputValueClass((classOf[Text])) 28 | job.setOutputKeyClass(classOf[Text]) 29 | job.setOutputValueClass(classOf[Text]) 30 | FileInputFormat.addInputPath(job, inputPath) 31 | FileOutputFormat.setOutputPath(job, outputPath) 32 | val status = if (job.waitForCompletion(true)) 0 else 1 33 | System.exit(status) 34 | } 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/PatternSearch.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.apps 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.DoubleWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.Job 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 9 | 10 | import com.github.locis.map.PatternSearchMapper 11 | import com.github.locis.reduce.PatternSearchReducer 12 | import com.github.locis.utils.HBaseUtil 13 | 14 | object PatternSearch extends MapReduceJob { 15 | 16 | private val hBaseUtil = new HBaseUtil() 17 | 18 | def jobName: String = { 19 | "PatternSearch" 20 | } 21 | 22 | override protected val errorMsg = "Usage: $HADOOP_HOME/bin/hadoop jar target/uber-locis-0.0.1-SNAPSHOT.jar " + 23 | "com.github.locis.apps." + jobName + " " + 24 | " " 25 | 26 | override protected val numberOfArgsExpected = 3 27 | 28 | def run(inputPath: Path, outputPath: Path, args: Array[String]): Unit = { 29 | val size = args(2) 30 | val thresholdParticipationIndex = userConfiguration.getString("participationIndex.threshold") 31 | hBaseUtil.createColocationStoreTable() 32 | hadoopConfiguration.set("k", size) 33 | hadoopConfiguration.set("thresholdParticipationIndex", thresholdParticipationIndex) 34 | val job = new Job(hadoopConfiguration, jobName) 35 | job.setMapperClass(classOf[PatternSearchMapper]) 36 | job.setReducerClass(classOf[PatternSearchReducer]) 37 | job.setMapOutputKeyClass(classOf[Text]) 38 | job.setMapOutputValueClass((classOf[Text])) 39 | job.setOutputKeyClass(classOf[Text]) 40 | job.setOutputValueClass(classOf[DoubleWritable]) 41 | FileInputFormat.addInputPath(job, inputPath) 42 | FileOutputFormat.setOutputPath(job, outputPath) 43 | val status = if (job.waitForCompletion(true)) 0 else 1 44 | System.exit(status) 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/apps/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author shagun 6 | * 7 | */ 8 | package com.github.locis.apps; -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/map/CountInstanceMapper.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.map 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Mapper 6 | import org.slf4j.Logger 7 | import org.slf4j.LoggerFactory 8 | 9 | import com.github.locis.utils.DataParser 10 | 11 | /* 12 | * This class maps the input data points to value 1. 13 | * See : https://github.com/shagunsodhani/locis/issues/8 14 | */ 15 | 16 | class CountInstanceMapper extends Mapper[LongWritable, Text, Text, LongWritable] { 17 | 18 | private val logger: Logger = LoggerFactory.getLogger(getClass) 19 | 20 | override def map(key: LongWritable, value: Text, 21 | context: Mapper[LongWritable, Text, Text, LongWritable]#Context) = { 22 | val line = value.toString().split("\t") 23 | val keyString = line(0) 24 | context.write(new Text(DataParser.getType(keyString)), new LongWritable(1L)) 25 | } 26 | 27 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/map/NeighborGroupingMapper.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.map 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Mapper 6 | import org.slf4j.Logger 7 | import org.slf4j.LoggerFactory 8 | 9 | import com.github.locis.utils.DataParser 10 | 11 | /* 12 | * This class maps the input data points to one of their neighbors. 13 | * See : https://github.com/shagunsodhani/locis/issues/6 14 | */ 15 | 16 | class NeighborGroupingMapper extends Mapper[LongWritable, Text, Text, Text] { 17 | 18 | private val logger: Logger = LoggerFactory.getLogger(getClass) 19 | 20 | override def map(key: LongWritable, value: Text, 21 | context: Mapper[LongWritable, Text, Text, Text]#Context) = { 22 | val line = value.toString().split("\t") 23 | val keyString = line(0) 24 | val valueString = line(1) 25 | if (keyString == valueString || DataParser.getType(keyString) < DataParser.getType(valueString)) { 26 | context.write(new Text(keyString), new Text(valueString)) 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/map/NeighborSearchMapper.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.map 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Mapper 6 | import org.slf4j.Logger 7 | import org.slf4j.LoggerFactory 8 | import com.github.locis.utils.DataParser 9 | 10 | /* 11 | * This class maps the input data points to the grid number. For now, we are 12 | * using a dummy implementation. 13 | * See : https://github.com/shagunsodhani/locis/issues/2 14 | */ 15 | 16 | class NeighborSearchMapper extends Mapper[LongWritable, Text, Text, Text] { 17 | 18 | private val logger: Logger = LoggerFactory.getLogger(getClass) 19 | 20 | private def getGridId(dataPoint: String): String = { 21 | DataParser.getKeyForGridMapping(dataPoint) 22 | } 23 | override def map(key: LongWritable, value: Text, 24 | context: Mapper[LongWritable, Text, Text, Text]#Context) = { 25 | val gridNumber = new Text(getGridId(value.toString())) 26 | context.write(gridNumber, value) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/map/PatternSearchMapper.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.map 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapreduce.Mapper 6 | import org.slf4j.Logger 7 | import org.slf4j.LoggerFactory 8 | 9 | import com.github.locis.utils.DataParser 10 | import com.github.locis.utils.HBaseUtil 11 | 12 | /* 13 | * This class maps the input data points to (eventset, instance). 14 | * See : https://github.com/shagunsodhani/locis/issues/7 15 | */ 16 | 17 | class PatternSearchMapper extends Mapper[LongWritable, Text, Text, Text] { 18 | 19 | private val logger: Logger = LoggerFactory.getLogger(getClass) 20 | 21 | private val hBaseUtil = new HBaseUtil() 22 | 23 | private val internalSeprator = "#" 24 | // used to separate records within a colocation instance 25 | 26 | private val externalSeperator = "\t" 27 | // used to separate records in a list of colocations 28 | 29 | private def scanNTransactions(neighborhood: Array[String], k: Int): Iterator[Array[String]] = { 30 | val key = neighborhood(0) 31 | neighborhood 32 | .filterNot { _.equals(key) } 33 | .combinations(k - 1) 34 | } 35 | 36 | private def checkCliqueness(instance: Array[String], k: Int): Boolean = { 37 | if (instance.isEmpty) { 38 | true 39 | } else { 40 | val sortedInstance = instance 41 | .sortBy { dataPoint => DataParser.getType(dataPoint) } 42 | val rowName = sortedInstance 43 | .map { dataPoint => DataParser.getType(dataPoint) } 44 | .mkString(internalSeprator) 45 | 46 | hBaseUtil 47 | .readColocationStoreRow(rowName = rowName, size = k - 1) 48 | .split(externalSeperator) 49 | .contains(sortedInstance.mkString(internalSeprator)) 50 | } 51 | } 52 | 53 | private def eventTypeOf(instance: Array[String]): Array[String] = { 54 | instance.map { DataParser.getType } 55 | } 56 | 57 | override def map(key: LongWritable, value: Text, 58 | context: Mapper[LongWritable, Text, Text, Text]#Context) = { 59 | val k = context.getConfiguration.get("k").toInt 60 | val line = value.toString().split(externalSeperator) 61 | val keyString = line(0) 62 | 63 | /* 64 | * I have a feeling that this could explode for very large datasets. It should not be brought in-memory. 65 | * But querying HBase everytime would be very slow. Given the number of steps this set would take to be created, I am not convinced if 66 | * filtering on basis of prevalent colocation types would indeed benefit the implementation. 67 | */ 68 | val prevalentColocationTypes = { 69 | if (k > 1) { 70 | hBaseUtil 71 | .scanColocationStoreColumn("size", k - 1) 72 | .flatMap(colocationType => colocationType.split(internalSeprator).toIterable) 73 | .toSet 74 | } else { 75 | hBaseUtil 76 | .scanInstanceCountColumn("size") 77 | .flatMap(colocationType => colocationType.split(internalSeprator).toIterable) 78 | .toSet 79 | } 80 | } 81 | 82 | val neighborhood = line 83 | .tail 84 | .filter { neighbor => prevalentColocationTypes.contains(DataParser.getType(neighbor)) } 85 | 86 | scanNTransactions(neighborhood, k) 87 | .filter { checkCliqueness(_, k) } 88 | .foreach { instance => 89 | { 90 | val eventSet = (eventTypeOf(instance) ++ Array(DataParser.getType(keyString))) 91 | .sortBy { identity } 92 | .mkString(internalSeprator) 93 | val sortedInstance = (instance ++ Array(keyString)) 94 | .sortBy { dataPoint => DataParser.getType(dataPoint) } 95 | .mkString(internalSeprator) 96 | context.write(new Text(eventSet), new Text(sortedInstance)) 97 | } 98 | } 99 | } 100 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/map/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author shagun 6 | * 7 | */ 8 | package com.github.locis.map; -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/reduce/CountInstanceReducer.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.reduce 2 | 3 | import scala.collection.JavaConversions.asScalaIterator 4 | 5 | import org.apache.hadoop.io.LongWritable 6 | import org.apache.hadoop.io.Text 7 | import org.apache.hadoop.mapreduce.Reducer 8 | import org.slf4j.Logger 9 | import org.slf4j.LoggerFactory 10 | 11 | import com.github.locis.utils.HBaseUtil 12 | 13 | /* 14 | * This class counts the number of instances of each type and saves it to HBase 15 | * See : https://github.com/shagunsodhani/locis/issues/8 16 | */ 17 | 18 | class CountInstanceReducer extends Reducer[Text, LongWritable, Text, LongWritable] { 19 | private val logger: Logger = LoggerFactory.getLogger(getClass) 20 | 21 | private val hBaseUtil = new HBaseUtil() 22 | 23 | override def reduce(key: Text, values: java.lang.Iterable[LongWritable], 24 | context: Reducer[Text, LongWritable, Text, LongWritable]#Context): Unit = { 25 | val sum = values.iterator().map { x => x.get() }.sum 26 | hBaseUtil.writeToInstanceCountTable( 27 | List( 28 | (key.toString(), sum.toString()))) 29 | context.write(key, new LongWritable(sum)) 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/reduce/NeighborGroupingReducer.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.reduce 2 | 3 | import org.apache.hadoop.io.Text 4 | import org.apache.hadoop.mapreduce.Reducer 5 | import org.slf4j.Logger 6 | import org.slf4j.LoggerFactory 7 | 8 | import com.github.locis.utils.DataParser 9 | 10 | /* 11 | * This class groups all the neighbors for a given key (dataPoint) in sorted order. 12 | * algorithm. 13 | * See : https://github.com/shagunsodhani/locis/issues/6 14 | */ 15 | 16 | class NeighborGroupingReducer extends Reducer[Text, Text, Text, Text] { 17 | private val logger: Logger = LoggerFactory.getLogger(getClass) 18 | 19 | override def reduce(key: Text, values: java.lang.Iterable[Text], 20 | context: Reducer[Text, Text, Text, Text]#Context): Unit = { 21 | val dataPointIterator = values.iterator() 22 | val sortedObjectSet = { 23 | var tempObjectSet = Set[String]() 24 | while (dataPointIterator.hasNext()) { 25 | tempObjectSet += dataPointIterator.next().toString() 26 | } 27 | tempObjectSet.toSeq.sortBy { dataPoint => DataParser.getType(dataPoint) } 28 | } 29 | val nRecord = (Seq(key.toString()) ++ sortedObjectSet).mkString("\t") 30 | // This step adds the key to the nRecord set for the second time which seems not only unnecessary but also erroneous. 31 | // val nRecord = sortedObjectSet.mkString("\t") 32 | context.write(key, new Text(nRecord)) 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/reduce/NeighborSearchReducer.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.reduce 2 | 3 | import org.apache.hadoop.io.Text 4 | import org.apache.hadoop.mapreduce.Reducer 5 | import org.slf4j.Logger 6 | import org.slf4j.LoggerFactory 7 | 8 | import com.github.locis.utils.DataParser 9 | import com.github.locis.utils.DistanceMeasure 10 | import com.typesafe.config.ConfigFactory 11 | 12 | /* 13 | * This class finds all the neighbors in a given grid using the plane-sweep 14 | * algorithm. 15 | * See : https://github.com/shagunsodhani/locis/issues/3 16 | */ 17 | 18 | class NeighborSearchReducer extends Reducer[Text, Text, Text, Text] { 19 | 20 | private val logger: Logger = LoggerFactory.getLogger(getClass) 21 | 22 | private val distanceThreshold: Double = ConfigFactory.load.getDouble("distance.threshold") 23 | 24 | private def getEuclideanDistance(dataPoint1: String, dataPoint2: String): Double = { 25 | val y1 = DataParser.getY(dataPoint1) 26 | val y2 = DataParser.getY(dataPoint2) 27 | 28 | val x1 = DataParser.getX(dataPoint1) 29 | val x2 = DataParser.getX(dataPoint2) 30 | 31 | DistanceMeasure.euclideanDistance(x1, y1, x2, y2) 32 | } 33 | 34 | private def getHaversineDistance(dataPoint1: String, dataPoint2: String): Double = { 35 | val lat1 = DataParser.getY(dataPoint1) 36 | val lat2 = DataParser.getY(dataPoint2) 37 | 38 | val lng1 = DataParser.getX(dataPoint1) 39 | val lng2 = DataParser.getX(dataPoint2) 40 | 41 | DistanceMeasure.haversineDistance(lat1, lng1, lat2, lng2) 42 | } 43 | 44 | private def planeSweep(dataPoints: java.lang.Iterable[Text]): Set[(String, String)] = { 45 | val dataPointIterator = dataPoints.iterator() 46 | val objectSet = { 47 | var tempObjectSet = Set[String]() 48 | while (dataPointIterator.hasNext()) { 49 | tempObjectSet += dataPointIterator.next().toString() 50 | } 51 | tempObjectSet.toSeq.sortBy { dataPoint => DataParser.getX(dataPoint) } 52 | } 53 | // sorted Object Set 54 | var activeSet = Set[String]() 55 | var resultSet = Set[(String, String)]() 56 | var j = 0 57 | val n = objectSet.length 58 | (0 to n-1).foreach { 59 | i => 60 | { 61 | while (DataParser.getX(objectSet(i)) - DataParser.getX(objectSet(j)) > distanceThreshold) { 62 | activeSet -= objectSet(i) 63 | j += 1 64 | } 65 | val range = activeSet 66 | .filter { dataPoint => 67 | { 68 | val yDataPoint = DataParser.getY(dataPoint) 69 | val yObjectSetPoint = DataParser.getY(objectSet(i)) 70 | ((yDataPoint <= yObjectSetPoint + distanceThreshold) 71 | && (yDataPoint >= yObjectSetPoint - distanceThreshold)) 72 | } 73 | } 74 | range.filter { 75 | dataPointInrange => (getEuclideanDistance(objectSet(i), dataPointInrange) <= distanceThreshold) 76 | }.foreach { dataPoint => (resultSet += ((objectSet(i), dataPoint))) } 77 | activeSet += objectSet(i) 78 | // This step adds a data point as its own neighbor but does not seem to be required. Infact, using this step can make 79 | // the scanNTransactions step slower. 80 | resultSet += ((objectSet(i), objectSet(i))) 81 | } 82 | } 83 | resultSet 84 | } 85 | 86 | override def reduce(key: Text, values: java.lang.Iterable[Text], 87 | context: Reducer[Text, Text, Text, Text]#Context): Unit = { 88 | planeSweep(values).foreach { dataPoint => 89 | { 90 | context.write(new Text(dataPoint._1), new Text(dataPoint._2)) 91 | } 92 | } 93 | 94 | } 95 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/reduce/PatternSearchReducer.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.reduce 2 | 3 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter 4 | 5 | import org.apache.hadoop.io.DoubleWritable 6 | import org.apache.hadoop.io.Text 7 | import org.apache.hadoop.mapreduce.Reducer 8 | import org.slf4j.Logger 9 | import org.slf4j.LoggerFactory 10 | 11 | import com.github.locis.utils.DataParser 12 | import com.github.locis.utils.HBaseUtil 13 | /* 14 | * This class groups all the instances for a given key (eventset). 15 | * See : https://github.com/shagunsodhani/locis/issues/7 16 | */ 17 | 18 | class PatternSearchReducer extends Reducer[Text, Text, Text, DoubleWritable] { 19 | private val logger: Logger = LoggerFactory.getLogger(getClass) 20 | 21 | private val hBaseUtil = new HBaseUtil() 22 | private val internalSeprator = "#" 23 | // used to separate records within a colocation instance 24 | 25 | private val externalSeperator = "\t" 26 | // used to separate records in a list of colocations 27 | 28 | private def getEventTypeCount(eventType: String): Long = { 29 | hBaseUtil.readInstanceCountRow(eventType).toLong 30 | } 31 | private def computeParticipationIndexAndInstanceString(eventType: String, 32 | instanceListIterator: Iterable[String]) = { 33 | /* 34 | * We have to compute the instance string in this function because we can not consume the iterable twice. 35 | */ 36 | val eventTypeList = eventType.split(internalSeprator) 37 | val eventTypeToIntMapping = eventTypeList.zipWithIndex 38 | // Mapping each event type to an integer 39 | val eventTypeToCountMapping = eventTypeToIntMapping.map { 40 | eventTypeIntTuple => 41 | { 42 | (eventTypeIntTuple._2, getEventTypeCount(eventTypeIntTuple._1)) 43 | } 44 | }.toMap 45 | // Mapping each event type (via an integer) to a its total number of instances 46 | val eventTypeToSetMapping = eventTypeToIntMapping.map { 47 | eventTypeIntTuple => 48 | { 49 | (eventTypeIntTuple._2, scala.collection.mutable.Set[String]()) 50 | } 51 | }.toMap 52 | // new scala.collection.mutable.HashMap[Int, scala.collection.mutable.Set[String]] 53 | // Mapping each event type (via an integer) to a set of unique instances 54 | 55 | val instanceStringBuffer = new StringBuilder 56 | 57 | instanceListIterator.foreach { instanceList => 58 | { 59 | val instancesToIntMapping = instanceList.split(internalSeprator).zipWithIndex 60 | instancesToIntMapping.foreach { 61 | instanceIntTuple => 62 | { 63 | eventTypeToSetMapping(instanceIntTuple._2) += DataParser.getId(instanceIntTuple._1) 64 | instanceStringBuffer ++= instanceIntTuple._1 + internalSeprator 65 | } 66 | } 67 | instanceStringBuffer.deleteCharAt(instanceStringBuffer.length-1) 68 | instanceStringBuffer ++= externalSeperator 69 | } 70 | } 71 | instanceStringBuffer.deleteCharAt(instanceStringBuffer.length-1) 72 | val participationIndex = eventTypeToSetMapping.map { 73 | intSetTuple => 74 | { 75 | intSetTuple._2.count { x => true }.toDouble / eventTypeToCountMapping(intSetTuple._1) 76 | } 77 | }.min 78 | (participationIndex, instanceStringBuffer.toString()) 79 | } 80 | 81 | override def reduce(key: Text, values: java.lang.Iterable[Text], 82 | context: Reducer[Text, Text, Text, DoubleWritable]#Context): Unit = { 83 | val k = context.getConfiguration.get("k").toInt 84 | val eventType = key.toString() 85 | val instanceListIterator = values.asScala.map { _.toString() } 86 | val thresholdParticipationIndex = context.getConfiguration.get("thresholdParticipationIndex").toDouble 87 | val (participationIndex, instanceString) = computeParticipationIndexAndInstanceString(eventType, 88 | instanceListIterator) 89 | if (participationIndex > thresholdParticipationIndex) { 90 | hBaseUtil.writeToColocationStoreTable(List((eventType, instanceString)), k) 91 | context.write(key, new DoubleWritable(participationIndex)) 92 | } 93 | } 94 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/reduce/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author shagun 6 | * 7 | */ 8 | package com.github.locis.reduce; -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/ConfigUtils.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.LocalFileSystem 5 | import org.apache.hadoop.hdfs.DistributedFileSystem 6 | import org.slf4j.Logger 7 | import org.slf4j.LoggerFactory 8 | 9 | import com.typesafe.config.ConfigFactory 10 | 11 | object ConfigUtils { 12 | 13 | private val logger: Logger = LoggerFactory.getLogger(getClass) 14 | 15 | def getHadoopConfiguration() = { 16 | val configuration = new Configuration() 17 | configuration.set("fs.defaultFS", ConfigFactory.load.getString("fs.defaultFS")) 18 | configuration.set("fs.hdfs.impl", classOf[org.apache.hadoop.hdfs.DistributedFileSystem].getName()) 19 | configuration.set("fs.file.impl", classOf[org.apache.hadoop.fs.LocalFileSystem].getName()) 20 | configuration 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/DataParser.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import org.slf4j.Logger 4 | import org.slf4j.LoggerFactory 5 | 6 | object DataParser { 7 | private val logger: Logger = LoggerFactory.getLogger(getClass) 8 | 9 | private val sep = "," 10 | 11 | private val attributeList: List[String] = List( 12 | "id", 13 | "primary_type", 14 | "x_coordinate", 15 | "y_coordinate", 16 | "longitude", 17 | "latitude", 18 | "district", 19 | "ward", 20 | "community_area", 21 | "fbi_code") 22 | 23 | private val attributeIndexMap = attributeList.zipWithIndex.toMap 24 | 25 | private def getIndex(attribute: String) = { 26 | attributeIndexMap.get(attribute) 27 | } 28 | 29 | def getId(dataPoint: String): String = { 30 | val typeIndex = attributeIndexMap("id") 31 | dataPoint.split(sep)(typeIndex) 32 | } 33 | 34 | def getType(dataPoint: String): String = { 35 | val typeIndex = attributeIndexMap("primary_type") 36 | dataPoint.split(sep)(typeIndex) 37 | } 38 | 39 | def getX(dataPoint: String): Double = { 40 | val xIndex = attributeIndexMap("x_coordinate") 41 | dataPoint.split(sep)(xIndex).toDouble 42 | } 43 | 44 | def getY(dataPoint: String): Double = { 45 | val yIndex = attributeIndexMap("y_coordinate") 46 | dataPoint.split(sep)(yIndex).toDouble 47 | } 48 | 49 | def getAttributeList: List[String] = { 50 | attributeList 51 | } 52 | 53 | def getKeyForGridMapping(dataPoint: String): String = { 54 | val keyIndex = attributeIndexMap("district") 55 | dataPoint.split(sep)(keyIndex) 56 | } 57 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/DistanceMeasure.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import org.slf4j.Logger 4 | import org.slf4j.LoggerFactory 5 | 6 | object DistanceMeasure { 7 | 8 | private val logger: Logger = LoggerFactory.getLogger(getClass) 9 | 10 | private val earthRadius: Double = 6371 //in km 11 | 12 | def haversineDistance(lat1: Double, lng1: Double, lat2: Double, lng2: Double) = { 13 | /* 14 | * Function to return distance between two points (represented by latitude and longitude). 15 | * The distance is returned in km 16 | */ 17 | val dLat = Math.toRadians(lat2 - lat1) 18 | val dLng = Math.toRadians(lng2 - lng1) 19 | val a = Math.sin(dLat / 2) * Math.sin(dLat / 2) + 20 | Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2)) * 21 | Math.sin(dLng / 2) * Math.sin(dLng / 2) 22 | val c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a)) 23 | earthRadius * c 24 | } 25 | 26 | def euclideanDistance(x1: Double, y1: Double, x2: Double, y2: Double) = { 27 | /* 28 | * Function to return distance between two points in the euclidean space. 29 | * The unit is not known but can be worked out. 30 | */ 31 | math.sqrt(math.pow(x1 - x2, 2) + math.pow(y1 - y2, 2)) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/HBaseUtil.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import scala.collection.JavaConverters.iterableAsScalaIterableConverter 4 | 5 | import org.apache.hadoop.hbase.HBaseConfiguration 6 | import org.apache.hadoop.hbase.HColumnDescriptor 7 | import org.apache.hadoop.hbase.HTableDescriptor 8 | import org.apache.hadoop.hbase.TableName 9 | import org.apache.hadoop.hbase.client.ConnectionFactory 10 | import org.apache.hadoop.hbase.client.Get 11 | import org.apache.hadoop.hbase.client.HTable 12 | import org.apache.hadoop.hbase.client.Put 13 | import org.apache.hadoop.hbase.client.Result 14 | import org.apache.hadoop.hbase.client.Scan 15 | import org.apache.hadoop.hbase.util.Bytes 16 | import org.slf4j.Logger 17 | import org.slf4j.LoggerFactory 18 | 19 | class HBaseUtil { 20 | 21 | /* 22 | * This class contains all the HBase related logic. 23 | * See : https://github.com/shagunsodhani/locis/issues/10 24 | * : https://github.com/shagunsodhani/locis/issues/12 25 | */ 26 | 27 | private val logger: Logger = LoggerFactory.getLogger(getClass) 28 | 29 | private val conf = HBaseConfiguration.create() 30 | private val connection = ConnectionFactory.createConnection(conf) 31 | private val admin = connection.getAdmin() 32 | private val instanceCountTableName = "InstanceCount" 33 | private val colocationStoreTableName = "ColocationStore" 34 | 35 | private def isTableExist(tableName: TableName) = { 36 | admin.tableExists(tableName) 37 | } 38 | 39 | private def createTable(tableName: String, columnList: List[String]) = { 40 | /* 41 | * This method creates a table with a given name. 42 | * If a table with same name exists, nothing is done. 43 | */ 44 | val _tableName = TableName.valueOf(tableName) 45 | if (!isTableExist(_tableName)) { 46 | val tableDescriptor = new HTableDescriptor(_tableName) 47 | columnList.foreach { x => 48 | tableDescriptor.addFamily(new HColumnDescriptor(x)) 49 | } 50 | admin.createTable(tableDescriptor) 51 | } 52 | } 53 | 54 | def createInstanceCountTable() = { 55 | /* 56 | * This method creates a table to track the instance counts for each event type. 57 | * For Data Model, see : https://github.com/shagunsodhani/locis/blob/master/docs/implementation.md#using-hbase-data-model 58 | */ 59 | val columnList = List("size") 60 | createTable(instanceCountTableName, columnList) 61 | } 62 | 63 | def createColocationStoreTable() = { 64 | /* 65 | * This method creates a table to store colocations of different sizes. 66 | * For Data Model, see : https://github.com/shagunsodhani/locis/blob/master/docs/implementation.md#using-hbase-data-model 67 | */ 68 | val columnList = List("size") 69 | createTable(colocationStoreTableName, columnList) 70 | } 71 | 72 | def writeToInstanceCountTable(rowkeyValueList: List[(String, String)]) = { 73 | val table = new HTable(conf, instanceCountTableName) 74 | val sizeByteArray = Bytes.toBytes("1") 75 | rowkeyValueList.foreach { 76 | rowkeyValue => 77 | { 78 | val rowKey = rowkeyValue._1 79 | val value = rowkeyValue._2 80 | val row = new Put(Bytes.toBytes(rowKey)) 81 | row.addImmutable(Bytes.toBytes("size"), sizeByteArray, 82 | Bytes.toBytes(value)) 83 | table.put(row) 84 | } 85 | table.flushCommits() 86 | table.close() 87 | } 88 | } 89 | 90 | def writeToColocationStoreTable(rowkeyValueList: List[(String, String)], 91 | size: Int) = { 92 | val table = new HTable(conf, colocationStoreTableName) 93 | val sizeByteArray = Bytes.toBytes(size.toString()) 94 | rowkeyValueList.foreach { 95 | rowkeyValue => 96 | { 97 | val rowKey = rowkeyValue._1 98 | val value = rowkeyValue._2 99 | val row = new Put(Bytes.toBytes(rowKey)) 100 | row.addImmutable(Bytes.toBytes("size"), sizeByteArray, 101 | Bytes.toBytes(value)) 102 | table.put(row) 103 | } 104 | table.flushCommits() 105 | table.close() 106 | } 107 | } 108 | 109 | def deleteTable(tableName: String) = { 110 | val _tableName = TableName.valueOf(tableName) 111 | if (admin.tableExists(_tableName)) { 112 | admin.disableTable(_tableName) 113 | admin.deleteTable(_tableName) 114 | } 115 | } 116 | 117 | def scanColocationStoreColumn(columnName: String, size: Int): Iterable[String] = { 118 | /* 119 | * Method to scan a column in HBase. 120 | * See : https://github.com/shagunsodhani/locis/issues/15 121 | */ 122 | val colocationStoreTable = new HTable(conf, colocationStoreTableName) 123 | val scan = new Scan() 124 | scan.addColumn(Bytes.toBytes(columnName), Bytes.toBytes(size.toString())) 125 | colocationStoreTable 126 | .getScanner(scan) 127 | .asScala 128 | .map { result => Bytes.toString(result.getRow) } 129 | } 130 | 131 | def readColocationStoreRow(rowName: String, size: Int): String = { 132 | /* 133 | * Method to read a row in HBase. 134 | * See : https://github.com/shagunsodhani/locis/issues/15 135 | */ 136 | val colocationStoreTable = new HTable(conf, colocationStoreTableName) 137 | val get = new Get(Bytes.toBytes(rowName)) 138 | val result: Result = colocationStoreTable.get(get) 139 | Bytes.toString(result.getValue(Bytes.toBytes("size"), Bytes.toBytes((size).toString()))) 140 | } 141 | 142 | def readInstanceCountRow(rowName: String, size: Int = 1): String = { 143 | /* 144 | * Method to read a row in HBase. 145 | * See : https://github.com/shagunsodhani/locis/issues/15 146 | */ 147 | val instanceCountTable = new HTable(conf, instanceCountTableName) 148 | val get = new Get(Bytes.toBytes(rowName)) 149 | val result: Result = instanceCountTable.get(get) 150 | Bytes.toString(result.getValue(Bytes.toBytes("size"), Bytes.toBytes((size).toString()))) 151 | } 152 | 153 | def scanInstanceCountColumn(columnName: String, size: Int = 1): Iterable[String] = { 154 | /* 155 | * Method to scan a column in HBase. 156 | * See : https://github.com/shagunsodhani/locis/issues/15 157 | */ 158 | val instanceCountTable = new HTable(conf, instanceCountTableName) 159 | val scan = new Scan() 160 | scan.addColumn(Bytes.toBytes(columnName), Bytes.toBytes(size.toString())) 161 | instanceCountTable 162 | .getScanner(scan) 163 | .asScala 164 | .map { result => Bytes.toString(result.getRow) } 165 | } 166 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/HDFSWriter.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.FileSystem 5 | import org.apache.hadoop.fs.LocalFileSystem 6 | import org.apache.hadoop.fs.Path 7 | import org.apache.hadoop.hdfs.DistributedFileSystem 8 | import org.apache.hadoop.util.Progressable 9 | import org.slf4j.Logger 10 | import org.slf4j.LoggerFactory 11 | 12 | import com.typesafe.config.ConfigFactory 13 | 14 | class HDFSWriter { 15 | 16 | private val logger: Logger = LoggerFactory.getLogger(getClass) 17 | 18 | private val configuration = new Configuration() 19 | configuration.set("fs.defaultFS", ConfigFactory.load.getString("fs.defaultFS")) 20 | configuration.set("fs.hdfs.impl", classOf[org.apache.hadoop.hdfs.DistributedFileSystem].getName()) 21 | configuration.set("fs.file.impl", classOf[org.apache.hadoop.fs.LocalFileSystem].getName()) 22 | 23 | private val fileSystem = FileSystem.get(configuration) 24 | 25 | def getWriter(outputFilePath: String) = { 26 | val progressable = new Progressable() { 27 | @Override 28 | def progress() { 29 | print("."); 30 | } 31 | } 32 | fileSystem.create(new Path(outputFilePath), progressable) 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/Mysql.scala: -------------------------------------------------------------------------------- 1 | package com.github.locis.utils 2 | 3 | import scala.concurrent.Await 4 | import scala.concurrent.Future 5 | import scala.concurrent.duration.Duration 6 | 7 | import org.slf4j.Logger 8 | import org.slf4j.LoggerFactory 9 | 10 | import com.github.mauricio.async.db.Configuration 11 | import com.github.mauricio.async.db.QueryResult 12 | import com.github.mauricio.async.db.ResultSet 13 | import com.github.mauricio.async.db.mysql.MySQLConnection 14 | import com.typesafe.config.ConfigFactory 15 | 16 | class Mysql { 17 | 18 | private val logger: Logger = LoggerFactory.getLogger(getClass) 19 | 20 | private val mysqlConnection: com.github.mauricio.async.db.mysql.MySQLConnection = { 21 | val configuration: Configuration = { 22 | val username: String = ConfigFactory.load.getString("mysql.username") 23 | val host: String = ConfigFactory.load.getString("mysql.host") 24 | val port: Int = ConfigFactory.load.getInt("mysql.port") 25 | val password: String = ConfigFactory.load.getString("mysql.password") 26 | val database: String = ConfigFactory.load.getString("mysql.database") 27 | new Configuration(username, host, port, Option(password), Option(database)) 28 | } 29 | val connection = new MySQLConnection(configuration) 30 | Await.result(connection.connect, Duration.Inf) 31 | connection 32 | } 33 | 34 | def runQuery(sqlQuery: String): ResultSet = { 35 | val future: Future[QueryResult] = mysqlConnection 36 | .sendPreparedStatement(sqlQuery) 37 | Await.result(future, Duration.Inf).rows.get 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/github/locis/utils/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author shagun 6 | * 7 | */ 8 | package com.github.locis.utils; --------------------------------------------------------------------------------