├── .gitignore
├── LICENSE
├── README.md
├── bq_query.sql
└── location_parser.py


/.gitignore:
--------------------------------------------------------------------------------
1 | API_KEY.secret.txt
2 | inputs
3 | outputs
4 | .pydevproject
5 | .project
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 javier ramírez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # google_timeline_parser
2 | parse your google location history (timeline) after exporting from google takeout
3 | 
4 | * Export your timeline data with google takeout
5 | * Convert the json into a smaller JSON only with the fields we want using jq `cat LocationHistory.json |jq "[.locations[] | {latitudeE7, longitudeE7, timestampMs}]" > filtered_locations.json ` 
6 | * Convert the json summary into CSV with jsonv `cat filtered_locations.json |jsonv  latitudeE7,longitudeE7,timestampMs > filtered_locations.csv` 
7 | * Import the csv into your db. If using BigQuery you can just copy to gcloud and create an external table pointing to your file. Fields are timestamp(TIMESTAMP), date(DATE), lat(FLOAT),lng(FLOAT), and locality(STRING). All of them are required
8 | * Execute your query. I have an example to calculate whole absence days from the UK based on the collected data
9 | 


--------------------------------------------------------------------------------
/bq_query.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   MIN(previous_date) AS date_from,
 3 |   MAX(date) AS date_to,
 4 |   GREATEST(0, DATEDIFF(CAST(MAX(date) AS timestamp),CAST(MIN(previous_date) AS timestamp)) -1) AS days,
 5 |   CONCAT('UK>', GROUP_CONCAT(country, '>')) AS itinerary,
 6 |   trip_no
 7 | FROM (
 8 |   SELECT
 9 |     *,
10 |     SUM(IF(previous_country='UK',1,0)) OVER (ORDER BY timestamp ASC) AS trip_no
11 |   FROM (
12 |     SELECT
13 |       *,
14 |       REGEXP_EXTRACT(previous_locality, r'(\w+$)') AS previous_country,
15 |       REGEXP_EXTRACT(locality, r'(\w+$)') AS country,
16 |     FROM (
17 |       SELECT
18 |         timestamp,
19 |         LAG(date, 1) OVER (ORDER BY timestamp ASC) AS previous_date,
20 |         LAG(locality, 1) OVER (ORDER BY timestamp ASC) AS previous_locality,
21 |         date,
22 |         locality
23 |       FROM
24 |         [javier-cp300:locations.history]
25 |       WHERE
26 |         locality <> 'Error' ) )
27 |   WHERE
28 |     previous_country <> country )
29 | GROUP BY
30 |   trip_no
31 | ORDER BY
32 |   trip_no
33 | 


--------------------------------------------------------------------------------
/location_parser.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from datetime import datetime, timedelta
 3 | import sys
 4 | 
 5 | import requests
 6 | 
 7 | from lxml import etree 
 8 | 
 9 | 
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 | 
13 | cached_locs = {}
14 | api_key = open('API_KEY.secret.txt').read()
15 | 
16 | 
17 | #https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude
18 | #for country, 1 decimal digit is more than we need. It will differentiate also cities usually. Check link above
19 | #the more digits, the more precission, but also the more http request and the slower it will be
20 | def key_for(lat,lng):
21 |     return str(round(lat,1))+'#'+str(round(lng,1))
22 | 
23 | def city_for(lat,lng, cached_locs):
24 |     city = cached_locs.get(key_for(lat, lng))
25 |     if city:
26 |         return city
27 |     
28 |     base_url='https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}&result_type=locality'
29 |     try:
30 |         response = requests.get(base_url.format(lat, lng, api_key)).json()
31 |         city = response['results'][0]['formatted_address'].replace(',','.')
32 |     except:
33 |         city = 'Error' 
34 |     cached_locs[key_for(lat,lng)]=city
35 |     return city
36 | 
37 | def parse_foursquare_date(date_str):
38 |      dt,tz=date_str.rsplit(' ',1)
39 |      dt_obj=datetime.strptime(dt,'%a, %d %b %y %H:%M:%S')
40 |      tz_delta = timedelta(hours=int(tz[1:3]),minutes=int(tz[3:])) 
41 |      if tz[0]=='+':    
42 |          dt_obj-= tz_delta
43 |      else:     
44 |          dt_obj+= tz_delta
45 |      return dt_obj    
46 |      
47 | 
48 | with open('inputs/filtered_locations.csv','rb') as csv_in, open('inputs/foursquare.kml', 'rb') as kml_in, open('outputs/output.csv','wb') as csv_out:
49 |     c_out=csv.writer(csv_out)
50 |     c_in=csv.reader(csv_in)
51 |     k_in=csv.reader(kml_in)
52 |     
53 |     for row in c_in:
54 |         lat = int(row[0])/10000000.0
55 |         lng = int(row[1])/10000000.0
56 |         timestamp = int(row[2])
57 |         date = datetime.fromtimestamp(timestamp/1000.0).strftime('%Y-%m-%d')
58 |         city = city_for(lat, lng, cached_locs).encode('utf-8')
59 |         if city =="Error":
60 |             continue
61 |          
62 |         c_out.writerow([timestamp,date,lat,lng,city,'location_history', None])
63 |     
64 |     tree = etree.parse(kml_in) #("inputs/5000foursquare.kml")        
65 |     for placemark in tree.findall('.//Placemark'):
66 |         name = placemark.find('name').text
67 |         lng,lat = placemark.find('Point/coordinates').text.split(',')
68 |         lng = float(lng)
69 |         lat = float(lat)
70 |         foursquare_date = parse_foursquare_date(placemark.find('published').text) 
71 |         timestamp =  (foursquare_date - datetime(1970, 1, 1)).total_seconds() * 1000
72 |         date = foursquare_date.strftime('%Y-%m-%d')
73 |         city = city_for(lat, lng, cached_locs).encode('utf-8')        
74 |         if city =="Error":
75 |             continue
76 |         c_out.writerow([timestamp,date,lat,lng,city,'foursquare',name])
77 |         
78 |         
79 | 
80 | 


--------------------------------------------------------------------------------