├── .github ├── FUNDING.yml └── ISSUE_TEMPLATE │ └── data_source.md ├── .gitignore ├── .sqliterc ├── LICENSE.txt ├── README.md ├── account.go ├── cmd └── timeliner │ └── main.go ├── datasource.go ├── datasources ├── facebook │ ├── facebook.go │ ├── media.go │ └── post.go ├── googlelocation │ └── googlelocation.go ├── googlephotos │ ├── googlephotos.go │ ├── media.go │ └── takeoutarchive.go ├── instagram │ ├── instagram.go │ └── models.go ├── smsbackuprestore │ ├── mms.go │ ├── sms.go │ └── smsbackuprestore.go └── twitter │ ├── api.go │ ├── api_test.go │ ├── archives.go │ ├── models.go │ └── twitter.go ├── db.go ├── go.mod ├── go.sum ├── itemfiles.go ├── itemgraph.go ├── mapmutex.go ├── oauth2.go ├── oauth2client ├── browser.go ├── localapp.go ├── oauth2.go ├── oauth2proxy │ ├── cmd │ │ └── oauth2proxy │ │ │ └── main.go │ └── proxy.go └── remoteapp.go ├── persons.go ├── processing.go ├── ratelimit.go ├── timeliner.go └── wrappedclient.go /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [mholt] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/data_source.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New data source request 3 | about: Request a new data source 4 | title: '' 5 | labels: 'data source' 6 | assignees: '' 7 | 8 | --- 9 | 10 | <!-- 11 | This template is specifically for requesting the addition of a new data source (a way to add items to the timeline). Please answer all the questions as completely as possible. Put some effort into it since any implementation is going to require even more effort. If questions are not answered sufficiently, the issue may be closed. 12 | 13 | PLEASE NOTE: This project is a community effort. We hope that after posting this issue, you will take the time to implement it and submit a pull request for everyone to use! 14 | --> 15 | 16 | ## 1. What is the data source you want to add? 17 | <!-- Please give the data source's name and website and explain why it would be useful to have its content on a personal timeline. --> 18 | 19 | 20 | 21 | 22 | ## 2. What constitutes an "item" from this data source? 23 | <!-- An item is an entry on the timeline. Some data sources have multiple things that are "items" - for example: photos, blog posts, or text messages can all be items. An item must make sense to put on a timeline, and items must have unique IDs. --> 24 | 25 | 26 | 27 | 28 | ## 3. How are items obtained from the data source? 29 | <!-- Is there a free API that allows us to get content from this source? If so, what authentication is required? Or do we have to manually import data from a file? Please describe the process in detail and link to documentation! --> 30 | 31 | 32 | 33 | 34 | ### 3a. If authentication is required, how does a user create or obtain credentials for Timeliner to access the data? 35 | <!-- For example, APIs that use OAuth usually require creating an app or client with the service provider before their APIs can be accessed. What is that process? We will need to add this to the wiki for others to know how to get set up, so be clear and list the steps here. Check our project wiki first, because it might already be implemented (for example, Google OAuth is already in place.) --> 36 | 37 | 38 | 39 | 40 | ### 3b. If an API is available, what are its rate limits? 41 | <!-- Please link to rate limit documentation as well. --> 42 | 43 | 44 | 45 | 46 | ### 3c. If a file is imported, how is the file obtained? 47 | <!-- What is the process a user must go through to obtain the specific file that the data source is designed to import from? --> 48 | 49 | 50 | 51 | 52 | ### 3d. If a file is imported, how do we read the file? 53 | <!-- Is the file a compressed archive? How do we get the items out? Is the content and metadata separate? Please link to any documentation or provide a sample file. --> 54 | 55 | 56 | 57 | 58 | ## 4. How can items from this data source be related? 59 | <!-- Often, items form relationships with other items; for example, an item might be a reply to another item, or an item might contain another item. Think of relationships as uni-or-bi-directional arrows between items, with a label on the arrow. Relationships enrich the data obtained from this source. What kinds of useful relationships can be expressed from this data source? Do the relationships work both ways or just one way? Talk about this. --> 60 | 61 | 62 | 63 | 64 | ## 5. What constitutes a "collection" from this data source? 65 | <!-- A collection is a group of items (like a photo album). Note that collections are different from item relationships. Some data sources don't have collections; please explain. --> 66 | 67 | 68 | 69 | 70 | ## 6. What might not be trivial, obvious, or straightforward when implementing this data source? 71 | <!-- Most data sources have nuances or caveats, some of which might not be obvious. Please think hard about this and use your experience with this data source to think of things that need special consideration. For example, a data source might only allow the most recent items to be obtained; how could we overcome that, maybe via a data export? See our wiki for "Writing a Data Source" to get ideas about what might be tricky. Ask unanswered questions here, start a discussion. Data sources can't be implemented successfully until these details are figured out. --> 72 | 73 | 74 | 75 | 76 | ## Bonus: How do you like Timeliner? How much data are you preserving with it? Which existing data sources do you use? 77 | <!-- I want to know! --> 78 | 79 | 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _gitignore/ 2 | _storage/ 3 | oauth2client/oauth2proxy/cmd/oauth2proxy/credentials.toml 4 | **/timeliner.toml 5 | cmc/timeliner/timeliner -------------------------------------------------------------------------------- /.sqliterc: -------------------------------------------------------------------------------- 1 | PRAGMA foreign_keys = ON; 2 | -------------------------------------------------------------------------------- /account.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "fmt" 7 | "net/http" 8 | "sync" 9 | "time" 10 | ) 11 | 12 | // Account represents an account with a service. 13 | type Account struct { 14 | ID int64 15 | DataSourceID string 16 | UserID string 17 | person Person 18 | authorization []byte 19 | checkpoint []byte 20 | lastItemID *int64 21 | 22 | t *Timeline 23 | ds DataSource 24 | cp *checkpointWrapper 25 | } 26 | 27 | // NewHTTPClient returns an HTTP client that is suitable for use 28 | // with an API associated with the account's data source. If 29 | // OAuth2 is configured for the data source, the client has OAuth2 30 | // credentials. If a rate limit is configured, this client is 31 | // rate limited. A sane default timeout is set, and any fields 32 | // on the returned Client valule can be modified as needed. 33 | func (acc Account) NewHTTPClient() (*http.Client, error) { 34 | httpClient := new(http.Client) 35 | if acc.ds.OAuth2.ProviderID != "" { 36 | var err error 37 | httpClient, err = acc.NewOAuth2HTTPClient() 38 | if err != nil { 39 | return nil, err 40 | } 41 | } 42 | if acc.ds.RateLimit.RequestsPerHour > 0 { 43 | httpClient.Transport = acc.NewRateLimitedRoundTripper(httpClient.Transport) 44 | } 45 | httpClient.Timeout = 60 * time.Second 46 | return httpClient, nil 47 | } 48 | 49 | func (acc Account) String() string { 50 | return acc.DataSourceID + "/" + acc.UserID 51 | } 52 | 53 | // AddAccount authenticates userID with the service identified 54 | // within the application by dataSourceID, and then stores it in the 55 | // database. The account must not yet exist. 56 | func (t *Timeline) AddAccount(dataSourceID, userID string) error { 57 | // ensure account is not already stored in our system 58 | var count int 59 | err := t.db.QueryRow(`SELECT COUNT(*) FROM accounts WHERE data_source_id=? AND user_id=? LIMIT 1`, 60 | dataSourceID, userID).Scan(&count) 61 | if err != nil { 62 | return fmt.Errorf("checking if account is already stored: %v", err) 63 | } 64 | if count > 0 { 65 | return fmt.Errorf("account already stored in database: %s/%s", dataSourceID, userID) 66 | } 67 | 68 | return t.Authenticate(dataSourceID, userID) 69 | } 70 | 71 | // Authenticate gets authentication for userID with dataSourceID. If the 72 | // account already exists in the database, it will be updated with the 73 | // latest authorization. 74 | func (t *Timeline) Authenticate(dataSourceID, userID string) error { 75 | ds, ok := dataSources[dataSourceID] 76 | if !ok { 77 | return fmt.Errorf("data source not registered: %s", dataSourceID) 78 | } 79 | 80 | // authenticate with the data source (if necessary) 81 | var credsBytes []byte 82 | var err error 83 | if authFn := ds.authFunc(); authFn != nil { 84 | credsBytes, err = authFn(userID) 85 | if err != nil { 86 | return fmt.Errorf("authenticating %s for %s: %v", userID, dataSourceID, err) 87 | } 88 | } 89 | 90 | // make sure the data source is registered in the DB 91 | _, err = t.db.Exec(`INSERT OR IGNORE INTO data_sources (id, name) VALUES (?, ?)`, 92 | dataSourceID, ds.Name) 93 | if err != nil { 94 | return fmt.Errorf("saving data source record: %v", err) 95 | } 96 | 97 | // store the account along with our authorization to access it 98 | _, err = t.db.Exec(`INSERT INTO accounts 99 | (data_source_id, user_id, authorization) 100 | VALUES (?, ?, ?) 101 | ON CONFLICT (data_source_id, user_id) 102 | DO UPDATE SET authorization=?`, 103 | dataSourceID, userID, credsBytes, 104 | credsBytes) 105 | if err != nil { 106 | return fmt.Errorf("inserting into or updating DB: %v", err) 107 | } 108 | 109 | return nil 110 | } 111 | 112 | // NewClient returns a new Client that is ready to interact with 113 | // the data source for the account uniquely specified by the data 114 | // source ID and the user ID for that data source. The Client is 115 | // actually wrapped by a type with unexported fields that are 116 | // necessary for internal use. 117 | func (t *Timeline) NewClient(dataSourceID, userID string) (WrappedClient, error) { 118 | ds, ok := dataSources[dataSourceID] 119 | if !ok { 120 | return WrappedClient{}, fmt.Errorf("data source not registered: %s", dataSourceID) 121 | } 122 | if ds.NewClient == nil { 123 | return WrappedClient{}, fmt.Errorf("impossible to make client for data source: %s", dataSourceID) 124 | } 125 | 126 | acc, err := t.getAccount(dataSourceID, userID) 127 | if err != nil { 128 | return WrappedClient{}, fmt.Errorf("getting account: %v", err) 129 | } 130 | 131 | cl, err := ds.NewClient(acc) 132 | if err != nil { 133 | return WrappedClient{}, fmt.Errorf("making client from data source: %v", err) 134 | } 135 | 136 | return WrappedClient{ 137 | Client: cl, 138 | tl: t, 139 | acc: acc, 140 | ds: ds, 141 | lastItemMu: new(sync.Mutex), 142 | }, nil 143 | } 144 | 145 | func (t *Timeline) getAccount(dsID, userID string) (Account, error) { 146 | ds, ok := dataSources[dsID] 147 | if !ok { 148 | return Account{}, fmt.Errorf("data source not registered: %s", dsID) 149 | } 150 | acc := Account{ 151 | ds: ds, 152 | t: t, 153 | } 154 | err := t.db.QueryRow(`SELECT 155 | id, data_source_id, user_id, authorization, checkpoint, last_item_id 156 | FROM accounts WHERE data_source_id=? AND user_id=? LIMIT 1`, 157 | dsID, userID).Scan(&acc.ID, &acc.DataSourceID, &acc.UserID, &acc.authorization, &acc.checkpoint, &acc.lastItemID) 158 | if err != nil { 159 | return acc, fmt.Errorf("querying account %s/%s from DB: %v", dsID, userID, err) 160 | } 161 | if acc.checkpoint != nil { 162 | err = UnmarshalGob(acc.checkpoint, &acc.cp) 163 | if err != nil { 164 | return acc, fmt.Errorf("decoding checkpoint wrapper: %v", err) 165 | } 166 | } 167 | return acc, nil 168 | } 169 | 170 | // MarshalGob is a convenient way to gob-encode v. 171 | func MarshalGob(v interface{}) ([]byte, error) { 172 | b := new(bytes.Buffer) 173 | err := gob.NewEncoder(b).Encode(v) 174 | return b.Bytes(), err 175 | } 176 | 177 | // UnmarshalGob is a convenient way to gob-decode data into v. 178 | func UnmarshalGob(data []byte, v interface{}) error { 179 | return gob.NewDecoder(bytes.NewReader(data)).Decode(v) 180 | } 181 | -------------------------------------------------------------------------------- /cmd/timeliner/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "github.com/BurntSushi/toml" 14 | "github.com/mholt/timeliner" 15 | "github.com/mholt/timeliner/oauth2client" 16 | "golang.org/x/oauth2" 17 | 18 | // plug in data sources 19 | _ "github.com/mholt/timeliner/datasources/facebook" 20 | _ "github.com/mholt/timeliner/datasources/googlelocation" 21 | _ "github.com/mholt/timeliner/datasources/googlephotos" 22 | _ "github.com/mholt/timeliner/datasources/instagram" 23 | "github.com/mholt/timeliner/datasources/smsbackuprestore" 24 | "github.com/mholt/timeliner/datasources/twitter" 25 | ) 26 | 27 | func init() { 28 | flag.StringVar(&configFile, "config", configFile, "The path to the config file to load") 29 | flag.StringVar(&repoDir, "repo", repoDir, "The path to the folder of the repository") 30 | flag.IntVar(&maxRetries, "max-retries", maxRetries, "If > 0, will retry on failure at most this many times") 31 | flag.DurationVar(&retryAfter, "retry-after", retryAfter, "If > 0, will wait this long between retries") 32 | flag.BoolVar(&verbose, "v", verbose, "Verbose output (can be very slow if data source isn't bottlenecked by network)") 33 | 34 | flag.BoolVar(&prune, "prune", prune, "When finishing, delete items not found on remote (download-all or import only)") 35 | flag.BoolVar(&integrity, "integrity", integrity, "Perform integrity check on existing items and reprocess if needed (download-all or import only)") 36 | flag.BoolVar(&reprocess, "reprocess", reprocess, "Reprocess every item that has not been modified locally (download-all or import only)") 37 | flag.StringVar(&merge, "merge", merge, "Comma-separated list of merge options: soft (required, enables 'soft' merging on: account+timestamp+text or filename), and values to overwrite: id,text,file,metadata") 38 | 39 | flag.StringVar(&tfStartInput, "start", "", "Timeframe start (relative=duration, absolute=YYYY/MM/DD)") 40 | flag.StringVar(&tfEndInput, "end", "", "Timeframe end (relative=duration, absolute=YYYY/MM/DD)") 41 | 42 | flag.BoolVar(&twitterRetweets, "twitter-retweets", twitterRetweets, "Twitter: include retweets") 43 | flag.BoolVar(&twitterReplies, "twitter-replies", twitterReplies, "Twitter: include replies that are not just replies to self") 44 | 45 | flag.StringVar(&phoneDefaultRegion, "phone-default-region", phoneDefaultRegion, "SMS Backup & Restore: default region") 46 | } 47 | 48 | func main() { 49 | flag.Parse() 50 | 51 | if maxRetries < 0 { 52 | maxRetries = 0 53 | } 54 | 55 | // split the CLI arguments into subcommand and arguments 56 | args := flag.Args() 57 | if len(args) == 0 { 58 | log.Fatal("[FATAL] Missing subcommand and account arguments (specify one or more of 'data_source_id/user_id')") 59 | } 60 | subcmd := args[0] 61 | accountList := args[1:] 62 | if subcmd == "import" { 63 | // special case; import takes an extra argument before account list 64 | if len(args) != 3 { 65 | log.Fatal("[FATAL] Expecting: import <filename> <data_source_id/user_id>") 66 | } 67 | accountList = args[2:] 68 | if len(args) == 0 { 69 | log.Fatal("[FATAL] No accounts to use (specify one or more 'data_source_id/user_id' arguments)") 70 | } 71 | } 72 | 73 | // load the command config 74 | err := loadConfig() 75 | if err != nil { 76 | log.Fatalf("[FATAL] Loading configuration: %v", err) 77 | } 78 | 79 | // parse the accounts out of the CLI 80 | accounts, err := getAccounts(accountList) 81 | if err != nil { 82 | log.Fatalf("[FATAL] %v", err) 83 | } 84 | if len(accounts) == 0 { 85 | log.Fatalf("[FATAL] No accounts specified") 86 | } 87 | 88 | // open the timeline 89 | tl, err := timeliner.Open(repoDir) 90 | if err != nil { 91 | log.Fatalf("[FATAL] Opening timeline: %v", err) 92 | } 93 | defer tl.Close() 94 | 95 | // as a special case, handle authentication subcommands separately 96 | switch subcmd { 97 | case "add-account": 98 | for _, a := range accounts { 99 | err := tl.AddAccount(a.dataSourceID, a.userID) 100 | if err != nil { 101 | log.Fatalf("[FATAL] Adding account: %v", err) 102 | } 103 | } 104 | return 105 | case "reauth": 106 | for _, a := range accounts { 107 | err := tl.Authenticate(a.dataSourceID, a.userID) 108 | if err != nil { 109 | log.Fatalf("[FATAL] Authenticating: %v", err) 110 | } 111 | } 112 | return 113 | } 114 | 115 | // get the timeframe within which to constrain item processing (multiple commands use this) 116 | tf, err := parseTimeframe() 117 | if err != nil { 118 | log.Fatalf("[FATAL] %v", err) 119 | } 120 | 121 | // make the processing options 122 | var mergeOpt timeliner.MergeOptions 123 | mergeOptVals := strings.Split(merge, ",") 124 | for _, val := range mergeOptVals { 125 | switch val { 126 | case "": 127 | case "soft": 128 | mergeOpt.SoftMerge = true 129 | case "id": 130 | mergeOpt.PreferNewID = true 131 | case "text": 132 | mergeOpt.PreferNewDataText = true 133 | case "file": 134 | mergeOpt.PreferNewDataFile = true 135 | case "meta": 136 | mergeOpt.PreferNewMetadata = true 137 | default: 138 | log.Fatalf("[FATAL] Unrecognized merge option: '%s'", val) 139 | } 140 | } 141 | if !mergeOpt.SoftMerge && (mergeOpt.PreferNewID || mergeOpt.PreferNewDataText || mergeOpt.PreferNewDataFile || mergeOpt.PreferNewMetadata) { 142 | // for now, the only kind of merging is "soft" merging, so if it is not enabled but other merge options are set, that's probably a user error 143 | log.Fatal("[FATAL] Merge options are specified but merging is not enabled (-merge=soft); only soft merging is implemented") 144 | } 145 | procOpt := timeliner.ProcessingOptions{ 146 | Reprocess: reprocess, 147 | Prune: prune, 148 | Integrity: integrity, 149 | Timeframe: tf, 150 | Merge: mergeOpt, 151 | Verbose: verbose, 152 | } 153 | 154 | // make a client for each account 155 | var clients []timeliner.WrappedClient 156 | for _, a := range accounts { 157 | wc, err := tl.NewClient(a.dataSourceID, a.userID) 158 | if err != nil { 159 | log.Fatalf("[FATAL][%s/%s] Creating data source client: %v", a.dataSourceID, a.userID, err) 160 | } 161 | 162 | // configure the client (TODO: this is not good design; should happen in their own packages) 163 | switch v := wc.Client.(type) { 164 | case *twitter.Client: 165 | v.Retweets = twitterRetweets 166 | v.Replies = twitterReplies 167 | case *smsbackuprestore.Client: 168 | v.DefaultRegion = phoneDefaultRegion 169 | } 170 | 171 | clients = append(clients, wc) 172 | } 173 | 174 | switch subcmd { 175 | case "get-latest": 176 | if procOpt.Reprocess || procOpt.Prune || procOpt.Integrity || procOpt.Timeframe.Since != nil { 177 | log.Fatalf("[FATAL] The get-latest subcommand does not support -reprocess, -prune, -integrity, or -start") 178 | } 179 | 180 | var wg sync.WaitGroup 181 | for _, wc := range clients { 182 | wg.Add(1) 183 | go func(wc timeliner.WrappedClient) { 184 | defer wg.Done() 185 | ctx, cancel := context.WithCancel(context.Background()) 186 | for retryNum := 0; retryNum < 1+maxRetries; retryNum++ { 187 | if retryNum > 0 { 188 | log.Println("[INFO] Retrying command") 189 | } 190 | err := wc.GetLatest(ctx, procOpt) 191 | if err != nil { 192 | log.Printf("[ERROR][%s/%s] Getting latest: %v", 193 | wc.DataSourceID(), wc.UserID(), err) 194 | if retryAfter > 0 { 195 | time.Sleep(retryAfter) 196 | } 197 | continue 198 | } 199 | break 200 | } 201 | defer cancel() // TODO: Make this useful, maybe? 202 | }(wc) 203 | } 204 | wg.Wait() 205 | 206 | case "get-all": 207 | var wg sync.WaitGroup 208 | for _, wc := range clients { 209 | wg.Add(1) 210 | go func(wc timeliner.WrappedClient) { 211 | defer wg.Done() 212 | ctx, cancel := context.WithCancel(context.Background()) 213 | for retryNum := 0; retryNum < 1+maxRetries; retryNum++ { 214 | if retryNum > 0 { 215 | log.Println("[INFO] Retrying command") 216 | } 217 | err := wc.GetAll(ctx, procOpt) 218 | if err != nil { 219 | log.Printf("[ERROR][%s/%s] Downloading all: %v", 220 | wc.DataSourceID(), wc.UserID(), err) 221 | if retryAfter > 0 { 222 | time.Sleep(retryAfter) 223 | } 224 | continue 225 | } 226 | break 227 | } 228 | defer cancel() // TODO: Make this useful, maybe? 229 | }(wc) 230 | } 231 | wg.Wait() 232 | 233 | case "import": 234 | file := args[1] 235 | wc := clients[0] 236 | 237 | ctx, cancel := context.WithCancel(context.Background()) 238 | err = wc.Import(ctx, file, procOpt) 239 | if err != nil { 240 | log.Printf("[ERROR][%s/%s] Importing: %v", 241 | wc.DataSourceID(), wc.UserID(), err) 242 | } 243 | defer cancel() // TODO: Make this useful, maybe? 244 | 245 | default: 246 | log.Fatalf("[FATAL] Unrecognized subcommand: %s", subcmd) 247 | } 248 | } 249 | 250 | // parseTimeframe parses tfStartInput and/or tfEndInput and returns 251 | // the resulting timeframe or an error. 252 | func parseTimeframe() (timeliner.Timeframe, error) { 253 | var tf timeliner.Timeframe 254 | var timeStart, timeEnd time.Time 255 | 256 | if tfStartInput != "" { 257 | tfStartRel, err := time.ParseDuration(tfStartInput) 258 | if err == nil { 259 | timeStart = time.Now().Add(tfStartRel) 260 | } else { 261 | timeStart, err = time.Parse(dateFormat, tfStartInput) 262 | if err != nil { 263 | return tf, fmt.Errorf("bad timeframe start value '%s': %v", tfStartInput, err) 264 | } 265 | } 266 | tf.Since = &timeStart 267 | } 268 | 269 | if tfEndInput != "" { 270 | tfEndRel, err := time.ParseDuration(tfEndInput) 271 | if err == nil { 272 | timeEnd = time.Now().Add(tfEndRel) 273 | } else { 274 | timeEnd, err = time.Parse(dateFormat, tfEndInput) 275 | if err != nil { 276 | return tf, fmt.Errorf("bad timeframe end value '%s': %v", tfEndInput, err) 277 | } 278 | } 279 | tf.Until = &timeEnd 280 | } 281 | 282 | if tf.Since != nil && tf.Until != nil && tf.Until.Before(*tf.Since) { 283 | return tf, fmt.Errorf("end time must be after start time (start=%s end=%s)", tf.Since, tf.Until) 284 | } 285 | 286 | return tf, nil 287 | } 288 | 289 | func loadConfig() error { 290 | // no config file is allowed, but that might be useless 291 | _, err := os.Stat(configFile) 292 | if os.IsNotExist(err) { 293 | return nil 294 | } 295 | 296 | var cmdConfig commandConfig 297 | md, err := toml.DecodeFile(configFile, &cmdConfig) 298 | if err != nil { 299 | return fmt.Errorf("decoding config file: %v", err) 300 | } 301 | if len(md.Undecoded()) > 0 { 302 | return fmt.Errorf("unrecognized key(s) in config file: %+v", md.Undecoded()) 303 | } 304 | 305 | // convert them into oauth2.Configs (the structure of 306 | // oauth2.Config as TOML is too verbose for my taste) 307 | // (important to not be pointer values, since the 308 | // oauth2.Configs need to be copied and changed for 309 | // each token source that is created) 310 | oauth2Configs := make(map[string]oauth2.Config) 311 | for id, prov := range cmdConfig.OAuth2.Providers { 312 | if prov.RedirectURL == "" { 313 | prov.RedirectURL = oauth2client.DefaultRedirectURL 314 | } 315 | oauth2Configs[id] = oauth2.Config{ 316 | ClientID: prov.ClientID, 317 | ClientSecret: prov.ClientSecret, 318 | RedirectURL: prov.RedirectURL, 319 | Endpoint: oauth2.Endpoint{ 320 | AuthURL: prov.AuthURL, 321 | TokenURL: prov.TokenURL, 322 | }, 323 | } 324 | } 325 | 326 | // TODO: Should this be passed into timeliner.Open() instead? 327 | timeliner.OAuth2AppSource = func(providerID string, scopes []string) (oauth2client.App, error) { 328 | cfg, ok := oauth2Configs[providerID] 329 | if !ok { 330 | return nil, fmt.Errorf("unsupported provider: %s", providerID) 331 | } 332 | cfg.Scopes = scopes 333 | return oauth2client.LocalAppSource{OAuth2Config: &cfg}, nil 334 | } 335 | 336 | return nil 337 | } 338 | 339 | func getAccounts(args []string) ([]accountInfo, error) { 340 | var accts []accountInfo 341 | for _, a := range args { 342 | parts := strings.SplitN(a, "/", 2) 343 | if len(parts) < 2 { 344 | return nil, fmt.Errorf("malformed account identifier '%s': expecting '<data_source>/<account>' format", a) 345 | } 346 | accts = append(accts, accountInfo{ 347 | dataSourceID: parts[0], 348 | userID: parts[1], 349 | }) 350 | } 351 | return accts, nil 352 | } 353 | 354 | type accountInfo struct { 355 | dataSourceID string 356 | userID string 357 | } 358 | 359 | type commandConfig struct { 360 | OAuth2 oauth2Config `toml:"oauth2"` 361 | } 362 | 363 | type oauth2Config struct { 364 | Providers map[string]oauth2ProviderConfig `toml:"providers"` 365 | } 366 | 367 | type oauth2ProviderConfig struct { 368 | ClientID string `toml:"client_id"` 369 | ClientSecret string `toml:"client_secret"` 370 | RedirectURL string `toml:"redirect_url"` 371 | AuthURL string `toml:"auth_url"` 372 | TokenURL string `toml:"token_url"` 373 | } 374 | 375 | var ( 376 | repoDir = "./timeliner_repo" 377 | configFile = "timeliner.toml" 378 | maxRetries int 379 | retryAfter time.Duration 380 | verbose bool 381 | 382 | integrity bool 383 | prune bool 384 | reprocess bool 385 | merge string 386 | 387 | tfStartInput, tfEndInput string 388 | 389 | twitterRetweets bool 390 | twitterReplies bool 391 | 392 | phoneDefaultRegion string = "US" 393 | ) 394 | 395 | const dateFormat = "2006/01/02" // YYYY/MM/DD 396 | -------------------------------------------------------------------------------- /datasource.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "database/sql" 7 | "encoding/gob" 8 | "fmt" 9 | "log" 10 | "time" 11 | ) 12 | 13 | func init() { 14 | tdBuf := new(bytes.Buffer) 15 | err := gob.NewEncoder(tdBuf).Encode(Metadata{}) 16 | if err != nil { 17 | log.Fatalf("[FATAL] Unable to gob-encode metadata struct: %v", err) 18 | } 19 | metadataGobPrefix = tdBuf.Bytes() 20 | } 21 | 22 | // RegisterDataSource registers ds as a data source. 23 | func RegisterDataSource(ds DataSource) error { 24 | if ds.ID == "" { 25 | return fmt.Errorf("missing ID") 26 | } 27 | if ds.Name == "" { 28 | return fmt.Errorf("missing Name") 29 | } 30 | if ds.OAuth2.ProviderID != "" && ds.Authenticate != nil { 31 | return fmt.Errorf("conflicting ways of obtaining authorization") 32 | } 33 | 34 | // register the data source 35 | if _, ok := dataSources[ds.ID]; ok { 36 | return fmt.Errorf("data source already registered: %s", ds.ID) 37 | } 38 | dataSources[ds.ID] = ds 39 | 40 | return nil 41 | } 42 | 43 | func saveAllDataSources(db *sql.DB) error { 44 | if len(dataSources) == 0 { 45 | return nil 46 | } 47 | 48 | query := `INSERT INTO "data_sources" ("id", "name") VALUES` 49 | var vals []interface{} 50 | var count int 51 | 52 | for _, ds := range dataSources { 53 | if count > 0 { 54 | query += "," 55 | } 56 | query += " (?, ?)" 57 | vals = append(vals, ds.ID, ds.Name) 58 | count++ 59 | } 60 | 61 | query += " ON CONFLICT DO NOTHING" 62 | 63 | _, err := db.Exec(query, vals...) 64 | if err != nil { 65 | return fmt.Errorf("writing data sources to DB: %v", err) 66 | } 67 | 68 | return nil 69 | } 70 | 71 | // DataSource has information about a 72 | // data source that can be registered. 73 | type DataSource struct { 74 | // A snake_cased name of the service 75 | // that uniquely identifies it from 76 | // all others. 77 | ID string 78 | 79 | // The human-readable or brand name of 80 | // the service. 81 | Name string 82 | 83 | // If the service authenticates with 84 | // OAuth2, fill out this field. 85 | OAuth2 OAuth2 86 | 87 | // Otherwise, if the service uses some 88 | // other form of authentication, 89 | // Authenticate is a function which 90 | // returns the credentials needed to 91 | // access an account on the service. 92 | Authenticate AuthenticateFn 93 | 94 | // If the service enforces a rate limit, 95 | // specify it here. You can abide it by 96 | // getting an http.Client from the 97 | // Account passed into NewClient. 98 | RateLimit RateLimit 99 | 100 | // NewClient is a function which takes 101 | // information about the account and 102 | // returns a type which can facilitate 103 | // transactions with the service. 104 | NewClient NewClientFn 105 | } 106 | 107 | // authFunc gets the authentication function for this 108 | // service. If s.Authenticate is set, it returns that; 109 | // if s.OAuth2 is set, it uses a standard OAuth2 func. 110 | func (ds DataSource) authFunc() AuthenticateFn { 111 | if ds.Authenticate != nil { 112 | return ds.Authenticate 113 | } else if ds.OAuth2.ProviderID != "" { 114 | return func(userID string) ([]byte, error) { 115 | return authorizeWithOAuth2(ds.OAuth2) 116 | } 117 | } 118 | return nil 119 | } 120 | 121 | // OAuth2 defines which OAuth2 provider a service 122 | // uses and which scopes it requires. 123 | type OAuth2 struct { 124 | // The ID of the service must be recognized 125 | // by the OAuth2 app configuration. 126 | ProviderID string 127 | 128 | // The list of scopes to ask for during auth. 129 | Scopes []string 130 | } 131 | 132 | // AuthenticateFn is a function that authenticates userID with a service. 133 | // It returns the authorization or credentials needed to operate. The return 134 | // value should be byte-encoded so it can be stored in the DB to be reused. 135 | // To store arbitrary types, encode the value as a gob, for example. 136 | type AuthenticateFn func(userID string) ([]byte, error) 137 | 138 | // NewClientFn is a function that returns a client which, given 139 | // the account passed in, can interact with a service provider. 140 | type NewClientFn func(acc Account) (Client, error) 141 | 142 | // Client is a type that can interact with a data source. 143 | type Client interface { 144 | // ListItems lists the items on the account. Items should be 145 | // sent on itemChan as they are discovered, but related items 146 | // should be combined onto a single ItemGraph so that their 147 | // relationships can be stored. If the relationships are not 148 | // discovered until later, that's OK: item processing is 149 | // idempotent, so repeating an item from earlier will have no 150 | // adverse effects (this is possible because a unique ID is 151 | // required for each item). 152 | // 153 | // Implementations must honor the context's cancellation. If 154 | // ctx.Done() is closed, the function should return. Typically, 155 | // this is done by having an outer loop select over ctx.Done() 156 | // and default, where the next page or set of items is handled 157 | // in the default case. 158 | // 159 | // ListItems MUST close itemChan when returning. A 160 | // `defer close(itemChan)` will usually suffice. Closing 161 | // this channel signals to the processing goroutine that 162 | // no more items are coming. 163 | // 164 | // Further options for listing items may be passed in opt. 165 | // 166 | // If opt.Filename is specified, the implementation is expected 167 | // to open and list items from that file. If this is not 168 | // supported, an error should be returned. Conversely, if a 169 | // filename is not specified but required, an error should be 170 | // returned. 171 | // 172 | // opt.Timeframe consists of two optional timestamp and/or item 173 | // ID values. If set, item listings should be bounded in the 174 | // respective direction by that timestamp / item ID. (Items 175 | // are assumed to be part of a chronology; both timestamp and 176 | // item ID *may be* provided, when possible, to accommodate 177 | // data sources which do not constrain by timestamp but which 178 | // do by item ID instead.) The respective time and item ID 179 | // fields, if set, will not be in conflict, so either may be 180 | // used if both are present. While it should be documented if 181 | // timeframes are not supported, an error need not be returned 182 | // if they cannot be honored. 183 | // 184 | // opt.Checkpoint consists of the last checkpoint for this 185 | // account if the last call to ListItems did not finish and 186 | // if a checkpoint was saved. If not nil, the checkpoint 187 | // should be used to resume the listing instead of starting 188 | // over from the beginning. Checkpoint values usually consist 189 | // of page tokens or whatever state is required to resume. Call 190 | // timeliner.Checkpoint to set a checkpoint. Checkpoints are not 191 | // required, but if the implementation sets checkpoints, it 192 | // should be able to resume from one, too. 193 | ListItems(ctx context.Context, itemChan chan<- *ItemGraph, opt ListingOptions) error 194 | } 195 | 196 | // Timeframe represents a start and end time and/or 197 | // a start and end item, where either value could be 198 | // nil which means unbounded in that direction. 199 | // When items are used as the timeframe boundaries, 200 | // the ItemID fields will be populated. It is not 201 | // guaranteed that any particular field will be set 202 | // or unset just because other fields are set or unset. 203 | // However, if both Since or both Until fields are 204 | // set, that means the timestamp and items are 205 | // correlated; i.e. the Since timestamp is (approx.) 206 | // that of the item ID. Or, put another way: there 207 | // will never be conflicts among the fields which 208 | // are non-nil. 209 | type Timeframe struct { 210 | Since, Until *time.Time 211 | SinceItemID, UntilItemID *string 212 | } 213 | 214 | func (tf Timeframe) String() string { 215 | var sinceItemID, untilItemID string 216 | if tf.SinceItemID != nil { 217 | sinceItemID = *tf.SinceItemID 218 | } 219 | if tf.UntilItemID != nil { 220 | untilItemID = *tf.UntilItemID 221 | } 222 | return fmt.Sprintf("{Since:%s Until:%s SinceItemID:%s UntilItemID:%s}", 223 | tf.Since, tf.Until, sinceItemID, untilItemID) 224 | } 225 | 226 | var dataSources = make(map[string]DataSource) 227 | -------------------------------------------------------------------------------- /datasources/facebook/media.go: -------------------------------------------------------------------------------- 1 | package facebook 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "mime" 8 | "net/http" 9 | "net/url" 10 | "path" 11 | "time" 12 | 13 | "github.com/mholt/timeliner" 14 | ) 15 | 16 | type fbMediaPage struct { 17 | Data []fbMedia `json:"data"` 18 | Paging fbPaging `json:"paging"` 19 | } 20 | 21 | // fbMedia is used for videos, photos, and albums. 22 | type fbMedia struct { 23 | Album fbAlbum `json:"album,omitempty"` 24 | BackdatedTime string `json:"backdated_time,omitempty"` 25 | CreatedTime string `json:"created_time,omitempty"` 26 | From fbFrom `json:"from,omitempty"` 27 | Images []fbImage `json:"images,omitempty"` 28 | UpdatedTime string `json:"updated_time,omitempty"` 29 | Description string `json:"description,omitempty"` 30 | Length float64 `json:"length,omitempty"` // in seconds 31 | Message string `json:"message,omitempty"` 32 | Name string `json:"name,omitempty"` 33 | Place *fbPlace `json:"place,omitempty"` 34 | Photos *fbMediaPage `json:"photos,omitempty"` 35 | Source string `json:"source,omitempty"` 36 | Status fbVideoStatus `json:"status,omitempty"` 37 | MediaID string `json:"id,omitempty"` 38 | 39 | // these fields added by us and used internally 40 | mediaType string 41 | bestSourceURL string 42 | bestSourceFilename string 43 | exifData map[string]interface{} 44 | } 45 | 46 | func (m *fbMedia) fillFields(mediaType string) { 47 | m.mediaType = mediaType 48 | 49 | // get URL to actual media content; we'll need 50 | // it later, and by doing this now, we only have 51 | // to do it once 52 | switch mediaType { 53 | case "photo": 54 | _, _, m.bestSourceURL = m.getLargestImage() 55 | case "video": 56 | m.bestSourceURL = m.Source 57 | } 58 | if m.bestSourceURL != "" { 59 | sourceURL, err := url.Parse(m.bestSourceURL) 60 | if err != nil { 61 | // TODO: What to return in this case? return the error? 62 | log.Printf("[ERROR] Parsing media source URL to get filename: %v", err) 63 | } 64 | m.bestSourceFilename = path.Base(sourceURL.Path) 65 | } 66 | } 67 | 68 | func (m *fbMedia) ID() string { 69 | return m.MediaID 70 | } 71 | 72 | func (m *fbMedia) Timestamp() time.Time { 73 | if m.BackdatedTime != "" { 74 | return fbTimeToGoTime(m.BackdatedTime) 75 | } 76 | return fbTimeToGoTime(m.CreatedTime) 77 | } 78 | 79 | func (m *fbMedia) DataText() (*string, error) { 80 | if m.Description != "" { 81 | return &m.Description, nil 82 | } 83 | if m.Name != "" { 84 | return &m.Name, nil 85 | } 86 | return nil, nil 87 | } 88 | 89 | func (m *fbMedia) DataFileName() *string { 90 | if m.bestSourceFilename != "" { 91 | return &m.bestSourceFilename 92 | } 93 | return nil 94 | } 95 | 96 | func (m *fbMedia) DataFileHash() []byte { 97 | return nil 98 | } 99 | 100 | func (m *fbMedia) DataFileReader() (io.ReadCloser, error) { 101 | if m.bestSourceURL == "" { 102 | return nil, fmt.Errorf("no way to get data file: no best source URL") 103 | } 104 | 105 | resp, err := http.Get(m.bestSourceURL) 106 | if err != nil { 107 | return nil, fmt.Errorf("getting media contents: %v", err) 108 | } 109 | if resp.StatusCode != http.StatusOK { 110 | resp.Body.Close() 111 | return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 112 | } 113 | 114 | return resp.Body, nil 115 | } 116 | 117 | func (m *fbMedia) DataFileMIMEType() *string { 118 | mt := mime.TypeByExtension(path.Ext(m.bestSourceFilename)) 119 | if mt != "" { 120 | return &mt 121 | } 122 | return nil 123 | } 124 | 125 | func (m *fbMedia) Owner() (*string, *string) { 126 | return &m.From.ID, &m.From.Name 127 | } 128 | 129 | func (m *fbMedia) Class() timeliner.ItemClass { 130 | switch m.mediaType { 131 | case "photo": 132 | return timeliner.ClassImage 133 | case "video": 134 | return timeliner.ClassVideo 135 | } 136 | return timeliner.ClassUnknown 137 | } 138 | 139 | func (m *fbMedia) Metadata() (*timeliner.Metadata, error) { 140 | // TODO 141 | return nil, nil 142 | } 143 | 144 | func (m *fbMedia) getLargestImage() (height, width int, source string) { 145 | var largest int 146 | for _, im := range m.Images { 147 | size := im.Height * im.Width 148 | if size > largest { 149 | source = im.Source 150 | height = im.Height 151 | width = im.Width 152 | largest = size 153 | } 154 | } 155 | return 156 | } 157 | 158 | func (m *fbMedia) Location() (*timeliner.Location, error) { 159 | if m.Place != nil { 160 | return &timeliner.Location{ 161 | Latitude: &m.Place.Location.Latitude, 162 | Longitude: &m.Place.Location.Longitude, 163 | }, nil 164 | } 165 | return nil, nil 166 | } 167 | 168 | type fbVideoStatus struct { 169 | VideoStatus string `json:"video_status,omitempty"` 170 | } 171 | 172 | type fbAlbum struct { 173 | CreatedTime string `json:"created_time,omitempty"` 174 | Name string `json:"name,omitempty"` 175 | ID string `json:"id,omitempty"` 176 | Photos []fbMediaPage `json:"photos,omitempty"` 177 | } 178 | 179 | type fbImage struct { 180 | Height int `json:"height,omitempty"` 181 | Source string `json:"source,omitempty"` 182 | Width int `json:"width,omitempty"` 183 | } 184 | -------------------------------------------------------------------------------- /datasources/facebook/post.go: -------------------------------------------------------------------------------- 1 | package facebook 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "time" 7 | 8 | "github.com/mholt/timeliner" 9 | ) 10 | 11 | type fbPost struct { 12 | Attachments fbPostAttachments `json:"attachments,omitempty"` 13 | BackdatedTime string `json:"backdated_time,omitempty"` 14 | CreatedTime string `json:"created_time,omitempty"` // example format: "2018-12-22T19:10:30+0000" 15 | From fbFrom `json:"from,omitempty"` 16 | Link string `json:"link,omitempty"` 17 | Description string `json:"description,omitempty"` 18 | Message string `json:"message,omitempty"` 19 | Name string `json:"name,omitempty"` 20 | ParentID string `json:"parent_id,omitempty"` 21 | Place *fbPlace `json:"place,omitempty"` 22 | StatusType string `json:"status_type,omitempty"` 23 | Type string `json:"type,omitempty"` 24 | PostID string `json:"id,omitempty"` 25 | } 26 | 27 | func (p fbPost) ID() string { 28 | return p.PostID 29 | } 30 | 31 | func (p fbPost) Timestamp() time.Time { 32 | if p.BackdatedTime != "" { 33 | return fbTimeToGoTime(p.BackdatedTime) 34 | } 35 | return fbTimeToGoTime(p.CreatedTime) 36 | } 37 | 38 | func (p fbPost) DataText() (*string, error) { 39 | return &p.Message, nil 40 | } 41 | 42 | func (p fbPost) DataFileName() *string { 43 | return nil 44 | } 45 | 46 | func (p fbPost) DataFileReader() (io.ReadCloser, error) { 47 | return nil, nil 48 | } 49 | 50 | func (p fbPost) DataFileHash() []byte { 51 | return nil 52 | } 53 | 54 | func (p fbPost) DataFileMIMEType() *string { 55 | return nil 56 | } 57 | 58 | func (p fbPost) Owner() (*string, *string) { 59 | return &p.From.ID, &p.From.Name 60 | } 61 | 62 | func (p fbPost) Class() timeliner.ItemClass { 63 | return timeliner.ClassPost 64 | } 65 | 66 | func (p fbPost) Metadata() (*timeliner.Metadata, error) { 67 | return &timeliner.Metadata{ 68 | Link: p.Link, 69 | Description: p.Description, 70 | Name: p.Name, 71 | ParentID: p.ParentID, 72 | StatusType: p.StatusType, 73 | Type: p.Type, 74 | }, nil 75 | } 76 | 77 | func (p fbPost) Location() (*timeliner.Location, error) { 78 | if p.Place != nil { 79 | return &timeliner.Location{ 80 | Latitude: &p.Place.Location.Latitude, 81 | Longitude: &p.Place.Location.Longitude, 82 | }, nil 83 | } 84 | return nil, nil 85 | } 86 | 87 | type fbPostAttachments struct { 88 | Data []fbPostAttachmentData `json:"data"` 89 | } 90 | 91 | type fbPostAttachmentData struct { 92 | Media fbPostAttachmentMedia `json:"media,omitempty"` 93 | Target fbPostAttachmentTarget `json:"target,omitempty"` 94 | Subattachments fbPostAttachments `json:"subattachments,omitempty"` 95 | Title string `json:"title,omitempty"` 96 | Type string `json:"type,omitempty"` 97 | URL string `json:"url,omitempty"` 98 | } 99 | 100 | type fbPostAttachmentMedia struct { 101 | Image fbPostAttachmentImage `json:"image,omitempty"` 102 | } 103 | 104 | type fbPostAttachmentImage struct { 105 | Height int `json:"height,omitempty"` 106 | Src string `json:"src,omitempty"` 107 | Width int `json:"width,omitempty"` 108 | } 109 | 110 | type fbPostAttachmentTarget struct { 111 | ID string `json:"id,omitempty"` 112 | URL string `json:"url,omitempty"` 113 | } 114 | 115 | func fbTimeToGoTime(fbTime string) time.Time { 116 | if fbTime == "" { 117 | return time.Time{} 118 | } 119 | ts, err := time.Parse(fbTimeFormat, fbTime) 120 | if err != nil { 121 | log.Printf("[ERROR] Parsing timestamp from Facebook: '%s' is not in '%s' format", 122 | fbTime, fbTimeFormat) 123 | } 124 | return ts 125 | } 126 | 127 | const fbTimeFormat = "2006-01-02T15:04:05+0000" 128 | -------------------------------------------------------------------------------- /datasources/googlelocation/googlelocation.go: -------------------------------------------------------------------------------- 1 | // Package googlelocation implements a Timeliner data source for 2 | // importing data from the Google Location History (aka Google 3 | // Maps Timeline). 4 | package googlelocation 5 | 6 | import ( 7 | "context" 8 | "encoding/json" 9 | "fmt" 10 | "io" 11 | "log" 12 | "os" 13 | "sort" 14 | "strconv" 15 | "strings" 16 | "time" 17 | 18 | "github.com/mholt/timeliner" 19 | ) 20 | 21 | // Data source name and ID 22 | const ( 23 | DataSourceName = "Google Location History" 24 | DataSourceID = "google_location" 25 | ) 26 | 27 | var dataSource = timeliner.DataSource{ 28 | ID: DataSourceID, 29 | Name: DataSourceName, 30 | NewClient: func(acc timeliner.Account) (timeliner.Client, error) { 31 | return new(Client), nil 32 | }, 33 | } 34 | 35 | func init() { 36 | err := timeliner.RegisterDataSource(dataSource) 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | } 41 | 42 | // Client implements the timeliner.Client interface. 43 | type Client struct{} 44 | 45 | // ListItems lists items from the data source. opt.Filename must be non-empty. 46 | func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 47 | defer close(itemChan) 48 | 49 | if opt.Filename == "" { 50 | return fmt.Errorf("filename is required") 51 | } 52 | 53 | file, err := os.Open(opt.Filename) 54 | if err != nil { 55 | return fmt.Errorf("opening data file: %v", err) 56 | } 57 | defer file.Close() 58 | 59 | dec := json.NewDecoder(file) 60 | 61 | // read the following opening tokens: 62 | // 1. open brace '{' 63 | // 2. "locations" field name, 64 | // 3. the array value's opening bracket '[' 65 | for i := 0; i < 3; i++ { 66 | _, err := dec.Token() 67 | if err != nil { 68 | return fmt.Errorf("decoding opening token: %v", err) 69 | } 70 | } 71 | 72 | var prev *location 73 | for dec.More() { 74 | select { 75 | case <-ctx.Done(): 76 | return nil 77 | default: 78 | var err error 79 | prev, err = c.processLocation(dec, prev, itemChan) 80 | if err != nil { 81 | return fmt.Errorf("processing location item: %v", err) 82 | } 83 | } 84 | } 85 | 86 | return nil 87 | } 88 | 89 | func (c *Client) processLocation(dec *json.Decoder, prev *location, 90 | itemChan chan<- *timeliner.ItemGraph) (*location, error) { 91 | 92 | var l *location 93 | err := dec.Decode(&l) 94 | if err != nil { 95 | return nil, fmt.Errorf("decoding location element: %v", err) 96 | } 97 | 98 | // redundancy checks (lots of data points are very similar) 99 | if prev != nil { 100 | // if the timestamp of this location is the same 101 | // as the previous one, it seems useless to keep 102 | // both, so skip this one (also, we produce IDs 103 | // based on timestamp, which must be unique -- 104 | // hence why we compare the unix timestamp values) 105 | if l.Timestamp().Unix() == prev.Timestamp().Unix() { 106 | return l, nil 107 | } 108 | 109 | // if this location is basically the same spot as the 110 | // previously-seen one, and if we're sure that the 111 | // timestamps are in order, skip it; mostly redundant 112 | if locationsSimilar(l, prev) && l.Timestamp().Before(prev.Timestamp()) { 113 | return l, nil 114 | } 115 | } 116 | 117 | // store this item, and possibly connect it to the 118 | // previous one if there's a movement activity 119 | ig := timeliner.NewItemGraph(l) 120 | if movement := l.primaryMovement(); movement != "" && prev != nil { 121 | // bidirectional edge, because you may want to know how you got somewhere, 122 | // and the timestamps should make it obvious which location is the "from" 123 | // or the "to", since you can't go backwards in time (that we know of...) 124 | ig.Add(prev, timeliner.Relation{ 125 | Label: strings.ToLower(movement), 126 | Bidirectional: true, 127 | }) 128 | } 129 | itemChan <- ig 130 | 131 | return l, nil 132 | } 133 | 134 | func locationsSimilar(a, b *location) bool { 135 | if a == nil && b == nil { 136 | return true 137 | } 138 | if a == nil || b == nil { 139 | return false 140 | } 141 | return similar(a.LatitudeE7, b.LatitudeE7) && 142 | similar(a.LongitudeE7, b.LongitudeE7) 143 | } 144 | 145 | func similar(a, b int) bool { 146 | const tolerance = 1000 147 | if a > b { 148 | return a-b < tolerance 149 | } 150 | return b-a < tolerance 151 | } 152 | 153 | type location struct { 154 | TimestampMs string `json:"timestampMs"` 155 | LatitudeE7 int `json:"latitudeE7"` 156 | LongitudeE7 int `json:"longitudeE7"` 157 | Accuracy int `json:"accuracy"` 158 | Altitude int `json:"altitude,omitempty"` 159 | VerticalAccuracy int `json:"verticalAccuracy,omitempty"` 160 | Activity []activities `json:"activity,omitempty"` 161 | Velocity int `json:"velocity,omitempty"` 162 | Heading int `json:"heading,omitempty"` 163 | } 164 | 165 | func (l location) primaryMovement() string { 166 | if len(l.Activity) == 0 { 167 | return "" 168 | } 169 | 170 | counts := make(map[string]int) 171 | confidences := make(map[string]int) 172 | for _, a := range l.Activity { 173 | for _, aa := range a.Activity { 174 | counts[aa.Type]++ 175 | confidences[aa.Type] += aa.Confidence 176 | } 177 | } 178 | 179 | // turn confidence into average confidence, 180 | // (ensure all activities are represented), 181 | // and keep activities with high enough score 182 | var top []activity 183 | var hasOnFoot, hasWalking, hasRunning bool 184 | for _, a := range movementActivities { 185 | count := counts[a] 186 | if count == 0 { 187 | count = 1 // for the purposes of division 188 | } 189 | avg := confidences[a] / len(l.Activity) 190 | avgSeen := confidences[a] / count 191 | if avgSeen > 50 { 192 | switch a { 193 | case "ON_FOOT": 194 | hasOnFoot = true 195 | case "WALKING": 196 | hasWalking = true 197 | case "RUNNING": 198 | hasRunning = true 199 | } 200 | top = append(top, activity{Type: a, Confidence: avg}) 201 | } 202 | } 203 | sort.Slice(top, func(i, j int) bool { 204 | return top[i].Confidence > top[j].Confidence 205 | }) 206 | 207 | // consolidate ON_FOOT, WALKING, and RUNNING if more than one is present 208 | if hasOnFoot && (hasWalking || hasRunning) { 209 | for i := 0; i < len(top); i++ { 210 | if hasWalking && hasRunning && 211 | (top[i].Type == "WALKING" || top[i].Type == "RUNNING") { 212 | // if both WALKING and RUNNING, prefer more general ON_FOOT 213 | top = append(top[:i], top[i+1:]...) 214 | } else if top[i].Type == "ON_FOOT" { 215 | // if only one of WALKING or RUNNING, prefer that over ON_FOOT 216 | top = append(top[:i], top[i+1:]...) 217 | } 218 | } 219 | } 220 | 221 | if len(top) > 0 { 222 | return top[0].Type 223 | } 224 | return "" 225 | } 226 | 227 | func (l location) hasActivity(act string) bool { 228 | for _, a := range l.Activity { 229 | for _, aa := range a.Activity { 230 | if aa.Type == act && aa.Confidence > 50 { 231 | return true 232 | } 233 | } 234 | } 235 | return false 236 | } 237 | 238 | type activities struct { 239 | TimestampMs string `json:"timestampMs"` 240 | Activity []activity `json:"activity"` 241 | } 242 | 243 | type activity struct { 244 | Type string `json:"type"` 245 | Confidence int `json:"confidence"` 246 | } 247 | 248 | // ID returns a string representation of the timestamp, 249 | // since there is no actual ID provided by the service. 250 | // It is assumed that one cannot be in two places at once. 251 | func (l location) ID() string { 252 | ts := fmt.Sprintf("loc_%d", l.Timestamp().Unix()) 253 | return ts 254 | } 255 | 256 | func (l location) Timestamp() time.Time { 257 | ts, err := strconv.Atoi(l.TimestampMs) 258 | if err != nil { 259 | return time.Time{} 260 | } 261 | return time.Unix(int64(ts)/1000, 0) 262 | } 263 | 264 | func (l location) Owner() (*string, *string) { 265 | return nil, nil 266 | } 267 | 268 | func (l location) Class() timeliner.ItemClass { 269 | return timeliner.ClassLocation 270 | } 271 | 272 | func (l location) DataText() (*string, error) { 273 | return nil, nil 274 | } 275 | 276 | func (l location) DataFileName() *string { 277 | return nil 278 | } 279 | 280 | func (l location) DataFileReader() (io.ReadCloser, error) { 281 | return nil, nil 282 | } 283 | 284 | func (l location) DataFileHash() []byte { 285 | return nil 286 | } 287 | 288 | func (l location) DataFileMIMEType() *string { 289 | return nil 290 | } 291 | 292 | func (l location) Metadata() (*timeliner.Metadata, error) { 293 | var m timeliner.Metadata 294 | var hasMetadata bool 295 | 296 | if l.Velocity > 0 { 297 | m.Velocity = l.Velocity 298 | hasMetadata = true 299 | } 300 | if l.Heading > 0 { 301 | m.Heading = l.Heading 302 | hasMetadata = true 303 | } 304 | if l.Altitude > 0 { 305 | m.Altitude = l.Altitude 306 | m.AltitudeAccuracy = l.VerticalAccuracy 307 | hasMetadata = true 308 | } 309 | 310 | if hasMetadata { 311 | return &m, nil 312 | } 313 | return nil, nil 314 | } 315 | 316 | func (l location) Location() (*timeliner.Location, error) { 317 | lat := float64(l.LatitudeE7) / 1e7 318 | lon := float64(l.LongitudeE7) / 1e7 319 | return &timeliner.Location{ 320 | Latitude: &lat, 321 | Longitude: &lon, 322 | }, nil 323 | } 324 | 325 | // movementActivities is the list of activities we care about 326 | // for drawing relationships between two locations. For example, 327 | // we don't care about TILTING (sudden accelerometer adjustment, 328 | // like phone set down or person standing up), UNKNOWN, or STILL 329 | // (where there is no apparent movement detected). 330 | // 331 | // https://developers.google.com/android/reference/com/google/android/gms/location/DetectedActivity 332 | var movementActivities = []string{ 333 | "WALKING", 334 | "RUNNING", 335 | "IN_VEHICLE", 336 | "ON_FOOT", 337 | "ON_BICYCLE", 338 | } 339 | -------------------------------------------------------------------------------- /datasources/googlephotos/googlephotos.go: -------------------------------------------------------------------------------- 1 | // Package googlephotos implements the Google Photos service 2 | // using its API, documented at https://developers.google.com/photos/. 3 | package googlephotos 4 | 5 | import ( 6 | "bytes" 7 | "context" 8 | "encoding/json" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "log" 13 | "net/http" 14 | "net/url" 15 | "strings" 16 | "sync" 17 | "time" 18 | 19 | "github.com/mholt/timeliner" 20 | ) 21 | 22 | // Data source name and ID. 23 | const ( 24 | DataSourceName = "Google Photos" 25 | DataSourceID = "google_photos" 26 | 27 | apiBase = "https://photoslibrary.googleapis.com/v1" 28 | ) 29 | 30 | var dataSource = timeliner.DataSource{ 31 | ID: DataSourceID, 32 | Name: DataSourceName, 33 | OAuth2: timeliner.OAuth2{ 34 | ProviderID: "google", 35 | Scopes: []string{"https://www.googleapis.com/auth/photoslibrary.readonly"}, 36 | }, 37 | RateLimit: timeliner.RateLimit{ 38 | RequestsPerHour: 10000 / 24, // https://developers.google.com/photos/library/guides/api-limits-quotas 39 | BurstSize: 3, 40 | }, 41 | NewClient: func(acc timeliner.Account) (timeliner.Client, error) { 42 | httpClient, err := acc.NewHTTPClient() 43 | if err != nil { 44 | return nil, err 45 | } 46 | return &Client{ 47 | HTTPClient: httpClient, 48 | userID: acc.UserID, 49 | checkpoint: checkpointInfo{mu: new(sync.Mutex)}, 50 | }, nil 51 | }, 52 | } 53 | 54 | func init() { 55 | err := timeliner.RegisterDataSource(dataSource) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | } 60 | 61 | // Client interacts with the Google Photos 62 | // API. It requires an OAuth2-authorized 63 | // HTTP client in order to work properly. 64 | type Client struct { 65 | HTTPClient *http.Client 66 | IncludeArchivedMedia bool 67 | 68 | userID string 69 | checkpoint checkpointInfo 70 | } 71 | 72 | // ListItems lists items from the data source. 73 | // opt.Timeframe precision is day-level at best. 74 | func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 75 | defer close(itemChan) 76 | 77 | if opt.Filename != "" { 78 | return c.listFromTakeoutArchive(ctx, itemChan, opt) 79 | } 80 | 81 | // load any previous checkpoint 82 | c.checkpoint.load(opt.Checkpoint) 83 | 84 | // get items and collections 85 | errChan := make(chan error) 86 | go func() { 87 | err := c.listItems(ctx, itemChan, opt) 88 | errChan <- err 89 | }() 90 | go func() { 91 | err := c.listCollections(ctx, itemChan, opt) 92 | errChan <- err 93 | }() 94 | 95 | // read exactly 2 error (or nil) values to ensure we 96 | // block precisely until the two listers are done 97 | var errs []string 98 | for i := 0; i < 2; i++ { 99 | err := <-errChan 100 | if err != nil { 101 | log.Printf("[ERROR] %s/%s: a listing goroutine errored: %v", DataSourceID, c.userID, err) 102 | errs = append(errs, err.Error()) 103 | } 104 | } 105 | if len(errs) > 0 { 106 | return fmt.Errorf("one or more errors: %s", strings.Join(errs, ", ")) 107 | } 108 | 109 | return nil 110 | } 111 | 112 | func (c *Client) listItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 113 | c.checkpoint.mu.Lock() 114 | pageToken := c.checkpoint.ItemsNextPage 115 | c.checkpoint.mu.Unlock() 116 | 117 | for { 118 | select { 119 | case <-ctx.Done(): 120 | return nil 121 | default: 122 | var err error 123 | pageToken, err = c.getItemsNextPage(itemChan, pageToken, opt.Timeframe) 124 | if err != nil { 125 | return fmt.Errorf("getting items on next page: %v", err) 126 | } 127 | if pageToken == "" { 128 | return nil 129 | } 130 | 131 | c.checkpoint.mu.Lock() 132 | c.checkpoint.ItemsNextPage = pageToken 133 | c.checkpoint.save(ctx) 134 | c.checkpoint.mu.Unlock() 135 | } 136 | } 137 | } 138 | 139 | func (c *Client) getItemsNextPage(itemChan chan<- *timeliner.ItemGraph, 140 | pageToken string, timeframe timeliner.Timeframe) (string, error) { 141 | reqBody := listMediaItemsRequest{ 142 | PageSize: 100, 143 | PageToken: pageToken, 144 | } 145 | if timeframe.Since != nil || timeframe.Until != nil { 146 | reqBody.Filters = &listMediaItemsFilter{ 147 | DateFilter: listMediaItemsDateFilter{ 148 | Ranges: []listMediaItemsFilterRange{dateRange(timeframe)}, 149 | }, 150 | IncludeArchivedMedia: c.IncludeArchivedMedia, 151 | } 152 | } 153 | 154 | page, err := c.pageOfMediaItems(reqBody) 155 | if err != nil { 156 | return "", fmt.Errorf("requesting next page: %v", err) 157 | } 158 | 159 | for _, item := range page.MediaItems { 160 | itemChan <- &timeliner.ItemGraph{ 161 | Node: item, 162 | } 163 | } 164 | 165 | return page.NextPageToken, nil 166 | } 167 | 168 | // listCollections lists media items by iterating each album. As 169 | // of Jan. 2019, the Google Photos API does not allow searching 170 | // media items with both an album ID and filters. Because this 171 | // search is predicated on album ID, we cannot be constrained by 172 | // a timeframe in this search. 173 | // 174 | // See https://developers.google.com/photos/library/reference/rest/v1/mediaItems/search. 175 | func (c *Client) listCollections(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 176 | c.checkpoint.mu.Lock() 177 | albumPageToken := c.checkpoint.AlbumsNextPage 178 | c.checkpoint.mu.Unlock() 179 | 180 | for { 181 | select { 182 | case <-ctx.Done(): 183 | return nil 184 | default: 185 | if opt.Verbose { 186 | log.Printf("[DEBUG] %s/%s: listing albums: next page (page_token=%s)", 187 | DataSourceID, c.userID, albumPageToken) 188 | } 189 | 190 | var err error 191 | albumPageToken, err = c.getAlbumsAndTheirItemsNextPage(itemChan, albumPageToken, opt) 192 | if err != nil { 193 | return err 194 | } 195 | if albumPageToken == "" { 196 | return nil 197 | } 198 | 199 | c.checkpoint.mu.Lock() 200 | c.checkpoint.AlbumsNextPage = albumPageToken 201 | c.checkpoint.save(ctx) 202 | c.checkpoint.mu.Unlock() 203 | } 204 | } 205 | } 206 | 207 | func (c *Client) getAlbumsAndTheirItemsNextPage(itemChan chan<- *timeliner.ItemGraph, 208 | pageToken string, opt timeliner.ListingOptions) (string, error) { 209 | vals := url.Values{ 210 | "pageToken": {pageToken}, 211 | "pageSize": {"50"}, 212 | } 213 | 214 | var respBody listAlbums 215 | err := c.apiRequestWithRetry("GET", "/albums?"+vals.Encode(), nil, &respBody) 216 | if err != nil { 217 | return pageToken, err 218 | } 219 | 220 | for _, album := range respBody.Albums { 221 | if opt.Verbose { 222 | log.Printf("[DEBUG] %s/%s: listing items in album: '%s' (album_id=%s item_count=%s)", 223 | DataSourceID, c.userID, album.Title, album.ID, album.MediaItemsCount) 224 | } 225 | 226 | err = c.getAlbumItems(itemChan, album, opt) 227 | if err != nil { 228 | return "", err 229 | } 230 | } 231 | 232 | return respBody.NextPageToken, nil 233 | } 234 | 235 | func (c *Client) getAlbumItems(itemChan chan<- *timeliner.ItemGraph, album gpAlbum, opt timeliner.ListingOptions) error { 236 | var albumItemsNextPage string 237 | var counter int 238 | 239 | const pageSize = 100 240 | 241 | for { 242 | reqBody := listMediaItemsRequest{ 243 | AlbumID: album.ID, 244 | PageToken: albumItemsNextPage, 245 | PageSize: pageSize, 246 | } 247 | 248 | if opt.Verbose { 249 | log.Printf("[DEBUG] %s/%s: getting next page of media items in album (album_id=%s page_size=%d page_token=%s)", 250 | DataSourceID, c.userID, album.ID, pageSize, albumItemsNextPage) 251 | } 252 | 253 | page, err := c.pageOfMediaItems(reqBody) 254 | if err != nil { 255 | return fmt.Errorf("listing album contents: %v", err) 256 | } 257 | 258 | // iterate each media item on this page of the album listing 259 | var items []timeliner.CollectionItem 260 | for _, it := range page.MediaItems { 261 | // since we cannot request items in an album and also filter 262 | // by timestamp, be sure to filter here; it means we still 263 | // have to iterate all items in all albums, but at least we 264 | // can just skip items that fall outside the timeframe... 265 | ts := it.Timestamp() 266 | if opt.Timeframe.Since != nil && ts.Before(*opt.Timeframe.Since) { 267 | continue 268 | } 269 | if opt.Timeframe.Until != nil && ts.After(*opt.Timeframe.Until) { 270 | continue 271 | } 272 | 273 | // otherwise, add this item to the album 274 | items = append(items, timeliner.CollectionItem{ 275 | Item: it, 276 | Position: counter, 277 | }) 278 | counter++ 279 | } 280 | 281 | // if any items remained after filtering, 282 | // process this album now 283 | if len(items) > 0 { 284 | ig := timeliner.NewItemGraph(nil) 285 | ig.Collections = append(ig.Collections, timeliner.Collection{ 286 | OriginalID: album.ID, 287 | Name: &album.Title, 288 | Items: items, 289 | }) 290 | itemChan <- ig 291 | } 292 | 293 | if page.NextPageToken == "" { 294 | return nil 295 | } 296 | 297 | albumItemsNextPage = page.NextPageToken 298 | } 299 | } 300 | 301 | func (c *Client) pageOfMediaItems(reqBody listMediaItemsRequest) (listMediaItems, error) { 302 | var respBody listMediaItems 303 | err := c.apiRequestWithRetry("POST", "/mediaItems:search", reqBody, &respBody) 304 | return respBody, err 305 | } 306 | 307 | func (c *Client) apiRequestWithRetry(method, endpoint string, reqBodyData, respInto interface{}) error { 308 | // do the request in a loop for controlled retries on error 309 | var err error 310 | const maxTries = 10 311 | for i := 0; i < maxTries; i++ { 312 | var resp *http.Response 313 | resp, err = c.apiRequest(method, endpoint, reqBodyData) 314 | if err != nil { 315 | log.Printf("[ERROR] %s/%s: doing API request: >>> %v <<< - retrying... (attempt %d/%d)", 316 | DataSourceID, c.userID, err, i+1, maxTries) 317 | time.Sleep(10 * time.Second) 318 | continue 319 | } 320 | 321 | if resp.StatusCode != http.StatusOK { 322 | bodyText, err2 := ioutil.ReadAll(io.LimitReader(resp.Body, 1024*256)) 323 | resp.Body.Close() 324 | 325 | if err2 == nil { 326 | err = fmt.Errorf("HTTP %d: %s: >>> %s <<<", resp.StatusCode, resp.Status, bodyText) 327 | } else { 328 | err = fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 329 | } 330 | 331 | // extra-long pause for rate limiting errors 332 | if resp.StatusCode == http.StatusTooManyRequests { 333 | log.Printf("[ERROR] %s/%s: rate limited: HTTP %d: %s: %s - retrying in 35 seconds... (attempt %d/%d)", 334 | DataSourceID, c.userID, resp.StatusCode, resp.Status, bodyText, i+1, maxTries) 335 | time.Sleep(35 * time.Second) 336 | continue 337 | } 338 | 339 | // for any other error, wait a couple seconds and retry 340 | log.Printf("[ERROR] %s/%s: bad API response: %v - retrying... (attempt %d/%d)", 341 | DataSourceID, c.userID, err, i+1, maxTries) 342 | time.Sleep(10 * time.Second) 343 | continue 344 | } 345 | 346 | // successful request; read the response body 347 | err = json.NewDecoder(resp.Body).Decode(&respInto) 348 | if err != nil { 349 | resp.Body.Close() 350 | err = fmt.Errorf("decoding JSON: %v", err) 351 | log.Printf("[ERROR] %s/%s: reading API response: %v - retrying... (attempt %d/%d)", 352 | DataSourceID, c.userID, err, i+1, maxTries) 353 | time.Sleep(10 * time.Second) 354 | continue 355 | } 356 | 357 | // successful read; we're done here 358 | resp.Body.Close() 359 | break 360 | } 361 | 362 | return err 363 | } 364 | 365 | func (c *Client) apiRequest(method, endpoint string, reqBodyData interface{}) (*http.Response, error) { 366 | var reqBody io.Reader 367 | if reqBodyData != nil { 368 | reqBodyBytes, err := json.Marshal(reqBodyData) 369 | if err != nil { 370 | return nil, err 371 | } 372 | reqBody = bytes.NewReader(reqBodyBytes) 373 | } 374 | 375 | req, err := http.NewRequest(method, apiBase+endpoint, reqBody) 376 | if err != nil { 377 | return nil, err 378 | } 379 | if reqBody != nil { 380 | req.Header.Set("Content-Type", "application/json") 381 | } 382 | 383 | return c.HTTPClient.Do(req) 384 | } 385 | 386 | func dateRange(timeframe timeliner.Timeframe) listMediaItemsFilterRange { 387 | var start, end filterDate 388 | if timeframe.Since == nil { 389 | start = filterDate{ 390 | Day: 1, 391 | Month: 1, 392 | Year: 1, 393 | } 394 | } else { 395 | since := timeframe.Since.Add(24 * time.Hour) // to account for day precision 396 | start = filterDate{ 397 | Day: since.Day(), 398 | Month: int(since.Month()), 399 | Year: since.Year(), 400 | } 401 | } 402 | if timeframe.Until == nil { 403 | end = filterDate{ 404 | Day: 31, 405 | Month: 12, 406 | Year: 9999, 407 | } 408 | } else { 409 | timeframe.Until.Add(-24 * time.Hour) // to account for day precision 410 | end = filterDate{ 411 | Day: timeframe.Until.Day(), 412 | Month: int(timeframe.Until.Month()), 413 | Year: timeframe.Until.Year(), 414 | } 415 | } 416 | return listMediaItemsFilterRange{ 417 | StartDate: start, 418 | EndDate: end, 419 | } 420 | } 421 | 422 | // Assuming checkpoints are short-lived (i.e. are resumed 423 | // somewhat quickly, before the page tokens/cursors expire), 424 | // we can just store the page tokens. 425 | type checkpointInfo struct { 426 | ItemsNextPage string 427 | AlbumsNextPage string 428 | mu *sync.Mutex 429 | } 430 | 431 | // save records the checkpoint. It is NOT thread-safe, 432 | // so calls to this must be protected by a mutex. 433 | func (ch *checkpointInfo) save(ctx context.Context) { 434 | gobBytes, err := timeliner.MarshalGob(ch) 435 | if err != nil { 436 | log.Printf("[ERROR] %s: encoding checkpoint: %v", DataSourceID, err) 437 | } 438 | timeliner.Checkpoint(ctx, gobBytes) 439 | } 440 | 441 | // load decodes the checkpoint. It is NOT thread-safe, 442 | // so calls to this must be protected by a mutex. 443 | func (ch *checkpointInfo) load(checkpointGob []byte) { 444 | if len(checkpointGob) == 0 { 445 | return 446 | } 447 | err := timeliner.UnmarshalGob(checkpointGob, ch) 448 | if err != nil { 449 | log.Printf("[ERROR] %s: decoding checkpoint: %v", DataSourceID, err) 450 | } 451 | } 452 | -------------------------------------------------------------------------------- /datasources/googlephotos/media.go: -------------------------------------------------------------------------------- 1 | package googlephotos 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "io/ioutil" 7 | "log" 8 | "net/http" 9 | "strconv" 10 | "time" 11 | 12 | "github.com/mholt/timeliner" 13 | ) 14 | 15 | // listMediaItems is the structure of the results 16 | // of calling mediaItems in the Google Photos API. 17 | type listMediaItems struct { 18 | MediaItems []mediaItem `json:"mediaItems"` 19 | NextPageToken string `json:"nextPageToken"` 20 | } 21 | 22 | type mediaItem struct { 23 | MediaID string `json:"id"` 24 | ProductURL string `json:"productUrl"` 25 | BaseURL string `json:"baseUrl"` 26 | Description string `json:"description"` 27 | MIMEType string `json:"mimeType"` 28 | MediaMetadata mediaMetadata `json:"mediaMetadata"` 29 | ContributorInfo mediaContributor `json:"mediaContributor"` 30 | Filename string `json:"filename"` 31 | } 32 | 33 | func (m mediaItem) ID() string { 34 | return m.MediaID 35 | } 36 | 37 | func (m mediaItem) Timestamp() time.Time { 38 | return m.MediaMetadata.CreationTime 39 | } 40 | 41 | func (m mediaItem) DataText() (*string, error) { 42 | return &m.Description, nil 43 | } 44 | 45 | func (m mediaItem) DataFileName() *string { 46 | return &m.Filename 47 | } 48 | 49 | func (m mediaItem) DataFileReader() (io.ReadCloser, error) { 50 | if m.MediaMetadata.Video != nil && m.MediaMetadata.Video.Status != "READY" { 51 | log.Printf("[INFO] Skipping video file because it is not ready (status=%s filename=%s)", 52 | m.MediaMetadata.Video.Status, m.Filename) 53 | return nil, nil 54 | } 55 | 56 | u := m.BaseURL 57 | 58 | // configure for the download of full file with almost-full exif data; see 59 | // https://developers.google.com/photos/library/guides/access-media-items#base-urls 60 | if m.MediaMetadata.Photo != nil { 61 | u += "=d" 62 | } else if m.MediaMetadata.Video != nil { 63 | u += "=dv" 64 | } 65 | 66 | const maxTries = 5 67 | var err error 68 | var resp *http.Response 69 | for i := 0; i < maxTries; i++ { 70 | resp, err = http.Get(u) 71 | if err != nil { 72 | err = fmt.Errorf("getting media contents: %v", err) 73 | log.Printf("[ERROR] %s: %s: %v - retrying... (attempt %d/%d)", DataSourceID, u, err, i+1, maxTries) 74 | time.Sleep(30 * time.Second) 75 | continue 76 | } 77 | if resp.StatusCode != http.StatusOK { 78 | bodyText, err2 := ioutil.ReadAll(io.LimitReader(resp.Body, 1024*256)) 79 | resp.Body.Close() 80 | 81 | if err2 == nil { 82 | err = fmt.Errorf("HTTP %d: %s: >>> %s <<<", resp.StatusCode, resp.Status, bodyText) 83 | } else { 84 | err = fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) 85 | } 86 | 87 | log.Printf("[ERROR %s: %s: Bad response: %v - waiting and retrying... (attempt %d/%d)", 88 | DataSourceID, u, err, i+1, maxTries) 89 | time.Sleep(15 * time.Second) 90 | continue 91 | } 92 | break 93 | } 94 | 95 | if resp == nil { 96 | return nil, err 97 | } 98 | return resp.Body, err 99 | } 100 | 101 | func (m mediaItem) DataFileHash() []byte { 102 | return nil 103 | } 104 | 105 | func (m mediaItem) DataFileMIMEType() *string { 106 | return &m.MIMEType 107 | } 108 | 109 | func (m mediaItem) Owner() (*string, *string) { 110 | // since we only download media owned by the account, 111 | // we can leave ID nil and assume the display name 112 | // is the account owner's name 113 | if m.ContributorInfo.DisplayName != "" { 114 | return nil, &m.ContributorInfo.DisplayName 115 | } 116 | return nil, nil 117 | } 118 | 119 | func (m mediaItem) Class() timeliner.ItemClass { 120 | if m.MediaMetadata.Video != nil { 121 | return timeliner.ClassVideo 122 | } 123 | if m.MediaMetadata.Photo != nil { 124 | return timeliner.ClassImage 125 | } 126 | return timeliner.ClassUnknown 127 | } 128 | 129 | func (m mediaItem) Metadata() (*timeliner.Metadata, error) { 130 | // TODO: Parse exif metadata... maybe add most important/useful 131 | // EXIF fields to the metadata struct directly? 132 | 133 | widthInt, err := strconv.Atoi(m.MediaMetadata.Width) 134 | if err != nil { 135 | return nil, fmt.Errorf("parsing width as int: %v (width=%s)", 136 | err, m.MediaMetadata.Width) 137 | } 138 | heightInt, err := strconv.Atoi(m.MediaMetadata.Height) 139 | if err != nil { 140 | return nil, fmt.Errorf("parsing height as int: %v (height=%s)", 141 | err, m.MediaMetadata.Height) 142 | } 143 | 144 | meta := &timeliner.Metadata{ 145 | Width: widthInt, 146 | Height: heightInt, 147 | } 148 | 149 | if m.MediaMetadata.Photo != nil { 150 | meta.CameraMake = m.MediaMetadata.Photo.CameraMake 151 | meta.CameraModel = m.MediaMetadata.Photo.CameraModel 152 | meta.FocalLength = m.MediaMetadata.Photo.FocalLength 153 | meta.ApertureFNumber = m.MediaMetadata.Photo.ApertureFNumber 154 | meta.ISOEquivalent = m.MediaMetadata.Photo.ISOEquivalent 155 | if m.MediaMetadata.Photo.ExposureTime != "" { 156 | expDur, err := time.ParseDuration(m.MediaMetadata.Photo.ExposureTime) 157 | if err != nil { 158 | return nil, fmt.Errorf("parsing exposure time as duration: %v (exposure_time=%s)", 159 | err, m.MediaMetadata.Photo.ExposureTime) 160 | } 161 | meta.ExposureTime = expDur 162 | } 163 | } else if m.MediaMetadata.Video != nil { 164 | meta.CameraMake = m.MediaMetadata.Video.CameraMake 165 | meta.CameraModel = m.MediaMetadata.Video.CameraModel 166 | meta.FPS = m.MediaMetadata.Video.FPS 167 | } 168 | 169 | return meta, nil 170 | } 171 | 172 | func (m mediaItem) Location() (*timeliner.Location, error) { 173 | // See https://issuetracker.google.com/issues/80379228 😭 174 | return nil, nil 175 | } 176 | 177 | type mediaMetadata struct { 178 | CreationTime time.Time `json:"creationTime"` 179 | Width string `json:"width"` 180 | Height string `json:"height"` 181 | Photo *photoMetadata `json:"photo,omitempty"` 182 | Video *videoMetadata `json:"video,omitempty"` 183 | } 184 | 185 | type photoMetadata struct { 186 | CameraMake string `json:"cameraMake"` 187 | CameraModel string `json:"cameraModel"` 188 | FocalLength float64 `json:"focalLength"` 189 | ApertureFNumber float64 `json:"apertureFNumber"` 190 | ISOEquivalent int `json:"isoEquivalent"` 191 | ExposureTime string `json:"exposureTime"` // TODO: Parse duration out of this...? 192 | } 193 | 194 | type videoMetadata struct { 195 | CameraMake string `json:"cameraMake"` 196 | CameraModel string `json:"cameraModel"` 197 | FPS float64 `json:"fps"` 198 | Status string `json:"status"` 199 | } 200 | 201 | type mediaContributor struct { 202 | ProfilePictureBaseURL string `json:"profilePictureBaseUrl"` 203 | DisplayName string `json:"displayName"` 204 | } 205 | 206 | type listMediaItemsRequest struct { 207 | Filters *listMediaItemsFilter `json:"filters,omitempty"` 208 | AlbumID string `json:"albumId,omitempty"` 209 | PageSize int `json:"pageSize,omitempty"` 210 | PageToken string `json:"pageToken,omitempty"` 211 | } 212 | 213 | type listMediaItemsFilter struct { 214 | DateFilter listMediaItemsDateFilter `json:"dateFilter"` 215 | IncludeArchivedMedia bool `json:"includeArchivedMedia"` 216 | ExcludeNonAppCreatedData bool `json:"excludeNonAppCreatedData"` 217 | ContentFilter listMediaItemsContentFilter `json:"contentFilter"` 218 | MediaTypeFilter listMediaItemsMediaTypeFilter `json:"mediaTypeFilter"` 219 | } 220 | 221 | type listMediaItemsDateFilter struct { 222 | Ranges []listMediaItemsFilterRange `json:"ranges,omitempty"` 223 | Dates []filterDate `json:"dates,omitempty"` 224 | } 225 | 226 | type listMediaItemsFilterRange struct { 227 | StartDate filterDate `json:"startDate"` 228 | EndDate filterDate `json:"endDate"` 229 | } 230 | 231 | type filterDate struct { 232 | Month int `json:"month"` 233 | Day int `json:"day"` 234 | Year int `json:"year"` 235 | } 236 | 237 | type listMediaItemsContentFilter struct { 238 | ExcludedContentCategories []string `json:"excludedContentCategories,omitempty"` 239 | IncludedContentCategories []string `json:"includedContentCategories,omitempty"` 240 | } 241 | 242 | type listMediaItemsMediaTypeFilter struct { 243 | MediaTypes []string `json:"mediaTypes,omitempty"` 244 | } 245 | 246 | type listAlbums struct { 247 | Albums []gpAlbum `json:"albums"` 248 | NextPageToken string `json:"nextPageToken"` 249 | } 250 | 251 | type gpAlbum struct { 252 | ID string `json:"id"` 253 | Title string `json:"title,omitempty"` 254 | ProductURL string `json:"productUrl"` 255 | MediaItemsCount string `json:"mediaItemsCount"` 256 | CoverPhotoBaseURL string `json:"coverPhotoBaseUrl"` 257 | CoverPhotoMediaItemID string `json:"coverPhotoMediaItemId"` 258 | } 259 | -------------------------------------------------------------------------------- /datasources/googlephotos/takeoutarchive.go: -------------------------------------------------------------------------------- 1 | package googlephotos 2 | 3 | import ( 4 | "archive/tar" 5 | "archive/zip" 6 | "bytes" 7 | "context" 8 | "encoding/json" 9 | "fmt" 10 | "io" 11 | "path/filepath" 12 | "strconv" 13 | "strings" 14 | "time" 15 | 16 | "github.com/mholt/archiver/v3" 17 | "github.com/mholt/timeliner" 18 | ) 19 | 20 | func (c *Client) listFromTakeoutArchive(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 21 | err := archiver.Walk(opt.Filename, func(f archiver.File) error { 22 | pathInArchive := getPathInArchive(f) // TODO: maybe this should be a function in the archiver lib 23 | 24 | // only walk in album folders, and look for metadata files 25 | if !strings.HasPrefix(pathInArchive, "Takeout/Google Photos/") { 26 | return nil 27 | } 28 | if f.Name() != albumMetadataFilename { 29 | return nil 30 | } 31 | 32 | // album metadata file; begin processing next album 33 | var albumMeta albumArchiveMetadata 34 | err := json.NewDecoder(f).Decode(&albumMeta) 35 | if err != nil { 36 | return fmt.Errorf("decoding album metadata file %s: %v", pathInArchive, err) 37 | } 38 | collection := timeliner.Collection{ 39 | OriginalID: albumMeta.AlbumData.Date.Timestamp, // TODO: we don't have one... this will not merge nicely with API imports!! 40 | Name: &albumMeta.AlbumData.Title, 41 | Description: &albumMeta.AlbumData.Description, 42 | } 43 | 44 | albumPathInArchive := strings.TrimSuffix(pathInArchive, albumMetadataFilename) 45 | 46 | // get all the album's items using a separate walk that is constrained to this album's folder 47 | err = archiver.Walk(opt.Filename, func(f archiver.File) error { 48 | pathInArchive := getPathInArchive(f) 49 | if !strings.HasPrefix(pathInArchive, albumPathInArchive) { 50 | return nil 51 | } 52 | if f.Name() == albumMetadataFilename { 53 | return nil 54 | } 55 | if filepath.Ext(f.Name()) != ".json" { 56 | return nil 57 | } 58 | 59 | var itemMeta mediaArchiveMetadata 60 | err := json.NewDecoder(f).Decode(&itemMeta) 61 | if err != nil { 62 | return fmt.Errorf("decoding item metadata file %s: %v", pathInArchive, err) 63 | } 64 | 65 | itemMeta.parsedPhotoTakenTime, err = itemMeta.timestamp() 66 | if err != nil { 67 | return fmt.Errorf("parsing timestamp from item %s: %v", pathInArchive, err) 68 | } 69 | itemMeta.pathInArchive = strings.TrimSuffix(pathInArchive, ".json") 70 | itemMeta.archiveFilename = opt.Filename 71 | 72 | withinTimeframe := (opt.Timeframe.Since == nil || itemMeta.parsedPhotoTakenTime.After(*opt.Timeframe.Since)) && 73 | (opt.Timeframe.Until == nil || itemMeta.parsedPhotoTakenTime.Before(*opt.Timeframe.Until)) 74 | 75 | if withinTimeframe { 76 | collection.Items = append(collection.Items, timeliner.CollectionItem{ 77 | Item: itemMeta, 78 | Position: len(collection.Items), 79 | }) 80 | } 81 | 82 | return nil 83 | }) 84 | if err != nil { 85 | return err 86 | } 87 | 88 | if len(collection.Items) > 0 { 89 | ig := timeliner.NewItemGraph(nil) 90 | ig.Collections = append(ig.Collections, collection) 91 | itemChan <- ig 92 | } 93 | 94 | return nil 95 | }) 96 | if err != nil { 97 | return err 98 | } 99 | 100 | return nil 101 | } 102 | 103 | const albumMetadataFilename = "metadata.json" 104 | 105 | func getPathInArchive(f archiver.File) string { 106 | switch hdr := f.Header.(type) { 107 | case zip.FileHeader: 108 | return hdr.Name 109 | case *tar.Header: 110 | return hdr.Name 111 | } 112 | return "" 113 | } 114 | 115 | type albumArchiveMetadata struct { 116 | AlbumData struct { 117 | Title string `json:"title"` 118 | Description string `json:"description"` 119 | Access string `json:"access"` 120 | Location string `json:"location"` 121 | Date struct { 122 | Timestamp string `json:"timestamp"` 123 | Formatted string `json:"formatted"` 124 | } `json:"date"` 125 | GeoData struct { 126 | Latitude float64 `json:"latitude"` 127 | Longitude float64 `json:"longitude"` 128 | Altitude float64 `json:"altitude"` 129 | LatitudeSpan float64 `json:"latitudeSpan"` 130 | LongitudeSpan float64 `json:"longitudeSpan"` 131 | } `json:"geoData"` 132 | } `json:"albumData"` 133 | } 134 | 135 | type mediaArchiveMetadata struct { 136 | Title string `json:"title"` 137 | Description string `json:"description"` 138 | ImageViews string `json:"imageViews"` 139 | CreationTime struct { 140 | Timestamp string `json:"timestamp"` 141 | Formatted string `json:"formatted"` 142 | } `json:"creationTime"` 143 | ModificationTime struct { 144 | Timestamp string `json:"timestamp"` 145 | Formatted string `json:"formatted"` 146 | } `json:"modificationTime"` 147 | GeoData struct { 148 | Latitude float64 `json:"latitude"` 149 | Longitude float64 `json:"longitude"` 150 | Altitude float64 `json:"altitude"` 151 | LatitudeSpan float64 `json:"latitudeSpan"` 152 | LongitudeSpan float64 `json:"longitudeSpan"` 153 | } `json:"geoData"` 154 | GeoDataExif struct { 155 | Latitude float64 `json:"latitude"` 156 | Longitude float64 `json:"longitude"` 157 | Altitude float64 `json:"altitude"` 158 | LatitudeSpan float64 `json:"latitudeSpan"` 159 | LongitudeSpan float64 `json:"longitudeSpan"` 160 | } `json:"geoDataExif"` 161 | PhotoTakenTime struct { 162 | Timestamp string `json:"timestamp"` 163 | Formatted string `json:"formatted"` 164 | } `json:"photoTakenTime"` 165 | GooglePhotosOrigin struct { 166 | MobileUpload struct { 167 | DeviceFolder struct { 168 | LocalFolderName string `json:"localFolderName"` 169 | } `json:"deviceFolder"` 170 | DeviceType string `json:"deviceType"` 171 | } `json:"mobileUpload"` 172 | } `json:"googlePhotosOrigin"` 173 | 174 | parsedPhotoTakenTime time.Time 175 | archiveFilename string 176 | pathInArchive string 177 | } 178 | 179 | func (m mediaArchiveMetadata) timestamp() (time.Time, error) { 180 | ts := m.PhotoTakenTime.Timestamp 181 | if ts == "" { 182 | ts = m.CreationTime.Timestamp 183 | } 184 | if ts == "" { 185 | ts = m.ModificationTime.Timestamp 186 | } 187 | if ts == "" { 188 | return time.Time{}, fmt.Errorf("no timestamp available") 189 | } 190 | parsed, err := strconv.ParseInt(ts, 10, 64) 191 | if err != nil { 192 | return time.Time{}, err 193 | } 194 | return time.Unix(parsed, 0), nil 195 | } 196 | 197 | // ID does NOT return the same ID as from the API. Takeout archives do NOT 198 | // have an ID associated with each item, so we do our best by making up 199 | // an ID using the timestamp and the filename. 200 | func (m mediaArchiveMetadata) ID() string { 201 | return m.PhotoTakenTime.Timestamp + "_" + m.Title 202 | } 203 | 204 | func (m mediaArchiveMetadata) Timestamp() time.Time { 205 | return m.parsedPhotoTakenTime 206 | } 207 | 208 | func (m mediaArchiveMetadata) Class() timeliner.ItemClass { 209 | ext := filepath.Ext(strings.ToLower(m.Title)) 210 | switch ext { 211 | case ".mp4", ".m4v", ".mov", ".wmv", ".mkv", "mpeg4", ".mpeg", ".ogg", ".m4p", ".avi": 212 | return timeliner.ClassVideo 213 | default: 214 | return timeliner.ClassImage 215 | } 216 | } 217 | 218 | func (m mediaArchiveMetadata) Owner() (id *string, name *string) { 219 | return nil, nil 220 | } 221 | 222 | func (m mediaArchiveMetadata) DataText() (*string, error) { 223 | if m.Description != "" { 224 | return &m.Description, nil 225 | } 226 | return nil, nil 227 | } 228 | 229 | func (m mediaArchiveMetadata) DataFileName() *string { 230 | return &m.Title 231 | } 232 | 233 | func (m mediaArchiveMetadata) DataFileReader() (io.ReadCloser, error) { 234 | var rc io.ReadCloser 235 | err := archiver.Walk(m.archiveFilename, func(f archiver.File) error { 236 | pathInArchive := getPathInArchive(f) 237 | if pathInArchive != m.pathInArchive { 238 | return nil 239 | } 240 | 241 | buf := new(bytes.Buffer) 242 | _, err := io.Copy(buf, f) 243 | if err != nil { 244 | return fmt.Errorf("copying item into memory: %v", err) 245 | } 246 | rc = timeliner.FakeCloser(buf) 247 | 248 | return archiver.ErrStopWalk 249 | }) 250 | if err != nil { 251 | return nil, fmt.Errorf("walking takeout file %s in search of media: %v", 252 | m.archiveFilename, err) 253 | } 254 | return rc, nil 255 | } 256 | 257 | func (m mediaArchiveMetadata) DataFileHash() []byte { 258 | return nil 259 | } 260 | 261 | func (m mediaArchiveMetadata) DataFileMIMEType() *string { 262 | return nil 263 | } 264 | 265 | func (m mediaArchiveMetadata) Metadata() (*timeliner.Metadata, error) { 266 | return nil, nil 267 | } 268 | 269 | func (m mediaArchiveMetadata) Location() (*timeliner.Location, error) { 270 | lat, lon := m.GeoData.Latitude, m.GeoData.Longitude 271 | if lat == 0 { 272 | lat = m.GeoDataExif.Latitude 273 | } 274 | if lon == 0 { 275 | lon = m.GeoDataExif.Longitude 276 | } 277 | return &timeliner.Location{ 278 | Latitude: &lat, 279 | Longitude: &lon, 280 | }, nil 281 | } 282 | -------------------------------------------------------------------------------- /datasources/instagram/instagram.go: -------------------------------------------------------------------------------- 1 | // Package instagram implements a Timeliner data source for 2 | // importing data from Instagram archive files. 3 | package instagram 4 | 5 | import ( 6 | "context" 7 | "encoding/json" 8 | "fmt" 9 | "log" 10 | "time" 11 | 12 | "github.com/mholt/archiver/v3" 13 | "github.com/mholt/timeliner" 14 | ) 15 | 16 | // Data source name and ID 17 | const ( 18 | DataSourceName = "Instagram" 19 | DataSourceID = "instagram" 20 | ) 21 | 22 | var dataSource = timeliner.DataSource{ 23 | ID: DataSourceID, 24 | Name: DataSourceName, 25 | NewClient: func(acc timeliner.Account) (timeliner.Client, error) { 26 | return new(Client), nil 27 | }, 28 | } 29 | 30 | func init() { 31 | err := timeliner.RegisterDataSource(dataSource) 32 | if err != nil { 33 | log.Fatal(err) 34 | } 35 | } 36 | 37 | // Client implements the timeliner.Client interface. 38 | type Client struct{} 39 | 40 | // ListItems lists items from the data source. opt.Filename must be non-empty. 41 | func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 42 | defer close(itemChan) 43 | 44 | if opt.Filename == "" { 45 | return fmt.Errorf("filename is required") 46 | } 47 | 48 | // first, load the profile information 49 | prof, err := c.getProfileInfo(opt.Filename) 50 | if err != nil { 51 | return fmt.Errorf("loading profile: %v", err) 52 | } 53 | 54 | // then, load the media index 55 | idx, err := c.getMediaIndex(opt.Filename) 56 | if err != nil { 57 | return fmt.Errorf("loading index: %v", err) 58 | } 59 | 60 | // prepare each media item with the information they 61 | // need to be processed into the timeline 62 | for i, ph := range idx.Photos { 63 | idx.Photos[i].profile = prof 64 | idx.Photos[i].archiveFilename = opt.Filename 65 | idx.Photos[i].takenAtParsed, err = time.Parse(takenAtFormat, ph.TakenAt) 66 | if err != nil { 67 | return fmt.Errorf("parsing photo time %s into format %s: %v", ph.TakenAt, takenAtFormat, err) 68 | } 69 | } 70 | for i, p := range idx.Profile { 71 | idx.Profile[i].profile = prof 72 | idx.Profile[i].archiveFilename = opt.Filename 73 | idx.Photos[i].takenAtParsed, err = time.Parse(takenAtFormat, p.TakenAt) 74 | if err != nil { 75 | return fmt.Errorf("parsing profile pic time %s into format %s: %v", p.TakenAt, takenAtFormat, err) 76 | } 77 | } 78 | for i, vid := range idx.Videos { 79 | idx.Videos[i].profile = prof 80 | idx.Videos[i].archiveFilename = opt.Filename 81 | idx.Videos[i].takenAtParsed, err = time.Parse(takenAtFormat, vid.TakenAt) 82 | if err != nil { 83 | return fmt.Errorf("parsing video time %s into format %s: %v", vid.TakenAt, takenAtFormat, err) 84 | } 85 | } 86 | 87 | // add all of the media items to the timeline 88 | for _, photo := range idx.Photos { 89 | itemChan <- timeliner.NewItemGraph(photo) 90 | } 91 | for _, video := range idx.Videos { 92 | itemChan <- timeliner.NewItemGraph(video) 93 | } 94 | 95 | return nil 96 | } 97 | 98 | func (c *Client) getProfileInfo(filename string) (instaAccountProfile, error) { 99 | var prof instaAccountProfile 100 | err := archiver.Walk(filename, func(f archiver.File) error { 101 | defer f.Close() 102 | if f.Name() != "profile.json" { 103 | return nil 104 | } 105 | 106 | err := json.NewDecoder(f).Decode(&prof) 107 | if err != nil { 108 | return fmt.Errorf("decoding account file: %v", err) 109 | } 110 | 111 | return archiver.ErrStopWalk 112 | }) 113 | return prof, err 114 | } 115 | 116 | func (c *Client) getMediaIndex(filename string) (instaMediaIndex, error) { 117 | var idx instaMediaIndex 118 | err := archiver.Walk(filename, func(f archiver.File) error { 119 | defer f.Close() 120 | if f.Name() != "media.json" { 121 | return nil 122 | } 123 | 124 | err := json.NewDecoder(f).Decode(&idx) 125 | if err != nil { 126 | return fmt.Errorf("decoding media index JSON: %v", err) 127 | } 128 | 129 | return archiver.ErrStopWalk 130 | }) 131 | if err != nil { 132 | return idx, fmt.Errorf("walking archive file %s: %v", filename, err) 133 | } 134 | return idx, nil 135 | } 136 | 137 | const takenAtFormat = "2006-01-02T15:04:05+07:00" 138 | -------------------------------------------------------------------------------- /datasources/instagram/models.go: -------------------------------------------------------------------------------- 1 | package instagram 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "mime" 9 | "path" 10 | "strings" 11 | "time" 12 | 13 | "github.com/mholt/archiver/v3" 14 | "github.com/mholt/timeliner" 15 | ) 16 | 17 | type instaMediaIndex struct { 18 | Photos []instaPhoto `json:"photos"` 19 | Profile []instaProfilePic `json:"profile"` 20 | Videos []instaVideo `json:"videos"` 21 | } 22 | 23 | type instaPhoto struct { 24 | Caption string `json:"caption"` 25 | TakenAt string `json:"taken_at"` 26 | Path string `json:"path"` 27 | LocationStr string `json:"location,omitempty"` 28 | 29 | takenAtParsed time.Time 30 | archiveFilename string 31 | profile instaAccountProfile 32 | } 33 | 34 | func (ph instaPhoto) ID() string { 35 | fname := path.Base(ph.Path) 36 | ext := path.Ext(fname) 37 | return strings.TrimSuffix(fname, ext) 38 | } 39 | 40 | func (ph instaPhoto) Timestamp() time.Time { 41 | return ph.takenAtParsed 42 | } 43 | 44 | func (ph instaPhoto) Class() timeliner.ItemClass { 45 | return timeliner.ClassImage 46 | } 47 | 48 | func (ph instaPhoto) Owner() (id *string, name *string) { 49 | return &ph.profile.Username, &ph.profile.Name 50 | } 51 | 52 | func (ph instaPhoto) DataText() (*string, error) { 53 | return &ph.Caption, nil 54 | } 55 | 56 | func (ph instaPhoto) DataFileName() *string { 57 | fname := path.Base(ph.Path) 58 | return &fname 59 | } 60 | 61 | func (ph instaPhoto) DataFileReader() (io.ReadCloser, error) { 62 | var rc io.ReadCloser 63 | err := archiver.Walk(ph.archiveFilename, func(f archiver.File) error { 64 | if f.Header.(zip.FileHeader).Name != ph.Path { 65 | return nil 66 | } 67 | 68 | buf := new(bytes.Buffer) 69 | _, err := io.Copy(buf, f) 70 | if err != nil { 71 | return fmt.Errorf("copying item into memory: %v", err) 72 | } 73 | rc = timeliner.FakeCloser(buf) 74 | 75 | return archiver.ErrStopWalk 76 | }) 77 | if err != nil { 78 | return nil, fmt.Errorf("walking archive file %s in search of media: %v", 79 | ph.archiveFilename, err) 80 | } 81 | return rc, nil 82 | } 83 | 84 | func (ph instaPhoto) DataFileHash() []byte { 85 | return nil 86 | } 87 | 88 | func (ph instaPhoto) DataFileMIMEType() *string { 89 | mt := mime.TypeByExtension(path.Ext(ph.Path)) 90 | return &mt 91 | } 92 | 93 | func (ph instaPhoto) Metadata() (*timeliner.Metadata, error) { 94 | if ph.LocationStr != "" { 95 | return &timeliner.Metadata{GeneralArea: ph.LocationStr}, nil 96 | } 97 | return nil, nil 98 | } 99 | 100 | func (ph instaPhoto) Location() (*timeliner.Location, error) { 101 | return nil, nil 102 | } 103 | 104 | type instaProfilePic struct { 105 | Caption string `json:"caption"` 106 | TakenAt string `json:"taken_at"` 107 | IsActiveProfile bool `json:"is_active_profile"` 108 | Path string `json:"path"` 109 | 110 | takenAtParsed time.Time 111 | archiveFilename string 112 | profile instaAccountProfile 113 | } 114 | 115 | type instaVideo struct { 116 | Caption string `json:"caption"` 117 | TakenAt string `json:"taken_at"` 118 | Path string `json:"path"` 119 | LocationStr string `json:"location,omitempty"` 120 | 121 | takenAtParsed time.Time 122 | archiveFilename string 123 | profile instaAccountProfile 124 | } 125 | 126 | func (vid instaVideo) ID() string { 127 | fname := path.Base(vid.Path) 128 | ext := path.Ext(fname) 129 | return strings.TrimSuffix(fname, ext) 130 | } 131 | 132 | func (vid instaVideo) Timestamp() time.Time { 133 | return vid.takenAtParsed 134 | } 135 | 136 | func (vid instaVideo) Class() timeliner.ItemClass { 137 | return timeliner.ClassVideo 138 | } 139 | 140 | func (vid instaVideo) Owner() (id *string, name *string) { 141 | return &vid.profile.Username, &vid.profile.Name 142 | } 143 | 144 | func (vid instaVideo) DataText() (*string, error) { 145 | return &vid.Caption, nil 146 | } 147 | 148 | func (vid instaVideo) DataFileName() *string { 149 | fname := path.Base(vid.Path) 150 | return &fname 151 | } 152 | 153 | func (vid instaVideo) DataFileReader() (io.ReadCloser, error) { 154 | var rc io.ReadCloser 155 | err := archiver.Walk(vid.archiveFilename, func(f archiver.File) error { 156 | if f.Header.(zip.FileHeader).Name != vid.Path { 157 | return nil 158 | } 159 | 160 | buf := new(bytes.Buffer) 161 | _, err := io.Copy(buf, f) 162 | if err != nil { 163 | return fmt.Errorf("copying item into memory: %v", err) 164 | } 165 | rc = timeliner.FakeCloser(buf) 166 | 167 | return archiver.ErrStopWalk 168 | }) 169 | if err != nil { 170 | return nil, fmt.Errorf("walking archive file %s in search of media: %v", 171 | vid.archiveFilename, err) 172 | } 173 | return rc, nil 174 | } 175 | 176 | func (vid instaVideo) DataFileHash() []byte { 177 | return nil 178 | } 179 | 180 | func (vid instaVideo) DataFileMIMEType() *string { 181 | mt := mime.TypeByExtension(path.Ext(vid.Path)) 182 | return &mt 183 | } 184 | 185 | func (vid instaVideo) Metadata() (*timeliner.Metadata, error) { 186 | if vid.LocationStr != "" { 187 | return &timeliner.Metadata{GeneralArea: vid.LocationStr}, nil 188 | } 189 | return nil, nil 190 | } 191 | 192 | func (vid instaVideo) Location() (*timeliner.Location, error) { 193 | return nil, nil 194 | } 195 | 196 | type instaAccountProfile struct { 197 | Biography string `json:"biography"` 198 | DateJoined string `json:"date_joined"` 199 | Email string `json:"email"` 200 | Website string `json:"website"` 201 | Gender string `json:"gender"` 202 | PrivateAccount bool `json:"private_account"` 203 | Name string `json:"name"` 204 | PhoneNumber string `json:"phone_number"` 205 | ProfilePicURL string `json:"profile_pic_url"` 206 | Username string `json:"username"` 207 | } 208 | -------------------------------------------------------------------------------- /datasources/smsbackuprestore/mms.go: -------------------------------------------------------------------------------- 1 | package smsbackuprestore 2 | 3 | import ( 4 | "encoding/base64" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "time" 9 | 10 | "github.com/mholt/timeliner" 11 | ) 12 | 13 | // MMS represents a multimedia message. 14 | type MMS struct { 15 | CommonSMSandMMSFields 16 | Rr string `xml:"rr,attr"` 17 | Sub string `xml:"sub,attr"` 18 | CtT string `xml:"ct_t,attr"` 19 | ReadStatus string `xml:"read_status,attr"` 20 | Seen string `xml:"seen,attr"` 21 | MsgBox string `xml:"msg_box,attr"` 22 | SubCs string `xml:"sub_cs,attr"` 23 | RespSt string `xml:"resp_st,attr"` 24 | RetrSt string `xml:"retr_st,attr"` 25 | DTm string `xml:"d_tm,attr"` 26 | TextOnly string `xml:"text_only,attr"` 27 | Exp string `xml:"exp,attr"` 28 | MID string `xml:"m_id,attr"` 29 | St string `xml:"st,attr"` 30 | RetrTxtCs string `xml:"retr_txt_cs,attr"` 31 | RetrTxt string `xml:"retr_txt,attr"` 32 | Creator string `xml:"creator,attr"` 33 | MSize string `xml:"m_size,attr"` 34 | RptA string `xml:"rpt_a,attr"` 35 | CtCls string `xml:"ct_cls,attr"` 36 | Pri string `xml:"pri,attr"` 37 | TrID string `xml:"tr_id,attr"` 38 | RespTxt string `xml:"resp_txt,attr"` 39 | CtL string `xml:"ct_l,attr"` 40 | MCls string `xml:"m_cls,attr"` 41 | DRpt string `xml:"d_rpt,attr"` 42 | V string `xml:"v,attr"` 43 | MType string `xml:"m_type,attr"` 44 | Parts Parts `xml:"parts"` 45 | Addrs Addresses `xml:"addrs"` 46 | 47 | client *Client 48 | } 49 | 50 | // ID returns a unique ID by concatenating the 51 | // date of the message with its TRID. 52 | func (m MMS) ID() string { 53 | return fmt.Sprintf("%d_%s", m.Date, m.TrID) 54 | } 55 | 56 | // Timestamp returns the message's date. 57 | func (m MMS) Timestamp() time.Time { 58 | return time.Unix(0, m.Date*int64(time.Millisecond)) 59 | } 60 | 61 | // Class returns the class Message. 62 | func (m MMS) Class() timeliner.ItemClass { 63 | return timeliner.ClassMessage 64 | } 65 | 66 | // Owner returns the name and number of the sender, 67 | // if available. The export format does not give us 68 | // the contacts' names, however. 69 | func (m MMS) Owner() (number *string, name *string) { 70 | for _, addr := range m.Addrs.Addr { 71 | if addr.Type == mmsAddrTypeSender { 72 | // TODO: Get sender name... for group texts this is tricky/impossible, since order varies 73 | // TODO: If there is only one other contact on the message (other than the account owner's number), we can probably assume the contact name is theirs. 74 | standardized, err := m.client.standardizePhoneNumber(addr.Address) 75 | if err != nil { 76 | // oh well; just go with what we have, I guess 77 | return &addr.Address, nil 78 | } 79 | return &standardized, nil 80 | } 81 | } 82 | return nil, nil 83 | } 84 | 85 | // DataText returns the text of the multimedia message, if any. 86 | func (m MMS) DataText() (*string, error) { 87 | var text string 88 | for _, part := range m.Parts.Part { 89 | if part.Seq < 0 { 90 | continue 91 | } 92 | if part.ContentType == "text/plain" && 93 | part.AttrText != "" && 94 | part.AttrText != "null" { 95 | text += part.AttrText 96 | } 97 | } 98 | if text != "" { 99 | return &text, nil 100 | } 101 | return nil, nil 102 | } 103 | 104 | // DataFileName returns the name of the file, if any. 105 | func (m MMS) DataFileName() *string { 106 | for _, part := range m.Parts.Part { 107 | if part.Seq < 0 { 108 | continue 109 | } 110 | if isMediaContentType(part.ContentType) { 111 | return &part.Filename 112 | } 113 | } 114 | return nil 115 | } 116 | 117 | // DataFileReader returns the data file reader, if any. 118 | func (m MMS) DataFileReader() (io.ReadCloser, error) { 119 | for _, part := range m.Parts.Part { 120 | if part.Seq < 0 { 121 | continue 122 | } 123 | if isMediaContentType(part.ContentType) { 124 | sr := strings.NewReader(part.Data) 125 | bd := base64.NewDecoder(base64.StdEncoding, sr) 126 | return timeliner.FakeCloser(bd), nil 127 | } 128 | } 129 | return nil, nil 130 | } 131 | 132 | // DataFileHash returns nil. 133 | func (m MMS) DataFileHash() []byte { 134 | return nil 135 | } 136 | 137 | // DataFileMIMEType returns the MIME type, if any. 138 | func (m MMS) DataFileMIMEType() *string { 139 | for _, part := range m.Parts.Part { 140 | if isMediaContentType(part.ContentType) { 141 | return &part.ContentType 142 | } 143 | } 144 | return nil 145 | } 146 | 147 | // Metadata returns nil. 148 | func (m MMS) Metadata() (*timeliner.Metadata, error) { 149 | return nil, nil 150 | } 151 | 152 | // Location returns nil. 153 | func (m MMS) Location() (*timeliner.Location, error) { 154 | return nil, nil 155 | } 156 | 157 | // Parts is the parts of an MMS. 158 | type Parts struct { 159 | Text string `xml:",chardata"` 160 | Part []Part `xml:"part"` 161 | } 162 | 163 | // Part is a part of an MMS. 164 | type Part struct { 165 | Text string `xml:",chardata"` 166 | Seq int `xml:"seq,attr"` 167 | ContentType string `xml:"ct,attr"` 168 | Name string `xml:"name,attr"` 169 | Charset string `xml:"chset,attr"` 170 | Cd string `xml:"cd,attr"` 171 | Fn string `xml:"fn,attr"` 172 | Cid string `xml:"cid,attr"` 173 | Filename string `xml:"cl,attr"` 174 | CttS string `xml:"ctt_s,attr"` 175 | CttT string `xml:"ctt_t,attr"` 176 | AttrText string `xml:"text,attr"` 177 | Data string `xml:"data,attr"` 178 | } 179 | 180 | // Addresses is the addresses the MMS was sent to. 181 | type Addresses struct { 182 | Text string `xml:",chardata"` 183 | Addr []Address `xml:"addr"` 184 | } 185 | 186 | // Address is a sender or recipient of the MMS. 187 | type Address struct { 188 | Text string `xml:",chardata"` 189 | Address string `xml:"address,attr"` 190 | Type int `xml:"type,attr"` // 151 = recipient, 137 = sender 191 | Charset string `xml:"charset,attr"` 192 | } 193 | 194 | func isMediaContentType(ct string) bool { 195 | return strings.HasPrefix(ct, "image/") || 196 | strings.HasPrefix(ct, "video/") 197 | } 198 | -------------------------------------------------------------------------------- /datasources/smsbackuprestore/sms.go: -------------------------------------------------------------------------------- 1 | package smsbackuprestore 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "time" 9 | 10 | "github.com/mholt/timeliner" 11 | ) 12 | 13 | // Smses was generated 2019-07-10 using an export from 14 | // SMS Backup & Restore v10.05.602 (previous versions 15 | // have a bug with emoji encodings). 16 | type Smses struct { 17 | XMLName xml.Name `xml:"smses"` 18 | Text string `xml:",chardata"` 19 | Count int `xml:"count,attr"` 20 | BackupSet string `xml:"backup_set,attr"` // UUID 21 | BackupDate int64 `xml:"backup_date,attr"` // unix timestamp in milliseconds 22 | SMS []SMS `xml:"sms"` 23 | MMS []MMS `xml:"mms"` 24 | } 25 | 26 | // CommonSMSandMMSFields are the fields that both 27 | // SMS and MMS share in common. 28 | type CommonSMSandMMSFields struct { 29 | Text string `xml:",chardata"` 30 | Address string `xml:"address,attr"` 31 | Date int64 `xml:"date,attr"` // unix timestamp in milliseconds 32 | Read int `xml:"read,attr"` 33 | Locked int `xml:"locked,attr"` 34 | DateSent int64 `xml:"date_sent,attr"` // unix timestamp in (SMS: milliseconds, MMS: seconds) 35 | SubID int `xml:"sub_id,attr"` 36 | ReadableDate string `xml:"readable_date,attr"` // format: "Oct 20, 2017 12:35:30 PM" 37 | ContactName string `xml:"contact_name,attr"` // might be "(Unknown)" 38 | } 39 | 40 | // SMS represents a simple text message. 41 | type SMS struct { 42 | CommonSMSandMMSFields 43 | Protocol int `xml:"protocol,attr"` 44 | Type int `xml:"type,attr"` // 1 = received, 2 = sent 45 | Subject string `xml:"subject,attr"` 46 | Body string `xml:"body,attr"` 47 | Toa string `xml:"toa,attr"` 48 | ScToa string `xml:"sc_toa,attr"` 49 | ServiceCenter string `xml:"service_center,attr"` 50 | Status int `xml:"status,attr"` 51 | 52 | client *Client 53 | } 54 | 55 | // ID returns a unique ID for this text message. 56 | // Because text messages do not have IDs, an ID 57 | // is constructed by concatenating the millisecond 58 | // timestamp of the message with a fast hash of 59 | // the message body. 60 | func (s SMS) ID() string { 61 | return fmt.Sprintf("%d_%s", s.Date, fastHash(s.Body)) 62 | } 63 | 64 | // Timestamp returns the message's date. 65 | func (s SMS) Timestamp() time.Time { 66 | return time.Unix(0, s.Date*int64(time.Millisecond)) 67 | } 68 | 69 | // Class returns class Message. 70 | func (s SMS) Class() timeliner.ItemClass { 71 | return timeliner.ClassMessage 72 | } 73 | 74 | // Owner returns the sender's phone number and name, if available. 75 | func (s SMS) Owner() (number *string, name *string) { 76 | switch s.Type { 77 | case smsTypeSent: 78 | return &s.client.account.UserID, nil 79 | case smsTypeReceived: 80 | if s.ContactName != "" && s.ContactName != "(Unknown)" { 81 | name = &s.ContactName 82 | } 83 | standardized, err := s.client.standardizePhoneNumber(s.Address) 84 | if err == nil { 85 | number = &standardized 86 | } else { 87 | number = &s.Address // oh well 88 | } 89 | } 90 | return 91 | } 92 | 93 | // DataText returns the text of the message. 94 | func (s SMS) DataText() (*string, error) { 95 | body := strings.TrimSpace(s.Body) 96 | if body != "" { 97 | return &body, nil 98 | } 99 | return nil, nil 100 | } 101 | 102 | // DataFileName returns nil. 103 | func (s SMS) DataFileName() *string { 104 | return nil 105 | } 106 | 107 | // DataFileReader returns nil. 108 | func (s SMS) DataFileReader() (io.ReadCloser, error) { 109 | return nil, nil 110 | } 111 | 112 | // DataFileHash returns nil. 113 | func (s SMS) DataFileHash() []byte { 114 | return nil 115 | } 116 | 117 | // DataFileMIMEType returns nil. 118 | func (s SMS) DataFileMIMEType() *string { 119 | return nil 120 | } 121 | 122 | // Metadata returns nil. 123 | func (s SMS) Metadata() (*timeliner.Metadata, error) { 124 | return nil, nil 125 | } 126 | 127 | // Location returns nil. 128 | func (s SMS) Location() (*timeliner.Location, error) { 129 | return nil, nil 130 | } 131 | -------------------------------------------------------------------------------- /datasources/smsbackuprestore/smsbackuprestore.go: -------------------------------------------------------------------------------- 1 | // Package smsbackuprestore implements a Timeliner data source for 2 | // the Android SMS Backup & Restore app by SyncTech: 3 | // https://synctech.com.au/sms-backup-restore/ 4 | package smsbackuprestore 5 | 6 | import ( 7 | "context" 8 | "encoding/xml" 9 | "fmt" 10 | "hash/fnv" 11 | "log" 12 | "os" 13 | 14 | "github.com/mholt/timeliner" 15 | "github.com/ttacon/libphonenumber" 16 | ) 17 | 18 | // Data source name and ID. 19 | const ( 20 | DataSourceName = "SMS Backup & Restore" 21 | DataSourceID = "smsbackuprestore" 22 | ) 23 | 24 | var dataSource = timeliner.DataSource{ 25 | ID: DataSourceID, 26 | Name: DataSourceName, 27 | NewClient: func(acc timeliner.Account) (timeliner.Client, error) { 28 | return &Client{account: acc}, nil 29 | }, 30 | } 31 | 32 | func init() { 33 | err := timeliner.RegisterDataSource(dataSource) 34 | if err != nil { 35 | log.Fatal(err) 36 | } 37 | } 38 | 39 | // Client implements the timeliner.Client interface. 40 | type Client struct { 41 | // DefaultRegion is the region to assume for phone 42 | // numbers that do not have an explicit country 43 | // calling code. This value should be the ISO 44 | // 3166-1 alpha-2 standard region code. 45 | DefaultRegion string 46 | 47 | account timeliner.Account 48 | } 49 | 50 | // ListItems lists items from the data source. 51 | func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 52 | defer close(itemChan) 53 | 54 | if opt.Filename == "" { 55 | return fmt.Errorf("filename is required") 56 | } 57 | 58 | // ensure the client's phone number is standardized 59 | // TODO: It would be better to have a hook in the account creation process to be able to do this 60 | ownerPhoneNum, err := c.standardizePhoneNumber(c.account.UserID) 61 | if err != nil { 62 | return fmt.Errorf("standardizing client phone number '%s': %v", c.account.UserID, err) 63 | } 64 | c.account.UserID = ownerPhoneNum 65 | 66 | xmlFile, err := os.Open(opt.Filename) 67 | if err != nil { 68 | return err 69 | } 70 | defer xmlFile.Close() 71 | 72 | var data Smses 73 | dec := xml.NewDecoder(xmlFile) 74 | err = dec.Decode(&data) 75 | if err != nil { 76 | return fmt.Errorf("decoding XML file: %v", err) 77 | } 78 | 79 | for _, sms := range data.SMS { 80 | sms.client = c 81 | itemChan <- timeliner.NewItemGraph(sms) 82 | } 83 | 84 | for _, mms := range data.MMS { 85 | mms.client = c 86 | 87 | ig := timeliner.NewItemGraph(mms) 88 | 89 | // add relations to make sure other participants in a group text 90 | // are recorded; necessary if more than two participants 91 | if len(mms.Addrs.Addr) > 2 { 92 | ownerNum, _ := mms.Owner() 93 | if ownerNum != nil { 94 | for _, addr := range mms.Addrs.Addr { 95 | participantNum, err := c.standardizePhoneNumber(addr.Address) 96 | if err != nil { 97 | participantNum = addr.Address // oh well 98 | } 99 | // if this participant is not the owner of the message or 100 | // the account owner, then it must be another group member 101 | if participantNum != *ownerNum && participantNum != c.account.UserID { 102 | ig.Relations = append(ig.Relations, timeliner.RawRelation{ 103 | FromItemID: mms.ID(), 104 | ToPersonUserID: participantNum, 105 | Relation: timeliner.RelCCed, 106 | }) 107 | } 108 | } 109 | } 110 | } 111 | 112 | itemChan <- ig 113 | } 114 | 115 | return nil 116 | } 117 | 118 | // fastHash hashes input using a fast 32-bit hashing algorithm 119 | // and returns the hash as a hex-encoded string. Do not use this 120 | // for cryptographic purposes. If the hashing fails for some 121 | // reason, an empty string is returned. 122 | func fastHash(input string) string { 123 | h := fnv.New32a() 124 | h.Write([]byte(input)) 125 | return fmt.Sprintf("%x", h.Sum32()) 126 | } 127 | 128 | // standardizePhoneNumber attempts to parse number and returns 129 | // a standardized version in E164 format. If the number does 130 | // not have an explicit region/country code, the country code 131 | // for c.DefaultRegion is used instead. 132 | // 133 | // We chose E164 because that's what Twilio uses. 134 | func (c *Client) standardizePhoneNumber(number string) (string, error) { 135 | ph, err := libphonenumber.Parse(number, c.DefaultRegion) 136 | if err != nil { 137 | return "", err 138 | } 139 | return libphonenumber.Format(ph, libphonenumber.E164), nil 140 | } 141 | 142 | const ( 143 | smsTypeReceived = 1 144 | smsTypeSent = 2 145 | 146 | mmsAddrTypeRecipient = 151 147 | mmsAddrTypeSender = 137 148 | ) 149 | -------------------------------------------------------------------------------- /datasources/twitter/api.go: -------------------------------------------------------------------------------- 1 | package twitter 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | "net/url" 9 | "strconv" 10 | "strings" 11 | 12 | "github.com/mholt/timeliner" 13 | ) 14 | 15 | func (c *Client) getFromAPI(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 16 | // load any previous checkpoint 17 | c.checkpoint.load(opt.Checkpoint) 18 | 19 | // get account owner information 20 | cleanedScreenName := strings.TrimPrefix(c.acc.UserID, "@") 21 | ownerAccount, err := c.getAccountFromAPI(cleanedScreenName, "") 22 | if err != nil { 23 | return fmt.Errorf("getting user account information for @%s: %v", cleanedScreenName, err) 24 | } 25 | c.ownerAccount = ownerAccount 26 | 27 | // get the starting bounds of this operation 28 | var maxTweet, minTweet string 29 | if opt.Timeframe.SinceItemID != nil { 30 | minTweet = *opt.Timeframe.SinceItemID 31 | } 32 | if c.checkpoint.LastTweetID != "" { 33 | // by default, start off at the last checkpoint 34 | maxTweet = c.checkpoint.LastTweetID 35 | if opt.Timeframe.UntilItemID != nil { 36 | // if both a timeframe UntilItemID and a checkpoint are set, 37 | // we will choose the one with a tweet ID that is higher, 38 | // meaning more recent, to avoid potentially skipping 39 | // a chunk of the timeline 40 | maxTweet = maxTweetID(c.checkpoint.LastTweetID, *opt.Timeframe.UntilItemID) 41 | } 42 | } 43 | 44 | for { 45 | select { 46 | case <-ctx.Done(): 47 | return nil 48 | default: 49 | tweets, err := c.nextPageOfTweetsFromAPI(maxTweet, minTweet) 50 | if err != nil { 51 | return fmt.Errorf("getting next page of tweets: %v", err) 52 | } 53 | 54 | // we are done when there are no more tweets 55 | if len(tweets) == 0 { 56 | return nil 57 | } 58 | 59 | for _, t := range tweets { 60 | err = c.processTweetFromAPI(t, itemChan) 61 | if err != nil { 62 | return fmt.Errorf("processing tweet from API: %v", err) 63 | } 64 | } 65 | 66 | // since max_id is inclusive, subtract 1 from the tweet ID 67 | // https://developer.twitter.com/en/docs/tweets/timelines/guides/working-with-timelines 68 | nextTweetID := tweets[len(tweets)-1].TweetID - 1 69 | c.checkpoint.LastTweetID = strconv.FormatInt(int64(nextTweetID), 10) 70 | c.checkpoint.save(ctx) 71 | 72 | // decrease maxTweet to get the next page on next iteration 73 | maxTweet = c.checkpoint.LastTweetID 74 | } 75 | } 76 | } 77 | 78 | func (c *Client) processTweetFromAPI(t tweet, itemChan chan<- *timeliner.ItemGraph) error { 79 | skip, err := c.prepareTweet(&t, "api") 80 | if err != nil { 81 | return fmt.Errorf("preparing tweet: %v", err) 82 | } 83 | if skip { 84 | return nil 85 | } 86 | 87 | ig, err := c.makeItemGraphFromTweet(t, "") 88 | if err != nil { 89 | return fmt.Errorf("processing tweet %s: %v", t.ID(), err) 90 | } 91 | 92 | // send the tweet for processing 93 | if ig != nil { 94 | itemChan <- ig 95 | } 96 | 97 | return nil 98 | } 99 | 100 | // nextPageOfTweetsFromAPI returns the next page of tweets starting at maxTweet 101 | // and going for a full page or until minTweet, whichever comes first. Generally, 102 | // iterating over this function will involve decreasing maxTweet and leaving 103 | // minTweet the same, if set at all (maxTweet = "until", minTweet = "since"). 104 | // Either or both can be empty strings, for no boundaries. This function returns 105 | // at least 0 tweets (signaling done, I think) or up to a full page of tweets. 106 | func (c *Client) nextPageOfTweetsFromAPI(maxTweet, minTweet string) ([]tweet, error) { 107 | q := url.Values{ 108 | "user_id": {c.ownerAccount.id()}, 109 | "count": {"200"}, 110 | "tweet_mode": {"extended"}, // https://developer.twitter.com/en/docs/tweets/tweet-updates 111 | "exclude_replies": {"false"}, // always include replies in case it's a self-reply; we can filter all others 112 | "include_rts": {"false"}, 113 | } 114 | if c.Retweets { 115 | q.Set("include_rts", "true") 116 | } 117 | if maxTweet != "" { 118 | q.Set("max_id", maxTweet) 119 | } 120 | if minTweet != "" { 121 | q.Set("since_id", minTweet) 122 | } 123 | u := "https://api.twitter.com/1.1/statuses/user_timeline.json?" + q.Encode() 124 | 125 | resp, err := c.HTTPClient.Get(u) 126 | if err != nil { 127 | return nil, fmt.Errorf("performing API request: %v", err) 128 | } 129 | defer resp.Body.Close() 130 | 131 | // TODO: handle HTTP errors, esp. rate limiting, a lot better 132 | if resp.StatusCode != http.StatusOK { 133 | return nil, fmt.Errorf("HTTP error: %s: %s", u, resp.Status) 134 | } 135 | 136 | var tweets []tweet 137 | err = json.NewDecoder(resp.Body).Decode(&tweets) 138 | if err != nil { 139 | return nil, fmt.Errorf("reading response body: %v", err) 140 | } 141 | 142 | return tweets, nil 143 | } 144 | 145 | // getAccountFromAPI gets the account information for either 146 | // screenName, if set, or accountID, if set. Set only one; 147 | // leave the other argument empty string. 148 | func (c *Client) getAccountFromAPI(screenName, accountID string) (twitterAccount, error) { 149 | var ta twitterAccount 150 | 151 | q := make(url.Values) 152 | if screenName != "" { 153 | q.Set("screen_name", screenName) 154 | } else if accountID != "" { 155 | q.Set("user_id", accountID) 156 | } 157 | 158 | u := "https://api.twitter.com/1.1/users/show.json?" + q.Encode() 159 | 160 | resp, err := c.HTTPClient.Get(u) 161 | if err != nil { 162 | return ta, fmt.Errorf("performing API request: %v", err) 163 | } 164 | defer resp.Body.Close() 165 | 166 | // TODO: handle HTTP errors, esp. rate limiting, a lot better 167 | if resp.StatusCode != http.StatusOK { 168 | return ta, fmt.Errorf("HTTP error: %s: %s", u, resp.Status) 169 | } 170 | 171 | err = json.NewDecoder(resp.Body).Decode(&ta) 172 | if err != nil { 173 | return ta, fmt.Errorf("reading response body: %v", err) 174 | } 175 | 176 | return ta, nil 177 | } 178 | 179 | func (c *Client) getTweetFromAPI(id string) (tweet, error) { 180 | var t tweet 181 | 182 | q := url.Values{ 183 | "id": {id}, 184 | "tweet_mode": {"extended"}, // https://developer.twitter.com/en/docs/tweets/tweet-updates 185 | } 186 | u := "https://api.twitter.com/1.1/statuses/show.json?" + q.Encode() 187 | 188 | resp, err := c.HTTPClient.Get(u) 189 | if err != nil { 190 | return t, fmt.Errorf("performing API request: %v", err) 191 | } 192 | defer resp.Body.Close() 193 | 194 | switch resp.StatusCode { 195 | case http.StatusNotFound: 196 | // this is okay, because the tweet may simply have been deleted, 197 | // and we skip empty tweets anyway 198 | fallthrough 199 | case http.StatusForbidden: 200 | // this happens when the author's account is suspended 201 | return t, nil 202 | case http.StatusOK: 203 | break 204 | default: 205 | // TODO: handle HTTP errors, esp. rate limiting, a lot better 206 | return t, fmt.Errorf("HTTP error: %s: %s", u, resp.Status) 207 | } 208 | 209 | err = json.NewDecoder(resp.Body).Decode(&t) 210 | if err != nil { 211 | return t, fmt.Errorf("reading response body: %v", err) 212 | } 213 | 214 | return t, nil 215 | } 216 | -------------------------------------------------------------------------------- /datasources/twitter/api_test.go: -------------------------------------------------------------------------------- 1 | package twitter 2 | 3 | import ( 4 | "encoding/json" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func TestDecodeTwitterAccount(t *testing.T) { 10 | // try decode a "kitchen sink", so that we can test that most features get decoded correctly 11 | twitterAccountAPIResponseJSON := strings.NewReader(` 12 | { 13 | "id": 9876543, 14 | "id_str": "9876543", 15 | "name": "barry", 16 | "screen_name": "testingperson", 17 | "location": "In your hearts and minds", 18 | "profile_location": null, 19 | "description": "i am the what i was of what i will be.", 20 | "url": "https:\/\/t.co\/abcdefghij", 21 | "entities": { 22 | "url": { 23 | "urls": [ 24 | { 25 | "url": "https:\/\/t.co\/abcdefghij", 26 | "expanded_url": "http:\/\/Instagram.com\/demotestingIGperson", 27 | "display_url": "Instagram.com\/demotestingIGperson", 28 | "indices": [ 29 | 0, 30 | 23 31 | ] 32 | } 33 | ] 34 | }, 35 | "description": { 36 | "urls": [ 37 | 38 | ] 39 | } 40 | }, 41 | "protected": false, 42 | "followers_count": 161, 43 | "friends_count": 280, 44 | "listed_count": 8, 45 | "created_at": "Wed Mar 21 18:13:14 +0000 2007", 46 | "favourites_count": 2279, 47 | "utc_offset": null, 48 | "time_zone": null, 49 | "geo_enabled": true, 50 | "verified": false, 51 | "statuses_count": 1729, 52 | "lang": null, 53 | "status": { 54 | "created_at": "Wed Nov 27 18:54:49 +0000 2019", 55 | "id": 1234567890123456789, 56 | "id_str": "1234567890123456789", 57 | "text": "Demo tweet #testing https:\/\/t.co\/abcdefgijk", 58 | "truncated": false, 59 | "entities": { 60 | "hashtags": [ 61 | { 62 | "text": "testing", 63 | "indices": [ 64 | 0, 65 | 8 66 | ] 67 | } 68 | ], 69 | "symbols": [ 70 | 71 | ], 72 | "user_mentions": [ 73 | 74 | ], 75 | "urls": [ 76 | { 77 | "url": "https:\/\/t.co\/abcdefgijk", 78 | "expanded_url": "https:\/\/www.instagram.com\/p\/BAABAABAABA\/?igshid=xyxyxyxyxyxyx", 79 | "display_url": "instagram.com\/p\/BAABAABAABA\/\u2026", 80 | "indices": [ 81 | 52, 82 | 75 83 | ] 84 | } 85 | ] 86 | }, 87 | "source": "\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e", 88 | "in_reply_to_status_id": null, 89 | "in_reply_to_status_id_str": null, 90 | "in_reply_to_user_id": null, 91 | "in_reply_to_user_id_str": null, 92 | "in_reply_to_screen_name": null, 93 | "geo": { 94 | "type": "Point", 95 | "coordinates": [ 96 | 34.0522, 97 | -118.243 98 | ] 99 | }, 100 | "coordinates": { 101 | "type": "Point", 102 | "coordinates": [ 103 | -118.243, 104 | 34.0522 105 | ] 106 | }, 107 | "place": { 108 | "id": "3b77caf94bfc81fe", 109 | "url": "https:\/\/api.twitter.com\/1.1\/geo\/id\/3b77caf94bfc81fe.json", 110 | "place_type": "city", 111 | "name": "Los Angeles", 112 | "full_name": "Los Angeles, CA", 113 | "country_code": "US", 114 | "country": "USA", 115 | "contained_within": [ 116 | 117 | ], 118 | "bounding_box": { 119 | "type": "Polygon", 120 | "coordinates": [ 121 | [ 122 | [ 123 | -118.668404, 124 | 33.704538 125 | ], 126 | [ 127 | -118.155409, 128 | 33.704538 129 | ], 130 | [ 131 | -118.155409, 132 | 34.337041 133 | ], 134 | [ 135 | -118.668404, 136 | 34.337041 137 | ] 138 | ] 139 | ] 140 | }, 141 | "attributes": { 142 | 143 | } 144 | }, 145 | "contributors": null, 146 | "is_quote_status": false, 147 | "retweet_count": 0, 148 | "favorite_count": 0, 149 | "favorited": false, 150 | "retweeted": false, 151 | "possibly_sensitive": false, 152 | "lang": "en" 153 | }, 154 | "contributors_enabled": false, 155 | "is_translator": false, 156 | "is_translation_enabled": false, 157 | "profile_background_color": "FFFFFF", 158 | "profile_background_image_url": "http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png", 159 | "profile_background_image_url_https": "https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png", 160 | "profile_background_tile": true, 161 | "profile_image_url": "http:\/\/pbs.twimg.com\/profile_images\/923335960007340032\/pIbUjNkC_normal.jpg", 162 | "profile_image_url_https": "https:\/\/pbs.twimg.com\/profile_images\/923335960007340032\/pIbUjNkC_normal.jpg", 163 | "profile_banner_url": "https:\/\/pbs.twimg.com\/profile_banners\/9876543\/1508975481", 164 | "profile_link_color": "0012BB", 165 | "profile_sidebar_border_color": "AAAAAA", 166 | "profile_sidebar_fill_color": "FFFFFF", 167 | "profile_text_color": "000000", 168 | "profile_use_background_image": false, 169 | "has_extended_profile": false, 170 | "default_profile": false, 171 | "default_profile_image": false, 172 | "can_media_tag": null, 173 | "followed_by": null, 174 | "following": null, 175 | "follow_request_sent": null, 176 | "notifications": null, 177 | "translator_type": "none" 178 | } 179 | `) 180 | 181 | var acc twitterAccount 182 | assertTrue(t, json.NewDecoder(twitterAccountAPIResponseJSON).Decode(&acc) == nil) 183 | 184 | // NOTE: assertions skipped for fields typed interface{} 185 | 186 | assertTrue(t, acc.ID == 9876543) 187 | assertEqualString(t, acc.IDStr, "9876543") 188 | assertEqualString(t, acc.ScreenName, "testingperson") 189 | assertEqualString(t, acc.Name, "barry") 190 | assertEqualString(t, acc.Location, "In your hearts and minds") 191 | assertEqualString(t, acc.Description, "i am the what i was of what i will be.") 192 | assertEqualString(t, acc.URL, "https://t.co/abcdefghij") 193 | 194 | assertTrue(t, !acc.Protected) 195 | assertTrue(t, acc.GeoEnabled) 196 | assertTrue(t, !acc.Verified) 197 | assertTrue(t, !acc.ContributorsEnabled) 198 | assertTrue(t, !acc.HasExtendedProfile) 199 | 200 | assertTrue(t, acc.FollowersCount == 161) 201 | assertTrue(t, acc.ListedCount == 8) 202 | assertTrue(t, acc.FavouritesCount == 2279) 203 | assertTrue(t, acc.StatusesCount == 1729) 204 | 205 | assertEqualString(t, acc.Lang, "") 206 | assertTrue(t, !acc.IsTranslator) 207 | assertTrue(t, !acc.IsTranslationEnabled) 208 | assertEqualString(t, acc.TranslatorType, "none") 209 | 210 | assertTrue(t, !acc.ProfileUseBackgroundImage) 211 | assertTrue(t, !acc.DefaultProfile) 212 | assertTrue(t, !acc.DefaultProfileImage) 213 | assertTrue(t, acc.ProfileBackgroundTile) 214 | assertEqualString(t, acc.ProfileBackgroundColor, "FFFFFF") 215 | assertEqualString(t, acc.ProfileBackgroundImageURL, "http://abs.twimg.com/images/themes/theme1/bg.png") 216 | assertEqualString(t, acc.ProfileBackgroundImageURLHTTPS, "https://abs.twimg.com/images/themes/theme1/bg.png") 217 | assertEqualString(t, acc.ProfileImageURL, "http://pbs.twimg.com/profile_images/923335960007340032/pIbUjNkC_normal.jpg") 218 | assertEqualString(t, acc.ProfileImageURLHTTPS, "https://pbs.twimg.com/profile_images/923335960007340032/pIbUjNkC_normal.jpg") 219 | assertEqualString(t, acc.ProfileBannerURL, "https://pbs.twimg.com/profile_banners/9876543/1508975481") 220 | assertEqualString(t, acc.ProfileLinkColor, "0012BB") 221 | assertEqualString(t, acc.ProfileSidebarBorderColor, "AAAAAA") 222 | assertEqualString(t, acc.ProfileSidebarFillColor, "FFFFFF") 223 | assertEqualString(t, acc.ProfileTextColor, "000000") 224 | 225 | latestTweet := acc.Status // shorthand 226 | 227 | assertEqualString(t, latestTweet.TweetIDStr, "1234567890123456789") 228 | assertTrue(t, latestTweet.TweetID == 1234567890123456789) 229 | assertTrue(t, latestTweet.User == nil) 230 | assertEqualString(t, latestTweet.CreatedAt, "Wed Nov 27 18:54:49 +0000 2019") 231 | assertEqualString(t, latestTweet.Text, "Demo tweet #testing https://t.co/abcdefgijk") 232 | assertEqualString(t, latestTweet.FullText, "") 233 | assertEqualString(t, latestTweet.Lang, "en") 234 | assertEqualString(t, latestTweet.Source, `<a href="http://instagram.com" rel="nofollow">Instagram</a>`) 235 | assertTrue(t, !latestTweet.Truncated) 236 | assertTrue(t, !latestTweet.PossiblySensitive) 237 | assertTrue(t, !latestTweet.IsQuoteStatus) 238 | 239 | assertEqualString(t, latestTweet.InReplyToScreenName, "") 240 | assertTrue(t, latestTweet.InReplyToStatusID == 0) 241 | assertEqualString(t, latestTweet.InReplyToStatusIDStr, "") 242 | assertTrue(t, latestTweet.InReplyToUserID == 0) 243 | assertEqualString(t, latestTweet.InReplyToUserIDStr, "") 244 | 245 | assertTrue(t, !latestTweet.WithheldCopyright) 246 | assertTrue(t, len(latestTweet.WithheldInCountries) == 0) 247 | assertEqualString(t, latestTweet.WithheldScope, "") 248 | 249 | assertTrue(t, !latestTweet.Favorited) 250 | assertTrue(t, latestTweet.FavoriteCount == 0) 251 | 252 | assertTrue(t, !latestTweet.Retweeted) 253 | assertTrue(t, latestTweet.RetweetedStatus == nil) 254 | assertTrue(t, latestTweet.RetweetCount == 0) 255 | 256 | assertTrue(t, len(latestTweet.DisplayTextRange) == 0) 257 | 258 | assertTrue(t, latestTweet.Coordinates.Latitude() == 34.0522) 259 | assertTrue(t, latestTweet.Coordinates.Longitude() == -118.243) 260 | 261 | assertTrue(t, latestTweet.ExtendedEntities == nil) 262 | // I was too lazy to type assertions for the "entities" hierarchy, so we're just comparing 263 | // re-serialized versions. this would catch if we would have had typos in JSON field 264 | // names (they would not get decoded, and hence would not get re-serialized) 265 | entitiesJSON, err := json.MarshalIndent(latestTweet.Entities, "", " ") 266 | assertTrue(t, err == nil) 267 | assertEqualString(t, string(entitiesJSON), `{ 268 | "hashtags": [ 269 | { 270 | "indices": [ 271 | 0, 272 | 8 273 | ], 274 | "text": "testing" 275 | } 276 | ], 277 | "symbols": [], 278 | "user_mentions": [], 279 | "urls": [ 280 | { 281 | "url": "https://t.co/abcdefgijk", 282 | "expanded_url": "https://www.instagram.com/p/BAABAABAABA/?igshid=xyxyxyxyxyxyx", 283 | "display_url": "instagram.com/p/BAABAABAABA/…", 284 | "indices": [ 285 | 52, 286 | 75 287 | ] 288 | } 289 | ], 290 | "polls": null 291 | }`) 292 | } 293 | 294 | func assertEqualString(t *testing.T, actual string, expected string) { 295 | t.Helper() 296 | 297 | if actual != expected { 298 | t.Fatalf("exp=%v; got=%v", expected, actual) 299 | } 300 | } 301 | 302 | func assertTrue(t *testing.T, val bool) { 303 | t.Helper() 304 | 305 | if !val { 306 | t.Fatal("expected true; got false") 307 | } 308 | } 309 | -------------------------------------------------------------------------------- /datasources/twitter/archives.go: -------------------------------------------------------------------------------- 1 | package twitter 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | 8 | "github.com/mholt/archiver/v3" 9 | "github.com/mholt/timeliner" 10 | ) 11 | 12 | func (c *Client) getFromArchiveFile(itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 13 | // load the user's account ID 14 | var err error 15 | c.ownerAccount, err = c.getOwnerAccountFromArchive(opt.Filename) 16 | if err != nil { 17 | return fmt.Errorf("unable to get user account ID: %v", err) 18 | } 19 | 20 | // first pass - add tweets to timeline 21 | err = c.processArchive(opt.Filename, itemChan, c.makeItemGraphFromTweet) 22 | if err != nil { 23 | return fmt.Errorf("processing tweets: %v", err) 24 | } 25 | 26 | // second pass - add tweet relationships to timeline 27 | err = c.processArchive(opt.Filename, itemChan, c.processReplyRelationFromArchive) 28 | if err != nil { 29 | return fmt.Errorf("processing tweets: %v", err) 30 | } 31 | 32 | return nil 33 | } 34 | 35 | func (c *Client) processArchive(archiveFilename string, itemChan chan<- *timeliner.ItemGraph, processFunc archiveProcessFn) error { 36 | err := archiver.Walk(archiveFilename, func(f archiver.File) error { 37 | defer f.Close() 38 | if f.Name() != "tweet.js" { 39 | return nil 40 | } 41 | 42 | // consume non-JSON preface (JavaScript variable definition) 43 | err := stripPreface(f, tweetFilePreface) 44 | if err != nil { 45 | return fmt.Errorf("reading tweet file preface: %v", err) 46 | } 47 | 48 | err = c.processTweetsFromArchive(itemChan, f, archiveFilename, processFunc) 49 | if err != nil { 50 | return fmt.Errorf("processing tweet file: %v", err) 51 | } 52 | 53 | return archiver.ErrStopWalk 54 | }) 55 | if err != nil { 56 | return fmt.Errorf("walking archive file %s: %v", archiveFilename, err) 57 | } 58 | 59 | return nil 60 | } 61 | 62 | func (c *Client) processTweetsFromArchive(itemChan chan<- *timeliner.ItemGraph, f io.Reader, 63 | archiveFilename string, processFunc archiveProcessFn) error { 64 | 65 | dec := json.NewDecoder(f) 66 | 67 | // read array opening bracket '[' 68 | _, err := dec.Token() 69 | if err != nil { 70 | return fmt.Errorf("decoding opening token: %v", err) 71 | } 72 | 73 | for dec.More() { 74 | var t tweet 75 | err := dec.Decode(&t) 76 | if err != nil { 77 | return fmt.Errorf("decoding tweet element: %v", err) 78 | } 79 | 80 | skip, err := c.prepareTweet(&t, "archive") 81 | if err != nil { 82 | return fmt.Errorf("preparing tweet: %v", err) 83 | } 84 | if skip { 85 | continue 86 | } 87 | 88 | ig, err := processFunc(t, archiveFilename) 89 | if err != nil { 90 | return fmt.Errorf("processing tweet: %v", err) 91 | } 92 | 93 | // send the tweet(s) for processing 94 | if ig != nil { 95 | itemChan <- ig 96 | } 97 | } 98 | 99 | return nil 100 | } 101 | 102 | func (c *Client) processReplyRelationFromArchive(t tweet, archiveFilename string) (*timeliner.ItemGraph, error) { 103 | if t.InReplyToStatusIDStr == "" { 104 | // current tweet is not a reply, so no relationship to add 105 | return nil, nil 106 | } 107 | if t.InReplyToUserIDStr != "" && t.InReplyToUserIDStr != c.ownerAccount.id() { 108 | // from archives, we only support storing replies to self... (TODO) 109 | return nil, nil 110 | } 111 | 112 | ig := &timeliner.ItemGraph{ 113 | Relations: []timeliner.RawRelation{ 114 | { 115 | FromItemID: t.TweetIDStr, 116 | ToItemID: t.InReplyToStatusIDStr, 117 | Relation: timeliner.RelReplyTo, 118 | }, 119 | }, 120 | } 121 | 122 | return ig, nil 123 | } 124 | 125 | func (c *Client) getOwnerAccountFromArchive(filename string) (twitterAccount, error) { 126 | var ta twitterAccount 127 | err := archiver.Walk(filename, func(f archiver.File) error { 128 | defer f.Close() 129 | if f.Name() != "account.js" { 130 | return nil 131 | } 132 | 133 | // consume non-JSON preface (JavaScript variable definition) 134 | err := stripPreface(f, accountFilePreface) 135 | if err != nil { 136 | return fmt.Errorf("reading account file preface: %v", err) 137 | } 138 | 139 | var accFile twitterAccountFile 140 | err = json.NewDecoder(f).Decode(&accFile) 141 | if err != nil { 142 | return fmt.Errorf("decoding account file: %v", err) 143 | } 144 | if len(accFile) == 0 { 145 | return fmt.Errorf("account file was empty") 146 | } 147 | 148 | ta = accFile[0].Account 149 | 150 | return archiver.ErrStopWalk 151 | }) 152 | return ta, err 153 | } 154 | 155 | func stripPreface(f io.Reader, preface string) error { 156 | buf := make([]byte, len(preface)) 157 | _, err := io.ReadFull(f, buf) 158 | return err 159 | } 160 | 161 | // archiveProcessFn is a function that processes a 162 | // tweet from a Twitter export archive and returns 163 | // an ItemGraph created from t. 164 | type archiveProcessFn func(t tweet, archiveFilename string) (*timeliner.ItemGraph, error) 165 | 166 | // Variable definitions that are intended for 167 | // use with JavaScript but which are of no use 168 | // to us and would break the JSON parser. 169 | const ( 170 | tweetFilePreface = "window.YTD.tweet.part0 =" 171 | accountFilePreface = "window.YTD.account.part0 =" 172 | ) 173 | -------------------------------------------------------------------------------- /datasources/twitter/twitter.go: -------------------------------------------------------------------------------- 1 | // Package twitter implements a Timeliner service for importing 2 | // and downloading data from Twitter. 3 | package twitter 4 | 5 | import ( 6 | "archive/zip" 7 | "bytes" 8 | "context" 9 | "fmt" 10 | "io" 11 | "log" 12 | "net/http" 13 | "net/url" 14 | "path" 15 | "regexp" 16 | "strconv" 17 | "time" 18 | 19 | "github.com/mholt/archiver/v3" 20 | "github.com/mholt/timeliner" 21 | ) 22 | 23 | // Service name and ID. 24 | const ( 25 | DataSourceName = "Twitter" 26 | DataSourceID = "twitter" 27 | ) 28 | 29 | var dataSource = timeliner.DataSource{ 30 | ID: DataSourceID, 31 | Name: DataSourceName, 32 | OAuth2: timeliner.OAuth2{ 33 | ProviderID: "twitter", 34 | }, 35 | RateLimit: timeliner.RateLimit{ 36 | // from https://developer.twitter.com/en/docs/basics/rate-limits 37 | // with some leeway since it's actually a pretty generous limit 38 | RequestsPerHour: 5900, 39 | }, 40 | NewClient: func(acc timeliner.Account) (timeliner.Client, error) { 41 | httpClient, err := acc.NewHTTPClient() 42 | if err != nil { 43 | return nil, err 44 | } 45 | return &Client{ 46 | HTTPClient: httpClient, 47 | acc: acc, 48 | otherAccounts: make(map[string]twitterAccount), 49 | }, nil 50 | }, 51 | } 52 | 53 | func init() { 54 | err := timeliner.RegisterDataSource(dataSource) 55 | if err != nil { 56 | log.Fatal(err) 57 | } 58 | } 59 | 60 | // Client implements the timeliner.Client interface. 61 | type Client struct { 62 | Retweets bool // whether to include retweets 63 | Replies bool // whether to include replies to tweets that are not our own; i.e. are not a continuation of thought 64 | 65 | HTTPClient *http.Client 66 | 67 | checkpoint checkpointInfo 68 | 69 | acc timeliner.Account 70 | ownerAccount twitterAccount 71 | otherAccounts map[string]twitterAccount // keyed by user/account ID 72 | } 73 | 74 | // ListItems lists items from opt.Filename if specified, or from the API otherwise. 75 | func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error { 76 | defer close(itemChan) 77 | 78 | if opt.Filename != "" { 79 | return c.getFromArchiveFile(itemChan, opt) 80 | } 81 | 82 | return c.getFromAPI(ctx, itemChan, opt) 83 | } 84 | 85 | func (c *Client) prepareTweet(t *tweet, source string) (skip bool, err error) { 86 | // mark whether this tweet came from the API or an export file 87 | t.source = source 88 | 89 | // set the owner account information; this has to be done differently 90 | // depending on the source (it's not embedded in the archive's tweets...) 91 | switch t.source { 92 | case "archive": 93 | t.ownerAccount = c.ownerAccount 94 | case "api": 95 | if t.User != nil { 96 | if t.User.UserIDStr == c.ownerAccount.id() { 97 | // tweet author is the owner of the account - awesome 98 | t.ownerAccount = c.ownerAccount 99 | } else { 100 | // look up author's account info 101 | acc, ok := c.otherAccounts[t.User.UserIDStr] 102 | if !ok { 103 | acc, err = c.getAccountFromAPI("", t.User.UserIDStr) 104 | if err != nil { 105 | return false, fmt.Errorf("looking up tweet author's account information: %v", err) 106 | } 107 | // cache this for later 108 | if len(c.otherAccounts) > 2000 { 109 | for id := range c.otherAccounts { 110 | delete(c.otherAccounts, id) 111 | break 112 | } 113 | } 114 | c.otherAccounts[acc.IDStr] = acc 115 | } 116 | t.ownerAccount = acc 117 | } 118 | } 119 | default: 120 | return false, fmt.Errorf("unrecognized source: %s", t.source) 121 | } 122 | 123 | // skip empty tweets 124 | if t.isEmpty() { 125 | return true, nil 126 | } 127 | 128 | // skip tweets we aren't interested in 129 | if !c.Retweets && t.isRetweet() { 130 | return true, nil 131 | } 132 | if !c.Replies && t.InReplyToUserIDStr != "" && t.InReplyToUserIDStr != t.ownerAccount.id() { 133 | // TODO: Replies should have more context, like what are we replying to, etc... the whole thread, even? 134 | // this option is about replies to tweets other than our own (which are like a continuation of one thought) 135 | return true, nil 136 | } 137 | 138 | // parse Twitter's time string into an actual time value 139 | t.createdAtParsed, err = time.Parse("Mon Jan 2 15:04:05 -0700 2006", t.CreatedAt) 140 | if err != nil { 141 | return false, fmt.Errorf("parsing created_at time: %v", err) 142 | } 143 | 144 | return false, nil 145 | } 146 | 147 | func (c *Client) makeItemGraphFromTweet(t tweet, archiveFilename string) (*timeliner.ItemGraph, error) { 148 | oneMediaItem := t.hasExactlyOneMediaItem() 149 | 150 | // only create a tweet item if it has text OR exactly one media item 151 | // (because we don't want an empty item; we process each media item 152 | // as a separate item, unless there's exactly 1, in which case we 153 | // in-line it into the tweet itself) 154 | var ig *timeliner.ItemGraph 155 | if t.text() != "" || !oneMediaItem { 156 | ig = timeliner.NewItemGraph(&t) 157 | } 158 | 159 | // process the media items attached to the tweet 160 | if t.ExtendedEntities != nil { 161 | var collItems []timeliner.CollectionItem 162 | 163 | for i, m := range t.ExtendedEntities.Media { 164 | m.parent = &t 165 | 166 | var dataFileName string 167 | if dfn := m.DataFileName(); dfn == nil || *dfn == "" { 168 | log.Printf("[ERROR][%s/%s] Tweet media has no data file name: %+v", 169 | DataSourceID, c.acc.UserID, m) 170 | continue 171 | } else { 172 | dataFileName = *dfn 173 | } 174 | 175 | switch t.source { 176 | case "archive": 177 | targetFileInArchive := path.Join("tweet_media", dataFileName) 178 | 179 | err := archiver.Walk(archiveFilename, func(f archiver.File) error { 180 | if f.Header.(zip.FileHeader).Name != targetFileInArchive { 181 | return nil 182 | } 183 | 184 | buf := new(bytes.Buffer) 185 | _, err := io.Copy(buf, f) 186 | if err != nil { 187 | return fmt.Errorf("copying item into memory: %v", err) 188 | } 189 | m.readCloser = timeliner.FakeCloser(buf) 190 | 191 | return archiver.ErrStopWalk 192 | }) 193 | if err != nil { 194 | return nil, fmt.Errorf("walking archive file %s in search of tweet media: %v", 195 | archiveFilename, err) 196 | } 197 | 198 | case "api": 199 | mediaURL := m.getURL() 200 | if m.Type == "photo" { 201 | mediaURL += ":orig" // get original file, with metadata 202 | } 203 | resp, err := http.Get(mediaURL) 204 | if err != nil { 205 | return nil, fmt.Errorf("getting media resource %s: %v", m.MediaURLHTTPS, err) 206 | } 207 | if resp.StatusCode != http.StatusOK { 208 | return nil, fmt.Errorf("media resource returned HTTP status %s: %s", resp.Status, m.MediaURLHTTPS) 209 | } 210 | m.readCloser = resp.Body 211 | 212 | default: 213 | return nil, fmt.Errorf("unrecognized source value: must be api or archive: %s", t.source) 214 | } 215 | 216 | if !oneMediaItem { 217 | if ig != nil { 218 | ig.Add(m, timeliner.RelAttached) 219 | } 220 | collItems = append(collItems, timeliner.CollectionItem{ 221 | Item: m, 222 | Position: i, 223 | }) 224 | } 225 | } 226 | 227 | if len(collItems) > 0 { 228 | ig.Collections = append(ig.Collections, timeliner.Collection{ 229 | OriginalID: "tweet_" + t.ID(), 230 | Items: collItems, 231 | }) 232 | } 233 | } 234 | 235 | // if we're using the API, go ahead and get the 236 | // 'parent' tweet to which this tweet is a reply 237 | if t.source == "api" && t.InReplyToStatusIDStr != "" { 238 | inReplyToTweet, err := c.getTweetFromAPI(t.InReplyToStatusIDStr) 239 | if err != nil { 240 | return nil, fmt.Errorf("getting tweet that this tweet (%s) is in reply to (%s): %v", 241 | t.ID(), t.InReplyToStatusIDStr, err) 242 | } 243 | skip, err := c.prepareTweet(&inReplyToTweet, "api") 244 | if err != nil { 245 | return nil, fmt.Errorf("preparing reply-parent tweet: %v", err) 246 | } 247 | if !skip { 248 | repIG, err := c.makeItemGraphFromTweet(inReplyToTweet, "") 249 | if err != nil { 250 | return nil, fmt.Errorf("making item from tweet that this tweet (%s) is in reply to (%s): %v", 251 | t.ID(), inReplyToTweet.ID(), err) 252 | } 253 | ig.Edges[repIG] = []timeliner.Relation{timeliner.RelReplyTo} 254 | } 255 | } 256 | 257 | // if this tweet embeds/quotes/links to other tweets, 258 | // we should establish those relationships as well 259 | if t.source == "api" && t.Entities != nil { 260 | for _, urlEnt := range t.Entities.URLs { 261 | embeddedTweetID := getLinkedTweetID(urlEnt.ExpandedURL) 262 | if embeddedTweetID == "" { 263 | continue 264 | } 265 | embeddedTweet, err := c.getTweetFromAPI(embeddedTweetID) 266 | if err != nil { 267 | return nil, fmt.Errorf("getting tweet that this tweet (%s) embeds (%s): %v", 268 | t.ID(), t.InReplyToStatusIDStr, err) 269 | } 270 | skip, err := c.prepareTweet(&embeddedTweet, "api") 271 | if err != nil { 272 | return nil, fmt.Errorf("preparing embedded tweet: %v", err) 273 | } 274 | if !skip { 275 | embIG, err := c.makeItemGraphFromTweet(embeddedTweet, "") 276 | if err != nil { 277 | return nil, fmt.Errorf("making item from tweet that this tweet (%s) embeds (%s): %v", 278 | t.ID(), embeddedTweet.ID(), err) 279 | } 280 | ig.Edges[embIG] = []timeliner.Relation{timeliner.RelQuotes} 281 | } 282 | } 283 | } 284 | 285 | return ig, nil 286 | } 287 | 288 | // Assuming checkpoints are short-lived (i.e. are resumed 289 | // somewhat quickly, before the page tokens/cursors expire), 290 | // we can just store the page tokens. 291 | type checkpointInfo struct { 292 | LastTweetID string 293 | } 294 | 295 | // save records the checkpoint. 296 | func (ch *checkpointInfo) save(ctx context.Context) { 297 | gobBytes, err := timeliner.MarshalGob(ch) 298 | if err != nil { 299 | log.Printf("[ERROR][%s] Encoding checkpoint: %v", DataSourceID, err) 300 | } 301 | timeliner.Checkpoint(ctx, gobBytes) 302 | } 303 | 304 | // load decodes the checkpoint. 305 | func (ch *checkpointInfo) load(checkpointGob []byte) { 306 | if len(checkpointGob) == 0 { 307 | return 308 | } 309 | err := timeliner.UnmarshalGob(checkpointGob, ch) 310 | if err != nil { 311 | log.Printf("[ERROR][%s] Decoding checkpoint: %v", DataSourceID, err) 312 | } 313 | } 314 | 315 | // maxTweetID returns the higher of the two tweet IDs. 316 | // Errors parsing the strings as integers are ignored. 317 | // Empty string inputs are ignored so the other value 318 | // will win automatically. If both are empty, an empty 319 | // string is returned. 320 | func maxTweetID(id1, id2 string) string { 321 | if id1 == "" { 322 | return id2 323 | } 324 | if id2 == "" { 325 | return id1 326 | } 327 | id1int, _ := strconv.ParseInt(id1, 10, 64) 328 | id2int, _ := strconv.ParseInt(id2, 10, 64) 329 | if id1int > id2int { 330 | return id1 331 | } 332 | return id2 333 | } 334 | 335 | // getLinkedTweetID returns the ID of the tweet in 336 | // a link to a tweet, for example: 337 | // "https://twitter.com/foo/status/12345" 338 | // returns "12345". If the tweet ID cannot be found 339 | // or the URL does not match the right format, 340 | // an empty string is returned. 341 | func getLinkedTweetID(urlToTweet string) string { 342 | if !linkToTweetRE.MatchString(urlToTweet) { 343 | return "" 344 | } 345 | u, err := url.Parse(urlToTweet) 346 | if err != nil { 347 | return "" 348 | } 349 | return path.Base(u.Path) 350 | } 351 | 352 | var linkToTweetRE = regexp.MustCompile(`https?://twitter\.com/.*/status/[0-9]+`) 353 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | // register the sqlite3 driver 10 | _ "github.com/mattn/go-sqlite3" 11 | ) 12 | 13 | func openDB(dataDir string) (*sql.DB, error) { 14 | var db *sql.DB 15 | var err error 16 | defer func() { 17 | if err != nil && db != nil { 18 | db.Close() 19 | } 20 | }() 21 | 22 | err = os.MkdirAll(dataDir, 0755) 23 | if err != nil { 24 | return nil, fmt.Errorf("making data directory: %v", err) 25 | } 26 | 27 | dbPath := filepath.Join(dataDir, "index.db") 28 | 29 | db, err = sql.Open("sqlite3", dbPath+"?_foreign_keys=true") 30 | if err != nil { 31 | return nil, fmt.Errorf("opening database: %v", err) 32 | } 33 | 34 | // ensure DB is provisioned 35 | _, err = db.Exec(createDB) 36 | if err != nil { 37 | return nil, fmt.Errorf("setting up database: %v", err) 38 | } 39 | 40 | // add all registered data sources 41 | err = saveAllDataSources(db) 42 | if err != nil { 43 | return nil, fmt.Errorf("saving registered data sources to database: %v", err) 44 | } 45 | 46 | return db, nil 47 | } 48 | 49 | const createDB = ` 50 | -- A data source is a content provider, like a cloud photo service, social media site, or exported archive format. 51 | CREATE TABLE IF NOT EXISTS "data_sources" ( 52 | "id" TEXT PRIMARY KEY, 53 | "name" TEXT NOT NULL 54 | ); 55 | 56 | -- An account contains credentials necessary for accessing a data source. 57 | CREATE TABLE IF NOT EXISTS "accounts" ( 58 | "id" INTEGER PRIMARY KEY, 59 | "data_source_id" TEXT NOT NULL, 60 | "user_id" TEXT NOT NULL, 61 | "authorization" BLOB, 62 | "checkpoint" BLOB, 63 | "last_item_id" INTEGER, -- row ID of item having highest timestamp processed during the last run 64 | FOREIGN KEY ("data_source_id") REFERENCES "data_sources"("id") ON DELETE CASCADE, 65 | FOREIGN KEY ("last_item_id") REFERENCES "items"("id") ON DELETE SET NULL, 66 | UNIQUE ("data_source_id", "user_id") 67 | ); 68 | 69 | CREATE TABLE IF NOT EXISTS "persons" ( 70 | "id" INTEGER PRIMARY KEY, 71 | "name" TEXT 72 | ); 73 | 74 | -- This table specifies identities (user IDs, etc.) of a person across data_sources. 75 | CREATE TABLE IF NOT EXISTS "person_identities" ( 76 | "id" INTEGER PRIMARY KEY, 77 | "person_id" INTEGER NOT NULL, 78 | "data_source_id" TEXT NOT NULL, 79 | "user_id" TEXT NOT NULL, -- whatever identifier a person takes on at the data source 80 | FOREIGN KEY ("person_id") REFERENCES "persons"("id") ON DELETE CASCADE, 81 | FOREIGN KEY ("data_source_id") REFERENCES "data_sources"("id") ON DELETE CASCADE, 82 | UNIQUE ("person_id", "data_source_id", "user_id") 83 | ); 84 | 85 | -- An item is something downloaded from a specific account on a specific data source. 86 | CREATE TABLE IF NOT EXISTS "items" ( 87 | "id" INTEGER PRIMARY KEY, 88 | "account_id" INTEGER NOT NULL, 89 | "original_id" TEXT NOT NULL, -- ID provided by the data source 90 | "person_id" INTEGER NOT NULL, 91 | "timestamp" INTEGER, -- timestamp when item content was originally created (NOT when the database row was created) 92 | "stored" INTEGER NOT NULL DEFAULT (strftime('%s', CURRENT_TIME)), -- timestamp row was created or last updated from source 93 | "modified" INTEGER, -- timestamp when item was locally modified; if not null, then item is "not clean" 94 | "class" INTEGER, 95 | "mime_type" TEXT, 96 | "data_text" TEXT COLLATE NOCASE, -- item content, if text-encoded 97 | "data_file" TEXT, -- item filename, if non-text or not suitable for storage in DB (usually media items) 98 | "data_hash" TEXT, -- base64 encoding of SHA-256 checksum of contents of data file, if any 99 | "metadata" BLOB, -- optional extra information 100 | "latitude" REAL, 101 | "longitude" REAL, 102 | FOREIGN KEY ("account_id") REFERENCES "accounts"("id") ON DELETE CASCADE, 103 | FOREIGN KEY ("person_id") REFERENCES "persons"("id") ON DELETE CASCADE, 104 | UNIQUE ("original_id", "account_id") 105 | ); 106 | 107 | CREATE INDEX IF NOT EXISTS "idx_items_timestamp" ON "items"("timestamp"); 108 | CREATE INDEX IF NOT EXISTS "idx_items_data_text" ON "items"("data_text"); 109 | CREATE INDEX IF NOT EXISTS "idx_items_data_file" ON "items"("data_file"); 110 | CREATE INDEX IF NOT EXISTS "idx_items_data_hash" ON "items"("data_hash"); 111 | 112 | -- Relationships draws relationships between and across items and persons. 113 | CREATE TABLE IF NOT EXISTS "relationships" ( 114 | "id" INTEGER PRIMARY KEY, 115 | "from_person_id" INTEGER, 116 | "from_item_id" INTEGER, 117 | "to_person_id" INTEGER, 118 | "to_item_id" INTEGER, 119 | "directed" BOOLEAN, -- if false, the edge goes both ways 120 | "label" TEXT NOT NULL, 121 | FOREIGN KEY ("from_item_id") REFERENCES "items"("id") ON DELETE CASCADE, 122 | FOREIGN KEY ("to_item_id") REFERENCES "items"("id") ON DELETE CASCADE, 123 | FOREIGN KEY ("from_person_id") REFERENCES "persons"("id") ON DELETE CASCADE, 124 | FOREIGN KEY ("to_person_id") REFERENCES "persons"("id") ON DELETE CASCADE, 125 | UNIQUE ("from_item_id", "to_item_id", "label"), 126 | UNIQUE ("from_person_id", "to_person_id", "label"), 127 | UNIQUE ("from_item_id", "to_person_id", "label"), 128 | UNIQUE ("from_person_id", "to_item_id", "label") 129 | ); 130 | 131 | CREATE TABLE IF NOT EXISTS "collections" ( 132 | "id" INTEGER PRIMARY KEY, 133 | "account_id" INTEGER NOT NULL, 134 | "original_id" TEXT, 135 | "name" TEXT, 136 | "description" TEXT, 137 | "modified" INTEGER, -- timestamp when collection or any of its items/ordering were modified locally; if not null, then collection is "not clean" 138 | FOREIGN KEY ("account_id") REFERENCES "accounts"("id") ON DELETE CASCADE, 139 | UNIQUE("account_id", "original_id") 140 | ); 141 | 142 | CREATE TABLE IF NOT EXISTS "collection_items" ( 143 | "id" INTEGER PRIMARY KEY, 144 | "item_id" INTEGER NOT NULL, 145 | "collection_id" INTEGER NOT NULL, 146 | "position" INTEGER NOT NULL DEFAULT 0, 147 | FOREIGN KEY ("item_id") REFERENCES "items"("id") ON DELETE CASCADE, 148 | FOREIGN KEY ("collection_id") REFERENCES "collections"("id") ON DELETE CASCADE, 149 | UNIQUE("item_id", "collection_id", "position") 150 | ); 151 | ` 152 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/mholt/timeliner 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/BurntSushi/toml v0.3.1 7 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc // indirect 8 | github.com/mattn/go-sqlite3 v1.10.0 9 | github.com/mholt/archiver/v3 v3.3.0 10 | github.com/seiflotfy/cuckoofilter v0.0.0-20200323075608-c8f23b6b6cef 11 | github.com/ttacon/builder v0.0.0-20170518171403-c099f663e1c2 // indirect 12 | github.com/ttacon/libphonenumber v1.1.0 13 | golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d 14 | ) 15 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= 3 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 4 | github.com/andybalholm/brotli v0.0.0-20190621154722-5f990b63d2d6 h1:bZ28Hqta7TFAK3Q08CMvv8y3/8ATaEqv2nGoc6yff6c= 5 | github.com/andybalholm/brotli v0.0.0-20190621154722-5f990b63d2d6/go.mod h1:+lx6/Aqd1kLJ1GQfkvOnaZ1WGmLpMpbprPuIOOZX30U= 6 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc h1:8WFBn63wegobsYAX0YjD+8suexZDga5CctH4CCTx2+8= 7 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= 8 | github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= 9 | github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= 10 | github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= 11 | github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721 h1:KRMr9A3qfbVM7iV/WcLY/rL5LICqwMHLhwRXKu99fXw= 12 | github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721/go.mod h1:xEhNfoBDX1hzLm2Nf80qUvZ2sVwoMZ8d6IE2SrsQfh4= 13 | github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= 14 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 15 | github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= 16 | github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 17 | github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= 18 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 19 | github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= 20 | github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY= 21 | github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= 22 | github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= 23 | github.com/klauspost/pgzip v1.2.1 h1:oIPZROsWuPHpOdMVWLuJZXwgjhrW8r1yEX8UqMyeNHM= 24 | github.com/klauspost/pgzip v1.2.1/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= 25 | github.com/mattn/go-sqlite3 v1.10.0 h1:jbhqpg7tQe4SupckyijYiy0mJJ/pRyHvXf7JdWK860o= 26 | github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= 27 | github.com/mholt/archiver/v3 v3.3.0 h1:vWjhY8SQp5yzM9P6OJ/eZEkmi3UAbRrxCq48MxjAzig= 28 | github.com/mholt/archiver/v3 v3.3.0/go.mod h1:YnQtqsp+94Rwd0D/rk5cnLrxusUBUXg+08Ebtr1Mqao= 29 | github.com/nwaples/rardecode v1.0.0 h1:r7vGuS5akxOnR4JQSkko62RJ1ReCMXxQRPtxsiFMBOs= 30 | github.com/nwaples/rardecode v1.0.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= 31 | github.com/pierrec/lz4 v2.0.5+incompatible h1:2xWsjqPFWcplujydGg4WmhC/6fZqK42wMM8aXeqhl0I= 32 | github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= 33 | github.com/seiflotfy/cuckoofilter v0.0.0-20200323075608-c8f23b6b6cef h1:PokWhuPtXrgwLeUZzanj6iMZpUnFBCc6g2tDeheBLrE= 34 | github.com/seiflotfy/cuckoofilter v0.0.0-20200323075608-c8f23b6b6cef/go.mod h1:ET5mVvNjwaGXRgZxO9UZr7X+8eAf87AfIYNwRSp9s4Y= 35 | github.com/ttacon/builder v0.0.0-20170518171403-c099f663e1c2 h1:5u+EJUQiosu3JFX0XS0qTf5FznsMOzTjGqavBGuCbo0= 36 | github.com/ttacon/builder v0.0.0-20170518171403-c099f663e1c2/go.mod h1:4kyMkleCiLkgY6z8gK5BkI01ChBtxR0ro3I1ZDcGM3w= 37 | github.com/ttacon/libphonenumber v1.1.0 h1:tC6kE4t8UI4OqQVQjW5q8gSWhG2wnY5moEpSEORdYm4= 38 | github.com/ttacon/libphonenumber v1.1.0/go.mod h1:E0TpmdVMq5dyVlQ7oenAkhsLu86OkUl+yR4OAxyEg/M= 39 | github.com/ulikunitz/xz v0.5.6 h1:jGHAfXawEGZQ3blwU5wnWKQJvAraT7Ftq9EXjnXYgt8= 40 | github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= 41 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo= 42 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos= 43 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 44 | golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e h1:bRhVy7zSSasaqNksaRZiA5EEI+Ei4I1nO5Jh72wfHlg= 45 | golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 46 | golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d h1:TzXSXBo42m9gQenoE3b9BGiEpg5IG2JkU5FkPIawgtw= 47 | golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= 48 | golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw= 49 | golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 50 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 51 | google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508= 52 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 53 | -------------------------------------------------------------------------------- /itemfiles.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "crypto/sha256" 5 | "database/sql" 6 | "encoding/base64" 7 | "fmt" 8 | "hash" 9 | "io" 10 | "log" 11 | mathrand "math/rand" 12 | "os" 13 | "path" 14 | "path/filepath" 15 | "regexp" 16 | "strings" 17 | "time" 18 | ) 19 | 20 | // downloadItemFile ... TODO: finish godoc. 21 | func (t *Timeline) downloadItemFile(src io.ReadCloser, dest *os.File, h hash.Hash) (int64, error) { 22 | if src == nil { 23 | return 0, fmt.Errorf("missing reader with which to download file") 24 | } 25 | if dest == nil { 26 | return 0, fmt.Errorf("missing file to download into") 27 | } 28 | 29 | // TODO: What if file already exists on disk (byte-for-byte)? - i.e. data_hash in DB has a duplicate 30 | 31 | // give the hasher a copy of the file bytes 32 | tr := io.TeeReader(src, h) 33 | 34 | n, err := io.Copy(dest, tr) 35 | if err != nil { 36 | os.Remove(dest.Name()) 37 | return n, fmt.Errorf("copying contents: %v", err) 38 | } 39 | if err := dest.Sync(); err != nil { 40 | os.Remove(dest.Name()) 41 | return n, fmt.Errorf("syncing file: %v", err) 42 | } 43 | 44 | // TODO: If mime type is photo or video, extract most important EXIF data and return it for storage in DB? 45 | 46 | return n, nil 47 | } 48 | 49 | // makeUniqueCanonicalItemDataFileName returns an available 50 | // (non-overwriting) filename for the item's data file, starting 51 | // with its plain, canonical data file name, then improvising 52 | // and making unique if necessary. If there is no error, the 53 | // return value is always a usable data file name. 54 | func (t *Timeline) openUniqueCanonicalItemDataFile(it Item, dataSourceID string) (*os.File, *string, error) { 55 | if dataSourceID == "" { 56 | return nil, nil, fmt.Errorf("missing service ID") 57 | } 58 | 59 | dir := t.canonicalItemDataFileDir(it, dataSourceID) 60 | 61 | err := os.MkdirAll(t.fullpath(dir), 0700) 62 | if err != nil { 63 | return nil, nil, fmt.Errorf("making directory for data file: %v", err) 64 | } 65 | 66 | tryPath := path.Join(dir, t.canonicalItemDataFileName(it, dataSourceID)) 67 | lastAppend := path.Ext(tryPath) 68 | 69 | for i := 0; i < 100; i++ { 70 | fullFilePath := t.fullpath(filepath.FromSlash(tryPath)) 71 | 72 | f, err := os.OpenFile(fullFilePath, os.O_CREATE|os.O_RDWR|os.O_EXCL, 0600) 73 | if os.IsExist(err) { 74 | ext := path.Ext(tryPath) 75 | tryPath = strings.TrimSuffix(tryPath, lastAppend) 76 | lastAppend = fmt.Sprintf("_%d%s", i+1, ext) // start at 1, but actually 2 because existing file is "1" 77 | tryPath += lastAppend 78 | continue 79 | } 80 | if err != nil { 81 | return nil, nil, fmt.Errorf("creating data file: %v", err) 82 | } 83 | 84 | return f, &tryPath, nil 85 | } 86 | 87 | return nil, nil, fmt.Errorf("unable to find available filename for item: %s", tryPath) 88 | } 89 | 90 | // canonicalItemDataFileName returns the plain, canonical name of the 91 | // data file for the item. Canonical data file names are relative to 92 | // the base storage (repo) path (i.e. the folder of the DB file). This 93 | // function does no improvising in case of a name missing from the item, 94 | // nor does it do uniqueness checks. If the item does not have enough 95 | // information to generate a deterministic file name, the returned path 96 | // will end with a trailing slash (i.e. the path's last component empty). 97 | // Things considered deterministic for filename construction include the 98 | // item's filename, the item's original ID, and its timestamp. 99 | // TODO: fix godoc (this returns only the name now, not the whole dir) 100 | func (t *Timeline) canonicalItemDataFileName(it Item, dataSourceID string) string { 101 | // ideally, the filename is simply the one provided with the item 102 | var filename string 103 | if fname := it.DataFileName(); fname != nil { 104 | filename = t.safePathComponent(*fname) 105 | } 106 | 107 | // otherwise, try a filename based on the item's ID 108 | if filename == "" { 109 | if itemOriginalID := it.ID(); itemOriginalID != "" { 110 | filename = fmt.Sprintf("item_%s", itemOriginalID) 111 | } 112 | } 113 | 114 | // otherwise, try a filename based on the item's timestamp 115 | ts := it.Timestamp() 116 | if filename == "" && !ts.IsZero() { 117 | filename = ts.Format("2006_01_02_150405") 118 | } 119 | 120 | // otherwise, out of options; revert to a random string 121 | // since no deterministic filename is available 122 | if filename == "" { 123 | filename = randomString(24, false) 124 | } 125 | 126 | // shorten the name if needed (thanks for everything, Windows) 127 | return t.ensureDataFileNameShortEnough(filename) 128 | } 129 | 130 | func (t *Timeline) canonicalItemDataFileDir(it Item, dataSourceID string) string { 131 | ts := it.Timestamp() 132 | if ts.IsZero() { 133 | ts = time.Now() 134 | } 135 | 136 | if dataSourceID == "" { 137 | dataSourceID = "unknown_service" 138 | } 139 | 140 | // use "/" separators and adjust for the OS 141 | // path separator when accessing disk 142 | return path.Join("data", 143 | fmt.Sprintf("%04d", ts.Year()), 144 | fmt.Sprintf("%02d", ts.Month()), 145 | t.safePathComponent(dataSourceID)) 146 | } 147 | 148 | func (t *Timeline) ensureDataFileNameShortEnough(filename string) string { 149 | // thanks for nothing, Windows 150 | if len(filename) > 250 { 151 | ext := path.Ext(filename) 152 | if len(ext) > 20 { // arbitrary and unlikely, but just in case 153 | ext = ext[:20] 154 | } 155 | filename = filename[:250-len(ext)] 156 | filename += ext 157 | } 158 | return filename 159 | } 160 | 161 | // TODO:/NOTE: If changing a file name, all items with same data_hash must also be updated to use same file name 162 | func (t *Timeline) replaceWithExisting(canonical *string, checksumBase64 string, itemRowID int64) error { 163 | if canonical == nil || *canonical == "" || checksumBase64 == "" { 164 | return fmt.Errorf("missing data filename and/or hash of contents") 165 | } 166 | 167 | var existingDatafile *string 168 | err := t.db.QueryRow(`SELECT data_file FROM items 169 | WHERE data_hash = ? AND id != ? LIMIT 1`, 170 | checksumBase64, itemRowID).Scan(&existingDatafile) 171 | if err == sql.ErrNoRows { 172 | return nil // file is unique; carry on 173 | } 174 | if err != nil { 175 | return fmt.Errorf("querying DB: %v", err) 176 | } 177 | 178 | // file is a duplicate! 179 | 180 | if existingDatafile == nil { 181 | // ... that's weird, how's this possible? it has a hash but no file name recorded 182 | return fmt.Errorf("item with matching hash is missing data file name; hash: %s", checksumBase64) 183 | } 184 | 185 | // ensure the existing file is still the same 186 | h := sha256.New() 187 | f, err := os.Open(t.fullpath(*existingDatafile)) 188 | if err != nil { 189 | return fmt.Errorf("opening existing file: %v", err) 190 | } 191 | defer f.Close() 192 | 193 | _, err = io.Copy(h, f) 194 | if err != nil { 195 | return fmt.Errorf("checking file integrity: %v", err) 196 | } 197 | 198 | existingFileHash := h.Sum(nil) 199 | b64ExistingFileHash := base64.StdEncoding.EncodeToString(existingFileHash) 200 | 201 | // if the existing file was modified; restore it with 202 | // what we just downloaded, which presumably succeeded 203 | if checksumBase64 != b64ExistingFileHash { 204 | log.Printf("[INFO] Restoring modified data file: %s was '%s' but is now '%s'", 205 | *existingDatafile, checksumBase64, existingFileHash) 206 | err := os.Rename(t.fullpath(*canonical), t.fullpath(*existingDatafile)) 207 | if err != nil { 208 | return fmt.Errorf("replacing modified data file: %v", err) 209 | } 210 | } 211 | 212 | // everything checks out; delete the newly-downloaded file 213 | // and use the existing file instead of duplicating it 214 | err = os.Remove(t.fullpath(*canonical)) 215 | if err != nil { 216 | return fmt.Errorf("removing duplicate data file: %v", err) 217 | } 218 | 219 | canonical = existingDatafile 220 | 221 | return nil 222 | } 223 | 224 | // randomString returns a string of n random characters. 225 | // It is not even remotely secure or a proper distribution. 226 | // But it's good enough for some things. It excludes certain 227 | // confusing characters like I, l, 1, 0, O, etc. If sameCase 228 | // is true, then uppercase letters are excluded. 229 | func randomString(n int, sameCase bool) string { 230 | if n <= 0 { 231 | return "" 232 | } 233 | dict := []byte("abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRTUVWXY23456789") 234 | if sameCase { 235 | dict = []byte("abcdefghijkmnpqrstuvwxyz0123456789") 236 | } 237 | b := make([]byte, n) 238 | for i := range b { 239 | b[i] = dict[mathrand.Int63()%int64(len(dict))] 240 | } 241 | return string(b) 242 | } 243 | 244 | func (t *Timeline) fullpath(canonicalDatafileName string) string { 245 | return filepath.Join(t.repoDir, filepath.FromSlash(canonicalDatafileName)) 246 | } 247 | 248 | func (t *Timeline) datafileExists(canonicalDatafileName string) bool { 249 | _, err := os.Stat(t.fullpath(canonicalDatafileName)) 250 | return !os.IsNotExist(err) 251 | } 252 | 253 | func (t *Timeline) safePathComponent(s string) string { 254 | s = safePathRE.ReplaceAllLiteralString(s, "") 255 | s = strings.Replace(s, "..", "", -1) 256 | if s == "." { 257 | s = "" 258 | } 259 | return s 260 | } 261 | 262 | // safePathRER matches any undesirable characters in a filepath. 263 | // Note that this allows dots, so you'll have to strip ".." manually. 264 | var safePathRE = regexp.MustCompile(`[^\w.-]`) 265 | -------------------------------------------------------------------------------- /itemgraph.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "io" 7 | "time" 8 | ) 9 | 10 | // Item is the central concept of a piece of content 11 | // from a service or data source. Take note of which 12 | // methods are required to return non-empty values. 13 | // 14 | // The actual content of an item is stored either in 15 | // the database or on disk as a file. Generally, 16 | // content that is text-encoded can and should be 17 | // stored in the database where it will be indexed. 18 | // However, if the item's content (for example, the 19 | // bytes of a photo or video) are not text or if the 20 | // text is too large to store well in a database (for 21 | // example, an entire novel), it should be stored 22 | // on disk, and this interface has methods to 23 | // accommodate both. Note that an item may have both 24 | // text and non-text content, too: for example, photos 25 | // and videos may have descriptions that are as much 26 | // "content" as the media iteself. One part of an item 27 | // is not mutually exclusive with any other. 28 | type Item interface { 29 | // The unique ID of the item assigned by the service. 30 | // If the service does not assign one, then invent 31 | // one such that the ID is unique to the content or 32 | // substance of the item (for example, an ID derived 33 | // from timestamp or from the actual content of the 34 | // item -- whatever makes it unique). The ID need 35 | // only be unique for the account it is associated 36 | // with, although more unique is, of course, acceptable. 37 | // 38 | // REQUIRED. 39 | ID() string 40 | 41 | // The originating timestamp of the item, which 42 | // may be different from when the item was posted 43 | // or created. For example, a photo may be taken 44 | // one day but uploaded a week later. Prefer the 45 | // time when the original item content was captured. 46 | // 47 | // REQUIRED. 48 | Timestamp() time.Time 49 | 50 | // A classification of the item's kind. 51 | // 52 | // REQUIRED. 53 | Class() ItemClass 54 | 55 | // The user/account ID of the owner or 56 | // originator of the content, along with their 57 | // username or real name. The ID is used to 58 | // relate the item with the person behind it; 59 | // the name is used to make the person 60 | // recognizable to the human reader. If the 61 | // ID is nil, the current account owner will 62 | // be assumed. (Use the ID as given by the 63 | // data source.) If the data source only 64 | // provides a name but no ID, you may return 65 | // the name as the ID with the understanding 66 | // that a different name will be counted as a 67 | // different person. You may also return the 68 | // name as the name and leave the ID nil and 69 | // have correct results if it is safe to assume 70 | // the name belongs to the current account owner. 71 | Owner() (id *string, name *string) 72 | 73 | // Returns the text of the item, if any. 74 | // This field is indexed in the DB, so don't 75 | // use for unimportant metadata or huge 76 | // swaths of text; if there is a large 77 | // amount of text, use an item file instead. 78 | DataText() (*string, error) 79 | 80 | // For primary content which is not text or 81 | // which is too large to be stored well in a 82 | // database, the content can be downloaded 83 | // into a file. If so, the following methods 84 | // should return the necessary information, 85 | // if available from the service, so that a 86 | // data file can be obtained, stored, and 87 | // later read successfully. 88 | // 89 | // DataFileName returns the filename (NOT full 90 | // path or URL) of the file; prefer the original 91 | // filename if it originated as a file. If the 92 | // filename is not unique on disk when downloaded, 93 | // it will be made unique by modifying it. If 94 | // this value is nil/empty, a filename will be 95 | // generated from the item's other data. 96 | // 97 | // DataFileReader returns a way to read the data. 98 | // It will be closed when the read is completed. 99 | // 100 | // DataFileHash returns the checksum of the 101 | // content as provided by the service. If the 102 | // service (or data source) does not provide a 103 | // hash, leave this field empty, but note that 104 | // later it will be impossible to efficiently 105 | // know whether the content has changed on the 106 | // service from what is stored locally. 107 | // 108 | // DataFileMIMEType returns the MIME type of 109 | // the data file, if known. 110 | DataFileName() *string 111 | DataFileReader() (io.ReadCloser, error) 112 | DataFileHash() []byte 113 | DataFileMIMEType() *string 114 | 115 | // Metadata returns any optional metadata. 116 | // Feel free to leave as many fields empty 117 | // as you'd like: the less fields that are 118 | // filled out, the smaller the storage size. 119 | // Metadata is not indexed by the DB but is 120 | // rendered in projections and queries 121 | // according to the item's classification. 122 | Metadata() (*Metadata, error) 123 | 124 | // Location returns an item's location, 125 | // if known. For now, only Earth 126 | // coordinates are accepted, but we can 127 | // improve this later. 128 | Location() (*Location, error) 129 | } 130 | 131 | // ItemClass classifies an item. 132 | type ItemClass int 133 | 134 | // Various classes of items. 135 | const ( 136 | ClassUnknown ItemClass = iota 137 | ClassImage 138 | ClassVideo 139 | ClassAudio 140 | ClassPost 141 | ClassLocation 142 | ClassEmail 143 | ClassPrivateMessage 144 | ClassMessage 145 | ) 146 | 147 | // These are the standard relationships that Timeliner 148 | // recognizes. Using these known relationships is not 149 | // required, but it makes it easier to translate them to 150 | // human-friendly phrases when visualizing the timeline. 151 | var ( 152 | RelReplyTo = Relation{Label: "reply_to", Bidirectional: false} // "<from> is in reply to <to>" 153 | RelAttached = Relation{Label: "attached", Bidirectional: true} // "<to|from> is attached to <from|to>" 154 | RelQuotes = Relation{Label: "quotes", Bidirectional: false} // "<from> quotes <to>" 155 | RelCCed = Relation{Label: "carbon_copied", Bidirectional: false} // "<from_item> is carbon-copied to <to_person>" 156 | ) 157 | 158 | // ItemRow has the structure of an item's row in our DB. 159 | type ItemRow struct { 160 | ID int64 161 | AccountID int64 162 | OriginalID string 163 | PersonID int64 164 | Timestamp time.Time 165 | Stored time.Time 166 | Modified *time.Time 167 | Class ItemClass 168 | MIMEType *string 169 | DataText *string 170 | DataFile *string 171 | DataHash *string // base64-encoded SHA-256 172 | Metadata *Metadata 173 | Location 174 | 175 | metaGob []byte // use Metadata.(encode/decode) 176 | item Item 177 | } 178 | 179 | // Location contains location information. 180 | type Location struct { 181 | Latitude *float64 182 | Longitude *float64 183 | } 184 | 185 | // ItemGraph is an item with optional connections to other items. 186 | // All ItemGraph values should be pointers to ensure consistency. 187 | // The usual weird/fun thing about representing graph data structures 188 | // in memory is that a graph is a node, and a node is a graph. 🤓 189 | type ItemGraph struct { 190 | // The node item. This can be nil, but note that 191 | // Edges will not be traversed if Node is nil, 192 | // because there must be a node on both ends of 193 | // an edge. 194 | // 195 | // Optional. 196 | Node Item 197 | 198 | // Edges are represented as 1:many relations 199 | // to other "graphs" (nodes in the graph). 200 | // Fill this out to add multiple items to the 201 | // timeline at once, while drawing the 202 | // designated relationships between them. 203 | // Useful when processing related items in 204 | // batches. 205 | // 206 | // Directional relationships go from Node to 207 | // the map key. 208 | // 209 | // If the items involved in a relationship are 210 | // not efficiently available at the same time 211 | // (i.e. if loading both items involved in the 212 | // relationship would take a non-trivial amount 213 | // of time or API calls), you can use the 214 | // Relations field instead, but only after the 215 | // items have been added to the timeline. 216 | // 217 | // Optional. 218 | Edges map[*ItemGraph][]Relation 219 | 220 | // If items in the graph belong to a collection, 221 | // specify them here. If the collection does not 222 | // exist (by row ID or AccountID+OriginalID), it 223 | // will be created. If it already exists, the 224 | // collection in the DB will be unioned with the 225 | // collection specified here. Collections are 226 | // processed regardless of Node and Edges. 227 | // 228 | // Optional. 229 | Collections []Collection 230 | 231 | // Relationships between existing items in the 232 | // timeline can be represented here in a list 233 | // of item IDs that are connected by a label. 234 | // This field is useful when relationships and 235 | // the items involved in them are not discovered 236 | // at the same time. Relations in this list will 237 | // be added to the timeline, joined by the item 238 | // IDs described in the RawRelations, only if 239 | // the items having those IDs (as provided by 240 | // the data source; we're not talking about DB 241 | // row IDs here) already exist in the timeline. 242 | // In other words, this is a best-effort field; 243 | // useful for forming relationships of existing 244 | // items, but without access to the actual items 245 | // themselves. If you have the items involved in 246 | // the relationships, use Edges instead. 247 | // 248 | // Optional. 249 | Relations []RawRelation 250 | } 251 | 252 | // NewItemGraph returns a new node/graph. 253 | func NewItemGraph(node Item) *ItemGraph { 254 | return &ItemGraph{ 255 | Node: node, 256 | Edges: make(map[*ItemGraph][]Relation), 257 | } 258 | } 259 | 260 | // Add adds item to the graph ig by making an edge described 261 | // by rel from the node ig to a new node for item. 262 | // 263 | // This method is for simple inserts, where the only thing to add 264 | // to the graph at this moment is a single item, since the graph 265 | // it inserts contains only a single node populated by item. To 266 | // add a full graph with multiple items (i.e. a graph with edges), 267 | // call ig.Connect directly. 268 | func (ig *ItemGraph) Add(item Item, rel Relation) { 269 | ig.Connect(NewItemGraph(item), rel) 270 | } 271 | 272 | // Connect is a simple convenience function that adds a graph (node) 273 | // to ig by an edge described by rel. 274 | func (ig *ItemGraph) Connect(node *ItemGraph, rel Relation) { 275 | if ig.Edges == nil { 276 | ig.Edges = make(map[*ItemGraph][]Relation) 277 | } 278 | ig.Edges[node] = append(ig.Edges[node], rel) 279 | } 280 | 281 | // RawRelation represents a relationship between 282 | // two items or people (or both) from the same 283 | // data source (but not necessarily the same 284 | // accounts; we assume that a data source's item 285 | // IDs are globally unique across accounts). 286 | // The item IDs should be those which are 287 | // assigned/provided by the data source, NOT a 288 | // database row ID. Likewise, the persons' user 289 | // IDs should be the IDs of the user as associated 290 | // with the data source, NOT their row IDs. 291 | type RawRelation struct { 292 | FromItemID string 293 | ToItemID string 294 | FromPersonUserID string 295 | ToPersonUserID string 296 | Relation 297 | } 298 | 299 | // Relation describes how two nodes in a graph are related. 300 | // It's essentially an edge on a graph. 301 | type Relation struct { 302 | Label string 303 | Bidirectional bool 304 | } 305 | 306 | // Collection represents a group of items, like an album. 307 | type Collection struct { 308 | // The ID of the collection as given 309 | // by the service; for example, the 310 | // album ID. If the service does not 311 | // provide an ID for the collection, 312 | // invent one such that the next time 313 | // the collection is encountered and 314 | // processed, its ID will be the same. 315 | // An ID is necessary here to ensure 316 | // uniqueness. 317 | // 318 | // REQUIRED. 319 | OriginalID string 320 | 321 | // The name of the collection as 322 | // given by the service; for example, 323 | // the album title. 324 | // 325 | // Optional. 326 | Name *string 327 | 328 | // The description, caption, or any 329 | // other relevant text describing 330 | // the collection. 331 | // 332 | // Optional. 333 | Description *string 334 | 335 | // The items for the collection; 336 | // if ordering is significant, 337 | // specify each item's Position 338 | // field; the order of elememts 339 | // of this slice will not be 340 | // considered important. 341 | Items []CollectionItem 342 | } 343 | 344 | // CollectionItem represents an item 345 | // stored in a collection. 346 | type CollectionItem struct { 347 | // The item to add to the collection. 348 | Item Item 349 | 350 | // Specify if ordering is important. 351 | Position int 352 | 353 | // Used when processing; this will 354 | // store the row ID of the item 355 | // after the item has been inserted 356 | // into the DB. 357 | itemRowID int64 358 | } 359 | 360 | // Metadata is a unified structure for storing 361 | // item metadata in the DB. 362 | type Metadata struct { 363 | // A hash or etag provided by the service to 364 | // make it easy to know if it has changed 365 | ServiceHash []byte 366 | 367 | // Locations 368 | LocationAccuracy int 369 | Altitude int // meters 370 | AltitudeAccuracy int 371 | Heading int // degrees 372 | Velocity int 373 | 374 | GeneralArea string // natural language description of a location 375 | 376 | // Photos and videos 377 | EXIF map[string]interface{} 378 | // TODO: Should we have some of the "most important" EXIF fields explicitly here? 379 | 380 | Width int 381 | Height int 382 | 383 | // TODO: Google Photos (how many of these belong in EXIF?) 384 | CameraMake string 385 | CameraModel string 386 | FocalLength float64 387 | ApertureFNumber float64 388 | ISOEquivalent int 389 | ExposureTime time.Duration 390 | 391 | FPS float64 // Frames Per Second 392 | 393 | // Posts (Facebook so far) 394 | Link string 395 | Description string 396 | Name string 397 | ParentID string 398 | StatusType string 399 | Type string 400 | 401 | Shares int // aka "Retweets" or "Reshares" 402 | Likes int 403 | } 404 | 405 | func (m *Metadata) encode() ([]byte, error) { 406 | // then encode the actual data, and trim off 407 | // schema from the beginning 408 | buf := new(bytes.Buffer) 409 | err := gob.NewEncoder(buf).Encode(m) 410 | if err != nil { 411 | return nil, err 412 | } 413 | return buf.Bytes()[len(metadataGobPrefix):], nil 414 | } 415 | 416 | func (m *Metadata) decode(b []byte) error { 417 | if b == nil { 418 | return nil 419 | } 420 | fullGob := append(metadataGobPrefix, b...) 421 | return gob.NewDecoder(bytes.NewReader(fullGob)).Decode(m) 422 | } 423 | 424 | var metadataGobPrefix []byte 425 | -------------------------------------------------------------------------------- /mapmutex.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import "sync" 4 | 5 | // Modified from https://medium.com/@petrlozhkin/kmutex-lock-mutex-by-unique-id-408467659c24 6 | 7 | type mapMutex struct { 8 | cond *sync.Cond 9 | set map[interface{}]struct{} 10 | } 11 | 12 | func newMapMutex() *mapMutex { 13 | return &mapMutex{ 14 | cond: sync.NewCond(new(sync.Mutex)), 15 | set: make(map[interface{}]struct{}), 16 | } 17 | } 18 | 19 | func (mmu *mapMutex) Lock(key interface{}) { 20 | mmu.cond.L.Lock() 21 | defer mmu.cond.L.Unlock() 22 | for mmu.locked(key) { 23 | mmu.cond.Wait() 24 | } 25 | mmu.set[key] = struct{}{} 26 | return 27 | } 28 | 29 | func (mmu *mapMutex) Unlock(key interface{}) { 30 | mmu.cond.L.Lock() 31 | defer mmu.cond.L.Unlock() 32 | delete(mmu.set, key) 33 | mmu.cond.Broadcast() 34 | } 35 | 36 | func (mmu *mapMutex) locked(key interface{}) (ok bool) { 37 | _, ok = mmu.set[key] 38 | return 39 | } 40 | -------------------------------------------------------------------------------- /oauth2.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | 8 | "github.com/mholt/timeliner/oauth2client" 9 | "golang.org/x/oauth2" 10 | ) 11 | 12 | // OAuth2AppSource returns an oauth2client.App for the OAuth2 provider 13 | // with the given ID. Programs using data sources that authenticate 14 | // with OAuth2 MUST set this variable, or the program will panic. 15 | var OAuth2AppSource func(providerID string, scopes []string) (oauth2client.App, error) 16 | 17 | // NewOAuth2HTTPClient returns a new HTTP client which performs 18 | // HTTP requests that are authenticated with an oauth2.Token 19 | // stored with the account acc. 20 | func (acc Account) NewOAuth2HTTPClient() (*http.Client, error) { 21 | // load the existing token for this account from the database 22 | var tkn *oauth2.Token 23 | err := UnmarshalGob(acc.authorization, &tkn) 24 | if err != nil { 25 | return nil, fmt.Errorf("gob-decoding OAuth2 token: %v", err) 26 | } 27 | if tkn == nil || tkn.AccessToken == "" { 28 | return nil, fmt.Errorf("OAuth2 token is empty: %+v", tkn) 29 | } 30 | 31 | // load the service's "oauth app", which can provide both tokens and 32 | // oauth configs -- in this case, we need the oauth config; we should 33 | // already have a token 34 | oapp, err := OAuth2AppSource(acc.ds.OAuth2.ProviderID, acc.ds.OAuth2.Scopes) 35 | if err != nil { 36 | return nil, fmt.Errorf("getting token source for %s: %v", acc.DataSourceID, err) 37 | } 38 | 39 | // obtain a token source from the oauth's config so that it can keep 40 | // the token refreshed if it expires 41 | src := oapp.TokenSource(context.Background(), tkn) 42 | 43 | // finally, create an HTTP client that authenticates using the token, 44 | // but wrapping the underlying token source so we can persist any 45 | // changes to the database 46 | return oauth2.NewClient(context.Background(), &persistedTokenSource{ 47 | tl: acc.t, 48 | ts: src, 49 | accountID: acc.ID, 50 | token: tkn, 51 | }), nil 52 | } 53 | 54 | // authorizeWithOAuth2 gets an initial OAuth2 token from the user. 55 | // It requires OAuth2AppSource to be set or it will panic. 56 | func authorizeWithOAuth2(oc OAuth2) ([]byte, error) { 57 | src, err := OAuth2AppSource(oc.ProviderID, oc.Scopes) 58 | if err != nil { 59 | return nil, fmt.Errorf("getting token source: %v", err) 60 | } 61 | tkn, err := src.InitialToken() 62 | if err != nil { 63 | return nil, fmt.Errorf("getting token from source: %v", err) 64 | } 65 | return MarshalGob(tkn) 66 | } 67 | 68 | // persistedTokenSource wraps a TokenSource for 69 | // a particular account and persists any changes 70 | // to the account's token to the database. 71 | type persistedTokenSource struct { 72 | tl *Timeline 73 | ts oauth2.TokenSource 74 | accountID int64 75 | token *oauth2.Token 76 | } 77 | 78 | func (ps *persistedTokenSource) Token() (*oauth2.Token, error) { 79 | tkn, err := ps.ts.Token() 80 | if err != nil { 81 | return tkn, err 82 | } 83 | 84 | // store an updated token in the DB 85 | if tkn.AccessToken != ps.token.AccessToken { 86 | ps.token = tkn 87 | 88 | authBytes, err := MarshalGob(tkn) 89 | if err != nil { 90 | return nil, fmt.Errorf("gob-encoding new OAuth2 token: %v", err) 91 | } 92 | 93 | _, err = ps.tl.db.Exec(`UPDATE accounts SET authorization=? WHERE id=?`, authBytes, ps.accountID) 94 | if err != nil { 95 | return nil, fmt.Errorf("storing refreshed OAuth2 token: %v", err) 96 | } 97 | } 98 | 99 | return tkn, nil 100 | } 101 | -------------------------------------------------------------------------------- /oauth2client/browser.go: -------------------------------------------------------------------------------- 1 | package oauth2client 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | "net/url" 9 | "os/exec" 10 | "runtime" 11 | "strings" 12 | ) 13 | 14 | // Browser gets an OAuth2 code via the web browser. 15 | type Browser struct { 16 | // RedirectURL is the URL to redirect the browser 17 | // to after the code is obtained; it is usually a 18 | // loopback address. If empty, DefaultRedirectURL 19 | // will be used instead. 20 | RedirectURL string 21 | } 22 | 23 | // Get opens a browser window to authCodeURL for the user to 24 | // authorize the application, and it returns the resulting 25 | // OAuth2 code. It rejects requests where the "state" param 26 | // does not match expectedStateVal. 27 | func (b Browser) Get(expectedStateVal, authCodeURL string) (string, error) { 28 | redirURLStr := b.RedirectURL 29 | if redirURLStr == "" { 30 | redirURLStr = DefaultRedirectURL 31 | } 32 | redirURL, err := url.Parse(redirURLStr) 33 | if err != nil { 34 | return "", err 35 | } 36 | 37 | ln, err := net.Listen("tcp", redirURL.Host) 38 | if err != nil { 39 | return "", err 40 | } 41 | defer ln.Close() 42 | 43 | ch := make(chan string) 44 | errCh := make(chan error) 45 | 46 | go func() { 47 | handler := func(w http.ResponseWriter, r *http.Request) { 48 | state := r.FormValue("state") 49 | code := r.FormValue("code") 50 | 51 | if r.Method != "GET" || r.URL.Path != redirURL.Path || state == "" || code == "" { 52 | http.Error(w, "This endpoint is for OAuth2 callbacks only", http.StatusNotFound) 53 | return 54 | } 55 | 56 | if state != expectedStateVal { 57 | http.Error(w, "invalid state", http.StatusUnauthorized) 58 | errCh <- fmt.Errorf("invalid OAuth2 state; expected '%s' but got '%s'", 59 | expectedStateVal, state) 60 | return 61 | } 62 | 63 | fmt.Fprint(w, successBody) 64 | ch <- code 65 | } 66 | 67 | // must disable keep-alives, otherwise repeated calls to 68 | // this method can block indefinitely in some weird bug 69 | srv := http.Server{Handler: http.HandlerFunc(handler)} 70 | srv.SetKeepAlivesEnabled(false) 71 | srv.Serve(ln) 72 | }() 73 | 74 | err = openBrowser(authCodeURL) 75 | if err != nil { 76 | fmt.Printf("Can't open browser: %s.\nPlease follow this link: %s", err, authCodeURL) 77 | } 78 | 79 | select { 80 | case code := <-ch: 81 | return code, nil 82 | case err := <-errCh: 83 | return "", err 84 | } 85 | } 86 | 87 | // openBrowser opens the browser to url. 88 | func openBrowser(url string) error { 89 | osCommand := map[string][]string{ 90 | "darwin": []string{"open"}, 91 | "freebsd": []string{"xdg-open"}, 92 | "linux": []string{"xdg-open"}, 93 | "netbsd": []string{"xdg-open"}, 94 | "openbsd": []string{"xdg-open"}, 95 | "windows": []string{"cmd", "/c", "start"}, 96 | } 97 | 98 | if runtime.GOOS == "windows" { 99 | // escape characters not allowed by cmd 100 | url = strings.Replace(url, "&", `^&`, -1) 101 | } 102 | 103 | all := osCommand[runtime.GOOS] 104 | exe := all[0] 105 | args := all[1:] 106 | 107 | buf := new(bytes.Buffer) 108 | 109 | cmd := exec.Command(exe, append(args, url)...) 110 | cmd.Stdout = buf 111 | cmd.Stderr = buf 112 | err := cmd.Run() 113 | 114 | if err != nil { 115 | return fmt.Errorf("%v: %s", err, buf.String()) 116 | } 117 | 118 | return nil 119 | } 120 | 121 | const successBody = `<!DOCTYPE html> 122 | <html> 123 | <head> 124 | <title>OAuth2 Success</title> 125 | <meta charset="utf-8"> 126 | <style> 127 | body { text-align: center; padding: 5%; font-family: sans-serif; } 128 | h1 { font-size: 20px; } 129 | p { font-size: 16px; color: #444; } 130 | </style> 131 | </head> 132 | <body> 133 | <h1>Code obtained, thank you!</h1> 134 | <p> 135 | You may now close this page and return to the application. 136 | </p> 137 | </body> 138 | </html> 139 | ` 140 | 141 | var _ Getter = Browser{} 142 | -------------------------------------------------------------------------------- /oauth2client/localapp.go: -------------------------------------------------------------------------------- 1 | package oauth2client 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "golang.org/x/oauth2" 8 | "golang.org/x/oauth2/clientcredentials" 9 | ) 10 | 11 | // LocalAppSource implements oauth2.TokenSource for 12 | // OAuth2 client apps that have the client app 13 | // credentials (Client ID and Secret) available 14 | // locally. The OAuth2 provider is accessed directly 15 | // using the OAuth2Config field value. 16 | // 17 | // If the OAuth2Config.Endpoint's TokenURL is set 18 | // but the AuthURL is empty, then it is assumed 19 | // that this is a two-legged ("client credentials") 20 | // OAuth2 configuration; i.e. bearer token. 21 | // 22 | // LocalAppSource instances can be ephemeral. 23 | type LocalAppSource struct { 24 | // OAuth2Config is the OAuth2 configuration. 25 | OAuth2Config *oauth2.Config 26 | 27 | // AuthCodeGetter is how the auth code 28 | // is obtained. If not set, a default 29 | // oauth2client.Browser is used. 30 | AuthCodeGetter Getter 31 | } 32 | 33 | // InitialToken obtains a token using s.OAuth2Config 34 | // and s.AuthCodeGetter (unless the configuration 35 | // is for a client credentials / "two-legged" flow). 36 | func (s LocalAppSource) InitialToken() (*oauth2.Token, error) { 37 | if s.OAuth2Config == nil { 38 | return nil, fmt.Errorf("missing OAuth2Config") 39 | } 40 | 41 | // if this is a two-legged config ("client credentials" flow, 42 | // where the client bears the actual token, like a password, 43 | // without an intermediate app) configuration, then we can 44 | // just return that bearer token immediately 45 | if tlc := s.twoLeggedConfig(); tlc != nil { 46 | return tlc.Token(context.Background()) 47 | } 48 | 49 | if s.AuthCodeGetter == nil { 50 | s.AuthCodeGetter = Browser{} 51 | } 52 | 53 | stateVal := State() 54 | authURL := s.OAuth2Config.AuthCodeURL(stateVal, oauth2.AccessTypeOffline) 55 | 56 | code, err := s.AuthCodeGetter.Get(stateVal, authURL) 57 | if err != nil { 58 | return nil, fmt.Errorf("getting code via browser: %v", err) 59 | } 60 | 61 | ctx := context.WithValue(context.Background(), 62 | oauth2.HTTPClient, httpClient) 63 | 64 | return s.OAuth2Config.Exchange(ctx, code) 65 | } 66 | 67 | // TokenSource returns a token source for s. 68 | func (s LocalAppSource) TokenSource(ctx context.Context, tkn *oauth2.Token) oauth2.TokenSource { 69 | if tlc := s.twoLeggedConfig(); tlc != nil { 70 | return tlc.TokenSource(ctx) 71 | } 72 | return s.OAuth2Config.TokenSource(ctx, tkn) 73 | } 74 | 75 | // twoLeggedConfig returns a clientcredentials configuration if 76 | // this app source appears to be configured as one (i.e. with 77 | // bearer credentials, with a token URL but without an auth URL, 78 | // because the client credentials is the actual authentication). 79 | func (s LocalAppSource) twoLeggedConfig() *clientcredentials.Config { 80 | if s.OAuth2Config.Endpoint.TokenURL != "" && 81 | s.OAuth2Config.Endpoint.AuthURL == "" { 82 | return &clientcredentials.Config{ 83 | ClientID: s.OAuth2Config.ClientID, 84 | ClientSecret: s.OAuth2Config.ClientSecret, 85 | TokenURL: s.OAuth2Config.Endpoint.TokenURL, 86 | Scopes: s.OAuth2Config.Scopes, 87 | } 88 | } 89 | return nil 90 | } 91 | 92 | var _ App = LocalAppSource{} 93 | -------------------------------------------------------------------------------- /oauth2client/oauth2.go: -------------------------------------------------------------------------------- 1 | package oauth2client 2 | 3 | import ( 4 | "context" 5 | mathrand "math/rand" 6 | "net/http" 7 | "time" 8 | 9 | "golang.org/x/oauth2" 10 | ) 11 | 12 | func init() { 13 | mathrand.Seed(time.Now().UnixNano()) 14 | } 15 | 16 | // Getter is a type that can get an OAuth2 auth code. 17 | // It must enforce that the state parameter of the 18 | // redirected request matches expectedStateVal. 19 | type Getter interface { 20 | Get(expectedStateVal, authCodeURL string) (code string, err error) 21 | } 22 | 23 | // State returns a random string suitable as a state value. 24 | func State() string { 25 | return randString(14) 26 | } 27 | 28 | // randString is not safe for cryptographic use. 29 | func randString(n int) string { 30 | const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 31 | b := make([]byte, n) 32 | for i := range b { 33 | b[i] = letterBytes[mathrand.Intn(len(letterBytes))] 34 | } 35 | return string(b) 36 | } 37 | 38 | type ( 39 | // OAuth2Info contains information for obtaining an auth code. 40 | OAuth2Info struct { 41 | StateValue string 42 | AuthCodeURL string 43 | } 44 | 45 | // App provides a way to get an initial OAuth2 token 46 | // as well as a continuing token source. 47 | App interface { 48 | InitialToken() (*oauth2.Token, error) 49 | TokenSource(context.Context, *oauth2.Token) oauth2.TokenSource 50 | } 51 | ) 52 | 53 | // httpClient is the HTTP client to use for OAuth2 requests. 54 | var httpClient = &http.Client{ 55 | Timeout: 10 * time.Second, 56 | } 57 | 58 | // DefaultRedirectURL is the default URL to 59 | // which to redirect clients after a code 60 | // has been obtained. Redirect URLs may 61 | // have to be registered with your OAuth2 62 | // provider. 63 | const DefaultRedirectURL = "http://localhost:8008/oauth2-redirect" 64 | -------------------------------------------------------------------------------- /oauth2client/oauth2proxy/cmd/oauth2proxy/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "net/http" 7 | 8 | "github.com/BurntSushi/toml" 9 | "github.com/mholt/timeliner/oauth2client/oauth2proxy" 10 | "golang.org/x/oauth2" 11 | ) 12 | 13 | func init() { 14 | flag.StringVar(&credentialsFile, "credentials", credentialsFile, "The path to the file containing the OAuth2 app credentials for each provider") 15 | flag.StringVar(&addr, "addr", addr, "The address to listen on") 16 | flag.StringVar(&basePath, "path", basePath, "The base path on which to serve the proxy endpoints") 17 | } 18 | 19 | var ( 20 | credentialsFile = "credentials.toml" 21 | addr = ":7233" 22 | basePath = "/oauth2" 23 | ) 24 | 25 | func main() { 26 | flag.Parse() 27 | 28 | if credentialsFile == "" { 29 | log.Fatal("[FATAL] No credentials file specified (use -credentials)") 30 | } 31 | if addr == "" { 32 | log.Fatal("[FATAL] No address specified (use -addr)") 33 | } 34 | 35 | // decode app credentials 36 | var creds oauth2Credentials 37 | md, err := toml.DecodeFile(credentialsFile, &creds) 38 | if err != nil { 39 | log.Fatalf("[FATAL] Decoding credentials file: %v", err) 40 | } 41 | if len(md.Undecoded()) > 0 { 42 | log.Fatalf("[FATAL] Unrecognized key(s) in credentials file: %+v", md.Undecoded()) 43 | } 44 | 45 | // convert them into oauth2.Configs (the structure of 46 | // oauth2.Config as TOML is too verbose for my taste) 47 | oauth2Configs := make(map[string]oauth2.Config) 48 | for id, prov := range creds.Providers { 49 | oauth2Configs[id] = oauth2.Config{ 50 | ClientID: prov.ClientID, 51 | ClientSecret: prov.ClientSecret, 52 | Endpoint: oauth2.Endpoint{ 53 | AuthURL: prov.AuthURL, 54 | TokenURL: prov.TokenURL, 55 | }, 56 | } 57 | log.Println("Provider:", id) 58 | } 59 | 60 | log.Println("Serving OAuth2 proxy on", addr) 61 | 62 | p := oauth2proxy.New(basePath, oauth2Configs) 63 | http.ListenAndServe(addr, p) 64 | } 65 | 66 | type oauth2Credentials struct { 67 | Providers map[string]oauth2ProviderConfig `toml:"providers"` 68 | } 69 | 70 | type oauth2ProviderConfig struct { 71 | ClientID string `toml:"client_id"` 72 | ClientSecret string `toml:"client_secret"` 73 | AuthURL string `toml:"auth_url"` 74 | TokenURL string `toml:"token_url"` 75 | } 76 | -------------------------------------------------------------------------------- /oauth2client/oauth2proxy/proxy.go: -------------------------------------------------------------------------------- 1 | package oauth2proxy 2 | 3 | import ( 4 | "encoding/json" 5 | "io" 6 | "io/ioutil" 7 | "net/http" 8 | "net/url" 9 | "path" 10 | "strings" 11 | 12 | "github.com/mholt/timeliner/oauth2client" 13 | "golang.org/x/oauth2" 14 | ) 15 | 16 | // New returns a new OAuth2 proxy that serves its endpoints 17 | // under the given basePath and which replaces credentials 18 | // and endpoints with those found in the configs given in 19 | // the providers map. 20 | // 21 | // The map value does not use pointers, so that temporary 22 | // manipulations of the value can occur without modifying 23 | // the original template value. 24 | func New(basePath string, providers map[string]oauth2.Config) http.Handler { 25 | basePath = path.Join("/", basePath) 26 | 27 | proxy := oauth2Proxy{providers: providers} 28 | 29 | mux := http.NewServeMux() 30 | mux.HandleFunc(path.Join(basePath, "auth-code-url"), proxy.handleAuthCodeURL) 31 | mux.HandleFunc(path.Join(basePath, "proxy")+"/", proxy.handleOAuth2) 32 | 33 | return mux 34 | } 35 | 36 | type oauth2Proxy struct { 37 | providers map[string]oauth2.Config 38 | } 39 | 40 | func (proxy oauth2Proxy) handleAuthCodeURL(w http.ResponseWriter, r *http.Request) { 41 | providerID := r.FormValue("provider") 42 | redir := r.FormValue("redirect") 43 | scopes := r.URL.Query()["scope"] 44 | 45 | oauth2CfgCopy, ok := proxy.providers[providerID] 46 | if !ok { 47 | http.Error(w, "unknown service ID", http.StatusBadRequest) 48 | return 49 | } 50 | 51 | // augment the template config with parameters specific to this 52 | // request (this is why it's important that the configs aren't 53 | // pointers; we should be mutating only copies here) 54 | oauth2CfgCopy.Scopes = scopes 55 | oauth2CfgCopy.RedirectURL = redir 56 | 57 | stateVal := oauth2client.State() 58 | url := oauth2CfgCopy.AuthCodeURL(stateVal, oauth2.AccessTypeOffline) 59 | 60 | info := oauth2client.OAuth2Info{ 61 | StateValue: stateVal, 62 | AuthCodeURL: url, 63 | } 64 | 65 | json.NewEncoder(w).Encode(info) 66 | } 67 | 68 | func (proxy oauth2Proxy) handleOAuth2(w http.ResponseWriter, r *http.Request) { 69 | // knead the URL into its two parts: the service 70 | // ID and which endpoint to proxy to 71 | // reqURL := strings.TrimPrefix(r.URL.Path, basePath+"/proxy") 72 | // reqURL = path.Clean(strings.TrimPrefix(reqURL, "/")) 73 | 74 | // we want the last two components of the path 75 | urlParts := strings.Split(r.URL.Path, "/") 76 | if len(urlParts) < 2 { 77 | http.Error(w, "bad path length", http.StatusBadRequest) 78 | return 79 | } 80 | 81 | providerID := urlParts[len(urlParts)-2] 82 | whichEndpoint := urlParts[len(urlParts)-1] 83 | 84 | // get the OAuth2 config matching the service ID 85 | oauth2Config, ok := proxy.providers[providerID] 86 | if !ok { 87 | http.Error(w, "unknown service: "+providerID, http.StatusBadRequest) 88 | return 89 | } 90 | 91 | // figure out which endpoint we'll use for upstream 92 | var upstreamEndpoint string 93 | switch whichEndpoint { 94 | case "auth": 95 | upstreamEndpoint = oauth2Config.Endpoint.AuthURL 96 | case "token": 97 | upstreamEndpoint = oauth2Config.Endpoint.TokenURL 98 | } 99 | 100 | // read the body so we can replace values if necessary 101 | // (don't use r.ParseForm because we need to keep body 102 | // and query string distinct) 103 | reqBodyBytes, err := ioutil.ReadAll(r.Body) //http.MaxBytesReader(w, r.Body, 64*1024)) 104 | if err != nil { 105 | http.Error(w, err.Error(), http.StatusBadRequest) 106 | return 107 | } 108 | 109 | // if the request body is form-encoded, replace any 110 | // credential placeholders with the real credentials 111 | var upstreamBody io.Reader 112 | if strings.Contains(r.Header.Get("Content-Type"), "x-www-form-urlencoded") { 113 | bodyForm, err := url.ParseQuery(string(reqBodyBytes)) 114 | if err != nil { 115 | http.Error(w, "error parsing request body", http.StatusBadRequest) 116 | return 117 | } 118 | replaceCredentials(bodyForm, oauth2Config) 119 | upstreamBody = strings.NewReader(bodyForm.Encode()) 120 | } 121 | 122 | // now do the same thing for the query string 123 | qs := r.URL.Query() 124 | replaceCredentials(qs, oauth2Config) 125 | 126 | // make outgoing URL 127 | upstreamURL, err := url.Parse(upstreamEndpoint) 128 | if err != nil { 129 | http.Error(w, "bad upstream URL", http.StatusInternalServerError) 130 | return 131 | } 132 | upstreamURL.RawQuery = qs.Encode() 133 | 134 | // set the real credentials -- this has to be done 135 | // carefully because apparently a lot of OAuth2 136 | // providers are broken (against RFC 6749), so 137 | // the downstream OAuth2 client lib must be sure 138 | // to set the credentials in the right place, and 139 | // we should be sure to mirror that behavior; 140 | // this means that even though the downstream may 141 | // not have the real client ID and secret, they 142 | // need to provide SOMETHING as bogus placeholder 143 | // values to signal to us where to put the real 144 | // credentials 145 | if r.Header.Get("Authorization") != "" { 146 | r.SetBasicAuth(oauth2Config.ClientID, oauth2Config.ClientSecret) 147 | } 148 | 149 | // prepare the request to upstream 150 | upstream, err := http.NewRequest(r.Method, upstreamURL.String(), upstreamBody) 151 | if err != nil { 152 | http.Error(w, err.Error(), http.StatusBadRequest) 153 | return 154 | } 155 | upstream.Header = r.Header 156 | delete(upstream.Header, "Content-Length") 157 | 158 | // perform the upstream request 159 | resp, err := http.DefaultClient.Do(upstream) 160 | if err != nil { 161 | http.Error(w, err.Error(), http.StatusBadGateway) 162 | return 163 | } 164 | defer resp.Body.Close() 165 | 166 | // copy the upstream headers to the response downstream 167 | for key, vals := range resp.Header { 168 | for _, val := range vals { 169 | w.Header().Add(key, val) 170 | } 171 | } 172 | 173 | // carry over the status code 174 | w.WriteHeader(resp.StatusCode) 175 | 176 | // copy the response body downstream 177 | _, err = io.Copy(w, resp.Body) 178 | if err != nil { 179 | http.Error(w, "writing body: "+err.Error(), http.StatusBadGateway) 180 | return 181 | } 182 | } 183 | 184 | func replaceCredentials(form url.Values, oauth2Config oauth2.Config) { 185 | if form.Get("client_id") != "" { 186 | form.Set("client_id", oauth2Config.ClientID) 187 | } 188 | if form.Get("client_secret") != "" { 189 | form.Set("client_secret", oauth2Config.ClientSecret) 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /oauth2client/remoteapp.go: -------------------------------------------------------------------------------- 1 | package oauth2client 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | "net/url" 9 | "strings" 10 | 11 | "golang.org/x/oauth2" 12 | ) 13 | 14 | // RemoteAppSource implements oauth2.TokenSource for 15 | // OAuth2 client apps that have their credentials 16 | // (Client ID and Secret, as well as endpoint info) 17 | // stored remotely. Thus, this type obtains tokens 18 | // through a remote proxy that presumably has the 19 | // client app credentials, which it will replace 20 | // before proxying to the provider. 21 | // 22 | // RemoteAppSource values can be ephemeral. 23 | type RemoteAppSource struct { 24 | // How to obtain the auth URL. 25 | // Default: DirectAuthURLMode 26 | AuthURLMode AuthURLMode 27 | 28 | // The URL to the proxy server (its 29 | // address + base path). 30 | ProxyURL string 31 | 32 | // The ID of the OAuth2 provider. 33 | ProviderID string 34 | 35 | // The scopes for which to obtain 36 | // authorization. 37 | Scopes []string 38 | 39 | // The URL to redirect to to finish 40 | // the ceremony. 41 | RedirectURL string 42 | 43 | // How the auth code is obtained. 44 | // If not set, a default 45 | // oauth2code.Browser is used. 46 | AuthCodeGetter Getter 47 | } 48 | 49 | // InitialToken obtains an initial token using s.AuthCodeGetter. 50 | func (s RemoteAppSource) InitialToken() (*oauth2.Token, error) { 51 | if s.AuthCodeGetter == nil { 52 | s.AuthCodeGetter = Browser{} 53 | } 54 | if s.AuthURLMode == "" { 55 | s.AuthURLMode = DirectAuthURLMode 56 | } 57 | 58 | cfg := s.config() 59 | 60 | // obtain a state value and auth URL 61 | var stateVal, authURL string 62 | var err error 63 | switch s.AuthURLMode { 64 | case DirectAuthURLMode: 65 | stateVal, authURL, err = s.getDirectAuthURLFromProxy() 66 | case ProxiedAuthURLMode: 67 | stateVal, authURL, err = s.getProxiedAuthURL(cfg) 68 | default: 69 | return nil, fmt.Errorf("unknown AuthURLMode: %s", s.AuthURLMode) 70 | } 71 | if err != nil { 72 | return nil, err 73 | } 74 | 75 | // now obtain the code 76 | code, err := s.AuthCodeGetter.Get(stateVal, authURL) 77 | if err != nil { 78 | return nil, fmt.Errorf("getting code via browser: %v", err) 79 | } 80 | 81 | // and complete the ceremony 82 | ctx := context.WithValue(context.Background(), 83 | oauth2.HTTPClient, httpClient) 84 | 85 | return cfg.Exchange(ctx, code) 86 | } 87 | 88 | // getDirectAuthURLFromProxy returns an auth URL that goes directly to the 89 | // OAuth2 provider server, but it gets that URL by querying the proxy server 90 | // for what it should be ("DirectAuthURLMode"). 91 | func (s RemoteAppSource) getDirectAuthURLFromProxy() (state string, authURL string, err error) { 92 | redirURL := s.RedirectURL 93 | if redirURL == "" { 94 | redirURL = DefaultRedirectURL 95 | } 96 | 97 | v := url.Values{ 98 | "provider": {s.ProviderID}, 99 | "scope": s.Scopes, 100 | "redirect": {redirURL}, 101 | } 102 | 103 | proxyURL := strings.TrimSuffix(s.ProxyURL, "/") 104 | resp, err := http.Get(proxyURL + "/auth-code-url?" + v.Encode()) 105 | if err != nil { 106 | return "", "", err 107 | } 108 | defer resp.Body.Close() 109 | 110 | if resp.StatusCode != http.StatusOK { 111 | return "", "", fmt.Errorf("requesting auth code URL from proxy: HTTP %d: %s", 112 | resp.StatusCode, resp.Status) 113 | } 114 | 115 | var info OAuth2Info 116 | err = json.NewDecoder(resp.Body).Decode(&info) 117 | if err != nil { 118 | return "", "", err 119 | } 120 | 121 | return info.StateValue, info.AuthCodeURL, nil 122 | } 123 | 124 | // getProxiedAuthURL returns an auth URL that goes to the remote proxy ("ProxiedAuthURLMode"). 125 | func (s RemoteAppSource) getProxiedAuthURL(cfg *oauth2.Config) (state string, authURL string, err error) { 126 | state = State() 127 | authURL = cfg.AuthCodeURL(state, oauth2.AccessTypeOffline) 128 | return 129 | } 130 | 131 | // config builds an OAuth2 config from s. 132 | func (s RemoteAppSource) config() *oauth2.Config { 133 | redirURL := s.RedirectURL 134 | if redirURL == "" { 135 | redirURL = DefaultRedirectURL 136 | } 137 | 138 | return &oauth2.Config{ 139 | ClientID: "placeholder", 140 | ClientSecret: "placeholder", 141 | RedirectURL: redirURL, 142 | Scopes: s.Scopes, 143 | Endpoint: oauth2.Endpoint{ 144 | AuthURL: s.ProxyURL + "/proxy/" + s.ProviderID + "/auth", 145 | TokenURL: s.ProxyURL + "/proxy/" + s.ProviderID + "/token", 146 | }, 147 | } 148 | } 149 | 150 | // TokenSource returns a token source for s. 151 | func (s RemoteAppSource) TokenSource(ctx context.Context, tkn *oauth2.Token) oauth2.TokenSource { 152 | return s.config().TokenSource(ctx, tkn) 153 | } 154 | 155 | // AuthURLMode describes what kind of auth URL a 156 | // RemoteAppSource should obtain. 157 | type AuthURLMode string 158 | 159 | const ( 160 | // DirectAuthURLMode queries the remote proxy to get 161 | // an auth URL that goes directly to the OAuth2 provider 162 | // web page the user must go to in order to obtain 163 | // authorization. Although this mode incurs one extra 164 | // HTTP request (that is not part of the OAuth2 spec, 165 | // it is purely our own), it is perhaps more robust in 166 | // more environments, since the browser will access the 167 | // auth provider's site directly, meaning that any HTML 168 | // or JavaScript on the page that expects HTTPS or a 169 | // certain hostname will be able to function correctly. 170 | DirectAuthURLMode AuthURLMode = "direct" 171 | 172 | // ProxiedAuthURLMode makes an auth URL that goes to 173 | // the remote proxy, not directly to the provider. 174 | // This is perhaps a "purer" approach than 175 | // DirectAuthURLMode, but it may not work if HTML or 176 | // JavaScript on the provider's auth page expects 177 | // a certain scheme or hostname in the page's URL. 178 | // This mode usually works when the proxy is running 179 | // over HTTPS, but this mode may break depending on 180 | // the provider, when the proxy uses HTTP (which 181 | // should only be in dev environments of course). 182 | // 183 | // For example, Google's OAuth2 page will try to set a 184 | // secure-context cookie using JavaScript, which fails 185 | // if the auth page is proxied through a plaintext HTTP 186 | // localhost endpoint, which is what we do during 187 | // development for convenience; the lack of HTTPS caused 188 | // the page to reload infinitely because, even though 189 | // the request was reverse-proxied, the JS on the page 190 | // expected HTTPS. (See my self-congratulatory tweet: 191 | // https://twitter.com/mholt6/status/1078518306045231104) 192 | // Using DirectAuthURLMode is the easiest way around 193 | // this problem. 194 | ProxiedAuthURLMode AuthURLMode = "proxied" 195 | ) 196 | 197 | var _ App = RemoteAppSource{} 198 | -------------------------------------------------------------------------------- /persons.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | ) 7 | 8 | // getPerson returns the person mapped to userID on service. 9 | // If the person does not exist, it is created. 10 | func (t *Timeline) getPerson(dataSourceID, userID, name string) (Person, error) { 11 | // first, load the person 12 | var p Person 13 | err := t.db.QueryRow(`SELECT persons.id, persons.name 14 | FROM persons, person_identities 15 | WHERE person_identities.data_source_id=? 16 | AND person_identities.user_id=? 17 | AND persons.id = person_identities.person_id 18 | LIMIT 1`, dataSourceID, userID).Scan(&p.ID, &p.Name) 19 | if err == sql.ErrNoRows { 20 | // person does not exist; create this mapping - TODO: do in a transaction 21 | p = Person{Name: name} 22 | res, err := t.db.Exec(`INSERT INTO persons (name) VALUES (?)`, p.Name) 23 | if err != nil { 24 | return Person{}, fmt.Errorf("adding new person: %v", err) 25 | } 26 | p.ID, err = res.LastInsertId() 27 | if err != nil { 28 | return Person{}, fmt.Errorf("getting person ID: %v", err) 29 | } 30 | _, err = t.db.Exec(`INSERT OR IGNORE INTO person_identities 31 | (person_id, data_source_id, user_id) VALUES (?, ?, ?)`, 32 | p.ID, dataSourceID, userID) 33 | if err != nil { 34 | return Person{}, fmt.Errorf("adding new person identity mapping: %v", err) 35 | } 36 | } else if err != nil { 37 | return Person{}, fmt.Errorf("selecting person identity: %v", err) 38 | } 39 | 40 | // now get all the person's identities 41 | rows, err := t.db.Query(`SELECT id, person_id, data_source_id, user_id 42 | FROM person_identities WHERE person_id=?`, p.ID) 43 | if err != nil { 44 | return Person{}, fmt.Errorf("selecting person's known identities: %v", err) 45 | } 46 | defer rows.Close() 47 | for rows.Next() { 48 | var ident PersonIdentity 49 | err := rows.Scan(&ident.ID, &ident.PersonID, &ident.DataSourceID, &ident.UserID) 50 | if err != nil { 51 | return Person{}, fmt.Errorf("loading person's identity: %v", err) 52 | } 53 | p.Identities = append(p.Identities, ident) 54 | } 55 | if err = rows.Err(); err != nil { 56 | return Person{}, fmt.Errorf("scanning identity rows: %v", err) 57 | } 58 | 59 | return p, nil 60 | } 61 | 62 | // Person represents a person. 63 | type Person struct { 64 | ID int64 65 | Name string 66 | Identities []PersonIdentity 67 | } 68 | 69 | // PersonIdentity is a way to map a user ID on a service to a person. 70 | type PersonIdentity struct { 71 | ID int64 72 | PersonID string 73 | DataSourceID string 74 | UserID string 75 | } 76 | -------------------------------------------------------------------------------- /ratelimit.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "net/http" 5 | "time" 6 | ) 7 | 8 | // RateLimit describes a rate limit. 9 | type RateLimit struct { 10 | RequestsPerHour int 11 | BurstSize int 12 | 13 | ticker *time.Ticker 14 | token chan struct{} 15 | } 16 | 17 | // NewRateLimitedRoundTripper adds rate limiting to rt based on the rate 18 | // limiting policy registered by the data source associated with acc. 19 | func (acc Account) NewRateLimitedRoundTripper(rt http.RoundTripper) http.RoundTripper { 20 | rl, ok := acc.t.rateLimiters[acc.String()] 21 | 22 | if !ok && acc.ds.RateLimit.RequestsPerHour > 0 { 23 | secondsBetweenReqs := 60.0 / (float64(acc.ds.RateLimit.RequestsPerHour) / 60.0) 24 | millisBetweenReqs := secondsBetweenReqs * 1000.0 25 | reqInterval := time.Duration(millisBetweenReqs) * time.Millisecond 26 | if reqInterval < minInterval { 27 | reqInterval = minInterval 28 | } 29 | 30 | rl.ticker = time.NewTicker(reqInterval) 31 | rl.token = make(chan struct{}, rl.BurstSize) 32 | 33 | for i := 0; i < cap(rl.token); i++ { 34 | rl.token <- struct{}{} 35 | } 36 | go func() { 37 | for range rl.ticker.C { 38 | rl.token <- struct{}{} 39 | } 40 | }() 41 | 42 | acc.t.rateLimiters[acc.String()] = rl 43 | } 44 | 45 | return rateLimitedRoundTripper{ 46 | RoundTripper: rt, 47 | token: rl.token, 48 | } 49 | } 50 | 51 | type rateLimitedRoundTripper struct { 52 | http.RoundTripper 53 | token <-chan struct{} 54 | } 55 | 56 | func (rt rateLimitedRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { 57 | <-rt.token 58 | return rt.RoundTripper.RoundTrip(req) 59 | } 60 | 61 | var rateLimiters = make(map[string]RateLimit) 62 | 63 | const minInterval = 100 * time.Millisecond 64 | -------------------------------------------------------------------------------- /timeliner.go: -------------------------------------------------------------------------------- 1 | // Timeliner - A personal data aggregation utility 2 | // Copyright (C) 2019 Matthew Holt 3 | // 4 | // This program is free software: you can redistribute it and/or modify 5 | // it under the terms of the GNU Affero General Public License as published 6 | // by the Free Software Foundation, either version 3 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU Affero General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU Affero General Public License 15 | // along with this program. If not, see <https://www.gnu.org/licenses/>. 16 | 17 | // TODO: Apply license to all files 18 | 19 | package timeliner 20 | 21 | import ( 22 | "context" 23 | "database/sql" 24 | "fmt" 25 | "io" 26 | "log" 27 | mathrand "math/rand" 28 | "sync" 29 | "time" 30 | 31 | cuckoo "github.com/seiflotfy/cuckoofilter" 32 | ) 33 | 34 | func init() { 35 | mathrand.Seed(time.Now().UnixNano()) 36 | } 37 | 38 | // Timeline represents an opened timeline repository. 39 | // The zero value is NOT valid; use Open() to obtain 40 | // a valid value. 41 | type Timeline struct { 42 | db *sql.DB 43 | repoDir string 44 | rateLimiters map[string]RateLimit 45 | } 46 | 47 | // Open creates/opens a timeline at the given 48 | // repository directory. Timelines should always 49 | // be Close()'d for a clean shutdown when done. 50 | func Open(repo string) (*Timeline, error) { 51 | db, err := openDB(repo) 52 | if err != nil { 53 | return nil, fmt.Errorf("opening database: %v", err) 54 | } 55 | return &Timeline{ 56 | db: db, 57 | repoDir: repo, 58 | rateLimiters: make(map[string]RateLimit), 59 | }, nil 60 | } 61 | 62 | // Close frees up resources allocated from Open. 63 | func (t *Timeline) Close() error { 64 | for key, rl := range t.rateLimiters { 65 | if rl.ticker != nil { 66 | rl.ticker.Stop() 67 | rl.ticker = nil 68 | } 69 | delete(t.rateLimiters, key) 70 | } 71 | if t.db != nil { 72 | return t.db.Close() 73 | } 74 | return nil 75 | } 76 | 77 | type concurrentCuckoo struct { 78 | *cuckoo.Filter 79 | *sync.Mutex 80 | } 81 | 82 | // FakeCloser turns an io.Reader into an io.ReadCloser 83 | // where the Close() method does nothing. 84 | func FakeCloser(r io.Reader) io.ReadCloser { 85 | return fakeCloser{r} 86 | } 87 | 88 | type fakeCloser struct { 89 | io.Reader 90 | } 91 | 92 | // Close does nothing except satisfy io.Closer. 93 | func (fc fakeCloser) Close() error { return nil } 94 | 95 | // ctxKey is used for contexts, as recommended by 96 | // https://golang.org/pkg/context/#WithValue. It 97 | // is unexported so values stored by this package 98 | // can only be accessed by this package. 99 | type ctxKey string 100 | 101 | // wrappedClientCtxKey is how the context value is accessed. 102 | var wrappedClientCtxKey ctxKey = "wrapped_client" 103 | 104 | // CheckpointFn is a function that saves a checkpoint. 105 | type CheckpointFn func(checkpoint []byte) error 106 | 107 | // Checkpoint saves a checkpoint for the processing associated 108 | // with the provided context. It overwrites any previous 109 | // checkpoint. Any errors are logged. 110 | func Checkpoint(ctx context.Context, checkpoint []byte) { 111 | wc, ok := ctx.Value(wrappedClientCtxKey).(*WrappedClient) 112 | 113 | if !ok { 114 | log.Printf("[ERROR][%s/%s] Checkpoint function not available; got type %T (%#v)", 115 | wc.ds.ID, wc.acc.UserID, wc, wc) 116 | return 117 | } 118 | 119 | chkpt, err := MarshalGob(checkpointWrapper{wc.commandParams, checkpoint}) 120 | if err != nil { 121 | log.Printf("[ERROR][%s/%s] Encoding checkpoint wrapper: %v", wc.ds.ID, wc.acc.UserID, err) 122 | return 123 | } 124 | 125 | _, err = wc.tl.db.Exec(`UPDATE accounts SET checkpoint=? WHERE id=?`, // TODO: LIMIT 1 (see https://github.com/mattn/go-sqlite3/pull/564) 126 | chkpt, wc.acc.ID) 127 | if err != nil { 128 | log.Printf("[ERROR][%s/%s] Checkpoint: %v", wc.ds.ID, wc.acc.UserID, err) 129 | return 130 | } 131 | } 132 | 133 | // checkpointWrapper stores a provider's checkpoint along with the 134 | // parameters of the command that initiated the process; the checkpoint 135 | // will only be loaded and restored to the provider on next run if 136 | // the parameters match, because it doesn't make sense to restore a 137 | // process that has different, potentially conflicting, parameters, 138 | // such as timeframe. 139 | type checkpointWrapper struct { 140 | Params string 141 | Data []byte 142 | } 143 | 144 | // ProcessingOptions configures how item processing is carried out. 145 | type ProcessingOptions struct { 146 | Reprocess bool 147 | Prune bool 148 | Integrity bool 149 | Timeframe Timeframe 150 | Merge MergeOptions 151 | Verbose bool 152 | } 153 | 154 | // MergeOptions configures how items are merged. By 155 | // default, items are not merged; if an item with a 156 | // duplicate ID is encountered, it will be replaced 157 | // with the new item (see the "reprocess" flag). 158 | // Merging has to be explicitly enabled. 159 | // 160 | // Currently, the only way to perform a merge is to 161 | // enable "soft" merging: finding an item with the 162 | // same timestamp and either text data or filename. 163 | // Then, one of the item's IDs is updated to match 164 | // the other. These merge options configure how 165 | // the items are then combined. 166 | // 167 | // As it is possible and likely for both items to 168 | // have non-empty values for the same fields, these 169 | // "conflicts" must be resolved non-interactively. 170 | // By default, a merge conflict prefers existing 171 | // values (old item's field) over the new one, and 172 | // the new one only fills in missing values. (This 173 | // seems safest.) However, these merge options allow 174 | // you to customize that behavior and overwrite 175 | // existing values with the new item's fields (only 176 | // happens if new item's field is non-empty, i.e. 177 | // a merge will never delete existing data). 178 | type MergeOptions struct { 179 | // Enables "soft" merging. 180 | // 181 | // If true, an item may be merged if it is likely 182 | // to be the same as an existing item, even if the 183 | // item IDs are different. For example, if a 184 | // service has multiple ways of listing items, but 185 | // does not provide a consistent ID for the same 186 | // item across listings, a soft merge will allow the 187 | // processing to treat them as the same as long as 188 | // other fields match: timestamp, and either data text 189 | // or data filename. 190 | SoftMerge bool 191 | 192 | // Overwrite existing (old) item's ID with the ID 193 | // provided by the current (new) item. 194 | PreferNewID bool 195 | 196 | // Overwrite existing item's text data. 197 | PreferNewDataText bool 198 | 199 | // Overwrite existing item's data file. 200 | PreferNewDataFile bool 201 | 202 | // Overwrite existing item's metadata. 203 | PreferNewMetadata bool 204 | } 205 | 206 | // ListingOptions specifies parameters for listing items 207 | // from a data source. Some data sources might not be 208 | // able to honor all fields. 209 | type ListingOptions struct { 210 | // A file from which to read the data. 211 | Filename string 212 | 213 | // Time bounds on which data to retrieve. 214 | // The respective time and item ID fields 215 | // which are set must never conflict. 216 | Timeframe Timeframe 217 | 218 | // A checkpoint from which to resume 219 | // item retrieval. 220 | Checkpoint []byte 221 | 222 | // Enable verbose output (logs). 223 | Verbose bool 224 | } 225 | -------------------------------------------------------------------------------- /wrappedclient.go: -------------------------------------------------------------------------------- 1 | package timeliner 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "os" 9 | "sync" 10 | "time" 11 | 12 | cuckoo "github.com/seiflotfy/cuckoofilter" 13 | ) 14 | 15 | // WrappedClient wraps a Client instance with unexported 16 | // fields that contain necessary state for performing 17 | // data collection operations. Do not craft this type 18 | // manually; use Timeline.NewClient() to obtain one. 19 | type WrappedClient struct { 20 | Client 21 | tl *Timeline 22 | acc Account 23 | ds DataSource 24 | 25 | lastItemRowID int64 26 | lastItemTimestamp time.Time 27 | lastItemMu *sync.Mutex 28 | 29 | // used with checkpoints; it only makes sense to resume a checkpoint 30 | // if the process has the same operational parameters as before; 31 | // some providers (like Google Photos) even return errors if you 32 | // query a "next page" with different parameters 33 | commandParams string 34 | } 35 | 36 | // GetLatest gets the most recent items from wc. It does not prune or 37 | // reprocess; only meant for a quick pull (error will be returned if 38 | // procOpt is not compatible). If there are no items pulled yet, all 39 | // items will be pulled. If procOpt.Timeframe.Until is not nil, the 40 | // latest only up to that timestamp will be pulled, and if until is 41 | // after the latest item, no items will be pulled. 42 | func (wc *WrappedClient) GetLatest(ctx context.Context, procOpt ProcessingOptions) error { 43 | if ctx == nil { 44 | ctx = context.Background() 45 | } 46 | ctx = context.WithValue(ctx, wrappedClientCtxKey, wc) 47 | 48 | if procOpt.Reprocess || procOpt.Prune || procOpt.Integrity || procOpt.Timeframe.Since != nil { 49 | return fmt.Errorf("get-latest does not support -reprocess, -prune, -integrity, or -start") 50 | } 51 | 52 | // get date and original ID of the most recent item for this 53 | // account from the last successful run 54 | var mostRecentTimestamp int64 55 | var mostRecentOriginalID string 56 | if wc.acc.lastItemID != nil { 57 | err := wc.tl.db.QueryRow(`SELECT timestamp, original_id 58 | FROM items WHERE id=? LIMIT 1`, *wc.acc.lastItemID).Scan(&mostRecentTimestamp, &mostRecentOriginalID) 59 | if err != nil && err != sql.ErrNoRows { 60 | return fmt.Errorf("getting most recent item: %v", err) 61 | } 62 | } 63 | 64 | // constrain the pull to the recent timeframe 65 | timeframe := Timeframe{Until: procOpt.Timeframe.Until} 66 | if mostRecentTimestamp > 0 { 67 | ts := time.Unix(mostRecentTimestamp, 0) 68 | timeframe.Since = &ts 69 | if timeframe.Until != nil && timeframe.Until.Before(ts) { 70 | // most recent item is already after "until"/end date; nothing to do 71 | return nil 72 | } 73 | } 74 | if mostRecentOriginalID != "" { 75 | timeframe.SinceItemID = &mostRecentOriginalID 76 | } 77 | 78 | checkpoint := wc.prepareCheckpoint(timeframe) 79 | 80 | wg, ch := wc.beginProcessing(concurrentCuckoo{}, procOpt) 81 | 82 | err := wc.Client.ListItems(ctx, ch, ListingOptions{ 83 | Timeframe: timeframe, 84 | Checkpoint: checkpoint, 85 | Verbose: procOpt.Verbose, 86 | }) 87 | if err != nil { 88 | return fmt.Errorf("getting items from service: %v", err) 89 | } 90 | 91 | // wait for processing to complete 92 | wg.Wait() 93 | 94 | err = wc.successCleanup() 95 | if err != nil { 96 | return fmt.Errorf("processing completed, but error cleaning up: %v", err) 97 | } 98 | 99 | return nil 100 | } 101 | 102 | // GetAll gets all the items using wc. If procOpt.Reprocess is true, items that 103 | // are already in the timeline will be re-processed. If procOpt.Prune is true, 104 | // items that are not listed on the data source by wc will be removed 105 | // from the timeline at the end of the listing. If procOpt.Integrity is true, 106 | // all items that are listed by wc that exist in the timeline and which 107 | // consist of a data file will be opened and checked for integrity; if 108 | // the file has changed, it will be reprocessed. 109 | func (wc *WrappedClient) GetAll(ctx context.Context, procOpt ProcessingOptions) error { 110 | if wc.Client == nil { 111 | return fmt.Errorf("no client") 112 | } 113 | if ctx == nil { 114 | ctx = context.Background() 115 | } 116 | ctx = context.WithValue(ctx, wrappedClientCtxKey, wc) 117 | 118 | var cc concurrentCuckoo 119 | if procOpt.Prune { 120 | cc.Filter = cuckoo.NewFilter(10000000) // 10mil = ~16 MB on 64-bit 121 | cc.Mutex = new(sync.Mutex) 122 | } 123 | 124 | checkpoint := wc.prepareCheckpoint(procOpt.Timeframe) 125 | 126 | wg, ch := wc.beginProcessing(cc, procOpt) 127 | 128 | err := wc.Client.ListItems(ctx, ch, ListingOptions{ 129 | Checkpoint: checkpoint, 130 | Timeframe: procOpt.Timeframe, 131 | Verbose: procOpt.Verbose, 132 | }) 133 | if err != nil { 134 | return fmt.Errorf("getting items from service: %v", err) 135 | } 136 | 137 | // wait for processing to complete 138 | wg.Wait() 139 | 140 | err = wc.successCleanup() 141 | if err != nil { 142 | return fmt.Errorf("processing completed, but error cleaning up: %v", err) 143 | } 144 | 145 | // commence prune, if requested 146 | if procOpt.Prune { 147 | err := wc.doPrune(cc) 148 | if err != nil { 149 | return fmt.Errorf("processing completed, but error pruning: %v", err) 150 | } 151 | } 152 | 153 | return nil 154 | } 155 | 156 | // prepareCheckpoint sets the current command parameters on wc for 157 | // checkpoints to be saved later on, and then returns the last 158 | // checkpoint data only if its parameters match the new/current ones. 159 | // This prevents trying to resume a process with different parameters 160 | // which can cause errors. 161 | func (wc *WrappedClient) prepareCheckpoint(tf Timeframe) []byte { 162 | wc.commandParams = tf.String() 163 | if wc.acc.cp == nil || wc.acc.cp.Params != wc.commandParams { 164 | return nil 165 | } 166 | return wc.acc.cp.Data 167 | } 168 | 169 | func (wc *WrappedClient) successCleanup() error { 170 | // clear checkpoint 171 | _, err := wc.tl.db.Exec(`UPDATE accounts SET checkpoint=NULL WHERE id=?`, wc.acc.ID) // TODO: limit 1 (see https://github.com/mattn/go-sqlite3/pull/802) 172 | if err != nil { 173 | return fmt.Errorf("clearing checkpoint: %v", err) 174 | } 175 | wc.acc.checkpoint = nil 176 | 177 | // update the last item ID, to advance the window for future get-latest operations 178 | wc.lastItemMu.Lock() 179 | lastItemID := wc.lastItemRowID 180 | wc.lastItemMu.Unlock() 181 | if lastItemID > 0 { 182 | _, err = wc.tl.db.Exec(`UPDATE accounts SET last_item_id=? WHERE id=?`, lastItemID, wc.acc.ID) // TODO: limit 1 183 | if err != nil { 184 | return fmt.Errorf("advancing most recent item ID: %v", err) 185 | } 186 | } 187 | 188 | return nil 189 | } 190 | 191 | // Import is like GetAll but for a locally-stored archive or export file that can 192 | // simply be opened and processed, rather than needing to run over a network. See 193 | // the godoc for GetAll. This is only for data sources that support Import. 194 | func (wc *WrappedClient) Import(ctx context.Context, filename string, procOpt ProcessingOptions) error { 195 | if wc.Client == nil { 196 | return fmt.Errorf("no client") 197 | } 198 | 199 | var cc concurrentCuckoo 200 | if procOpt.Prune { 201 | cc.Filter = cuckoo.NewFilter(10000000) // 10mil = ~16 MB on 64-bit 202 | cc.Mutex = new(sync.Mutex) 203 | } 204 | 205 | wg, ch := wc.beginProcessing(cc, procOpt) 206 | 207 | err := wc.Client.ListItems(ctx, ch, ListingOptions{ 208 | Filename: filename, 209 | Checkpoint: wc.acc.checkpoint, 210 | Timeframe: procOpt.Timeframe, 211 | Verbose: procOpt.Verbose, 212 | }) 213 | if err != nil { 214 | return fmt.Errorf("importing: %v", err) 215 | } 216 | 217 | // wait for processing to complete 218 | wg.Wait() 219 | 220 | err = wc.successCleanup() 221 | if err != nil { 222 | return fmt.Errorf("processing completed, but error cleaning up: %v", err) 223 | } 224 | 225 | // commence prune, if requested 226 | if procOpt.Prune { 227 | err := wc.doPrune(cc) 228 | if err != nil { 229 | return fmt.Errorf("processing completed, but error pruning: %v", err) 230 | } 231 | } 232 | 233 | return nil 234 | } 235 | 236 | func (wc *WrappedClient) doPrune(cuckoo concurrentCuckoo) error { 237 | // absolutely do not allow a prune to happen if the account 238 | // has a checkpoint; this is because we don't store the cuckoo 239 | // filter with checkpoints, meaning that the list of items 240 | // that have been seen is INCOMPLETE, and pruning on that 241 | // would lead to data loss. TODO: Find a way to store the 242 | // cuckoo filter with a checkpoint... 243 | var ckpt []byte 244 | err := wc.tl.db.QueryRow(`SELECT checkpoint FROM accounts WHERE id=? LIMIT 1`, 245 | wc.acc.ID).Scan(&ckpt) 246 | if err != nil { 247 | return fmt.Errorf("querying checkpoint: %v", err) 248 | } 249 | if len(ckpt) > 0 { 250 | return fmt.Errorf("checkpoint exists; refusing to prune for fear of incomplete item listing") 251 | } 252 | 253 | // deleting items can't happen while iterating the rows 254 | // since the database table locks; i.e. those two operations 255 | // are in conflict, so we can't do the delete until we 256 | // close the result rows; hence, we have to load each 257 | // item to delete into memory (sigh) and then delete after 258 | // the listing is complete 259 | itemsToDelete, err := wc.listItemsToDelete(cuckoo) 260 | if err != nil { 261 | return fmt.Errorf("listing items to delete: %v", err) 262 | } 263 | 264 | for _, rowID := range itemsToDelete { 265 | err := wc.deleteItem(rowID) 266 | if err != nil { 267 | log.Printf("[ERROR][%s/%s] Deleting item: %v (item_id=%d)", 268 | wc.ds.ID, wc.acc.UserID, err, rowID) 269 | } 270 | } 271 | 272 | return nil 273 | } 274 | 275 | func (wc *WrappedClient) listItemsToDelete(cuckoo concurrentCuckoo) ([]int64, error) { 276 | rows, err := wc.tl.db.Query(`SELECT id, original_id FROM items WHERE account_id=?`, wc.acc.ID) 277 | if err != nil { 278 | return nil, fmt.Errorf("selecting all items from account: %v (account_id=%d)", err, wc.acc.ID) 279 | } 280 | defer rows.Close() 281 | 282 | var itemsToDelete []int64 283 | for rows.Next() { 284 | var rowID int64 285 | var originalID string 286 | err := rows.Scan(&rowID, &originalID) 287 | if err != nil { 288 | return nil, fmt.Errorf("scanning item: %v", err) 289 | } 290 | if originalID == "" { 291 | continue 292 | } 293 | cuckoo.Lock() 294 | existsOnService := cuckoo.Lookup([]byte(originalID)) 295 | cuckoo.Unlock() 296 | if !existsOnService { 297 | itemsToDelete = append(itemsToDelete, rowID) 298 | } 299 | } 300 | if err = rows.Err(); err != nil { 301 | return nil, fmt.Errorf("iterating item rows: %v", err) 302 | } 303 | 304 | return itemsToDelete, nil 305 | } 306 | 307 | func (wc *WrappedClient) deleteItem(rowID int64) error { 308 | // before deleting the row, find out whether this item 309 | // has a data file and is the only one referencing it 310 | var count int 311 | var dataFile string 312 | err := wc.tl.db.QueryRow(`SELECT COUNT(*), data_file FROM items 313 | WHERE data_file = (SELECT data_file FROM items 314 | WHERE id=? AND data_file IS NOT NULL 315 | AND data_file != "" LIMIT 1)`, 316 | rowID).Scan(&count, &dataFile) 317 | if err != nil { 318 | return fmt.Errorf("querying count of rows sharing data file: %v", err) 319 | } 320 | 321 | _, err = wc.tl.db.Exec(`DELETE FROM items WHERE id=?`, rowID) // TODO: limit 1 (see https://github.com/mattn/go-sqlite3/pull/802) 322 | if err != nil { 323 | return fmt.Errorf("deleting item from DB: %v", err) 324 | } 325 | 326 | if count == 1 { 327 | err := os.Remove(wc.tl.fullpath(dataFile)) 328 | if err != nil { 329 | return fmt.Errorf("deleting associated data file from disk: %v", err) 330 | } 331 | } 332 | 333 | return nil 334 | } 335 | 336 | // DataSourceName returns the name of the data source wc was created from. 337 | func (wc *WrappedClient) DataSourceName() string { return wc.ds.Name } 338 | 339 | // DataSourceID returns the ID of the data source wc was created from. 340 | func (wc *WrappedClient) DataSourceID() string { return wc.ds.ID } 341 | 342 | // UserID returns the ID of the user associated with this client. 343 | func (wc *WrappedClient) UserID() string { return wc.acc.UserID } 344 | --------------------------------------------------------------------------------