├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── doc
├── MAIL_SETUP.md
└── TROUBLESHOOTING_INSTALLS.md
├── install
├── my
├── activitywatch
│ └── active_window.py
├── apple
│ └── privacy_export.py
├── bash.py
├── blizzard
│ └── gdpr.py
├── chess
│ └── export.py
├── discord
│ └── data_export.py
├── facebook
│ └── gdpr.py
├── grouvee
│ └── export.py
├── ip
│ ├── all.py
│ ├── blizzard.py
│ ├── discord.py
│ └── facebook.py
├── ipython.py
├── league
│ └── export.py
├── linkedin
│ └── privacy_export.py
├── listenbrainz
│ └── export.py
├── location
│ └── apple.py
├── mail
│ ├── all.py
│ ├── common.py
│ ├── imap.py
│ ├── mbox.py
│ └── parse_parts.py
├── mal
│ └── export.py
├── minecraft
│ └── advancements.py
├── mpv
│ └── history_daemon.py
├── offline
│ └── listens.py
├── piazza
│ └── scraper.py
├── project_euler.py
├── py.typed
├── rss
│ └── newsboat
│ │ └── git_history.py
├── runelite
│ └── screenshots.py
├── scramble
│ └── history.py
├── skype
│ └── gdpr.py
├── spotify
│ └── gdpr.py
├── steam
│ └── scraper.py
├── todotxt
│ ├── active.py
│ ├── common.py
│ └── git_history.py
├── trakt
│ └── export.py
├── ttt.py
├── twitch
│ ├── all.py
│ ├── common.py
│ ├── gdpr.py
│ └── overrustle_logs.py
├── utils
│ ├── backup_to
│ │ └── __main__.py
│ ├── parse_csv.py
│ └── time.py
└── zsh.py
├── scripts
├── functions.sh
└── lint
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── common.py
├── conftest.py
├── my
└── my
│ └── config
│ └── __init__.py
├── test_apple.py
├── test_bash.py
├── test_commits.py
├── test_games.py
├── test_ipython.py
├── test_my.py
├── test_zsh.py
└── testdata
├── bash
└── history
├── ipython.sqlite
└── zsh
├── overlap_history
└── zsh_history
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on:
3 | push:
4 | branches: ["*"]
5 | pull_request: # needed to trigger on others' PRs
6 | workflow_dispatch: # needed to trigger workflows manually
7 |
8 | jobs:
9 | build:
10 | strategy:
11 | matrix:
12 | platform: [ubuntu-latest, macos-latest]
13 | python-version: [ "3.10", "3.11", "3.12", "3.13" ]
14 | exclude:
15 | [
16 | { platform: macos-latest, python-version: "3.11" },
17 | { platform: macos-latest, python-version: "3.12" },
18 | ]
19 |
20 | runs-on: ${{ matrix.platform }}
21 |
22 | steps:
23 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
24 |
25 | - uses: actions/setup-python@v4
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 |
29 | - uses: actions/checkout@v4
30 | with:
31 | fetch-depth: 0 # nicer to have all git history when debugging/for tests
32 |
33 | - run: ./install
34 |
35 | - uses: actions/upload-artifact@v4
36 | with:
37 | name: .coverage.mypy-${{ matrix.platform }}_${{ matrix.python-version }}
38 | path: .coverage.mypy/
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tags*
2 | *.priv.job
3 | /*.pdf
4 | Pipfile*
5 |
6 |
7 | # Created by https://www.gitignore.io/api/python,emacs
8 | # Edit at https://www.gitignore.io/?templates=python,emacs
9 |
10 | ### Emacs ###
11 | # -*- mode: gitignore; -*-
12 | *~
13 | \#*\#
14 | /.emacs.desktop
15 | /.emacs.desktop.lock
16 | *.elc
17 | auto-save-list
18 | tramp
19 | .\#*
20 |
21 | # Org-mode
22 | .org-id-locations
23 | *_archive
24 |
25 | # flymake-mode
26 | *_flymake.*
27 |
28 | # eshell files
29 | /eshell/history
30 | /eshell/lastdir
31 |
32 | # elpa packages
33 | /elpa/
34 |
35 | # reftex files
36 | *.rel
37 |
38 | # AUCTeX auto folder
39 | /auto/
40 |
41 | # cask packages
42 | .cask/
43 | dist/
44 |
45 | # Flycheck
46 | flycheck_*.el
47 |
48 | # server auth directory
49 | /server/
50 |
51 | # projectiles files
52 | .projectile
53 |
54 | # directory configuration
55 | .dir-locals.el
56 |
57 | # network security
58 | /network-security.data
59 |
60 |
61 | ### Python ###
62 | # Byte-compiled / optimized / DLL files
63 | __pycache__/
64 | *.py[cod]
65 | *$py.class
66 |
67 | # C extensions
68 | *.so
69 |
70 | # Distribution / packaging
71 | .Python
72 | build/
73 | develop-eggs/
74 | downloads/
75 | eggs/
76 | .eggs/
77 | lib/
78 | lib64/
79 | parts/
80 | sdist/
81 | var/
82 | wheels/
83 | pip-wheel-metadata/
84 | share/python-wheels/
85 | *.egg-info/
86 | .installed.cfg
87 | *.egg
88 | MANIFEST
89 |
90 | # PyInstaller
91 | # Usually these files are written by a python script from a template
92 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
93 | *.manifest
94 | *.spec
95 |
96 | # Installer logs
97 | pip-log.txt
98 | pip-delete-this-directory.txt
99 |
100 | # Unit test / coverage reports
101 | htmlcov/
102 | .tox/
103 | .nox/
104 | .coverage
105 | .coverage.*
106 | .cache
107 | nosetests.xml
108 | coverage.xml
109 | *.cover
110 | .hypothesis/
111 | .pytest_cache/
112 |
113 | # Translations
114 | *.mo
115 | *.pot
116 |
117 | # Scrapy stuff:
118 | .scrapy
119 |
120 | # Sphinx documentation
121 | docs/_build/
122 |
123 | # PyBuilder
124 | target/
125 |
126 | # pyenv
127 | .python-version
128 |
129 | # pipenv
130 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
131 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
132 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
133 | # install all needed dependencies.
134 | #Pipfile.lock
135 |
136 | # celery beat schedule file
137 | celerybeat-schedule
138 |
139 | # SageMath parsed files
140 | *.sage.py
141 |
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 |
146 | # Rope project settings
147 | .ropeproject
148 |
149 | # Mr Developer
150 | .mr.developer.cfg
151 | .project
152 | .pydevproject
153 |
154 | # mkdocs documentation
155 | /site
156 |
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 |
162 | # Pyre type checker
163 | .pyre/
164 |
165 | # End of https://www.gitignore.io/api/python,emacs
166 |
167 | cov/
168 | *.png
169 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ### 2024-10-13
2 |
3 | Removed the `my.utils.input_source` code, it doesn't work great with cachew, or in general (passing a custom function didn't always invalidate caching).
4 |
5 | ### 2022-03-20
6 |
7 | See [#33](https://github.com/purarue/HPI/pull/33)
8 |
9 | Replaced `file_backups` modules with `git_history`, using [`git_doc_history`](https://github.com/purarue/git_doc_history)
10 |
11 | I don't expect anyone else was using these modules, but there's a script [here](https://github.com/purarue/git_doc_history/blob/master/bin/file_backups_to_doc_history) to convert from the old format to new. Feel free to open an issue if you were using these -- could maintain them in a separate HPI repo as old repositories
12 |
13 | ### 2022-01-30
14 |
15 | [Relevant PR](https://github.com/purarue/HPI/pull/18); If you're having issues with the `my.config` blocks, compare yours to [mine](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py)
16 |
17 | Renamed some modules to allow for future extension, and less possibilities for conflicts with other related HPI modules under a particular company/service/source
18 |
19 | For reference, [here is the current directory structure](https://github.com/purarue/HPI/tree/eb425e653918d68eb9d41da29e791fe1ba554dc7/my) as of this commit
20 |
21 | In particular, anything with a `gdpr`/`data_export`/`privacy_export` is named to be that, instead of globally squashing the namespace module to the single `modulename.py` file
22 |
23 | Converting a single-file module to a namespace module [is always a breaking change](https://github.com/karlicoss/promnesia/pull/225#issuecomment-819773697), and though [one can do hacky traceback introspection](https://github.com/karlicoss/HPI/blob/master/my/reddit/__init__.py) (to possible delay the change from a non-namespace package to a namespace package. However, if anyone else is using this code, its likely in the background through promnesia, so most likely situation is that they don't see that till I deprecate it anyways), but its only a temporary solution until the `__init__.py`/`module.py` file is eventually removed -- so better to do them all now instead of waiting till it becomes 'too late'
24 |
25 | A user (or me) may want to write their own module with the same name, meaning they can't use both at the same time if mine is just `my.module_name.py`, since my module existing means any other namespace packages can't have the same base name (see [reorder_editable](https://github.com/purarue/reorder_editable) for an explanation)
26 |
27 | The one motivating this change is `apple.py`, since the old `apple.py` was parsing the privacy export, but I wanted to add something to parse [`imessage`](https://github.com/purarue/HPI/commit/e361ce8182d8be8b331875078ad17605d3f80a50) files. Someone else may want to add other `apple/file.py` files to parse other parts of apple/mac behaviour, but me having the single `apple.py` either means they have to have their repo before mine on their path (but doing so means they overwrite the current `apple.py` file, so they can't use that to parse their privacy export, even if they were trying to do something else entirely), or they have to rename their code to something like `my_apple/file.py` to create a new namespace module
28 |
29 | Possible 'Exceptions' to this:
30 |
31 | - For some files, if the possibility for conflict is low (I can't imagine anyone exporting data from the source in any other way, e.g., `ipython`, `project_euler`) or the name is so specific to the source that its not needed (e.g. `ttt`, `window_watcher`)
32 | - For files where I can't imagine you'd want both mine and your/custom implementation the same time, e.g. if you override `bash`, `zsh`, you're probably creating your own solution to parse that source, and don't need mine (If that's not the case, feel free to open an issue)
33 | - For some of my modules, I've renamed them from what they do to their service/project names instead (`albums` to `nextalbums`; `money` to `mint`), so I'm not holding the generic name of some function when I don't really need to (have since moved those to [HPI-personal](https://github.com/purarue/HPI-personal))
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020-2024 purarue
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **TLDR**: I'm using `HPI`(Human Programming Interface) package as a means of unifying, accessing and interacting with all of my personal data.
2 |
3 | It's a Python library (named `my`), a collection of modules for:
4 |
5 | - social networks: posts, comments, favorites, searches
6 | - shell/program histories (zsh, bash, python, mpv, firefox)
7 | - programming (github/commits)
8 | - instant messaging
9 | - media histories (movies, TV shows, music, video game achievements/history); see
10 |
11 | [_Why?_](https://github.com/karlicoss/HPI#why)
12 |
13 | This is built on top of [`karlicoss/HPI`](https://github.com/karlicoss/HPI). These are all additional modules which aren't present in that repository - this is installed alongside the upstream repository (meaning _you can use both modules from upstream and here simultaneously_), see [#install](#install)
14 |
15 | ### My Modules
16 |
17 | - `my.zsh` and `my.bash`, access to my shell history w/ timestamps
18 | - `my.mail.imap` and `my.mail.mbox` to parse local IMAP sync's of my mail/mbox files -- see [doc/MAIL_SETUP.md](doc/MAIL_SETUP.md)
19 | - `my.mpv.history_daemon`, accesses movies/music w/ activity/metadata that have played on my machine, facilitated by a [mpv history daemon](https://github.com/purarue/mpv-history-daemon)
20 | - `my.discord.data_export`, parses ~1,000,000 messages/events from the discord data export, parser [here](https://github.com/purarue/discord_data)
21 | - `my.todotxt.active` to parse my current [todo.txt](https://github.com/todotxt/todo.txt-cli) file; `my.todotxt.git_history` tracks my history using backups of those files in [`git_doc_history`](https://github.com/purarue/git_doc_history)
22 | - `my.rss.newsboat`, keeps track of when I added/removed RSS feeds (for [`newsboat`](https://newsboat.org/))
23 | - `my.ipython`, for timestamped python REPL history
24 | - `my.ttt`, to parse shell/system history tracked by [`ttt`](https://github.com/purarue/ttt)
25 | - `my.activitywatch.active_window`, to parse active window events (what application I'm using/what the window title is) using [`window_watcher`](https://github.com/purarue/aw-watcher-window) and [activitywatch](https://activitywatch.net/) on android
26 | - `my.chess.export`, to track my [chess.com](https://www.chess.com)/[lichess.org](https://lichess.org/) games, using [`chess_export`](https://github.com/purarue/chess_export)
27 | - `my.trakt.export`, providing me a history/my ratings for Movies/TV Show (episodes) using [`traktexport`](https://github.com/purarue/traktexport)
28 | - `my.listenbrainz.export`, exporting my music listening history from [ListenBrainz](https://listenbrainz.org/) (open-source Last.fm) using [`listenbrainz_export`](https://github.com/purarue/listenbrainz_export)
29 | - `my.offline.listens`, for offline music listen history, using [offline_listens](https://github.com/purarue/offline_listens)
30 | - `my.mal.export`, for anime/manga history using [`malexport`](https://github.com/purarue/malexport)
31 | - `my.grouvee.export`, for my video game history/backlog using [`grouvee_export`](https://github.com/purarue/grouvee_export)
32 | - `my.runelite.screenshots`, parses data from the [automatic runelite screenshots](https://github.com/runelite/runelite/wiki/Screenshot)
33 | - `my.minecraft.advancements`, parses advancement (local achievement data) from the `~/.minecraft` directory
34 | - `my.project_euler`, when I solved [Project Euler](https://projecteuler.net/) problems
35 | - `my.linkedin.privacy_export`, to parse the [privacy export](https://www.linkedin.com/help/linkedin/answer/50191/downloading-your-account-data?lang=en) from linkedin
36 | - `my.scramble.history` for merged (timed) rubiks cube solves from multiple sources, using [scramble_history](https://github.com/purarue/scramble-history)
37 |
38 | #### 'Historical' Modules
39 |
40 | These are modules to parse GDPR exports/data from services I used to use, but don't anymore. They're here to provide more context into the past.
41 |
42 | - `my.apple.privacy_export`, parses Game Center and location data from the [apple privacy export](https://privacy.apple.com/)
43 | - `my.facebook.gdpr`, to parse the GDPR export from Facebook
44 | - `my.league.export`, gives League of Legends game history using [`lolexport`](https://github.com/purarue/lolexport)
45 | - `my.steam.scraper`, for steam achievement data and game playtime using [`steamscraper`](https://github.com/purarue/steamscraper)
46 | - `my.piazza.scraper`, parsing [piazza](https://piazza.com/) (university forum) posts using [`piazza-scraper`](https://github.com/purarue/piazza-scraper)
47 | - `my.blizzard.gdpr`, for general battle.net event data [parsed from a GDPR export](https://github.com/purarue/blizzard_gdpr_parser)
48 | - `my.skype.gdpr` to parse a couple datetimes from the Skype GDPR export (seems all my data from years ago is long gone)
49 | - `my.spotify.gdpr`, to parse the GDPR export from Spotify, mostly to access songs from my playlists from years ago
50 | - `my.twitch`, merging the [data export](https://www.twitch.tv/p/en/legal/privacy-choices/#user-privacy-requests) and my messages parsed from the [overrustle logs dump](https://github.com/purarue/overrustle_parser)
51 |
52 | See [here](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py) for my `HPI` config
53 |
54 | [Promnesia `Source`s for these `HPI` modules](https://github.com/purarue/promnesia)
55 |
56 | I also have some more personal scripts/modules in a separate repo; [`HPI-personal`](https://github.com/purarue/HPI-personal)
57 |
58 | ### In-use from [karlicoss/HPI](https://github.com/karlicoss/HPI)
59 |
60 | - `my.browser`, to parse browser history using [`browserexport`](https://github.com/purarue/browserexport)
61 | - `my.google.takeout.parser`, parses lots of (~500,000) events (youtube, searches, phone usage, comments, location history) from [google takeouts](https://takeout.google.com/), using [`google_takeout_parser`](https://github.com/purarue/google_takeout_parser)
62 | - `my.coding.commits` to track git commits across the system
63 | - `my.github` to track github events/commits and parse the GDPR export, using [`ghexport`](https://github.com/karlicoss/ghexport)
64 | - `my.reddit`, get saved posts, comments. Uses [`rexport`](https://github.com/karlicoss/rexport) to create backups of recent activity periodically, and [`pushshift`](https://github.com/purarue/pushshift_comment_export) to get old comments.
65 | - `my.smscalls`, exports call/sms history using [SMS Backup & Restore](https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore&hl=en_US)
66 | - `my.stackexchange.stexport`, for stackexchange data using [`stexport`](https://github.com/karlicoss/stexport)
67 |
68 | #### Partially in-use/with overrides:
69 |
70 | - `my.location`, though since I also have some locations from `apple.privacy_export`, I have a [`my.location.apple`](./my/location/apple.py) which I then merge into `my.location.all` in my overridden [`all.py`](https://github.com/purarue/HPI-personal/blob/master/my/location/all.py) file on my personal repo
71 | - similarly, I do use `my.ip` and `my.location.via_ip` from upstream, but I have [overridden `all.py` and module files here](https://github.com/purarue/HPI/tree/master/my/ip)
72 |
73 | 'Overriding' an `all.py` file means replacing the `all.py` from upstream repo (this means it can use my sources here to grab more locations/ips, since those don't exist in the upstream). For more info see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages), and the [module design](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#adding-new-modules) docs for HPI, but you might be able to get the gist by comparing:
74 |
75 | - [my.location.all](https://github.com/karlicoss/HPI/blob/master/my/location/all.py) in `karlicoss/HPI`
76 | - [my.location.all](https://github.com/purarue/HPI-personal/blob/master/my/location/all.py) in `purarue/HPI-personal`
77 |
78 | Since I've mangled my `PYTHONPATH` (see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages)), it imports from my repo instead of `karlicoss/HPI`. `all.py` files tend to pretty small -- so overriding/changing a line to add a source is the whole point.
79 |
80 | ### Companion Tools/Libraries
81 |
82 | Disregarding tools which actively collect data (like [`ttt`](https://github.com/purarue/ttt)/[`window_watcher`](https://github.com/purarue/aw-watcher-window)) or repositories which have their own exporter/parsers which are used here, there are a couple other tools/libraries I've created for this project:
83 |
84 | - [`ipgeocache`](https://github.com/purarue/ipgeocache) - for any IPs gathered from data exports, provides geolocation info, so I have partial location info going back to 2013
85 | - [`sqlite_backup`](https://github.com/purarue/sqlite_backup) - to safely copy/backup application sqlite databases that may currently be in use
86 | - [`git_doc_history`](https://github.com/purarue/git_doc_history) - a bash script to copy/backup files into git history, with a python library to help traverse and create a history/parse diffs between commits
87 | - [`HPI_API`](https://github.com/purarue/HPI_API) - automatically creates a JSON API/server for HPI modules
88 | - [`url_metadata`](https://github.com/purarue/url_metadata) - caches youtube subtitles, url metadata (title, description, image links), and a html/plaintext summary for any URL
89 |
90 | I also use this in [`my_feed`](https://github.com/purarue/my_feed), which creates a feed of media/data using `HPI`, live at
91 |
92 | ### Ad-hoc and interactive
93 |
94 | Some basic examples.
95 |
96 | When was I most using reddit?
97 |
98 | ```python
99 | >>> import collections, my.reddit.all, pprint
100 | >>> pprint.pprint(collections.Counter([c.created.year for c in my.reddit.all.comments()]))
101 | Counter({2016: 3288,
102 | 2017: 801,
103 | 2015: 523,
104 | 2018: 209,
105 | 2019: 65,
106 | 2014: 4,
107 | 2020: 3})
108 | ```
109 |
110 | Most common shell commands?
111 |
112 | ```python
113 | >>> import collections, pprint, my.zsh
114 | # lots of these are git-related aliases
115 | >>> pprint.pprint(collections.Counter([c.command for c in my.zsh.history()]).most_common(10))
116 | [('ls', 51059),
117 | ('gst', 11361),
118 | ('ranger', 6530),
119 | ('yst', 4630),
120 | ('gds', 3919),
121 | ('ec', 3808),
122 | ('clear', 3651),
123 | ('cd', 2111),
124 | ('yds', 1647),
125 | ('ga -A', 1333)]
126 | ```
127 |
128 | What websites do I visit most?
129 |
130 | ```python
131 | >>> import collections, pprint, my.browser.export, urllib
132 | >>> pprint.pprint(collections.Counter([urllib.parse.urlparse(h.url).netloc for h in my.browser.export.history()]).most_common(5))
133 | [('github.com', 20953),
134 | ('duckduckgo.com', 10146),
135 | ('www.youtube.com', 10126),
136 | ('discord.com', 8425),
137 | ('stackoverflow.com', 2906)]
138 | ```
139 |
140 | Song I've listened to most?
141 |
142 | ```python
143 | >>> import collections, my.mpv.history_daemon
144 | >>> collections.Counter([m.path for m in my.mpv.history_daemon.history()]).most_common(1)[0][0]
145 | '/home/username/Music/JPEFMAFIA/JPEGMAFIA - LP! - 2021 - V0/JPEGMAFIA - LP! - 05 HAZARD DUTY PAY!.mp3'
146 | ```
147 |
148 | Movie I've watched most?
149 |
150 | ```python
151 | >>> import my.trakt, from collections import Counter
152 | >>> Counter(e.media_data.title for e in my.trakt.history()).most_common(1)
153 | [('Up', 92)] # (the pixar movie)
154 | ```
155 |
156 | `hpi` also has a JSON query interface, so I can do quick computations using shell tools like:
157 |
158 | ```bash
159 | # how many calories have I eaten today (from https://github.com/purarue/ttally)
160 | $ hpi query ttally.__main__.food --recent 1d -s | jq -r '(.quantity)*(.calories)' | datamash sum 1
161 | 2258.5
162 | ```
163 |
164 | ### Install
165 |
166 | For the basic setup, I recommend you clone and install both directories as editable installs:
167 |
168 | ```bash
169 | # clone and install upstream as an editable package
170 | git clone https://github.com/karlicoss/HPI ./HPI-karlicoss
171 | python3 -m pip install --user -e ./HPI-karlicoss
172 |
173 | # clone and install my repository as an editable package
174 | git clone https://github.com/purarue/HPI ./HPI-pura
175 | python3 -m pip install --user -e ./HPI-pura
176 | ```
177 |
178 | Editable install means any changes to python files reflect immediately, which is very convenient for debugging and developing new modules. To update, you can just `git pull` in those directories.
179 |
180 | If you care about [overriding modules](https://github.com/purarue/HPI#partially-in-usewith-overrides), to make sure your `easy-install.pth` is ordered correctly:
181 |
182 | ```bash
183 | python3 -m pip install --user reorder_editable
184 | python3 -m reorder_editable reorder ./HPI-pura ./HPI-karlicoss
185 | ```
186 |
187 | Then, you likely need to run `hpi module install` for any modules you plan on using -- this can be done incrementally as you setup new modules. E.g.:
188 |
189 | - `hpi module install my.trakt.export` to install dependencies
190 | - Check the [stub config](./tests/my/my/config/__init__.py) or [my config](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py) and setup the config block in your HPI configuration file
191 | - Run `hpi doctor my.trakt.export` to check for any possible config issues/if your data is being loaded properly
192 |
193 | (The [install](./install) script does that for all my modules, but you likely don't want to do that)
194 |
195 | Its possible to install both `my` packages because `HPI` is a namespace package. For more information on that, and some of the complications one can run into, see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages), and the [module design](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#adding-new-modules) docs for HPI.
196 |
197 | If you're having issues installing/re-installing, check the [TROUBLESHOOTING_INSTALLS.md](doc/TROUBLESHOOTING_INSTALLS.md) file.
198 |
199 | If you recently updated and it seems like something has broke, check the [CHANGELOG](CHANGELOG.md) for any possible breaking changes
200 |
--------------------------------------------------------------------------------
/doc/MAIL_SETUP.md:
--------------------------------------------------------------------------------
1 | This is a distillation of the steps described in [this issue](https://github.com/purarue/HPI/issues/15)
2 |
3 | There are two mail parsing modules here -- `my.mail.imap` and `my.mail.mbox`. An [`mbox` file](https://docs.python.org/3/library/mailbox.html) is just a collection of email messages in a single text file
4 |
5 | You can also use both modules at the same time -- see `my.mail.all` below
6 |
7 | Remember to first run: `hpi module install my.mail.imap` to install the necessary dependencies
8 |
9 | Note: There are _lots of_ different ways email clients/websites will export messages, so any mention of thunderbird add-ons or syncing tools used in particular to back up mail are just examples. Anything that gives you access to either the raw email files or an mbox should do.
10 |
11 | ## `my.mail.imap`
12 |
13 | Personally, I use `my.mail.imap`. To sync my mail, I use [`mutt-wizard`](https://github.com/LukeSmithxyz/mutt-wizard/), which uses `mbsync` under the hood to saves a bunch of individual mail files in `~/.local/share/mail` -- updating every 5 minutes.
14 |
15 | There are - of course - hundreds of ways to save your mail locally. Lets take [the ImportTools thunderbird add-on](https://addons.thunderbird.net/en-US/thunderbird/addon/importexporttools-ng/) as an example (since its the one we did troubleshooting on in the [issue](https://github.com/purarue/HPI/issues/15)). To match the format `my.mail.imap` expects, select the folder you want to export, then use `Tools > ImportExportToolsNg > Export all messages in the Folder > Plain Text Format`, and export it to a folder somewhere. Then, in your config file, setup the block to point it at that path:
16 |
17 | ```python
18 | class mail:
19 | class imap:
20 | # path[s]/glob to the the mailboxes/IMAP files
21 | # you could also do something like:
22 | # mailboxes = "~/Documents/mbsync/*@*"
23 | # to match any files in that directory with '@' in them
24 | mailboxes = "~/Documents/ExportPlaintext/"
25 |
26 | # filter function which filters the input paths
27 | filter_path: Optional[Callable[[Path], bool]]
28 | ```
29 |
30 | To verify its finding your files, you can use `hpi query my.mail.imap.files -s` -- that'll print all the matched files
31 |
32 | That may be fine to parse an archive (a backup of some email you don't use anymore), but you need to continuously create new archives/delete old ones.
33 |
34 | Recently, ImportToolsExports has added support for periodic backups, but only in MBOX format. So -->
35 |
36 | ## `my.mail.mbox`
37 |
38 | If you already have access to an mbox file, you can skip this setup, is just an example:
39 |
40 | ### Thunderbird add-on
41 |
42 | In `Tools > ImportExportToolsNg > Options > Backup scheduling`, set the `Destination` and `Enable Frequency` to backup once per day, selecting `Just mail files`
43 |
44 | You can force a backup with `Tools > ImportExportToolsNg > Backup`
45 |
46 | Note: you can set the `Overwrite the mbox files with the same name in the destination directory` to overwrite your backup. Alternatively, since `my.config` is a python script, you could write some custom python function to parse the timestamp from the exported filepath, and then pass those to `mailboxes` in your `my.config`, using only using the latest exports as the input. Though, If you're overwriting the `mbox` files while HPI is trying to parse the files, HPI may fail.
47 |
48 | ### Setup mbox
49 |
50 | Once you've exported, setup your configuration to point at the directory. Note that since this uses `my.mail.imap` to parse the messages, you may have to setup a basic config with no files so that module does not fail:
51 |
52 | ```python
53 | class mail:
54 |
55 | class imap:
56 | # signifies no files
57 | mailboxes = ''
58 |
59 | class mbox:
60 |
61 | # paths/glob to the mbox directory -- searches recursively
62 | mailboxes = "~/Documents/mboxExport"
63 |
64 | # additional extensions to ignore
65 | exclude_extensions = (".sbd")
66 | ```
67 |
68 | ## `my.mail.all`
69 |
70 | You can also use both of these at the same time -- if you have some exported as individual text files and others as mbox files, setup a config like above, specifying `mailboxes` from both `imap` and `mbox`
71 |
72 | Then -- you can just use the `my.mbox.all.mail` function, which returns unique messages from both sources
73 |
74 | ## Testing
75 |
76 | To make sure this works, you can use the `doctor` and `query` commands, to make sure there are no config errors and it parses your mail properly:
77 |
78 | ```bash
79 | hpi --debug doctor --verobose my.mail.all
80 | hpi --debug doctor --verbose my.mail.imap
81 | hpi --debug doctor --verbose my.mail.mbox
82 | ```
83 |
84 | ```bash
85 | hpi --debug query my.mail.all --stream
86 | hpi --debug query my.mail.imap --stream
87 | hpi --debug query my.mail.mbox --stream
88 | ```
89 |
--------------------------------------------------------------------------------
/doc/TROUBLESHOOTING_INSTALLS.md:
--------------------------------------------------------------------------------
1 | It seems that sometimes installing from git has weird side effects with upgrading?
2 |
3 | If you're having issues -- try doing the following
4 |
5 | I'll use my promnesia modules (at ) as an example.
6 |
7 | Note: though the repository is `promnesia`, the module it installs is `promnesia_pura`. In python packages in general, its not necessary for the module name to match the repository (that's just where its hosted). To figure out what the name of the package is, use `python3 -m pip list`. For this HPI repository, it installs as `HPI-purarue`, so its possible to differentiate between this and upstream HPI.
8 |
9 | These are directions for installing a package as non-editable (into your python `site-packages`), though it covers uninstalling editable packages -- in case your path is misconfigured in some way. If you want to install as editable, see [reorder_editable](https://github.com/purarue/reorder_editable) and the [install section](https://github.com/purarue/HPI#install) of the README for issues you may run into, or see the [editable](#editable) section of this doc
10 |
11 | Whenever there are directions to use `pip` to do something -- its safer to do `python3 -m pip` (unless you know exactly what you're doing with managing multiple python installs on your system). That makes sure the `pip` that is being used is the same version as when you use `python3`
12 |
13 | Uninstall the package you're using:
14 |
15 | ```bash
16 | python3 -m pip uninstall -y promnesia_pura
17 | ```
18 |
19 | Make sure its actually uninstalled -- this should error:
20 |
21 | ```bash
22 | $ python3 -c "import promnesia_pura"
23 | Traceback (most recent call last):
24 | File "", line 1, in
25 | ModuleNotFoundError: No module named 'promnesia_pura'
26 | ```
27 |
28 | Note: For `HPI` in particular (since its a namespace package), if you're trying to uninstall my modules but leaves `karlicoss`'s (the core) modules installed, `import my` won't error. Instead, try something like `import my.trakt.export`, since that would only appear in my modules.
29 |
30 | If that still imports, you likely have files leftover in your site packages. To find that directory, you can use:
31 |
32 | ```bash
33 | $ python3 -m site
34 | sys.path = [
35 | '/home/username',
36 | '/usr/lib/python310.zip',
37 | '/usr/lib/python3.10',
38 | '/usr/lib/python3.10/lib-dynload',
39 | '/home/username/.local/lib/python3.10/site-packages',
40 | '/home/username/Repos/my_feed/src',
41 | '/home/username/Repos/albums',
42 | '/home/username/Repos/mint/budget',
43 | '/home/username/Repos/HPI-personal',
44 | '/home/username/Repos/HPI',
45 | '/home/username/Repos/HPI-karlicoss',
46 | '/home/username/Repos/promnesia-fork/src',
47 | '/usr/lib/python3.10/site-packages',
48 | ]
49 | USER_BASE: '/home/username/.local' (exists)
50 | USER_SITE: '/home/username/.local/lib/python3.10/site-packages' (exists)
51 | ENABLE_USER_SITE: True
52 | ```
53 |
54 | That should let you which directories python is scanning for imports. Check any of the `site-packages` directories, for files like:
55 |
56 | ```
57 | promnesia_prua
58 | promnesia_pura-0.0.0.dist-info
59 | ```
60 |
61 | and remove those (this is essentially a 'manually uninstall' of a broken package)
62 |
63 | If you've previously installed this as editable, review your editable installs to make sure its still not there:
64 |
65 | ```bash
66 | python3 -m pip install reorder_editable
67 | python3 -m reorder_editable locate # should show you where which editable installs are placing .egg-link files
68 | python3 -m reorder_editable cat
69 | ```
70 |
71 | Refer to the [reorder_editable](https://github.com/purarue/reorder_editable) README for more info on that.
72 |
73 | You should now be able to confirm it errors, like:
74 |
75 | ```bash
76 | $ python3 -c "import promnesia_pura"
77 | Traceback (most recent call last):
78 | File "", line 1, in
79 | ModuleNotFoundError: No module named 'promnesia_pura'
80 | ```
81 |
82 | Now -- to install it again!
83 |
84 | Instead of installing from git (since that can sometimes cache the result and run into other issues), clone it to some local directory:
85 |
86 | ```bash
87 | git clone https://github.com/purarue/promnesia ./promnesia_pura
88 | ```
89 |
90 | Then, you can install it by pointing it at the directory with includes the `setup.py` file, like: `python3 -m pip install --user ./promnesia_pura`
91 |
92 | You should now be able to confirm it imports properly:
93 |
94 | ```python3
95 | python3 -c "import promnesia_pura"
96 | ```
97 |
98 | ### Editable
99 |
100 | Alternatively, since you already have it locally, you can install it as editable:
101 |
102 | ```bash
103 | python3 -m pip install --user -e ./promnesia_pura
104 | ```
105 |
106 | That should modify your `sys.path` (run `python3 -m site`; and you'll see that directory appear on your path)
107 |
108 | That has the added benefit that whenever you want to update `promnesia_pura`, you can just:
109 |
110 | ```bash
111 | cd /path/to/promnesia_pura
112 | git pull
113 | ```
114 |
--------------------------------------------------------------------------------
/install:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # If the 'CI' environment variable is set, this runs
3 | # like it would on the CI. To try and test that locally,
4 | # can do: CI_SKIP_INSTALL=1 CI=1 ./install
5 | set -o pipefail
6 |
7 | ci() {
8 | [[ -n "${CI}" ]]
9 | }
10 |
11 | ci && set -x
12 |
13 | # script to setup HPI
14 | # - installs karlicoss/HPI as an editable namespace package,
15 | # - installs this repo
16 | # - installs additional python packages for modules
17 | # - checks for any required external commands
18 |
19 | # cd to base directory
20 | BASE_DIR="$(dirname "${BASH_SOURCE[0]}")"
21 | cd "${BASE_DIR}" || exit 1
22 | printf 'In: %s\n' "$(pwd)"
23 |
24 | # function to verify an external command is installed
25 | havecmd() {
26 | local BINARY ERRMSG script_name
27 | script_name='HPI'
28 | # error if first argument isn't provided
29 | BINARY="${1:?Must provide command to check}"
30 | # the command exists, exit with 0 (success!)
31 | if command -v "${BINARY}" >/dev/null 2>&1; then
32 | return 0
33 | else
34 | # construct error message
35 | ERRMSG="'${script_name}' requires '${BINARY}', could not find that on your \$PATH"
36 | if [[ -n "$2" ]]; then
37 | ERRMSG="$ERRMSG. $2"
38 | fi
39 | printf '%s\n' "$ERRMSG" 1>&2
40 | return 1
41 | fi
42 | } && export -f havecmd
43 |
44 | maybe_boxes() {
45 | # Print a fancy box, if boxes is installed
46 | # http://boxes.thomasjensen.com/
47 | if havecmd boxes >/dev/null 2>&1; then
48 | boxes -dshell -pv1h2
49 | else
50 | cat
51 | fi
52 | } && export -f maybe_boxes
53 |
54 | prompt_if_fails() {
55 | ci && return 1
56 | printf "Command failed... Hit Enter to continue, Ctrl+C to exit"
57 | read -r
58 | } && export -f prompt_if_fails
59 |
60 | pipm() {
61 | python3 -m pip "$@"
62 | } && export -f pipm
63 |
64 | pip_install() {
65 | local -a ARGS=()
66 | # only use --user when not in a virtual environment
67 | [[ -z "$VIRTUAL_ENV" ]] && ARGS+=("--user")
68 | ARGS+=("$@")
69 | pipm install "${ARGS[@]}"
70 | } && export -f pip_install
71 |
72 | # install dependencies (with pip) for this module
73 | # this works by traversing the AST/looking for
74 | # a 'REQUIRES' global variable in the
75 | # corresponding file
76 | hpi_module() {
77 | local -a ARGS=()
78 | [[ -z "$VIRTUAL_ENV" ]] && ARGS+=("--user")
79 | ARGS+=("$@")
80 | python3 -m my.core module install "${ARGS[@]}"
81 | } && export -f hpi_module
82 |
83 | ci_install_deps() {
84 | if [[ -n "${CI}" && -z "${CI_SKIP_INSTALL}" ]]; then
85 | # install OS specific stuff here
86 | if [[ "$OSTYPE" == "darwin"* ]]; then
87 | # macos ci
88 | brew install coreutils || return $?
89 | brew install boxes || return $?
90 | else
91 | # ubuntu ci
92 | sudo apt update || return $?
93 | sudo apt install boxes || return $?
94 | fi
95 | pip_install -U pip wheel setuptools || return $?
96 | fi
97 | }
98 |
99 | install_dependencies() {
100 | echo 'Installing dependencies...' | maybe_boxes
101 | if ci; then
102 | ci_install_deps || return $?
103 | fi
104 | }
105 |
106 | required_commands() {
107 | echo 'Checking if required commands are installed...' | maybe_boxes
108 | havecmd realpath || return $?
109 | havecmd git || return $?
110 | }
111 |
112 | ci_realpath() {
113 | if ci; then
114 | if [[ "$OSTYPE" == "darwin"* ]]; then
115 | # need to use g-prefixed things on CI
116 | grealpath "$@"
117 | return $?
118 | fi
119 | fi
120 | realpath "$@"
121 | }
122 |
123 | setup_fork() {
124 | local FORK_ABBREV UPSTREAM_URL UPSTREAM_DIR
125 | echo 'Setting up upstream fork...' | maybe_boxes
126 |
127 | FORK_ABBREV="${HPI_UPSTREAM_FOLDER_NAME:-HPI-karlicoss}"
128 | UPSTREAM_URL='https://github.com/karlicoss/HPI'
129 |
130 | UPSTREAM_DIR="$(ci_realpath "../${FORK_ABBREV}")"
131 |
132 | # clone my fork one level up from here if it does not exist
133 | if [[ ! -e "${UPSTREAM_DIR}" ]]; then
134 | git clone "${UPSTREAM_URL}" "${UPSTREAM_DIR}"
135 | else
136 | echo "Path already exists, skipping clone..."
137 | fi
138 |
139 | # install upstream/core HPI
140 | cd "${UPSTREAM_DIR}" || return $?
141 |
142 | if ci; then
143 | pip_install -e '.[optional,testing]' || return $?
144 | else
145 | pip_install -e '.' || return $?
146 | fi
147 |
148 | # cd back to here, to install this
149 | cd "${BASE_DIR}" || return $?
150 | pip_install -e '.' || return $?
151 | }
152 |
153 | module_dependencies() {
154 | if ! ci; then
155 | printf 'Install all module dependencies? [y/N] '
156 | read -r || return $?
157 | case "${REPLY}" in
158 | y | Y) ;;
159 | *)
160 | return 0
161 | ;;
162 | esac
163 | fi
164 | echo "Installing module dependencies" | maybe_boxes
165 | hpi_module my.activitywatch.active_window || return $?
166 | hpi_module my.discord.data_export || return $?
167 | hpi_module my.todotxt.git_history || return $?
168 | hpi_module my.ip.all || return $?
169 | hpi_module my.linkedin.privacy_export || return $?
170 | hpi_module my.chess.export || return $?
171 | hpi_module my.mpv.history_daemon || return $?
172 | hpi_module my.league.export || return $?
173 | hpi_module my.scramble.history || return $?
174 | hpi_module my.trakt.export || return $?
175 | hpi_module my.mail.all || return $?
176 | hpi_module my.piazza.scraper || return $?
177 | hpi_module my.apple.privacy_export || return $?
178 | hpi_module my.grouvee.export || return $?
179 | hpi_module my.offline.listens || return $?
180 | hpi_module my.mal.export || return $?
181 | hpi_module my.listenbrainz.export || return $?
182 | hpi_module my.skype.gdpr || return $?
183 | }
184 |
185 | ci_config() {
186 | CONF="$(ci_realpath "${BASE_DIR}/tests/my")" || return $?
187 | MY_CONFIG="${CONF}" python3 -m my.core config check 1>&2 || return $?
188 | echo "${CONF}"
189 | }
190 |
191 | ci_tests() {
192 | echo 'Running tests' | maybe_boxes
193 | ci && unset HPI_TESTS_PURA
194 | python3 -m pytest ./tests || return $?
195 | }
196 |
197 | ci_mypy() {
198 | echo 'Checking types' | maybe_boxes
199 | pip_install 'git+https://github.com/python/mypy'
200 | pip_install types-urllib3 types-simplejson types-setuptools types-PyYAML types-pytz types-dateparser types-requests lxml-stubs
201 | local mypy_args=(-p tests --txt-report .coverage.mypy)
202 | while read -r pkg; do
203 | mypy_args+=(-p "${pkg}")
204 | done < <(find my -name '*.py' | grep -v '__' | sed -e 's|\.\/||' -e 's|/|.|g' -e 's/\.py$//g' | sort)
205 | python3 -m mypy "${mypy_args[@]}" || return $?
206 | }
207 |
208 | ci_lint() {
209 | pip_install flake8 || return $?
210 | echo 'Linting...' | maybe_boxes
211 | python3 -m flake8 ./my || return $?
212 | }
213 |
214 | ci_run() {
215 | ci || return 0
216 | CONF="$(ci_config)" || return $?
217 | MY_CONFIG="${CONF}" ci_tests || return $?
218 | MY_CONFIG="${CONF}" ci_mypy || return $?
219 | MY_CONFIG="${CONF}" ci_lint || return $?
220 | }
221 |
222 | main() {
223 | install_dependencies || prompt_if_fails || return $?
224 | required_commands || prompt_if_fails || return $?
225 | # use realpath to make sure BASE_DIR is set properly
226 | BASE_DIR="$(ci_realpath "${BASE_DIR}")" || return $?
227 | (setup_fork) || prompt_if_fails || return $?
228 | module_dependencies || prompt_if_fails || return $?
229 | ci_run || return $?
230 | }
231 |
232 | # if user isn't running this directly
233 | # source the exported functions into the current bash environment
234 | if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then
235 | :
236 | else
237 | # otherwise, run main as usual
238 | main "$@" || exit $?
239 | fi
240 |
--------------------------------------------------------------------------------
/my/activitywatch/active_window.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses history from https://github.com/purarue/aw-watcher-window
3 | using https://github.com/purarue/active_window
4 | """
5 |
6 | REQUIRES = [
7 | "git+https://github.com/purarue/aw-watcher-window",
8 | "git+https://github.com/purarue/active_window",
9 | ]
10 |
11 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
12 | from my.config import activitywatch as user_config # type: ignore[attr-defined]
13 |
14 | from pathlib import Path
15 | from typing import Iterator, Sequence, Union
16 | from functools import partial
17 | from itertools import chain
18 |
19 | from dataclasses import dataclass
20 | from my.core import get_files, Stats, Paths, make_logger, make_config
21 |
22 | from more_itertools import unique_everseen
23 |
24 | import active_window.parse as AW
25 |
26 | logger = make_logger(__name__)
27 |
28 |
29 | @dataclass
30 | class window_config(user_config.active_window):
31 | # path[s]/glob to the backed up aw-window JSON/window_watcher CSV history files
32 | export_path: Paths
33 | error_policy: AW.ErrorPolicy = "drop"
34 |
35 |
36 | config = make_config(window_config)
37 |
38 |
39 | Result = Union[AW.AWAndroidEvent, AW.AWComputerEvent, AW.AWWindowWatcherEvent]
40 | Results = Iterator[Result]
41 |
42 |
43 | def inputs() -> Sequence[Path]:
44 | return get_files(config.export_path)
45 |
46 |
47 | def history() -> Results:
48 | yield from unique_everseen(
49 | chain(
50 | *map(
51 | partial(
52 | AW.parse_window_events,
53 | logger=logger,
54 | error_policy=config.error_policy,
55 | ),
56 | inputs(),
57 | )
58 | ),
59 | key=lambda e: e.timestamp,
60 | )
61 |
62 |
63 | def stats() -> Stats:
64 | from my.core import stat
65 |
66 | return {**stat(history)}
67 |
--------------------------------------------------------------------------------
/my/apple/privacy_export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the apple privacy Export
3 | https://privacy.apple.com/
4 | """
5 |
6 | REQUIRES = ["lxml"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import apple as user_config # type: ignore[attr-defined]
10 | from dataclasses import dataclass
11 | from my.core import PathIsh
12 |
13 |
14 | @dataclass
15 | class config(user_config.privacy_export):
16 | # path to unpacked GDPR archive
17 | gdpr_dir: PathIsh
18 |
19 |
20 | import os
21 | import json
22 | from datetime import datetime, timezone
23 | from pathlib import Path
24 | from typing import Iterator, Dict, Any, NamedTuple, Union, Optional, Sequence
25 |
26 | from lxml import etree # type: ignore[import]
27 | from lxml.etree import _Element
28 | from more_itertools import sliced, first
29 |
30 | Element = Union[_Element, None]
31 |
32 | from my.core import Stats, Res, make_logger
33 | from my.core.cachew import mcachew
34 |
35 |
36 | logger = make_logger(__name__)
37 |
38 |
39 | class Game(NamedTuple):
40 | name: str
41 | last_played: datetime
42 |
43 |
44 | # some duplication here to allow cachew usage
45 | class GameLeaderboardData(NamedTuple):
46 | game_name: str
47 | title: str
48 | dt: datetime
49 | rank: int
50 |
51 |
52 | class GameAchievement(NamedTuple):
53 | dt: datetime
54 | percentage: int
55 | game_name: str
56 | title: str
57 |
58 | @property
59 | def achieved(self) -> bool:
60 | return self.percentage == 100
61 |
62 |
63 | class Location(NamedTuple):
64 | lng: float
65 | lat: float
66 | dt: datetime
67 | name: str
68 | address: Optional[str]
69 |
70 |
71 | Event = Union[
72 | Game,
73 | GameLeaderboardData,
74 | GameAchievement,
75 | Location,
76 | ]
77 |
78 | Results = Iterator[Res[Event]]
79 |
80 |
81 | def _depends_on() -> Sequence[Path]:
82 | return sorted(Path(config.gdpr_dir).expanduser().absolute().rglob("*"))
83 |
84 |
85 | @mcachew(depends_on=_depends_on, logger=logger)
86 | def events() -> Results:
87 | gdpr_dir = Path(config.gdpr_dir).expanduser().absolute() # expand path
88 | handler_map = {
89 | "Apple ID account and device information": None,
90 | "Apple Online and Retail Stores": None,
91 | "iCloud Bookmarks": None, # TODO: parse,
92 | "Wallet Activity": None,
93 | "Game Center/Game Center Data.json": _parse_game_center,
94 | "iCloud Notes": None, # TODO: parse/copy?
95 | "Marketing communications": None,
96 | "iCloud Contacts": None,
97 | "iCloud Calendars and Reminders": None, # TODO: parse
98 | "Other data/Apple Features Using iCloud/EventKit/Locations.xml": _parse_locations,
99 | "Other data/Apple Features Using iCloud/Calendar/": _parse_calendar_recents,
100 | "Other data/Apple Features Using iCloud/Mail": None, # probably better to just do an IMAP sync and get all the data
101 | "Other data/": None, # ignore anything else in this directory
102 | }
103 | for f in gdpr_dir.rglob("*"):
104 | handler: Any
105 | for prefix, h in handler_map.items():
106 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)):
107 | continue
108 | handler = h
109 | break
110 | else:
111 | if f.is_dir():
112 | # rglob("*") matches directories, ignore those
113 | continue
114 | else:
115 | e = RuntimeError(f"Unhandled file: {f}")
116 | logger.debug(str(e))
117 | yield e
118 | continue
119 |
120 | if handler is None:
121 | # explicitly ignored
122 | continue
123 |
124 | yield from handler(f)
125 |
126 |
127 | def stats() -> Stats:
128 | from my.core import stat
129 |
130 | return {
131 | **stat(events),
132 | }
133 |
134 |
135 | def _parse_game_center(
136 | f: Path,
137 | ) -> Iterator[Union[Game, GameLeaderboardData, GameAchievement]]:
138 | for gme in json.loads(f.read_text())["games_state"]:
139 | yield Game(
140 | name=gme["game_name"],
141 | last_played=_parse_apple_utc_date(gme["last_played_utc"]),
142 | )
143 | for lb_inf in gme["leaderboard"]:
144 | for lb_val in lb_inf["leaderboard_score"]:
145 | yield GameLeaderboardData(
146 | game_name=gme["game_name"],
147 | title=lb_inf["leaderboard_title"],
148 | rank=lb_val["rank"],
149 | dt=_parse_apple_utc_date(lb_val["submitted_time_utc"]),
150 | )
151 | for ach_info in gme["achievements"]:
152 | yield GameAchievement(
153 | dt=_parse_apple_utc_date(ach_info["last_update_utc"]),
154 | game_name=gme["game_name"],
155 | percentage=ach_info["percentage_complete"],
156 | title=ach_info["achievements_title"],
157 | )
158 |
159 |
160 | def _parse_locations(f: Path) -> Iterator[Location]:
161 | tr = etree.parse(str(f))
162 | for location in _parse_apple_xml_val(tr.find("array")):
163 | loc_data: Dict[str, Any] = first(list(location.values()))
164 | if "t" in loc_data:
165 | for tstamp in loc_data["t"]:
166 | yield Location(
167 | lng=loc_data["map location"]["longitude"],
168 | lat=loc_data["map location"]["latitude"],
169 | name=loc_data["display name"],
170 | address=loc_data["address"],
171 | dt=tstamp,
172 | )
173 |
174 |
175 | def _parse_calendar_recents(f: Path) -> Iterator[Location]:
176 | tr = etree.parse(str(f))
177 | for location in _parse_apple_xml_val(tr.find("array")):
178 | loc_data: Dict[str, Any] = first(list(location.values()))
179 | if "map location" in loc_data:
180 | if "t" in loc_data:
181 | for tstamp in loc_data["t"]:
182 | yield Location(
183 | lng=loc_data["map location"]["longitude"],
184 | lat=loc_data["map location"]["latitude"],
185 | name=loc_data["display name"],
186 | address=first(loc_data.get("addressArray", []), None),
187 | dt=tstamp,
188 | )
189 |
190 |
191 | # parses apples XML file format, specifies what should be JSON as XML
192 | def _parse_apple_xml_val(xml_el: Element) -> Any:
193 | if xml_el is None:
194 | return None
195 | if xml_el.tag == "array":
196 | return [_parse_apple_xml_val(el) for el in xml_el]
197 | elif xml_el.tag == "dict":
198 | return {
199 | key.text: _parse_apple_xml_val(val) for key, val in sliced(list(xml_el), 2)
200 | }
201 | elif xml_el.tag == "string":
202 | return xml_el.text
203 | elif xml_el.tag == "integer":
204 | assert xml_el.text is not None, f"integer tag has no text: {xml_el}"
205 | return int(xml_el.text)
206 | elif xml_el.tag == "real":
207 | assert xml_el.text is not None, f"real tag has no text: {xml_el}"
208 | return float(xml_el.text)
209 | elif xml_el.tag == "date":
210 | # TODO: make sure this is parsing dates properly
211 | # is this UTC? probably, since others are
212 | assert xml_el.text is not None, f"date tag has no text: {xml_el}"
213 | return datetime.astimezone(
214 | datetime.fromisoformat(xml_el.text.rstrip("Z")), tz=timezone.utc
215 | )
216 | elif xml_el.tag == "data":
217 | return xml_el.text # BASE64 data, dont think I need this
218 | else:
219 | raise RuntimeError(f"Unknown tag: {xml_el.tag}")
220 |
221 |
222 | def _parse_apple_utc_date(dstr: str) -> datetime:
223 | return datetime.astimezone(
224 | datetime.strptime(dstr.rstrip("Z"), r"%m/%d/%Y %H:%M:%S"), tz=timezone.utc
225 | )
226 |
--------------------------------------------------------------------------------
/my/bash.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses bash history (mostly from servers/vps I run)
3 | using the following bootstrap script:
4 | https://github.com/purarue/bootstrap/
5 |
6 | This parses bash history with the following configuration:
7 |
8 | export HISTTIMEFORMAT="%s "
9 | export HISTFILESIZE=-1
10 | export HISTSIZE=-1
11 | shopt -s histappend # dont overwrite history
12 | shopt -s cmdhist # save al-lines of multi-line commands in the same entry
13 | shopt -s lithist # embedded newlines for multi-line commands
14 |
15 | That adds timestamps to history, making it look like:
16 |
17 | #1620931766
18 | command ls
19 | #1620931767
20 | command ls -al
21 | #1620931737
22 | which ls
23 | """
24 |
25 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
26 | from my.config import bash as user_config # type: ignore[attr-defined]
27 |
28 | from pathlib import Path
29 | from typing import Sequence, List
30 | from datetime import datetime
31 | from typing import NamedTuple, Iterator, Optional
32 | from itertools import chain
33 |
34 | from more_itertools import unique_everseen
35 |
36 | from dataclasses import dataclass
37 | from my.core import get_files, Stats, make_logger, Paths
38 | from my.core.cachew import mcachew
39 | from my.utils.time import parse_datetime_sec
40 |
41 |
42 | @dataclass
43 | class config(user_config):
44 | # path[s]/glob to the exported bash history files
45 | export_path: Paths
46 |
47 |
48 | logger = make_logger(__name__)
49 |
50 |
51 | def inputs() -> Sequence[Path]:
52 | return get_files(config.export_path)
53 |
54 |
55 | # represents one history entry (command)
56 | class Entry(NamedTuple):
57 | dt: datetime
58 | command: str
59 |
60 |
61 | Results = Iterator[Entry]
62 |
63 |
64 | def _cachew_depends_on() -> List[float]:
65 | return [p.stat().st_mtime for p in inputs()]
66 |
67 |
68 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
69 | def history() -> Results:
70 | yield from unique_everseen(
71 | chain(*map(_parse_file, inputs())),
72 | key=lambda h: (
73 | h.dt,
74 | h.command,
75 | ),
76 | )
77 |
78 |
79 | def _parse_file(histfile: Path) -> Results:
80 | dt: Optional[datetime] = None
81 | command_buf = "" # current command
82 | for line in histfile.open(encoding="latin-1"):
83 | if line.startswith("#"):
84 | # parse lines like '#1620931766'
85 | # possible string datetime
86 | sdt = line[1:].strip() # remove newline
87 | try:
88 | newdt = parse_datetime_sec(sdt)
89 | except Exception as e:
90 | logger.debug(f"Error while parsing datetime {e}")
91 | else:
92 | # this case happens when we successfully parse a datetime line
93 | # yield old data, then set newly parsed data to next items datetime
94 | if dt is not None:
95 | # rstrip \n gets rid of the last newline for each command
96 | yield Entry(dt=dt, command=command_buf.rstrip("\n"))
97 | # set new datetime for next entry
98 | dt = newdt
99 | # overwrite command buffer
100 | command_buf = ""
101 | continue
102 | # otherwise, append. this already includes newline
103 | command_buf += line
104 | # yield final command
105 | if dt is not None and command_buf.strip():
106 | yield Entry(dt=dt, command=command_buf.rstrip("\n"))
107 |
108 |
109 | def stats() -> Stats:
110 | from my.core import stat
111 |
112 | return {**stat(history)}
113 |
--------------------------------------------------------------------------------
/my/blizzard/gdpr.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses generic event data from my parsed GDPR data
3 | from: https://github.com/purarue/blizzard_gdpr_parser
4 | """
5 |
6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
7 | from my.config import blizzard as user_config # type: ignore[attr-defined]
8 | from dataclasses import dataclass
9 | from my.core import PathIsh, make_logger
10 | from my.core.cachew import mcachew
11 |
12 |
13 | @dataclass
14 | class config(user_config.gdpr):
15 | # path to the exported data
16 | export_path: PathIsh
17 |
18 |
19 | import json
20 | from pathlib import Path
21 | from datetime import datetime
22 | from typing import NamedTuple, Iterator, Sequence, List
23 | from itertools import chain
24 |
25 | from my.core import get_files, Stats
26 | from my.utils.time import parse_datetime_sec
27 |
28 |
29 | logger = make_logger(__name__)
30 |
31 |
32 | def inputs() -> Sequence[Path]:
33 | return get_files(config.export_path)
34 |
35 |
36 | def _cachew_depends_on() -> List[float]:
37 | return [p.stat().st_mtime for p in inputs()]
38 |
39 |
40 | class Event(NamedTuple):
41 | dt: datetime
42 | event_tag: str
43 | metadata: List[str]
44 |
45 |
46 | Results = Iterator[Event]
47 |
48 |
49 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
50 | def events() -> Results:
51 | yield from chain(*map(_parse_json_file, inputs()))
52 |
53 |
54 | def _parse_json_file(p: Path) -> Results:
55 | for e_info in json.loads(p.read_text()):
56 | dt, meta_tuple = e_info
57 | meta_tag, meta_joined = meta_tuple
58 | yield Event(
59 | dt=parse_datetime_sec(dt),
60 | event_tag=meta_tag,
61 | metadata=meta_joined.split("|"),
62 | )
63 |
64 |
65 | def stats() -> Stats:
66 | from my.core import stat
67 |
68 | return {**stat(events)}
69 |
--------------------------------------------------------------------------------
/my/chess/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses chess games from chess.com/lichess.org using
3 | https://github.com/purarue/chess_export
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/chess_export"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import chess as user_config # type: ignore[attr-defined]
10 |
11 |
12 | from pathlib import Path
13 | from typing import Iterator, Sequence, List, Union
14 | from itertools import chain
15 |
16 | import chess_export.chessdotcom.model as cmodel
17 | import chess_export.lichess.model as lmodel
18 | from more_itertools import unique_everseen
19 |
20 | from dataclasses import dataclass
21 | from my.core import get_files, Stats, make_logger, Paths
22 | from my.core.cachew import mcachew
23 |
24 |
25 | @dataclass
26 | class config(user_config.export):
27 | # path[s]/glob to the exported data. These are the resulting JSON files from 'chess_export ... export'
28 | export_path: Paths
29 |
30 |
31 | logger = make_logger(__name__)
32 |
33 |
34 | def inputs() -> Sequence[Path]:
35 | return get_files(config.export_path)
36 |
37 |
38 | # TODO: make extendible? Not sure if anyone has any other things they need to include here though...
39 | Results = Iterator[Union[cmodel.ChessDotComGame, lmodel.LichessGame]]
40 |
41 |
42 | def _cachew_depends_on() -> List[float]:
43 | return [p.stat().st_mtime for p in inputs()]
44 |
45 |
46 | def _parse_export_file(p: Path) -> Results:
47 | # try one, else the other
48 | # typically this raises a KeyError since the JSON didn't match
49 | # what the NamedTuple expects
50 | try:
51 | yield from lmodel.from_export(p)
52 | except Exception:
53 | yield from cmodel.from_export(p)
54 |
55 |
56 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
57 | def history() -> Results:
58 | yield from unique_everseen(
59 | chain(*(_parse_export_file(p) for p in inputs())), key=lambda g: g.end_time
60 | )
61 |
62 |
63 | def stats() -> Stats:
64 | from my.core import stat
65 |
66 | return {**stat(history)}
67 |
--------------------------------------------------------------------------------
/my/discord/data_export.py:
--------------------------------------------------------------------------------
1 | """
2 | Discord Data: messages and events data
3 | """
4 |
5 | REQUIRES = [
6 | "git+https://github.com/purarue/discord_data",
7 | "urlextract",
8 | ]
9 |
10 |
11 | from pathlib import Path
12 | from typing import List
13 | from dataclasses import dataclass
14 |
15 | from my.config import discord as user_config # type: ignore[attr-defined]
16 | from my.core import PathIsh, make_config
17 | from my.core.cachew import mcachew
18 |
19 |
20 | @dataclass
21 | class discord_config(user_config.data_export):
22 | # path to the top level discord export directory
23 | # see https://github.com/purarue/discord_data for more info
24 | export_path: PathIsh
25 |
26 | # whether to guess the compression of the files in the export_path
27 | # this uses kompress.ZipPath, which is a bit experimental
28 | #
29 | # NOTE: before adding this config flag, this was enabled,
30 | # since guess_compression=True on get_files by default
31 | _use_zippath: bool = True
32 |
33 |
34 | config = make_config(discord_config)
35 |
36 |
37 | from typing import Iterator, Optional, Tuple, Set, NamedTuple
38 | from datetime import datetime
39 |
40 | from my.core import make_logger, Stats, get_files
41 | from my.core.structure import match_structure
42 | from discord_data.parse import parse_messages, parse_activity
43 | from discord_data.model import Activity, Message
44 | from urlextract import URLExtract # type: ignore[import]
45 |
46 |
47 | logger = make_logger(__name__)
48 |
49 |
50 | def _remove_suppression(text: str, first_index: int, second_index: int) -> str:
51 | # add spaces so that text like
52 | # don't get converted into one long link
53 | return (
54 | text[:first_index] # before URL
55 | + " "
56 | + text[first_index + 1 : second_index] # URL itself
57 | + " "
58 | + text[second_index + 1 :] # after URL
59 | )
60 |
61 |
62 | extractor = URLExtract()
63 |
64 |
65 | def _remove_link_suppression(
66 | content: str, *, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None
67 | ) -> str:
68 | # fix content to remove discord link suppression if any links had any
69 | # e.g. this is a suppressed link
70 |
71 | if urls is None:
72 | urls = extractor.find_urls(content, get_indices=True)
73 |
74 | if not urls:
75 | return content.strip()
76 |
77 | for _, (start_index, end_index) in urls:
78 | before_ind = start_index - 1
79 | after_ind = end_index
80 | try:
81 | if content[before_ind] == "<" and content[after_ind] == ">":
82 | content = _remove_suppression(content, before_ind, after_ind)
83 | # could happen if the url didn't have braces and we hit the end of a string
84 | except IndexError:
85 | continue
86 | return content.strip()
87 |
88 |
89 | def test_remove_link_suppression() -> None:
90 | content = ""
91 | left = content.index("<")
92 | right = content.index(">")
93 | assert _remove_suppression(content, left, right) == " test "
94 |
95 | # shouldn't affect this at all
96 | content = "https://urlextract.readthedocs.io"
97 | assert _remove_link_suppression(content) == content
98 |
99 | content = ""
100 | expected = content.strip("<").strip(">")
101 | assert _remove_link_suppression(content) == expected
102 |
103 | content = "Here is some text "
104 | expected = "Here is some text https://urlextract.readthedocs.io"
105 | assert _remove_link_suppression(content) == expected
106 |
107 | content = "text other text"
108 | expected = "text https://urlextract.readthedocs.io other text"
109 | assert _remove_link_suppression(content) == expected
110 |
111 | content = (
112 | "t other f "
113 | )
114 | expected = (
115 | "t https://urlextract.readthedocs.io other github.com f other.website"
116 | )
117 | assert _remove_link_suppression(content) == expected
118 |
119 | content = "t "
120 | expected = "t https://urlextract.readthedocs.io other.website"
121 | assert _remove_link_suppression(content) == expected
122 |
123 |
124 | def _cachew_depends_on() -> List[str]:
125 | return [str(p) for p in get_files(config.export_path)]
126 |
127 |
128 | EXPECTED_DISCORD_STRUCTURE = ("messages/index.json", "account/user.json")
129 |
130 |
131 | def get_discord_exports() -> Iterator[Path]:
132 | for exp in get_files(config.export_path, guess_compression=config._use_zippath):
133 | # weak type check here, ZipPath is a bit experimental, so don't want a dependency
134 | # see https://github.com/karlicoss/HPI/blob/master/my/core/kompress.py#L160
135 | if type(exp).__name__ == "ZipPath":
136 | yield exp
137 | continue
138 | with match_structure(
139 | exp, expected=EXPECTED_DISCORD_STRUCTURE
140 | ) as discord_export:
141 | yield from discord_export
142 |
143 |
144 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
145 | def messages() -> Iterator[Message]:
146 | emitted: Set[int] = set()
147 | for discord_export in get_discord_exports():
148 | message_dir = discord_export / "messages"
149 | for msg in parse_messages(message_dir):
150 | if isinstance(msg, Exception):
151 | logger.warning(msg)
152 | continue
153 | if msg.message_id in emitted:
154 | continue
155 | yield Message(
156 | message_id=msg.message_id,
157 | timestamp=msg.timestamp,
158 | channel=msg.channel,
159 | content=_remove_link_suppression(msg.content),
160 | attachments=msg.attachments,
161 | )
162 | emitted.add(msg.message_id)
163 |
164 |
165 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
166 | def activity() -> Iterator[Activity]:
167 | emitted: Set[str] = set()
168 | for discord_export in get_discord_exports():
169 | activity_dir = discord_export / "activity"
170 | for act in parse_activity(activity_dir):
171 | if isinstance(act, Exception):
172 | logger.warning(act)
173 | continue
174 | if act.event_id in emitted:
175 | continue
176 | yield act
177 | emitted.add(act.event_id)
178 |
179 |
180 | class Reaction(NamedTuple):
181 | message_id: int
182 | emote: str
183 | timestamp: datetime
184 |
185 |
186 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
187 | def reactions() -> Iterator[Reaction]:
188 | for act in activity():
189 | jd = act.json_data
190 | if "emoji_name" in jd and "message_id" in jd:
191 | yield Reaction(
192 | message_id=int(jd["message_id"]),
193 | emote=jd["emoji_name"],
194 | timestamp=act.timestamp,
195 | )
196 |
197 |
198 | class AppLaunch(NamedTuple):
199 | name: str
200 | timestamp: datetime
201 |
202 |
203 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
204 | def app_launches() -> Iterator[AppLaunch]:
205 | for act in activity():
206 | jd = act.json_data
207 | name = jd.get("game") or jd.get("application")
208 | if name is not None:
209 | yield AppLaunch(
210 | name=name,
211 | timestamp=act.timestamp,
212 | )
213 |
214 |
215 | def stats() -> Stats:
216 | from my.core import stat
217 |
218 | return {
219 | **stat(messages),
220 | **stat(activity),
221 | **stat(reactions),
222 | **stat(app_launches),
223 | }
224 |
--------------------------------------------------------------------------------
/my/facebook/gdpr.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the facebook GPDR Export
3 | """
4 |
5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
6 | from my.config import facebook as user_config # type: ignore[attr-defined]
7 | from dataclasses import dataclass
8 | from my.core import PathIsh
9 |
10 |
11 | @dataclass
12 | class config(user_config.gdpr):
13 | gdpr_dir: PathIsh # path to unpacked GDPR archive
14 |
15 |
16 | import os
17 | import json
18 | from datetime import datetime
19 | from pathlib import Path
20 | from itertools import chain
21 | from typing import Iterator, Dict, Any, NamedTuple, Union, Optional, List
22 |
23 |
24 | from my.core import get_files, Stats, Res, Json, make_logger
25 | from my.utils.time import parse_datetime_sec
26 |
27 |
28 | logger = make_logger(__name__)
29 |
30 | FacebookJson = Dict[Any, Any]
31 |
32 |
33 | class Contact(NamedTuple):
34 | name: str
35 | phone_number: str
36 | created: datetime
37 | updated: datetime
38 |
39 |
40 | class Action(NamedTuple):
41 | description: str
42 | dt: datetime
43 | metadata: Json = {}
44 |
45 |
46 | # (logs/account activity)
47 | class AdminAction(NamedTuple):
48 | description: str
49 | dt: datetime
50 | ip: str
51 | user_agent: str
52 | metadata: Json = {}
53 |
54 |
55 | class Search(NamedTuple):
56 | query: str
57 | dt: datetime
58 |
59 |
60 | class UploadedPhoto(NamedTuple):
61 | dt: datetime
62 | ip: str
63 |
64 |
65 | class Post(NamedTuple):
66 | content: str
67 | dt: datetime
68 | action: Optional[str]
69 |
70 |
71 | class Comment(NamedTuple):
72 | action: str
73 | dt: datetime
74 | content: str
75 | metadata: Optional[str]
76 |
77 |
78 | class AcceptedEvent(NamedTuple):
79 | name: str
80 | starts_dt: datetime
81 | ends_dt: datetime
82 |
83 |
84 | class Friend(NamedTuple):
85 | name: str
86 | dt: datetime
87 | added: bool # whether this was when I added a friend or removed one
88 |
89 |
90 | # i.e. a PM
91 | class Message(NamedTuple):
92 | author: str
93 | dt: datetime
94 | content: str
95 | metadata: Optional[str] = None
96 |
97 |
98 | # a chain of messages back and forth, with one or more people
99 | class Conversation(NamedTuple):
100 | title: str
101 | participants: List[str]
102 | messages: List[Message]
103 |
104 |
105 | Event = Union[
106 | Contact,
107 | Conversation,
108 | Friend,
109 | UploadedPhoto,
110 | AcceptedEvent,
111 | Action,
112 | Post,
113 | Comment,
114 | Search,
115 | AdminAction,
116 | Contact,
117 | ]
118 |
119 | Results = Iterator[Res[Event]]
120 |
121 |
122 | def events() -> Results:
123 | # get files 2 levels deep into the export
124 | gdpr_dir = str(Path(config.gdpr_dir).expanduser().absolute()) # expand path
125 | files = chain(*map(lambda f: f.rglob("*"), get_files(config.gdpr_dir)))
126 | handler_map = {
127 | "about_you/face_recog": None,
128 | "about_you/friend_peer": None,
129 | "about_you/your_address_books": _parse_address_book,
130 | "ads": None,
131 | "apps_and_websites/apps_and_websites": _parse_installed_apps,
132 | "apps_and_websites/posts_from_apps_and_websites": _parse_app_posts,
133 | "comments/comments": _parse_group_comments,
134 | "events/event_invitations": None, # just parse the ones I accepted
135 | "events/your_event_responses": _parse_joined_events,
136 | "following_and": None, # I have no data here
137 | "friends/friends": _parse_friends,
138 | "friends/received_friend_requests": None, # Not interested
139 | "friends/rejected_friend": None, # Not interested
140 | "friends/sent_friend": None, # Not interested
141 | "friends/removed_": _parse_deleted_friends,
142 | "groups/your_group_membership": _parse_group_activity,
143 | "groups/your_posts_and_comments": _parse_group_posts,
144 | "likes_and_reactions/pages": _parse_page_likes,
145 | "likes_and_reactions/posts_and_comments": _parse_reactions,
146 | "location": None, # No data
147 | "marketplace": None,
148 | "other_activity": None,
149 | "pages": None,
150 | "payment_history": None,
151 | "photos_and_videos/album": _parse_photo_ips, # ip info for where images were uplodaed from
152 | "photos_and_videos/": None, # pull these out in my/photos.py
153 | "profile_information/profile_information.json": None,
154 | "saved_items": None,
155 | "stories": None,
156 | "your_places": None,
157 | "posts/your_posts": _parse_posts,
158 | "search_history": _parse_search_history,
159 | "profile_information/profile_update_history": _parse_posts,
160 | "messages/stickers_used": None, # no one needs stickers o_o
161 | "messages/": _parse_conversation,
162 | "security_and_login_information/account_activity": _parse_account_activity,
163 | "security_and_login_information/authorized_logins": _parse_authorized_logins,
164 | "security_and_login_information/administrative_records": _parse_admin_records,
165 | "security_and_login_information/where_you": None,
166 | "security_and_login_information/used_ip_addresses": None,
167 | "security_and_login_information/account_status_changes": None,
168 | "security_and_login_information/logins_and_logouts": None,
169 | "security_and_login_information/login_protection": None,
170 | "security_and_login_information/datr_cookie": None,
171 | "posts/other_people's_posts_to_your_timeline": None, # maybe implement this? OtherComment NamedTuple? Comment should just be mine
172 | }
173 | for f in files:
174 | handler: Any
175 | for prefix, h in handler_map.items():
176 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)):
177 | continue
178 | handler = h
179 | break
180 | else:
181 | if f.is_dir():
182 | # rglob("*") matches directories, as well as any subredirectories/json files in those
183 | # this is here exclusively for the messages dir, which has a larger structure
184 | # json files from inside the dirs are still picked up by rglob
185 | continue
186 | else:
187 | e = RuntimeError(f"Unhandled file: {f}")
188 | logger.debug(str(e))
189 | yield e
190 | continue
191 |
192 | if handler is None:
193 | # explicitly ignored
194 | continue
195 |
196 | if f.suffix != ".json":
197 | continue
198 |
199 | j = json.loads(f.read_text())
200 | yield from handler(j)
201 |
202 |
203 | def _parse_address_book(d: FacebookJson) -> Iterator[Contact]:
204 | # remove top-level address book name
205 | for addr_book_top in d.values():
206 | for addr_book_list in addr_book_top.values():
207 | for contact in addr_book_list:
208 | yield Contact(
209 | name=contact["name"],
210 | phone_number=contact["details"][0]["contact_point"],
211 | created=parse_datetime_sec(contact["created_timestamp"]),
212 | updated=parse_datetime_sec(contact["updated_timestamp"]),
213 | )
214 |
215 |
216 | def _parse_installed_apps(d: FacebookJson) -> Iterator[Action]:
217 | for app in d["installed_apps"]:
218 | yield Action(
219 | description="{} was installed".format(app["name"]),
220 | dt=parse_datetime_sec(app["added_timestamp"]),
221 | )
222 |
223 |
224 | def _parse_app_posts(d: FacebookJson) -> Iterator[Action]:
225 | for post in d["app_posts"]:
226 | yield Action(
227 | description=post["title"], dt=parse_datetime_sec(post["timestamp"])
228 | )
229 |
230 |
231 | def _parse_photo_ips(d: FacebookJson) -> Iterator[UploadedPhoto]:
232 | for photo_info in d["photos"]:
233 | if (
234 | "media_metadata" in photo_info
235 | and "photo_metadata" in photo_info["media_metadata"]
236 | and "upload_ip" in photo_info["media_metadata"]["photo_metadata"]
237 | ):
238 | yield UploadedPhoto(
239 | dt=parse_datetime_sec(photo_info["creation_timestamp"]),
240 | ip=photo_info["media_metadata"]["photo_metadata"]["upload_ip"],
241 | )
242 |
243 |
244 | def _parse_group_comments(d: FacebookJson) -> Iterator[Comment]:
245 | for comment in d["comments"]:
246 | yield Comment(
247 | content=comment["data"][0]["comment"]["comment"],
248 | action=comment["title"],
249 | dt=parse_datetime_sec(comment["timestamp"]),
250 | metadata=comment["data"][0]["comment"]["group"],
251 | )
252 |
253 |
254 | def _parse_joined_events(d: FacebookJson) -> Iterator[AcceptedEvent]:
255 | for event in d["event_responses"]["events_joined"]:
256 | yield AcceptedEvent(
257 | name=event["name"],
258 | starts_dt=parse_datetime_sec(event["start_timestamp"]),
259 | ends_dt=parse_datetime_sec(event["end_timestamp"]),
260 | )
261 |
262 |
263 | def _parse_friends(d: FacebookJson) -> Iterator[Friend]:
264 | for friend in d["friends"]:
265 | yield Friend(
266 | name=friend["name"], dt=parse_datetime_sec(friend["timestamp"]), added=True
267 | )
268 |
269 |
270 | def _parse_deleted_friends(d: FacebookJson) -> Iterator[Friend]:
271 | for friend in d["deleted_friends"]:
272 | yield Friend(
273 | name=friend["name"], dt=parse_datetime_sec(friend["timestamp"]), added=False
274 | )
275 |
276 |
277 | def _parse_group_activity(d: FacebookJson) -> Iterator[Action]:
278 | for gr in d["groups_joined"]:
279 | yield Action(
280 | description=gr["title"],
281 | dt=parse_datetime_sec(gr["timestamp"]),
282 | )
283 |
284 |
285 | def _parse_group_posts(d: FacebookJson) -> Iterator[Union[Comment, Post]]:
286 | for log_data_list in d.values():
287 | for comm_list in log_data_list.values():
288 | for comm in comm_list:
289 | data_keys = comm["data"][0].keys()
290 | if "comment" in data_keys:
291 | yield Comment(
292 | content=comm["data"][0]["comment"]["comment"],
293 | action=comm["title"],
294 | dt=parse_datetime_sec(comm["timestamp"]),
295 | metadata=comm["data"][0]["comment"]["group"],
296 | )
297 | else:
298 | yield Post(
299 | content=comm["data"][0]["post"],
300 | action=comm["title"],
301 | dt=parse_datetime_sec(comm["timestamp"]),
302 | )
303 |
304 |
305 | def _parse_page_likes(d: FacebookJson) -> Iterator[Action]:
306 | for page in d["page_likes"]:
307 | yield Action(
308 | description="Liked Page {}".format(page["name"]),
309 | dt=parse_datetime_sec(page["timestamp"]),
310 | )
311 |
312 |
313 | def _parse_reactions(d: FacebookJson) -> Iterator[Action]:
314 | for react in d["reactions"]:
315 | yield Action(
316 | description=react["title"], dt=parse_datetime_sec(react["timestamp"])
317 | )
318 |
319 |
320 | def _parse_search_history(d: FacebookJson) -> Iterator[Search]:
321 | for search in d["searches"]:
322 | assert len(search["data"]) == 1
323 | yield Search(
324 | query=search["data"][0]["text"], dt=parse_datetime_sec(search["timestamp"])
325 | )
326 |
327 |
328 | def _parse_conversation(
329 | d: FacebookJson,
330 | ) -> Iterator[Res[Conversation]]: # will only return 1 convo
331 | participants: List[str] = [p["name"] for p in d["participants"]]
332 | good_messages: List[Message] = []
333 | for m in _parse_messages_in_conversation(d["messages"]):
334 | # propagate up exception if one exists
335 | if isinstance(m, Exception):
336 | yield m
337 | else:
338 | good_messages.append(m)
339 | yield Conversation(
340 | participants=participants,
341 | title=d["title"],
342 | messages=good_messages,
343 | )
344 |
345 |
346 | def _parse_messages_in_conversation(
347 | messages: List[FacebookJson],
348 | ) -> Iterator[Res[Message]]:
349 | for m in messages:
350 | timestamp = parse_datetime_sec(m["timestamp_ms"] / 1000)
351 | author = m["sender_name"]
352 | if m["type"] == "Unsubscribe":
353 | continue
354 | elif m["type"] in ["Generic", "Share"]:
355 | # eh, I dont care that much about these in context, can do analysis on my/photos.py on its own
356 | if any([k in m for k in ["photos", "sticker"]]):
357 | continue
358 | elif "content" in m:
359 | yield Message(
360 | dt=timestamp,
361 | author=author,
362 | content=m["content"],
363 | metadata=m.get("share"),
364 | )
365 | # if this just actually does not have a field with content for some reason, ignore it
366 | elif set(m.keys()).issubset(set(["sender_name", "timestamp_ms", "type"])):
367 | continue
368 | else:
369 | yield RuntimeError(
370 | "Not sure how to parse message without 'photos' or 'content': {}".format(
371 | m
372 | )
373 | )
374 | else:
375 | yield RuntimeError("Not sure how to parse message for type: {}".format(m))
376 |
377 |
378 | # yikes. this is pretty much whenever I posted *anything*, or a third party app communicated
379 | # back to facebook that I listened to something/played a game, so it has like 5000 events
380 | #
381 | # not sure if I hit all the types, but this yields RuntimeErrors if it can't parse something,
382 | # so just check hpi doctor to make sure its all gooood
383 | # or
384 | # list(filter(lambda e: isinstance(e, Exception), events())),
385 | # throw a 'import pdb; pdb.set_trace()' at where its throwing the error
386 | # and add a new case for a new type of post
387 | def _parse_posts(d: FacebookJson) -> Iterator[Res[Union[Post, Action]]]:
388 | all_posts = d
389 | # handle both profile updates and posts
390 | if isinstance(all_posts, dict) and "profile_updates" in all_posts:
391 | all_posts = all_posts["profile_updates"]
392 | for post in all_posts:
393 | if "attachments" in post:
394 | att = post["attachments"]
395 | # e.g. photo with a description
396 | # make sure the structure looks like a media post
397 | # traverse into the image metadata post to see if we can find a description
398 | if len(att) >= 1 and "data" in att[0] and len(att[0]["data"]) >= 1:
399 | # make sure each data item has only one item of media
400 | if all([len(attach["data"]) == 1 for attach in att]):
401 | att_data = [attach["data"][0] for attach in att]
402 | # switch, over posts that have descriptions (e.g. me describing what the photo is), and posts that dont
403 | for dat in att_data:
404 | if "media" in dat:
405 | mdat = dat["media"]
406 | # image where I described something
407 | if "description" in mdat:
408 | yield Action(
409 | description=mdat["description"],
410 | dt=parse_datetime_sec(post["timestamp"]),
411 | metadata=mdat,
412 | )
413 | # image when I just posted to a album
414 | elif "title" in mdat:
415 | yield Action(
416 | description="Posted to Album {}".format(
417 | mdat["title"]
418 | ),
419 | dt=parse_datetime_sec(post["timestamp"]),
420 | metadata=mdat,
421 | )
422 | else:
423 | yield RuntimeError(
424 | "No known way to parse image post {}".format(post)
425 | )
426 | elif "place" in dat:
427 | # check-in into place
428 | if "name" in dat["place"]:
429 | yield Action(
430 | description="Visited {}".format(
431 | dat["place"]["name"]
432 | ),
433 | dt=parse_datetime_sec(post["timestamp"]),
434 | metadata=dat,
435 | )
436 | else:
437 | yield RuntimeError(
438 | "No known way to parse location post {}".format(
439 | post
440 | )
441 | )
442 | elif "life_event" in dat:
443 | # started high school etc.
444 | ddat = dat["life_event"]
445 | yield Action(
446 | description=ddat["title"],
447 | dt=parse_datetime_sec(post["timestamp"]),
448 | metadata=ddat,
449 | )
450 | # third party app event (e.g. Listened to Spotify Song)
451 | elif "title" in post:
452 | if "external_context" in dat:
453 | if "title" in post:
454 | yield Action(
455 | description=post["title"],
456 | dt=parse_datetime_sec(post["timestamp"]),
457 | metadata=dat,
458 | )
459 | # seems like bad data handling on facebooks part.
460 | # these are still events,
461 | # but it does not have an external context,
462 | # its like a stringified version of the data
463 | elif "text" in dat:
464 | yield Action(
465 | description=post["title"],
466 | dt=parse_datetime_sec(post["timestamp"]),
467 | metadata=dat,
468 | )
469 | else:
470 | yield RuntimeError(
471 | "No known way to parse attachment post with title {}".format(
472 | post
473 | )
474 | )
475 | else: # unknown data type
476 | yield RuntimeError(
477 | "No known way to parse data type with attachment {}".format(
478 | post
479 | )
480 | )
481 | else: # unknown structure
482 | yield RuntimeError(
483 | "No known way to parse data from post {}".format(post)
484 | )
485 | else:
486 | yield RuntimeError(
487 | "No known way to parse attachment post {}".format(post)
488 | )
489 | elif "data" in post and len(post["data"]) == 1:
490 | dat = post["data"][0]
491 | # basic post I wrote on my timeline
492 | if "post" in dat and isinstance(dat["post"], str) and "title" in post:
493 | yield Post(
494 | content=dat["post"],
495 | dt=parse_datetime_sec(post["timestamp"]),
496 | action=post["title"],
497 | )
498 | elif "profile_update" in dat:
499 | yield Action(
500 | description="Updated Profile",
501 | dt=parse_datetime_sec(post["timestamp"]),
502 | metadata=dat["profile_update"],
503 | )
504 | else:
505 | yield RuntimeError("No known way to parse basic post {}".format(post))
506 | # post without any actual content (e.g. {'timestamp': 1334515711, 'title': 'purarue posted in club'})
507 | # treat this as an action since I have no content here
508 | elif set(("timestamp", "title")) == set(post.keys()):
509 | yield Action(
510 | description=post["title"], dt=parse_datetime_sec(post["timestamp"])
511 | )
512 | else:
513 | yield RuntimeError("No known way to parse post {}".format(post))
514 |
515 |
516 | def _parse_account_activity(d: FacebookJson) -> Iterator[AdminAction]:
517 | for ac in d["account_activity"]:
518 | yield AdminAction(
519 | description=ac["action"],
520 | dt=parse_datetime_sec(ac["timestamp"]),
521 | ip=ac["ip_address"],
522 | user_agent=ac["user_agent"],
523 | )
524 |
525 |
526 | def _parse_authorized_logins(d: FacebookJson) -> Iterator[AdminAction]:
527 | for ac in d["recognized_devices"]:
528 | metadata = {}
529 | if "updated_timestamp" in ac:
530 | metadata["updated_at"] = parse_datetime_sec(ac["updated_timestamp"])
531 | yield AdminAction(
532 | description="Known Device: {}".format(ac["name"]),
533 | dt=parse_datetime_sec(ac["created_timestamp"]),
534 | ip=ac["ip_address"],
535 | user_agent=ac["user_agent"],
536 | metadata=metadata,
537 | )
538 |
539 |
540 | def _parse_admin_records(d: FacebookJson) -> Iterator[AdminAction]:
541 | for rec in d["admin_records"]:
542 | s = rec["session"]
543 | yield AdminAction(
544 | description=rec["event"],
545 | dt=parse_datetime_sec(s["created_timestamp"]),
546 | ip=s["ip_address"],
547 | user_agent=s["user_agent"],
548 | )
549 |
550 |
551 | def stats() -> Stats:
552 | from my.core import stat
553 |
554 | return {
555 | **stat(events),
556 | }
557 |
--------------------------------------------------------------------------------
/my/grouvee/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the CSV export from https://www.grouvee.com/
3 | """
4 |
5 | REQUIRES = ["git+https://github.com/purarue/grouvee_export"]
6 |
7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
8 | from my.config import grouvee as user_config # type: ignore[attr-defined]
9 |
10 | from pathlib import Path
11 | from typing import Iterator, List
12 | from functools import lru_cache
13 |
14 | from more_itertools import last
15 | import grouvee_export.dal as G
16 |
17 | from dataclasses import dataclass
18 | from my.core import get_files, Stats, Paths
19 |
20 |
21 | @dataclass
22 | class config(user_config.export):
23 | # path[s]/glob to the exported CSV files
24 | export_path: Paths
25 |
26 |
27 | def _latest_input() -> Path:
28 | """Since the exports are complete exports, can just use the most recent export"""
29 | return last(sorted(get_files(config.export_path), key=lambda p: p.stat().st_mtime))
30 |
31 |
32 | # should typically only parse the latest dump
33 | @lru_cache(maxsize=None)
34 | def _read_grouvee_export(p: Path) -> List[G.Game]:
35 | return list(G.parse_export(p))
36 |
37 |
38 | def games() -> Iterator[G.Game]:
39 | yield from _read_grouvee_export(_latest_input())
40 |
41 |
42 | def _filter_games_for_shelf(name: str) -> Iterator[G.Game]:
43 | for game in games():
44 | if name in (s.name for s in game.shelves):
45 | yield game
46 |
47 |
48 | def played() -> Iterator[G.Game]:
49 | """Games I've Played"""
50 | yield from _filter_games_for_shelf("Played")
51 |
52 |
53 | def watched() -> Iterator[G.Game]:
54 | """Games I've watched, not played"""
55 | yield from _filter_games_for_shelf("Watched")
56 |
57 |
58 | def backlog() -> Iterator[G.Game]:
59 | """Games on my backlog"""
60 | yield from _filter_games_for_shelf("Backlog")
61 |
62 |
63 | def wish_list() -> Iterator[G.Game]:
64 | """Games on my wish list"""
65 | yield from _filter_games_for_shelf("Wish List")
66 |
67 |
68 | def stats() -> Stats:
69 | from my.core import stat
70 |
71 | return {
72 | **stat(played),
73 | **stat(watched),
74 | **stat(backlog),
75 | **stat(wish_list),
76 | }
77 |
--------------------------------------------------------------------------------
/my/ip/all.py:
--------------------------------------------------------------------------------
1 | """
2 | Combines IPs from data exports which include IP addresses
3 | """
4 |
5 | REQUIRES = ["git+https://github.com/purarue/ipgeocache"]
6 |
7 | from typing import Iterator
8 |
9 | from my.ip.common import IP # type: ignore[import]
10 |
11 | from my.core import make_logger, Stats
12 |
13 | logger = make_logger(__name__)
14 |
15 |
16 | # can add more sources here, or disable them through core.disabled_modules
17 | def ips() -> Iterator[IP]:
18 | from . import facebook
19 | from . import discord
20 | from . import blizzard
21 |
22 | yield from facebook.ips()
23 | yield from discord.ips()
24 | yield from blizzard.ips()
25 |
26 |
27 | def stats() -> Stats:
28 | from my.core import stat
29 |
30 | return {**stat(ips)}
31 |
--------------------------------------------------------------------------------
/my/ip/blizzard.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 |
3 | from my.core import Stats
4 | from my.core.source import import_source
5 | from my.ip.common import IP # type: ignore[import]
6 |
7 |
8 | @import_source(module_name="my.blizzard.gdpr")
9 | def ips() -> Iterator[IP]:
10 | from my.blizzard.gdpr import events as blizzard_events
11 |
12 | for e in blizzard_events():
13 | if e.event_tag == "Activity History":
14 | yield IP(dt=e.dt, addr=e.metadata[-2])
15 |
16 |
17 | def stats() -> Stats:
18 | from my.core import stat
19 |
20 | return {**stat(ips)}
21 |
--------------------------------------------------------------------------------
/my/ip/discord.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 |
3 | from my.ip.common import IP, drop_private # type: ignore[import]
4 |
5 |
6 | from my.core import make_logger, Stats
7 | from my.core.cachew import mcachew
8 | from my.core.source import import_source
9 |
10 | logger = make_logger(__name__)
11 |
12 |
13 | @import_source(module_name="my.discord.data_export")
14 | def ips() -> Iterator[IP]:
15 | from my.discord.data_export import activity, _cachew_depends_on
16 |
17 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
18 | def _discord_ips() -> Iterator[IP]:
19 | for a in activity():
20 | if a.fingerprint.ip is not None:
21 | yield IP(dt=a.timestamp, addr=a.fingerprint.ip)
22 |
23 | yield from drop_private(_discord_ips())
24 |
25 |
26 | def stats() -> Stats:
27 | from my.core import stat
28 |
29 | return {**stat(ips)}
30 |
--------------------------------------------------------------------------------
/my/ip/facebook.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator, List
2 | from pathlib import Path
3 |
4 | from my.ip.common import IP, drop_private # type: ignore[import]
5 |
6 | from my.core import make_logger, Stats
7 | from my.core.source import import_source
8 | from my.core.cachew import mcachew
9 |
10 |
11 | logger = make_logger(__name__)
12 |
13 |
14 | def _cachew_depends_on() -> List[float]:
15 | from my.facebook.gdpr import config as facebook_config
16 |
17 | return [p.stat().st_mtime for p in Path(facebook_config.gdpr_dir).rglob("*")]
18 |
19 |
20 | @import_source(module_name="my.facebook.gdpr")
21 | def ips() -> Iterator[IP]:
22 | from my.facebook.gdpr import (
23 | AdminAction,
24 | UploadedPhoto,
25 | events as facebook_events,
26 | )
27 |
28 | @mcachew(
29 | depends_on=_cachew_depends_on,
30 | logger=logger,
31 | )
32 | def _facebook_ips() -> Iterator[IP]:
33 | for e in facebook_events():
34 | if isinstance(e, AdminAction) or isinstance(e, UploadedPhoto):
35 | if not isinstance(e, Exception):
36 | yield IP(dt=e.dt, addr=e.ip)
37 |
38 | yield from drop_private(_facebook_ips())
39 |
40 |
41 | def stats() -> Stats:
42 | from my.core import stat
43 |
44 | return {**stat(ips)}
45 |
--------------------------------------------------------------------------------
/my/ipython.py:
--------------------------------------------------------------------------------
1 | """
2 | Get IPython (REPL) History with datetimes
3 | https://ipython.readthedocs.io/en/stable/api/generated/IPython.core.history.html?highlight=hist#IPython.core.history.HistoryAccessor.__init__
4 |
5 | In order to save python history with timestamps, I define the following in my zshrc:
6 |
7 | # if I type python with out any arguments, launch ipython instead
8 | python() { python3 "$@" }
9 | python3() {
10 | if (( $# == 0 )); then
11 | echo -e "$(tput setaf 2)Launching ipython instead...$(tput sgr0)"
12 | ipython
13 | else
14 | /usr/bin/python3 "$@"
15 | fi
16 | }
17 | """
18 |
19 | REQUIRES = ["ipython>=8.5.0"]
20 |
21 |
22 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
23 | from my.config import ipython as user_config # type: ignore[attr-defined]
24 |
25 | from dataclasses import dataclass
26 | from my.core import Paths
27 |
28 |
29 | @dataclass
30 | class config(user_config):
31 | # path[s]/glob to the exported ipython sqlite databases
32 | export_path: Paths
33 |
34 |
35 | from pathlib import Path
36 | from datetime import datetime
37 | from typing import Iterable, NamedTuple, Iterator, Optional
38 | from itertools import chain
39 |
40 | from more_itertools import unique_everseen
41 | from IPython.core.history import HistoryAccessor
42 |
43 | from my.core import get_files, Stats, make_logger
44 |
45 | logger = make_logger(__name__)
46 |
47 |
48 | class Command(NamedTuple):
49 | dt: datetime
50 | command: str
51 |
52 |
53 | Results = Iterator[Command]
54 |
55 |
56 | # Return backed up sqlite databases
57 | def inputs() -> Iterable[Path]:
58 | yield from get_files(config.export_path)
59 |
60 |
61 | def _live_history() -> Results:
62 | # the empty string makes IPython use the live history file ~/.local/share/ipython/.../history.sqlite
63 | # instead of one of the files from the export backup
64 | # merge histories combines those
65 | #
66 | # seems that this has the possibility to fail to locate your live
67 | # history file if its being run in the background? unsure why
68 | try:
69 | yield from _parse_database(sqlite_database="")
70 | except Exception as e:
71 | logger.warning(f"Failed to get data from current ipython database: {e}")
72 | return
73 |
74 |
75 | def history() -> Results:
76 | yield from unique_everseen(
77 | chain(*(_parse_database(str(p)) for p in inputs()), _live_history()),
78 | key=lambda h: (h.command, h.dt),
79 | )
80 |
81 |
82 | def _parse_database(sqlite_database: str) -> Results:
83 | hist = HistoryAccessor(hist_file=sqlite_database) # type: ignore[no-untyped-call]
84 | try:
85 | total_sessions: Optional[int] = hist.get_last_session_id()
86 | except Exception as e:
87 | logger.warning(f"Failed to get last session id: {e}")
88 | # if database is corrupt/fails to compute sessions, skip
89 | return
90 | if total_sessions is None:
91 | return
92 | # yes, these start at 1
93 | for sess in range(1, total_sessions + 1):
94 | # get when this session started, use that as timestamp
95 | session_info = hist.get_session_info(sess)
96 | assert len(session_info) == 5 # sanity checks
97 | start_time = session_info[1]
98 | assert isinstance(start_time, datetime)
99 | for msg in hist.get_range(sess).fetchall(): # sqlite cursor
100 | assert len(msg) == 3
101 | assert isinstance(msg[-1], str)
102 | yield Command(command=msg[-1], dt=start_time)
103 |
104 |
105 | def stats() -> Stats:
106 | from my.core import stat
107 |
108 | return {**stat(history)}
109 |
--------------------------------------------------------------------------------
/my/league/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses league of legend history from https://github.com/purarue/lolexport
3 | """
4 |
5 | REQUIRES = ["git+https://github.com/purarue/lolexport"]
6 |
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import league as user_config # type: ignore[attr-defined]
10 | from dataclasses import dataclass
11 | from my.core import Paths
12 |
13 |
14 | @dataclass
15 | class config(user_config.export):
16 | # path[s]/glob to the exported data. These are the resulting json file from 'lolexport parse', or v5 exports
17 | export_path: Paths
18 |
19 | # league of legends username
20 | username: str
21 |
22 |
23 | from pathlib import Path
24 | from typing import Iterator, Sequence, Optional
25 |
26 | from my.core import get_files, Stats, Res, make_logger
27 |
28 | from lolexport.merge import Game, merge_game_histories
29 | import lolexport.log as llog
30 | from logzero import setup_logger # type: ignore[import]
31 |
32 | logger = make_logger(__name__)
33 |
34 | # configure logs
35 | llog.logger = setup_logger(name="lolexport", level=logger.level)
36 |
37 |
38 | def inputs() -> Sequence[Path]:
39 | return get_files(config.export_path)
40 |
41 |
42 | Results = Iterator[Res[Game]]
43 |
44 |
45 | def history(summoner_name: Optional[str] = None) -> Results:
46 | sname = summoner_name or config.username
47 | for g in merge_game_histories(list(inputs()), username=sname):
48 | try:
49 | g._serialize() # try parsing the data from this
50 | yield g
51 | except Exception as ex:
52 | yield ex
53 |
54 |
55 | def stats() -> Stats:
56 | from my.core import stat
57 |
58 | return {**stat(history)}
59 |
--------------------------------------------------------------------------------
/my/linkedin/privacy_export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the linkedin privacy/export
3 | https://www.linkedin.com/help/linkedin/answer/50191/downloading-your-account-data?lang=en
4 | """
5 |
6 | REQUIRES = ["dateparser"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import linkedin as user_config # type: ignore[attr-defined]
10 |
11 | from dataclasses import dataclass
12 | from my.core import PathIsh
13 |
14 |
15 | @dataclass
16 | class config(user_config.privacy_export):
17 | # path to unpacked privacy export, or a zip
18 | gdpr_dir: PathIsh
19 |
20 |
21 | import csv
22 | from datetime import datetime, timezone
23 | from pathlib import Path
24 | from typing import Iterator, Dict, cast, Optional
25 | from io import StringIO
26 |
27 | import dateparser
28 |
29 | from my.core import Stats, make_logger
30 | from my.core.structure import match_structure
31 |
32 |
33 | logger = make_logger(__name__)
34 |
35 |
36 | EXPECTED = (
37 | "Registration.csv",
38 | "messages.csv",
39 | "Jobs",
40 | "Profile.csv",
41 | )
42 |
43 |
44 | def input() -> Path:
45 | return Path(config.gdpr_dir).expanduser().absolute()
46 |
47 |
48 | DATELIKE_KEYS = {"date", "time"}
49 | ENDSWITH_KEYS = {" on", " at"}
50 |
51 |
52 | def _dateparser_to_utc(val: str) -> Optional[datetime]:
53 | dt_data = dateparser.parse(val)
54 | if dt_data is not None:
55 | return datetime.fromtimestamp(dt_data.timestamp(), tz=timezone.utc)
56 | return None
57 |
58 |
59 | @dataclass
60 | class Event:
61 | data: Dict[str, str]
62 | event_type: str # file name this was read from
63 |
64 | def iter_dts(self) -> Iterator[datetime]:
65 | for k, v in self.data.items():
66 | kl = k.lower()
67 | for en in ENDSWITH_KEYS:
68 | if kl.endswith(en):
69 | data = _dateparser_to_utc(v)
70 | if data is not None:
71 | yield data
72 | for dk in DATELIKE_KEYS:
73 | if dk in kl:
74 | data = _dateparser_to_utc(v)
75 | if data is not None:
76 | yield data
77 |
78 | @property
79 | def dt(self) -> Optional[datetime]:
80 | """Try to parse a datetime from this event"""
81 | if hasattr(self, "_dt"):
82 | return cast(datetime, getattr(self, "_dt"))
83 | dts = list(set(self.iter_dts()))
84 | if len(dts) >= 1:
85 | if len(dts) >= 2:
86 | logger.debug(f"Parsed multiple dates from {self.data}: {dts}")
87 | setattr(self, "_dt", dts[0])
88 | return dts[0]
89 | return None
90 |
91 |
92 | Results = Iterator[Event]
93 |
94 |
95 | def events() -> Iterator[Event]:
96 | with match_structure(input(), expected=EXPECTED, partial=True) as exports:
97 | for exp in exports:
98 | for csv_file in exp.rglob("*"):
99 | if not csv_file.is_file():
100 | continue
101 | yield from _csv_to_json(csv_file)
102 |
103 |
104 | # TODO: cache?
105 | def connections() -> Iterator[Event]:
106 | yield from filter(lambda f: f.event_type == "connections", events())
107 |
108 |
109 | def _csv_to_json(p: Path) -> Iterator[Event]:
110 | event_type = p.stem.strip().casefold().replace(" ", "_")
111 | text = p.read_text()
112 | # some items have 'Notes:' at the top, which are useless when parsing
113 | if text.startswith("Notes:\n"):
114 | # hopefully this is robust enough? -- seems to always be nother line after the note
115 | if "\n\n" in text.strip():
116 | text = text.split("\n\n", maxsplit=1)[1]
117 | reader = csv.reader(StringIO(text))
118 | header = next(reader)
119 | header_mapping = {i: t for i, t in enumerate(header)}
120 | for line in reader:
121 | # ignore empty lines -- not sure why they're present sometimes
122 | if "".join(line).strip() == "":
123 | continue
124 | yield Event(
125 | event_type=event_type,
126 | data={header_mapping[i]: line[i] for i in header_mapping},
127 | )
128 |
129 |
130 | def stats() -> Stats:
131 | from my.core import stat
132 |
133 | return {
134 | **stat(events),
135 | **stat(connections),
136 | }
137 |
--------------------------------------------------------------------------------
/my/listenbrainz/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses scrobbles from https://listenbrainz.org/ using
3 | https://github.com/purarue/listenbrainz_export
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/listenbrainz_export"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import listenbrainz as user_config # type: ignore[attr-defined]
10 |
11 |
12 | from pathlib import Path
13 | from typing import Iterator, Sequence
14 | from itertools import chain
15 |
16 | from listenbrainz_export.parse import Listen, iter_listens
17 | from more_itertools import unique_everseen
18 |
19 | from dataclasses import dataclass
20 | from my.core import get_files, Stats, make_logger, Paths
21 |
22 |
23 | @dataclass
24 | class config(user_config.export):
25 | # path[s]/glob to the exported data
26 | export_path: Paths
27 |
28 |
29 | logger = make_logger(__name__)
30 |
31 |
32 | def inputs() -> Sequence[Path]:
33 | return get_files(config.export_path)
34 |
35 |
36 | Results = Iterator[Listen]
37 |
38 |
39 | def _parse_export_file(p: Path) -> Results:
40 | # remove any items which have null as listen date
41 | # (may have been listening to something when export happened)
42 | yield from filter(lambda lst: lst.listened_at is not None, iter_listens(p))
43 |
44 |
45 | def history() -> Results:
46 | yield from unique_everseen(
47 | chain(*(_parse_export_file(p) for p in inputs())),
48 | key=lambda lst: lst.listened_at,
49 | )
50 |
51 |
52 | def stats() -> Stats:
53 | from my.core import stat
54 |
55 | return {**stat(history)}
56 |
--------------------------------------------------------------------------------
/my/location/apple.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 |
3 | from my.core.source import import_source
4 | from my.location.common import Location # type: ignore[import]
5 |
6 |
7 | @import_source(module_name="my.apple.privacy_export")
8 | def locations() -> Iterator[Location]:
9 | from my.apple.privacy_export import events, Location as AppleLocation
10 |
11 | for a in events():
12 | if isinstance(a, AppleLocation) and not isinstance(a, Exception):
13 | yield Location(
14 | lon=a.lng,
15 | lat=a.lat,
16 | dt=a.dt,
17 | accuracy=50.0,
18 | elevation=None,
19 | datasource="apple",
20 | )
21 |
--------------------------------------------------------------------------------
/my/mail/all.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 | from itertools import chain
3 |
4 | from my.core import Stats
5 | from my.core.source import import_source
6 |
7 | REQUIRES = ["mail-parser", "dateparser"]
8 |
9 | MAIL_HELP = "https://github.com/purarue/HPI/blob/master/doc/MAIL_SETUP.md"
10 |
11 | src_imap = import_source(module_name="my.mail.imap", help_url=MAIL_HELP)
12 | src_mbox = import_source(module_name="my.mail.mbox", help_url=MAIL_HELP)
13 |
14 |
15 | # top-level import -- this whole module requires mail-parser/dateparser
16 | from .common import Email, unique_mail, MessagePart
17 |
18 |
19 | @src_imap
20 | def _mail_imap() -> Iterator[Email]:
21 | from . import imap
22 |
23 | return imap.mail()
24 |
25 |
26 | @src_mbox
27 | def _mail_mbox() -> Iterator[Email]:
28 | from . import mbox
29 |
30 | return mbox.mail()
31 |
32 |
33 | # NOTE: you can comment out the sources you don't want
34 | def mail() -> Iterator[Email]:
35 | yield from unique_mail(
36 | chain(
37 | _mail_mbox(),
38 | _mail_imap(),
39 | )
40 | )
41 |
42 |
43 | def mail_subparts() -> Iterator[MessagePart]:
44 | for m in mail():
45 | yield from m.subparts
46 |
47 |
48 | def stats() -> Stats:
49 | from my.core import stat
50 |
51 | return {**stat(mail)}
52 |
--------------------------------------------------------------------------------
/my/mail/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from email.message import Message
4 | from typing import (
5 | List,
6 | Tuple,
7 | TextIO,
8 | Iterator,
9 | Optional,
10 | Union,
11 | Dict,
12 | Any,
13 | cast,
14 | )
15 | from datetime import datetime
16 | from dataclasses import dataclass
17 |
18 | import dateparser
19 | from mailparser import MailParser # type: ignore[import]
20 | from mailparser.exceptions import MailParserReceivedParsingError # type: ignore[import]
21 | from more_itertools import unique_everseen
22 |
23 | from my.core import make_logger, __NOT_HPI_MODULE__ # noqa: F401
24 |
25 | from .parse_parts import tag_message_subparts
26 |
27 | REQUIRES = ["mail-parser", "dateparser"]
28 |
29 | # silence all mailparser logs
30 | # https://stackoverflow.com/a/55396144
31 | mlog = logging.getLogger("mailparser")
32 | for handler in mlog.handlers.copy():
33 | mlog.removeHandler(handler)
34 | mlog.addHandler(logging.NullHandler())
35 | mlog.propagate = False
36 |
37 | logger = make_logger(__name__)
38 |
39 |
40 | @dataclass
41 | class MessagePart:
42 | content_type: str
43 | payload: Any
44 | _email: "Email"
45 |
46 |
47 | class Email(MailParser):
48 | """
49 | subclass of the mailparser which
50 | supports serialization by my.core.serialize
51 | along with a few other convenience functions
52 | """
53 |
54 | # note: The 'message' property on this class
55 | # is the stdlib email.Message class:
56 | # https://docs.python.org/3/library/email.message.html#module-email.message
57 | def __init__(self, message: Message) -> None:
58 | super().__init__(message=message)
59 | self.filepath: Optional[Path] = None
60 | self._dt: Optional[datetime] = None # property to cache datetime result
61 | self._dateparser_failed: bool = False # if dateparser previously failed
62 |
63 | @property
64 | def dt(self) -> Optional[datetime]:
65 | """
66 | Try to parse datetime if mail date wasn't in RFC 2822 format
67 | """
68 | if self._dt is not None:
69 | return self._dt
70 | if self._dateparser_failed:
71 | return None
72 | # If date was parsed properly by mailparser
73 | d = self.date
74 | if isinstance(d, datetime):
75 | self._dt = d
76 | return self._dt
77 | if "Date" in self.headers:
78 | dateparser_res: Optional[datetime] = dateparser.parse(self.headers["Date"])
79 | # if this failed to parse, save it on the object
80 | if dateparser_res is None:
81 | self._dateparser_failed = True
82 | return None
83 | else:
84 | self._dt = dateparser_res
85 | return self._dt
86 | return None
87 |
88 | def _serialize(self) -> Dict[str, Any]:
89 | return {
90 | "filepath": self.filepath,
91 | "bcc": self.bcc,
92 | "cc": self.cc,
93 | "date": self.dt,
94 | "date_utc": self.date_utc,
95 | "delivered_to": self.delivered_to,
96 | "from": self.from_,
97 | "message_id": self.message_id,
98 | "received": self.received,
99 | "reply_to": self.reply_to,
100 | "subject": self.subject,
101 | "to": self.to,
102 | "by": self.by,
103 | "envelope_from": self.envelope_from,
104 | "envelope_sender": self.envelope_sender,
105 | "for": getattr(self, "for"),
106 | "hop": self.hop,
107 | "with": getattr(self, "with"),
108 | "body": self.body,
109 | "body_html": self.body_html,
110 | "body_plain": self.body_plain,
111 | "attachments": self.attachments,
112 | "sender_ip_address": self.sender_ip_address,
113 | "to_domains": self.to_domains,
114 | }
115 |
116 | @property
117 | def description(self) -> str:
118 | return f"""From: {describe_persons(self.from_)}
119 | To: {describe_persons(self.to)}
120 | Subject: {self.subject}"""
121 |
122 | @classmethod
123 | def safe_parse(
124 | cls, fp: Union[str, bytes, Message, TextIO], display_filename: Path
125 | ) -> Optional["Email"]:
126 | try:
127 | if isinstance(fp, bytes):
128 | m = cls.from_bytes(fp)
129 | elif isinstance(fp, str):
130 | m = cls.from_string(fp)
131 | elif isinstance(fp, Message):
132 | # convert the email.Message (or a subclass) to this class
133 | m = cls(message=fp)
134 | else:
135 | m = cls.from_file_obj(fp)
136 | return cast(Email, m)
137 | except UnicodeDecodeError as e:
138 | logger.debug(f"While parsing {display_filename}: {e}")
139 | except MailParserReceivedParsingError as e:
140 | logger.debug(f"While parsing {display_filename}: {e}")
141 | except AttributeError as e:
142 | # error in the 'find_between' function when
143 | # the epilogue fails to be parse
144 | if str(e) == "'NoneType' object has no attribute 'index'":
145 | logger.debug(
146 | f"While parsing {display_filename}, epilogue failed to be parsed: {e}"
147 | )
148 | else:
149 | logger.debug(
150 | f"Unknown error while parsing {display_filename}: {e}, skipping...",
151 | exc_info=e,
152 | )
153 | except Exception as e:
154 | logger.warning(
155 | f"Unknown error while parsing {display_filename}: {e}, skipping...",
156 | exc_info=e,
157 | )
158 | return None
159 |
160 | @classmethod
161 | def safe_parse_path(cls, path: Path) -> Optional["Email"]:
162 | with path.open("rb") as bf:
163 | m = cls.safe_parse(try_decode_buf(bf.read()), display_filename=path)
164 | if m is None:
165 | return None
166 | m.filepath = path
167 | return m
168 |
169 | @property
170 | def subparts(self) -> Iterator[MessagePart]:
171 | for payload, content_type in tag_message_subparts(self.message):
172 | yield MessagePart(
173 | content_type=content_type,
174 | payload=payload,
175 | _email=self,
176 | )
177 |
178 |
179 | def unique_mail(emails: Iterator[Email]) -> Iterator[Email]:
180 | # remove duplicates (from a file being
181 | # in multiple boxes and the 'default' inbox)
182 | # some formats won't have a message id,
183 | # but hopefully the date/subject creates a unique
184 | # key in that case
185 | yield from unique_everseen(
186 | emails,
187 | key=lambda m: (
188 | m.subject_json,
189 | m.message_id_json,
190 | m.dt,
191 | ),
192 | )
193 |
194 |
195 | def try_decode_buf(buf: bytes) -> str:
196 | try:
197 | return buf.decode("utf-8")
198 | except UnicodeDecodeError:
199 | try:
200 | return buf.decode("iso-8859-1")
201 | except UnicodeDecodeError:
202 | return buf.decode("latin-1")
203 |
204 |
205 | def describe_person(p: Tuple[str, str]) -> str:
206 | """
207 | (
208 | "Person",
209 | "emailhere@gmail.com"
210 | )
211 | converts to
212 | Person
213 | if there's no 'Person' text, it
214 | just becomes:
215 | emailhere@gmail.com
216 | """
217 | if p[0].strip():
218 | return f"{p[0]} <{p[1]}>"
219 | else:
220 | return p[1]
221 |
222 |
223 | def describe_persons(m: List[Tuple[str, str]]) -> str:
224 | """
225 | >>> [('Google', 'no-reply@accounts.google.com'), ('Github', 'no-reply@github.com')]
226 | 'Google , Github '
227 | """
228 | return ", ".join([describe_person(p) for p in m])
229 |
--------------------------------------------------------------------------------
/my/mail/imap.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses my locally synced IMAP email files, using mbsync
3 | https://isync.sourceforge.io/mbsync.html
4 | Uses https://github.com/SpamScope/mail-parser to parse the mail
5 | """
6 |
7 | REQUIRES = ["mail-parser", "dateparser"]
8 |
9 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
10 | from my.config import mail as user_config # type: ignore[attr-defined]
11 |
12 | from pathlib import Path
13 | from typing import (
14 | Iterator,
15 | Callable,
16 | Optional,
17 | List,
18 | )
19 |
20 |
21 | from dataclasses import dataclass
22 | from my.core import Stats, Paths, get_files, make_config
23 | from .common import Email, unique_mail
24 |
25 |
26 | @dataclass
27 | class imap_conf(user_config.imap):
28 | # path[s]/glob to the the individual email files -- searches recursively
29 | mailboxes: Paths
30 |
31 | # filter function which filters the input paths
32 | filter_path: Optional[Callable[[Path], bool]] = None
33 |
34 |
35 | config = make_config(imap_conf)
36 |
37 |
38 | def mailboxes() -> List[Path]:
39 | return list(get_files(config.mailboxes))
40 |
41 |
42 | def _files() -> Iterator[Path]:
43 | for box in mailboxes():
44 | for path in box.rglob("*"):
45 | if not path.is_file():
46 | continue
47 | if path.stem.startswith("."):
48 | continue
49 | yield path
50 |
51 |
52 | def files() -> Iterator[Path]:
53 | if config.filter_path is None:
54 | yield from _files()
55 | else:
56 | assert callable(config.filter_path)
57 | yield from filter(config.filter_path, _files())
58 |
59 |
60 | def raw_mail() -> Iterator[Email]:
61 | for m in map(Email.safe_parse_path, files()):
62 | if m is not None:
63 | yield m
64 |
65 |
66 | def mail() -> Iterator[Email]:
67 | yield from unique_mail(raw_mail())
68 |
69 |
70 | def stats() -> Stats:
71 | from my.core import stat
72 |
73 | return {**stat(mail)}
74 |
--------------------------------------------------------------------------------
/my/mail/mbox.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses local mbox files
3 | """
4 |
5 | REQUIRES = ["mail-parser", "dateparser"]
6 |
7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
8 | from my.config import mail as user_config # type: ignore[attr-defined]
9 |
10 | import mailbox
11 | from pathlib import Path
12 | from typing import List, Iterator, Optional, Sequence, IO, Any
13 |
14 | from dataclasses import dataclass
15 | from my.core import Stats, Paths, get_files
16 | from my.core import make_logger
17 |
18 | from .common import Email, unique_mail, try_decode_buf
19 |
20 |
21 | logger = make_logger(__name__)
22 |
23 |
24 | @dataclass
25 | class config(user_config.mbox):
26 | # path[s]/glob to the mbox file directory
27 | mailboxes: Paths
28 |
29 | # any additional extensions to ignore -- by default includes .msf, .dat, .log
30 | exclude_extensions: Optional[Sequence[str]] = None
31 |
32 |
33 | def mailboxes() -> List[Path]:
34 | return list(get_files(config.mailboxes))
35 |
36 |
37 | DEFAULT_EXCLUDED_EXTENSIONS = {
38 | ".msf",
39 | ".log",
40 | ".dat",
41 | }
42 |
43 |
44 | def files() -> Iterator[Path]:
45 | excluded_ext = set(DEFAULT_EXCLUDED_EXTENSIONS)
46 | if config.exclude_extensions:
47 | for ext in config.exclude_extensions:
48 | excluded_ext.add(ext)
49 |
50 | for box in mailboxes():
51 | for path in box.rglob("*"):
52 | if path.stem.startswith("."):
53 | continue
54 | if path.is_file():
55 | if path.suffix not in excluded_ext:
56 | yield path
57 |
58 |
59 | def _decode_msg(msg: IO[Any]) -> mailbox.mboxMessage:
60 | """
61 | Custom decode function
62 |
63 | by default this uses 'ascii' which can cause fatal errors
64 | on UnicodeDecodeErrors
65 | """
66 | msg_str = try_decode_buf(msg.read())
67 | return mailbox.mboxMessage(mailbox.Message(msg_str))
68 |
69 |
70 | def _iter_mailbox(file: Path) -> Iterator[Email]:
71 | mbox = mailbox.mbox(
72 | str(file),
73 | factory=_decode_msg,
74 | create=False,
75 | )
76 | mbox_itr = iter(mbox)
77 | while True:
78 | try:
79 | mbox_message = next(mbox_itr)
80 | email = Email.safe_parse(mbox_message, display_filename=file)
81 | if email is not None:
82 | email.filepath = file
83 | yield email
84 | except StopIteration:
85 | break
86 | except Exception as ex:
87 | logger.warning(
88 | f"Unexpected error while parsing {file}: {ex}... no way to continue parsing mbox file...",
89 | exc_info=ex,
90 | )
91 |
92 |
93 | def raw_mail() -> Iterator[Email]:
94 | for file in files():
95 | assert file.exists() # sanity check -- make sure were not creating mboxes
96 | yield from _iter_mailbox(file)
97 |
98 |
99 | def mail() -> Iterator[Email]:
100 | yield from unique_mail(raw_mail())
101 |
102 |
103 | def stats() -> Stats:
104 | from my.core import stat
105 |
106 | return {**stat(mail)}
107 |
--------------------------------------------------------------------------------
/my/mail/parse_parts.py:
--------------------------------------------------------------------------------
1 | """
2 | Some helper functions/constants for parsing message subparts/ignoring certain content types
3 | """
4 |
5 | from typing import Iterator, Tuple, Set, Union, Any, Literal
6 | from email.message import Message
7 |
8 | # explicitly ignored types, anything else sends a warning
9 | IGNORED_CONTENT_TYPES = {
10 | "text/calendar",
11 | "application/ics",
12 | "application/pdf",
13 | "application/octet-stream",
14 | "application/octetstream",
15 | "text/csv",
16 | "application/json",
17 | "application/zip",
18 | "application/x-zip-compressed",
19 | "application/msword",
20 | "multipart/alternative",
21 | "application/postscript",
22 | "text/x-vcard",
23 | "multipart/parallel", # not sure what the best way to parse this is
24 | }
25 |
26 | IGNORED_CONTENT_PREFIXES: Set[str] = {
27 | "application/vnd",
28 | "application/x-apple",
29 | "application/x-iwork",
30 | "image",
31 | "audio",
32 | "video",
33 | }
34 |
35 |
36 | def get_message_parts(m: Message) -> Iterator[Message]:
37 | # since walk returns both multiparts and their children
38 | # we can ignore the multipart and return all individual parts
39 | #
40 | # if single type, it just returns the message itself
41 | for part in m.walk():
42 | if not part.is_multipart():
43 | yield part
44 |
45 |
46 | EmailText = Literal["html", "text"]
47 |
48 |
49 | EmailTextOrContentType = Union[EmailText, str]
50 |
51 |
52 | def tag_message_subparts(
53 | msg: Message,
54 | ) -> Iterator[Tuple[Any, EmailTextOrContentType]]:
55 | for message_part in get_message_parts(msg):
56 | content_type = message_part.get_content_type()
57 | payload = message_part.get_payload()
58 |
59 | # known ignored content types
60 | if content_type in IGNORED_CONTENT_TYPES:
61 | yield payload, content_type
62 |
63 | if any(
64 | [content_type.startswith(prefix) for prefix in IGNORED_CONTENT_PREFIXES]
65 | ):
66 | yield payload, content_type
67 |
68 | if content_type.startswith("text") and "html" in content_type:
69 | yield payload, "html"
70 | elif content_type == "text/plain":
71 | yield payload, "text"
72 | else:
73 | # unknown ignored content types
74 | yield payload, content_type
75 |
--------------------------------------------------------------------------------
/my/mal/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the data directory for my MAL export
3 | Uses https://github.com/purarue/malexport/
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/malexport"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import mal as user_config # type: ignore[attr-defined]
10 |
11 | from pathlib import Path
12 | from datetime import datetime
13 | from typing import Iterator, List, Tuple, NamedTuple, Optional
14 | from functools import lru_cache
15 |
16 | from dataclasses import dataclass
17 | from my.core import Stats, make_logger, PathIsh, make_config, get_files
18 | from my.core.structure import match_structure
19 |
20 | from malexport.paths import LocalDir
21 | from malexport.parse.combine import combine, AnimeData, MangaData
22 | from malexport.parse.forum import Post, iter_forum_posts
23 | from malexport.parse.friends import Friend, iter_friends
24 | from malexport.parse.messages import Thread, Message, iter_user_threads
25 | from malexport.parse.recover_deleted_entries import recover_deleted as rec_del, Approved
26 |
27 |
28 | @dataclass
29 | class mal_config(user_config.export):
30 | # path[s]/glob to the exported data
31 | export_path: PathIsh
32 |
33 | # this should be the top level directory, not the zip files or username directories
34 | # see https://github.com/purarue/malexport/#recover_deleted
35 | zip_backup_path: Optional[PathIsh] = None
36 |
37 |
38 | config = make_config(mal_config)
39 |
40 |
41 | logger = make_logger(__name__)
42 |
43 |
44 | # malexport supports multiple accounts
45 | # in its data directory structure
46 | @lru_cache(maxsize=1)
47 | def export_dirs() -> List[Path]:
48 | base: Path = Path(config.export_path).expanduser().absolute()
49 | with match_structure(base, expected="animelist.xml") as matches:
50 | return list(matches)
51 |
52 |
53 | Export = Tuple[List[AnimeData], List[MangaData]]
54 |
55 |
56 | @lru_cache(maxsize=2)
57 | def _read_malexport_aux(username: str, *, mtimes: Tuple[float, ...]) -> Export:
58 | logger.debug(f"reading {username}; cache miss: {mtimes}")
59 | return combine(username)
60 |
61 |
62 | def _read_malexport(username: str) -> Export:
63 | paths = LocalDir.from_username(username).data_dir.rglob("*")
64 | return _read_malexport_aux(
65 | username, mtimes=tuple(sorted(map(lambda f: f.stat().st_mtime, paths)))
66 | )
67 |
68 |
69 | @lru_cache(maxsize=None)
70 | def _find_deleted_aux(username: str, zips: Tuple[Path, ...]) -> Export:
71 | return rec_del(
72 | approved=Approved.parse_from_git_dir(),
73 | username=username,
74 | backups=list(zips),
75 | filter_with_activity=False,
76 | logger=logger,
77 | )
78 |
79 |
80 | def _find_deleted_inputs(username: str) -> Tuple[Path, ...]:
81 | if config.zip_backup_path is None:
82 | return tuple()
83 | directory_for_user: Path = Path(config.zip_backup_path) / username
84 | return get_files(directory_for_user, sort=True, glob="*.zip")
85 |
86 |
87 | def _find_deleted(username: str) -> Optional[Export]:
88 | return _find_deleted_aux(username, _find_deleted_inputs(username))
89 |
90 |
91 | ### Expose all the parsed information from malexport
92 |
93 |
94 | def anime() -> Iterator[AnimeData]:
95 | for path in export_dirs():
96 | anime, _ = _read_malexport(path.stem)
97 | yield from anime
98 |
99 |
100 | def manga() -> Iterator[MangaData]:
101 | for path in export_dirs():
102 | _, manga = _read_malexport(path.stem)
103 | yield from manga
104 |
105 |
106 | def deleted_anime() -> Iterator[AnimeData]:
107 | for path in export_dirs():
108 | if export := _find_deleted(path.stem):
109 | anime, _ = export
110 | yield from anime
111 |
112 |
113 | def deleted_manga() -> Iterator[MangaData]:
114 | for path in export_dirs():
115 | if export := _find_deleted(path.stem):
116 | _, manga = export
117 | yield from manga
118 |
119 |
120 | class Episode(NamedTuple):
121 | mal_id: int
122 | title: str
123 | episode: int
124 | at: datetime
125 |
126 |
127 | # use the combined data when reading history
128 | # since it removes entries you may have deleted
129 | # which still have local history files left over
130 | def episodes() -> Iterator[Episode]:
131 | for path in export_dirs():
132 | anime, _ = _read_malexport(path.stem)
133 | for a in anime:
134 | for h in a.history:
135 | yield Episode(
136 | mal_id=a.id,
137 | title=a.XMLData.title,
138 | episode=h.number,
139 | at=h.at,
140 | )
141 |
142 |
143 | class Chapter(NamedTuple):
144 | mal_id: int
145 | title: str
146 | chapter: int
147 | at: datetime
148 |
149 |
150 | def chapters() -> Iterator[Chapter]:
151 | for path in export_dirs():
152 | _, manga = _read_malexport(path.stem)
153 | for m in manga:
154 | for h in m.history:
155 | yield Chapter(
156 | mal_id=m.id,
157 | title=m.XMLData.title,
158 | chapter=h.number,
159 | at=h.at,
160 | )
161 |
162 |
163 | def posts() -> Iterator[Post]:
164 | for path in export_dirs():
165 | yield from iter_forum_posts(path.stem)
166 |
167 |
168 | def threads() -> Iterator[Thread]:
169 | for path in export_dirs():
170 | yield from iter_user_threads(path.stem)
171 |
172 |
173 | def messages() -> Iterator[Message]:
174 | for t in threads():
175 | yield from t.messages
176 |
177 |
178 | def friends() -> Iterator[Friend]:
179 | for path in export_dirs():
180 | yield from iter_friends(path.stem)
181 |
182 |
183 | def stats() -> Stats:
184 | from my.core import stat
185 |
186 | return {
187 | **stat(anime),
188 | **stat(manga),
189 | **stat(chapters),
190 | **stat(episodes),
191 | **stat(posts),
192 | **stat(friends),
193 | }
194 |
--------------------------------------------------------------------------------
/my/minecraft/advancements.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses achievement data/timestamps from local minecraft worlds
3 | Copied from the ~/.minecraft directory, one for each world
4 | Backed up with:
5 | https://github.com/purarue/HPI-personal/blob/master/scripts/backup_minecraft_advancements
6 | """
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import minecraft as user_config # type: ignore[attr-defined]
10 |
11 | from dataclasses import dataclass
12 | from my.core import Paths
13 |
14 |
15 | @dataclass
16 | class config(user_config.advancements):
17 | # path[s]/glob to the backup directory
18 | export_path: Paths
19 |
20 |
21 | import json
22 | from pathlib import Path
23 | from typing import Sequence, NamedTuple, Iterator, List, Any, Dict
24 | from datetime import datetime
25 | from itertools import chain
26 |
27 | from my.core import get_files, Stats
28 | from my.core.structure import match_structure
29 |
30 | from more_itertools import unique_everseen
31 |
32 | EXPECTED = ("advancements",)
33 |
34 |
35 | def _advancement_json_files(world_dir: Path) -> List[Path]:
36 | d = (world_dir / "advancements").absolute()
37 | if not d.exists():
38 | return []
39 | return list(d.rglob("*.json"))
40 |
41 |
42 | def worlds() -> Sequence[Path]:
43 | found = []
44 | for f in get_files(config.export_path):
45 | with match_structure(f, EXPECTED) as match:
46 | for m in match:
47 | if _advancement_json_files(m):
48 | found.append(m.absolute())
49 | return found
50 |
51 |
52 | class Advancement(NamedTuple):
53 | advancement_id: str
54 | world_name: str
55 | dt: datetime
56 |
57 |
58 | Results = Iterator[Advancement]
59 |
60 |
61 | def advancements() -> Results:
62 | yield from unique_everseen(chain(*map(_parse_world, worlds())))
63 |
64 |
65 | DATE_REGEX = r"%Y-%m-%d %H:%M:%S %z"
66 |
67 |
68 | def _parse_world(world_dir: Path) -> Results:
69 | """
70 | An example of a key, val this is trying to parse:
71 |
72 | "minecraft:nether/obtain_crying_obsidian": {
73 | "criteria": {
74 | "crying_obsidian": "2022-06-17 22:48:18 -0700"
75 | },
76 | "done": true
77 | },
78 | """
79 |
80 | for f in _advancement_json_files(world_dir):
81 | data = json.loads(f.read_text())
82 | for key, val in data.items():
83 | # ignore advanced in crafting recipes
84 | # and random non-dict values (version numbers etc.)
85 | if key.startswith("minecraft:recipes") or not isinstance(val, dict):
86 | continue
87 | # if just a marker and not 'done', don't include
88 | if "done" in val and val["done"] is False:
89 | continue
90 | possible_date_blobs: List[Dict[Any, Any]] = [
91 | v for v in val.values() if isinstance(v, dict)
92 | ]
93 | for blob in possible_date_blobs:
94 | for datestr in filter(lambda s: isinstance(s, str), blob.values()):
95 | try:
96 | parsed_date = datetime.strptime(datestr, DATE_REGEX)
97 | except ValueError:
98 | continue
99 | yield Advancement(
100 | advancement_id=key, world_name=world_dir.stem, dt=parsed_date
101 | )
102 |
103 |
104 | def stats() -> Stats:
105 | from my.core import stat
106 |
107 | return {**stat(advancements)}
108 |
--------------------------------------------------------------------------------
/my/mpv/history_daemon.py:
--------------------------------------------------------------------------------
1 | """
2 | Any Media being played on my computer with mpv
3 | Uses my mpv-history-daemon
4 | https://github.com/purarue/mpv-history-daemon
5 | """
6 |
7 | REQUIRES = ["git+https://github.com/purarue/mpv-history-daemon"]
8 |
9 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
10 | from my.config import mpv as user_config # type: ignore[attr-defined]
11 |
12 | from typing import Iterator, Sequence, Optional
13 | from dataclasses import dataclass
14 | from my.core import Paths, make_config
15 |
16 |
17 | @dataclass
18 | class mpv_config(user_config.history_daemon):
19 | # glob to the JSON files that the daemon writes whenever Im using mpv
20 | export_path: Paths
21 |
22 | # amount of song I should have listened to to qualify it as a listen (e.g. 0.5, 0.75)
23 | require_percent: Optional[float] = None
24 |
25 |
26 | config = make_config(mpv_config)
27 |
28 |
29 | import itertools
30 | from pathlib import Path
31 |
32 | from mpv_history_daemon.events import (
33 | Media,
34 | all_history as M_all_history,
35 | _actually_listened_to,
36 | )
37 |
38 | from my.core import get_files, Stats, make_logger
39 |
40 |
41 | logger = make_logger(__name__)
42 |
43 | # monkey patch logs
44 | import mpv_history_daemon.events
45 |
46 | mpv_history_daemon.events.logger = mpv_history_daemon.events.setup_logger(
47 | name="mpv_history_events", level=logger.level
48 | )
49 |
50 | Results = Iterator[Media]
51 |
52 |
53 | def stats() -> Stats:
54 | from my.core import stat
55 |
56 | return {
57 | **stat(history),
58 | }
59 |
60 |
61 | def inputs() -> Sequence[Path]:
62 | # this takes the files, sorts it so merged event files
63 | # are returned first, then the individual event ones
64 | # this makes it so that history is close to (it may not be if you opened 2 mpv
65 | # instances and listened to something while another was paused) chronologically sorted,
66 | # because the merged files are ordered by keyname
67 | files = list(get_files(config.export_path, sort=True))
68 | groups = {
69 | k: list(g)
70 | for k, g in itertools.groupby(files, key=lambda f: "merged" in f.stem)
71 | }
72 | # merged files, then raw event files
73 | return list(itertools.chain(groups.get(True, []), groups.get(False, [])))
74 |
75 |
76 | def _filter_by(m: Media) -> bool:
77 | if m.is_stream:
78 | return True
79 | # if duration is under 10 minutes, but listen_time is over
80 | # 3 hours, probably a broken item, caused by hanging mpv/socket?
81 | # I only have 2 of these, in the 13,000 or so history items
82 | if m.media_duration is not None and m.media_duration < 600:
83 | if m.listen_time > 10800:
84 | logger.debug(f"Assuming this is a broken file: {str(m)}")
85 | return False
86 | perc = config.require_percent or 0.75
87 | # fallback to library func
88 | return _actually_listened_to(m, require_listened_to_percent=perc)
89 |
90 |
91 | def all_history() -> Results:
92 | yield from M_all_history(list(inputs()))
93 |
94 |
95 | def history() -> Results:
96 | yield from filter(_filter_by, all_history())
97 |
--------------------------------------------------------------------------------
/my/offline/listens.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses scrobbles from https://github.com/purarue/offline_listens
3 | """
4 |
5 | REQUIRES = ["git+https://github.com/purarue/offline_listens"]
6 |
7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
8 | from my.config import offline as user_config # type: ignore[attr-defined]
9 |
10 |
11 | from pathlib import Path
12 | from typing import Iterator, Sequence
13 |
14 | from offline_listens.listens import Listen
15 | from offline_listens.parse import iter_dir, parse_file
16 |
17 | from dataclasses import dataclass
18 | from my.core import get_files, Stats, Paths
19 |
20 |
21 | @dataclass
22 | class config(user_config.listens):
23 | # path[s]/glob to the exported data
24 | export_path: Paths
25 |
26 |
27 | def inputs() -> Sequence[Path]:
28 | return get_files(config.export_path)
29 |
30 |
31 | Results = Iterator[Listen]
32 |
33 |
34 | def history() -> Results:
35 | for f in inputs():
36 | if f.is_dir():
37 | yield from iter_dir(f)
38 | else:
39 | yield from parse_file(f)
40 |
41 |
42 | def stats() -> Stats:
43 | from my.core import stat
44 |
45 | return {**stat(history)}
46 |
--------------------------------------------------------------------------------
/my/piazza/scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses piazza posts scraped by
3 | https://github.com/purarue/piazza-scraper
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/piazza-scraper"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import piazza as user_config # type: ignore[attr-defined]
10 | from dataclasses import dataclass
11 | from my.core import Paths
12 |
13 |
14 | @dataclass
15 | class config(user_config.scraper):
16 | # path to the exported data
17 | export_path: Paths
18 |
19 |
20 | import os
21 | from pathlib import Path
22 | from typing import Iterator, Sequence, Optional
23 |
24 | from my.core import get_files, Stats
25 |
26 | from piazza_scraper.parse import Post, Export
27 |
28 |
29 | def inputs() -> Sequence[Path]:
30 | return get_files(config.export_path)
31 |
32 |
33 | def classes() -> Iterator[Export]:
34 | for file in inputs():
35 | yield Export.parse_file(file)
36 |
37 |
38 | def _all_posts() -> Iterator[Post]:
39 | for exp in classes():
40 | for post in exp.posts:
41 | yield from post.walk_posts()
42 |
43 |
44 | def posts() -> Iterator[Post]:
45 | """
46 | Infer my user id by checking the stats/users area
47 | Parse all posts, and return ones made by me
48 | """
49 | for exp in classes():
50 | # hmm -- it seems that I'm always the only user in this?
51 | # will check an envvar in case someone else has issues configuring this/has different results
52 | # feel free to open an issue
53 | user_id: Optional[str] = os.environ.get("PIAZZA_UID")
54 | if user_id is None:
55 | assert (
56 | len(exp.users) > 0
57 | ), "Could not infer user id, set the PIAZZA_UID environment variable to your users' uid"
58 | user_id = exp.users[0].uid
59 |
60 | assert user_id is not None
61 | for post in exp.posts:
62 | yield from post.walk_posts_by_me(user_id)
63 |
64 |
65 | def stats() -> Stats:
66 | from my.core import stat
67 |
68 | return {
69 | **stat(posts),
70 | }
71 |
--------------------------------------------------------------------------------
/my/project_euler.py:
--------------------------------------------------------------------------------
1 | """
2 | When I completed https://projecteuler.net problems
3 |
4 | This information has to be updated manually, I do it once
5 | every few months/years depending on how many of these I keep
6 | solving
7 |
8 | To download, log in to your Project Euler account
9 | (in your browser), and then go to:
10 | https://projecteuler.net/history
11 |
12 | That txt file is what this accepts as input (can accept multiple)
13 | """
14 |
15 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
16 | from my.config import project_euler as user_config # type: ignore[attr-defined]
17 |
18 | from dataclasses import dataclass
19 | from my.core import Paths
20 |
21 |
22 | @dataclass
23 | class config(user_config):
24 | # path[s]/glob to the .txt export files
25 | export_path: Paths
26 |
27 |
28 | import re
29 | import csv
30 | from pathlib import Path
31 | from datetime import datetime, timezone
32 | from typing import Sequence, Iterator, NamedTuple, Optional, List, Dict
33 | from itertools import chain, groupby
34 |
35 | from my.core import get_files, Stats
36 |
37 |
38 | class Solution(NamedTuple):
39 | problem: int
40 | dt: datetime
41 | name: Optional[str]
42 |
43 |
44 | def inputs() -> Sequence[Path]:
45 | return get_files(config.export_path)
46 |
47 |
48 | def history() -> Iterator[Solution]:
49 | # need to sort here to dedupe accurately
50 | items: List[Solution] = sorted(
51 | chain(*map(_parse_file, inputs())), key=lambda s: s.problem
52 | )
53 | # group by items, and if there are multiple return the one with the name
54 | # (or None if there is no name)
55 | grouped: Dict[int, List[Solution]] = {
56 | num: list(problems) for num, problems in groupby(items, lambda s: s.problem)
57 | }
58 | for items in grouped.values():
59 | for item in items:
60 | if item.name is not None:
61 | yield item
62 | break # break out of the inner loop
63 | else:
64 | # no name on item, just yield the first
65 | yield items[0]
66 |
67 |
68 | # Example line:
69 | # 037: 07 Nov 14 (13:46)
70 | # project euler was started in early 2000s,
71 | # so no need to support 19XX
72 | # '14' means 2014
73 | OLD_LINE_REGEX = re.compile(r"(\d+):\s*(\d+)\s*(\w+)\s*(\d+)\s*\((\d+):(\d+)\)")
74 |
75 | # hardcoding instead of using calendar module avoid possible issues with locale
76 | MONTHS = [
77 | "jan",
78 | "feb",
79 | "mar",
80 | "apr",
81 | "may",
82 | "jun",
83 | "jul",
84 | "aug",
85 | "sep",
86 | "oct",
87 | "nov",
88 | "dec",
89 | ]
90 |
91 |
92 | def _parse_file(p: Path) -> Iterator[Solution]:
93 | for line in p.open():
94 | m = OLD_LINE_REGEX.match(line)
95 | if m:
96 | # old format
97 | problem, day, month_desc, year_short, hour, minute = m.groups()
98 | month_lowered = month_desc.lower()
99 | assert month_lowered in MONTHS, f"Couldn't find {month_lowered} in {MONTHS}"
100 | # datetimes in the file are UTC time
101 | yield Solution(
102 | problem=int(problem),
103 | dt=datetime(
104 | year=int(f"20{year_short}"),
105 | month=MONTHS.index(month_lowered) + 1,
106 | day=int(day),
107 | hour=int(hour),
108 | minute=int(minute),
109 | tzinfo=timezone.utc,
110 | ),
111 | name=None,
112 | )
113 | else:
114 | # new format
115 | csv_reader = csv.reader([line])
116 | row = next(csv_reader)
117 | dt = datetime.strptime(row[0], "%d %b %y (%H:%M)")
118 | yield Solution(problem=int(row[1]), dt=dt, name=row[2])
119 |
120 |
121 | def stats() -> Stats:
122 | from my.core import stat
123 |
124 | return {**stat(history)}
125 |
--------------------------------------------------------------------------------
/my/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/my/py.typed
--------------------------------------------------------------------------------
/my/rss/newsboat/git_history.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses when I added/removed newsboat subscriptions
3 | """
4 |
5 | REQUIRES = ["git+https://github.com/purarue/git_doc_history"]
6 |
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import rss as user_config # type: ignore[attr-defined]
10 |
11 |
12 | from pathlib import Path
13 | from datetime import datetime
14 | from typing import (
15 | Iterator,
16 | List,
17 | )
18 |
19 | from git_doc_history import (
20 | DocHistory,
21 | parse_snapshot_diffs,
22 | Diff,
23 | )
24 |
25 | from dataclasses import dataclass
26 | from my.core import Stats, PathIsh
27 |
28 |
29 | @dataclass
30 | class config(user_config.newsboat.git_history):
31 | # path to the git backup directory
32 | export_path: PathIsh
33 |
34 |
35 | RSS_FILES = ["urls"]
36 |
37 |
38 | def input() -> DocHistory:
39 | return DocHistory(
40 | backup_dir=Path(config.export_path).expanduser().absolute(),
41 | copy_files=RSS_FILES,
42 | )
43 |
44 |
45 | Results = Iterator[str]
46 |
47 |
48 | def _parse_buffer(buf: bytes) -> List[str]:
49 | return buf.decode("utf-8").strip().splitlines()
50 |
51 |
52 | def subscriptions() -> Results:
53 | yield from _parse_buffer(input().extract_buffer_at(RSS_FILES[0], at=datetime.now()))
54 |
55 |
56 | def events() -> Iterator[Diff]:
57 | yield from parse_snapshot_diffs(
58 | input(),
59 | file=RSS_FILES[0],
60 | )
61 |
62 |
63 | def stats() -> Stats:
64 | from my.core import stat
65 |
66 | return {
67 | **stat(subscriptions),
68 | **stat(events),
69 | }
70 |
--------------------------------------------------------------------------------
/my/runelite/screenshots.py:
--------------------------------------------------------------------------------
1 | """
2 | Extracts metadata from the automatic runelite (OldSchool RuneScape Client) screenshots
3 | that happen when you finish quests/gain levels
4 | https://github.com/runelite/runelite/wiki/Screenshot
5 | """
6 |
7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
8 | from my.config import runelite as user_config # type: ignore[attr-defined]
9 |
10 | from dataclasses import dataclass
11 | from my.core import Paths
12 |
13 |
14 | @dataclass
15 | class config(user_config.screenshots):
16 | # path[s]/glob to the base screenshot directory or each username
17 | # this can be some rsynced folder (like my jobs/computer/runelite_screenshots.job does)
18 | # or the .runelite folder itself
19 | export_path: Paths
20 |
21 |
22 | import re
23 | from pathlib import Path
24 | from typing import Sequence, Union, NamedTuple, Iterator, Tuple
25 | from datetime import datetime
26 |
27 | from my.core import get_files, Stats
28 | from my.core.structure import match_structure
29 |
30 | EXPECTED = ("Levels", "Quests")
31 |
32 |
33 | def accounts() -> Sequence[Path]:
34 | accounts = []
35 | for f in get_files(config.export_path):
36 | with match_structure(f, EXPECTED) as match:
37 | accounts.extend(list(match))
38 | return accounts
39 |
40 |
41 | class Level(NamedTuple):
42 | skill: str
43 | level: int
44 |
45 |
46 | Description = Union[Level, str]
47 |
48 |
49 | class Screenshot(NamedTuple):
50 | """represents one screenshot (quest/level etc.)"""
51 |
52 | dt: datetime
53 | path: Path
54 | screenshot_type: str # Level/Quest etc
55 | description: Description
56 | username: str
57 |
58 |
59 | Results = Iterator[Screenshot]
60 |
61 |
62 | def screenshots() -> Results:
63 | for acc in accounts():
64 | for p in acc.iterdir():
65 | if p.is_dir():
66 | yield from _parse_subdir(p, username=acc.stem)
67 |
68 |
69 | DT_REGEX = r"%Y-%m-%d_%H-%M-%S"
70 |
71 | # TODO: use tz module to optionally figure out what timezone I was
72 | # when the file was created, so I can make sure the info in the filename
73 | # being a naive date isn't an issue if I'm ever in another timezone
74 |
75 |
76 | def _extract_info_from_filename(p: Path) -> Tuple[str, datetime]:
77 | desc, _, dstr = p.stem.rpartition(" ")
78 | return desc.strip(), datetime.strptime(dstr, DT_REGEX)
79 |
80 |
81 | def _parse_subdir(p: Path, username: str) -> Results:
82 | if p.stem == "Levels":
83 | yield from _parse_level_dir(p, username=username)
84 | elif p.stem == "Quests":
85 | yield from _parse_quest_dir(p, username=username)
86 | else:
87 | yield from _parse_other_dir(p, username=username)
88 |
89 |
90 | QUEST_REGEX = re.compile(r"^Quest\((.*?)\)$")
91 |
92 |
93 | def _parse_quest_dir(p: Path, username: str) -> Results:
94 | for img in p.rglob("*.png"):
95 | desc, dt = _extract_info_from_filename(img)
96 | m = re.match(QUEST_REGEX, desc)
97 | assert m, f"Couldn't extract quest name from {desc}"
98 | yield Screenshot(
99 | dt=dt,
100 | path=img,
101 | screenshot_type="Quest",
102 | description=m.group(1),
103 | username=username,
104 | )
105 |
106 |
107 | LEVEL_REGEX = re.compile(r"^([\w ]+)\((\d+)\)$")
108 |
109 |
110 | def _parse_level_dir(p: Path, username: str) -> Results:
111 | for img in p.rglob("*.png"):
112 | desc, dt = _extract_info_from_filename(img)
113 | m = re.match(LEVEL_REGEX, desc)
114 | assert m, f"Could not match levels out of {desc}"
115 | skill_name, level = m.groups()
116 | yield Screenshot(
117 | dt=dt,
118 | path=img,
119 | screenshot_type="Level",
120 | description=Level(skill=skill_name, level=int(level)),
121 | username=username,
122 | )
123 |
124 |
125 | def _parse_other_dir(p: Path, username: str) -> Results:
126 | for img in p.rglob("*.png"):
127 | desc, dt = _extract_info_from_filename(img)
128 | yield Screenshot(
129 | dt=dt, path=img, screenshot_type=p.stem, description=desc, username=username
130 | )
131 |
132 |
133 | def stats() -> Stats:
134 | from my.core import stat
135 |
136 | return {**stat(screenshots)}
137 |
--------------------------------------------------------------------------------
/my/scramble/history.py:
--------------------------------------------------------------------------------
1 | """
2 | Timed Rubiks Cube Solve History from multiple sources using
3 | https://github.com/purarue/scramble-history
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/scramble-history"]
7 |
8 | from pathlib import Path
9 | from typing import Optional
10 | from dataclasses import dataclass
11 | from my.core import PathIsh, make_config
12 |
13 | from my.config import scramble as user_config # type: ignore[attr-defined]
14 |
15 |
16 | @dataclass
17 | class scramble_config(user_config.history):
18 | config_dir: Optional[PathIsh] = None
19 |
20 |
21 | config = make_config(scramble_config)
22 |
23 | from typing import Iterator
24 |
25 | from scramble_history.__main__ import (
26 | scramble_history_config_dir,
27 | conf_name,
28 | sourcemap_name,
29 | )
30 | from scramble_history.config import parse_config_file
31 | from scramble_history.models import Solve
32 | from scramble_history.source_merger import merge as merge_solves
33 |
34 | config_dir = Path(config.config_dir or scramble_history_config_dir).expanduser()
35 |
36 |
37 | parsed_conf = parse_config_file(config_dir / conf_name)
38 |
39 |
40 | def solves() -> Iterator[Solve]:
41 | yield from merge_solves(
42 | sourcemap_file=config_dir / sourcemap_name, conf=parsed_conf
43 | )
44 |
--------------------------------------------------------------------------------
/my/skype/gdpr.py:
--------------------------------------------------------------------------------
1 | """
2 | Parse Message Dates from Skypes GDPR JSON export
3 | """
4 |
5 | REQUIRES = ["dateparser"]
6 |
7 | # Isn't a lot of data here, seems a lot of the old
8 | # data is gone. Only parses a couple messages, might
9 | # as well use the datetimes for context on when I
10 | # was using skype
11 |
12 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
13 | from my.config import skype as user_config # type: ignore[attr-defined]
14 |
15 | from dataclasses import dataclass
16 | from my.core import Paths, Stats
17 |
18 |
19 | @dataclass
20 | class config(user_config.gdpr):
21 | # path[s]/glob to the skype JSON files
22 | export_path: Paths
23 |
24 |
25 | import json
26 | from pathlib import Path
27 | from datetime import datetime
28 | from typing import Iterator, Sequence
29 | from itertools import chain
30 |
31 | import dateparser
32 |
33 | from my.core import get_files, make_logger
34 |
35 | logger = make_logger(__name__)
36 |
37 |
38 | Results = Iterator[datetime]
39 |
40 |
41 | def inputs() -> Sequence[Path]:
42 | return get_files(config.export_path)
43 |
44 |
45 | def timestamps() -> Results:
46 | yield from chain(*map(_parse_file, inputs()))
47 |
48 |
49 | def _parse_file(post_file: Path) -> Results:
50 | items = json.loads(post_file.read_text())
51 | for conv in items["conversations"]:
52 | for msg in conv["MessageList"]:
53 | d = dateparser.parse(msg["originalarrivaltime"].rstrip("Z"))
54 | if d is not None:
55 | yield d
56 |
57 |
58 | def stats() -> Stats:
59 | from my.core import stat
60 |
61 | return {**stat(timestamps)}
62 |
--------------------------------------------------------------------------------
/my/spotify/gdpr.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the spotify GPDR Export
3 | """
4 |
5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
6 | from my.config import spotify as user_config # type: ignore[attr-defined]
7 |
8 | from dataclasses import dataclass
9 | from my.core import PathIsh, Stats
10 |
11 |
12 | @dataclass
13 | class config(user_config.gdpr):
14 | gdpr_dir: PathIsh # path to unpacked GDPR archive
15 |
16 |
17 | import os
18 | import json
19 | from datetime import date
20 | from pathlib import Path
21 | from typing import Iterator, Any, NamedTuple, List, Set, Tuple, Sequence, Optional
22 |
23 | from my.core import Res, get_files, make_logger, Json
24 |
25 | logger = make_logger(__name__)
26 |
27 |
28 | class Song(NamedTuple):
29 | name: str
30 | artist: str
31 | album: str
32 |
33 |
34 | class Playlist(NamedTuple):
35 | name: str
36 | last_modified: date
37 | songs: List[Song]
38 |
39 |
40 | Playlists = Iterator[Res[Playlist]]
41 | Songs = Iterator[Res[Song]]
42 |
43 |
44 | def inputs(gdpr_dir: Optional[PathIsh] = None) -> Sequence[Path]:
45 | chosen: PathIsh = gdpr_dir if gdpr_dir is not None else config.gdpr_dir
46 | echosen = Path(chosen).expanduser().absolute()
47 | return get_files(echosen, glob="*.json")
48 |
49 |
50 | def playlists() -> Playlists:
51 | gdpr_dir = str(Path(config.gdpr_dir).expanduser().absolute()) # expand path
52 | files = inputs(gdpr_dir)
53 | handler_map = {
54 | "Follow": None,
55 | "Inferences": None,
56 | "Payments": None,
57 | "Playlist": _filter_playlists,
58 | "StreamingHistory": None, # does not save any of the old play history, not worth parsing
59 | "Userdata": None,
60 | "YourLibrary": None,
61 | }
62 | for f in files:
63 | handler: Any
64 | for prefix, h in handler_map.items():
65 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)):
66 | continue
67 | handler = h
68 | break
69 | else:
70 | if f.is_dir():
71 | continue
72 | else:
73 | e = RuntimeError(f"Unhandled file: {f}")
74 | logger.debug(str(e))
75 | yield e
76 | continue
77 |
78 | if handler is None:
79 | # explicitly ignored
80 | continue
81 |
82 | if f.suffix != ".json":
83 | continue
84 |
85 | j = json.loads(f.read_text())
86 | yield from handler(j)
87 |
88 |
89 | def songs() -> Songs:
90 | emitted: Set[Tuple[str, str, str]] = set()
91 | for p in playlists():
92 | if isinstance(p, Exception):
93 | yield p
94 | continue
95 | for song in p.songs:
96 | key = (song.name, song.artist, song.album)
97 | if key in emitted:
98 | continue
99 | yield song
100 | emitted.add(key)
101 |
102 |
103 | def stats() -> Stats:
104 | from my.core import stat
105 |
106 | return {
107 | **stat(playlists),
108 | **stat(songs),
109 | }
110 |
111 |
112 | def _filter_playlists(d: Json) -> Iterator[Playlist]:
113 | # parse, then filter
114 | # make sure this playlist has more than one artist
115 | # if its just one artist, its probably just an album
116 | # that's been classified as a playlist
117 | for p in _parse_all_playlists(d):
118 | if len(set([s.artist for s in p.songs])) > 1:
119 | yield p
120 |
121 |
122 | def _parse_all_playlists(d: Json) -> Iterator[Playlist]:
123 | for plist in d["playlists"]:
124 | if plist["numberOfFollowers"] > 50:
125 | logger.debug(
126 | f"Ignoring playlist: {plist['name']}, too many followers to be one of my playlists"
127 | )
128 | continue
129 | songs: List[Song] = [_parse_song(b) for b in plist["items"]]
130 | yield Playlist(
131 | name=plist["name"],
132 | last_modified=_parse_date(plist["lastModifiedDate"]),
133 | songs=songs,
134 | )
135 |
136 |
137 | def _parse_song(song_info: Json) -> Song:
138 | tr: Json = song_info["track"]
139 | return Song(
140 | name=tr["trackName"],
141 | artist=tr["artistName"],
142 | album=tr["albumName"],
143 | )
144 |
145 |
146 | def _parse_date(date_str: str) -> date:
147 | date_info: List[int] = list(map(int, date_str.split("-")))
148 | return date(year=date_info[0], month=date_info[1], day=date_info[2])
149 |
--------------------------------------------------------------------------------
/my/steam/scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses steam game/achievement data scraped with
3 | https://github.com/purarue/steamscraper
4 | """
5 |
6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
7 | from my.config import steam as user_config # type: ignore[attr-defined]
8 | from dataclasses import dataclass
9 | from my.core import Paths
10 |
11 |
12 | @dataclass
13 | class config(user_config.scraper):
14 | # path to the exported data
15 | export_path: Paths
16 |
17 |
18 | import json
19 | from functools import partial
20 | from pathlib import Path
21 | from datetime import datetime
22 | from typing import NamedTuple, Iterator, Sequence, Dict, List, Optional, Any
23 | from itertools import groupby
24 |
25 | from my.core import get_files, Stats, Res
26 | from my.utils.time import parse_datetime_sec
27 |
28 |
29 | def inputs() -> Sequence[Path]:
30 | return get_files(config.export_path)
31 |
32 |
33 | class Achievement(NamedTuple):
34 | title: str
35 | description: str
36 | achieved: bool
37 | game_name: str
38 | achieved_on: Optional[datetime]
39 | icon: Optional[str]
40 |
41 |
42 | class Game(NamedTuple):
43 | id: int
44 | name: str
45 | hours_played: float
46 | achievements: List[Achievement]
47 | image_url: Optional[str]
48 |
49 | @property
50 | def achieved(self) -> int:
51 | return list(map(lambda g: g.achieved, self.achievements)).count(True)
52 |
53 | @property
54 | def achievement_count(self) -> int:
55 | return len(self.achievements)
56 |
57 | @property
58 | def achievement_percentage(self) -> float:
59 | return self.achieved / self.achievement_count
60 |
61 |
62 | Results = Iterator[Res[Game]]
63 | AchievementResults = Iterator[Res[Achievement]]
64 |
65 |
66 | def games() -> Results:
67 | """only ones I've played"""
68 | for game in all_games():
69 | if isinstance(game, Exception):
70 | yield game
71 | else:
72 | if game.hours_played > 0.0:
73 | yield game
74 |
75 |
76 | def all_games() -> Results:
77 | # combine the results from multiple files
78 | games_no_exc: List[Game] = []
79 | for json_file in inputs():
80 | for g in _read_parsed_json(json_file):
81 | if isinstance(g, Exception):
82 | yield g
83 | else:
84 | assert isinstance(g, Game)
85 | games_no_exc.append(g)
86 |
87 | # only return the single game with the most achievement count if there are duplicates
88 | for _, gm in groupby(sorted(games_no_exc, key=lambda x: x.id), lambda x: x.id):
89 | yield max(gm, key=lambda gmo: gmo.achieved)
90 |
91 |
92 | def all_achievements() -> AchievementResults:
93 | # combine the results from multiple achievement lists
94 | for game in all_games():
95 | if isinstance(game, Exception):
96 | yield game
97 | else:
98 | yield from game.achievements
99 |
100 |
101 | # only ones which Ive actually achieved
102 | def achievements() -> AchievementResults:
103 | for ach in all_achievements():
104 | if isinstance(ach, Exception):
105 | yield ach
106 | else:
107 | if ach.achieved:
108 | yield ach
109 |
110 |
111 | def _read_parsed_json(p: Path) -> Results:
112 | items = json.loads(p.read_text())
113 | for _, game in items.items():
114 | ach_lambda = partial(_parse_achievement, game_name=game["name"])
115 | try:
116 | yield Game(
117 | id=game["id"],
118 | name=game["name"],
119 | hours_played=game["hours"],
120 | image_url=game["image"],
121 | achievements=list(map(ach_lambda, game["achievements"])),
122 | )
123 | except TypeError as e:
124 | # error creating datetime?
125 | yield e
126 |
127 |
128 | def _parse_achievement(ach: Dict[str, Any], game_name: str) -> Achievement:
129 | achieved = ach["progress"]["unlocked"]
130 | achieved_on = None
131 | # parse datetime if it has it
132 | # could possibly throw an error, but its caught above
133 | if achieved:
134 | achieved_on = parse_datetime_sec(ach["progress"]["data"])
135 | return Achievement(
136 | title=ach["title"],
137 | description=ach["description"],
138 | game_name=game_name,
139 | achieved=achieved,
140 | achieved_on=achieved_on,
141 | icon=ach.get("icon"),
142 | )
143 |
144 |
145 | def stats() -> Stats:
146 | from my.core import stat
147 |
148 | return {
149 | **stat(games),
150 | **stat(achievements),
151 | }
152 |
--------------------------------------------------------------------------------
/my/todotxt/active.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses your active todotxt (http://todotxt.org/) done.txt and todo.txt
3 | """
4 |
5 | REQUIRES = ["pytodotxt>=1.5.0"]
6 |
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import todotxt as user_config # type: ignore[attr-defined]
10 |
11 |
12 | from pathlib import Path
13 | from typing import (
14 | Tuple,
15 | Iterator,
16 | )
17 |
18 | from dataclasses import dataclass
19 | from my.core import Stats, PathIsh
20 | from .common import Todo, TODOTXT_FILES, parse_todotxt_buffer
21 |
22 |
23 | @dataclass
24 | class config(user_config.active):
25 | # path to your active todo.txt directory
26 | # this is the same place todo.sh stores your files
27 | export_path: PathIsh
28 |
29 |
30 | def inputs() -> Tuple[Path, Path]:
31 | p = Path(config.export_path).expanduser().absolute()
32 | if not p.exists():
33 | raise FileNotFoundError(f"todotxt export path {p} doesn't exist")
34 | # todo.txt, done.txt
35 | return (
36 | p / TODOTXT_FILES[0],
37 | p / TODOTXT_FILES[1],
38 | )
39 |
40 |
41 | Results = Iterator[Todo]
42 |
43 |
44 | def done() -> Results:
45 | df = inputs()[1]
46 | if not Path(df).exists():
47 | return
48 | yield from parse_todotxt_buffer(Path(df).read_text())
49 |
50 |
51 | def todos() -> Results:
52 | tf = inputs()[0]
53 | if not tf.exists():
54 | return
55 | yield from parse_todotxt_buffer(tf.read_text())
56 |
57 |
58 | def stats() -> Stats:
59 | from my.core import stat
60 |
61 | return {
62 | **stat(todos),
63 | **stat(done),
64 | }
65 |
--------------------------------------------------------------------------------
/my/todotxt/common.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, cast, Optional, List, Union
2 | from datetime import datetime
3 |
4 | from pytodotxt import Task, TodoTxtParser # type: ignore[import]
5 | from my.core import __NOT_HPI_MODULE__ # noqa: F401
6 |
7 | REQUIRES = ["pytodotxt>=1.5.0"]
8 |
9 |
10 | class Todo(Task):
11 | # support serializing with hpi query
12 | def _serialize(self) -> Dict[str, Any]:
13 | assert self._raw is not None
14 | return {
15 | "completed": self.is_completed,
16 | "completion_date": self.completion_date,
17 | "deadline": self.deadline,
18 | "creation_date": self.creation_date,
19 | "priority": self.priority,
20 | "text": self.bare_description(),
21 | "projects": self.projects,
22 | "contexts": self.contexts,
23 | "attributes": self.attributes,
24 | "raw": self._raw,
25 | }
26 |
27 | @property
28 | def bare(self) -> str:
29 | return cast(str, self.bare_description())
30 |
31 | # parse the deadline created by https://github.com/purarue/full_todotxt
32 | # this is optional, so if it fails, just return None
33 | @property
34 | def deadline(self) -> Optional[datetime]:
35 | attrs = self.attributes
36 | if not attrs:
37 | return None
38 | if not isinstance(attrs, dict):
39 | return None
40 | if "deadline" in attrs:
41 | try:
42 | data = attrs["deadline"][0]
43 | parsed = datetime.strptime(data, "%Y-%m-%dT%H-%M%z")
44 | return parsed
45 | except ValueError:
46 | pass
47 | return None
48 |
49 | def __eq__(self, other: Any) -> bool:
50 | if not isinstance(other, Task):
51 | return False
52 | return cast(bool, self._raw == other._raw)
53 |
54 | def __ne__(self, other: Any) -> bool:
55 | return not self.__eq__(other)
56 |
57 | def __hash__(self) -> int:
58 | return hash(self._raw)
59 |
60 |
61 | TODOTXT_FILES = ["todo.txt", "done.txt"]
62 |
63 |
64 | def parse_todotxt_buffer(data: Union[str, bytes]) -> List[Todo]:
65 | return cast(List[Todo], TodoTxtParser(task_type=Todo).parse(data))
66 |
--------------------------------------------------------------------------------
/my/todotxt/git_history.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses todotxt (http://todotxt.org/) done.txt and todo.txt history
3 | from https://github.com/purarue/git_doc_history backups
4 | """
5 |
6 | REQUIRES = [
7 | "pytodotxt>=1.5.0",
8 | "git+https://github.com/purarue/git_doc_history",
9 | ]
10 |
11 |
12 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
13 | from my.config import todotxt as user_config # type: ignore[attr-defined]
14 |
15 |
16 | from pathlib import Path
17 | from datetime import datetime, timezone
18 | from typing import Iterator
19 |
20 | from git_doc_history import DocHistory, parse_snapshot_diffs, Action
21 |
22 | from dataclasses import dataclass
23 | from my.core import Stats, PathIsh
24 | from .common import Todo, TODOTXT_FILES, parse_todotxt_buffer
25 |
26 |
27 | @dataclass
28 | class config(user_config.git_history):
29 | # path to the git backup directory
30 | export_path: PathIsh
31 |
32 |
33 | def input() -> DocHistory:
34 | return DocHistory(
35 | backup_dir=Path(config.export_path).expanduser().absolute(),
36 | copy_files=TODOTXT_FILES,
37 | )
38 |
39 |
40 | Results = Iterator[Todo]
41 |
42 |
43 | # These work by grabbing the latest version of the file
44 | # from the git repo, so they may not always be up to date
45 | # if you don't update git_doc_history often enough
46 | def done() -> Results:
47 | yield from parse_todotxt_buffer(
48 | input().extract_buffer_at("done.txt", at=datetime.now())
49 | )
50 |
51 |
52 | def todos() -> Results:
53 | yield from parse_todotxt_buffer(
54 | input().extract_buffer_at("todo.txt", at=datetime.now())
55 | )
56 |
57 |
58 | @dataclass
59 | class TodoEvent:
60 | todo: Todo
61 | dt: datetime
62 | action: Action
63 |
64 |
65 | def events() -> Iterator[TodoEvent]:
66 | """
67 | Keeps track when I added/completed todos
68 | """
69 | for diff in parse_snapshot_diffs(
70 | input(),
71 | file="todo.txt",
72 | parse_func=lambda doc: parse_todotxt_buffer(doc.data),
73 | ):
74 | yield TodoEvent(
75 | todo=diff.data,
76 | dt=datetime.fromtimestamp(diff.epoch_time, tz=timezone.utc),
77 | action=diff.action,
78 | )
79 |
80 |
81 | def stats() -> Stats:
82 | from my.core import stat
83 |
84 | return {
85 | **stat(todos),
86 | **stat(done),
87 | **stat(events),
88 | }
89 |
--------------------------------------------------------------------------------
/my/trakt/export.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the dump of my movies/tv shows history and watchlist from https://trakt.tv/
3 | Uses https://github.com/purarue/traktexport
4 | """
5 |
6 | REQUIRES = ["git+https://github.com/purarue/traktexport"]
7 |
8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
9 | from my.config import trakt as user_config # type: ignore[attr-defined]
10 |
11 | from pathlib import Path
12 | from typing import Iterator, Dict, Any, Sequence, List
13 | from functools import lru_cache
14 |
15 | import traktexport.dal as D
16 | from traktexport.merge import read_and_merge_exports
17 |
18 | from dataclasses import dataclass
19 | from my.core import get_files, Stats, make_logger, Paths
20 | from my.core.cachew import mcachew
21 |
22 |
23 | @dataclass
24 | class config(user_config.export):
25 | # path[s]/glob to the exported data. These are the resulting json file from 'traktexport export'
26 | export_path: Paths
27 |
28 |
29 | logger = make_logger(__name__)
30 |
31 |
32 | def inputs() -> Sequence[Path]:
33 | return get_files(config.export_path)
34 |
35 |
36 | def _cachew_depends_on() -> List[float]:
37 | return [Path(f).lstat().st_mtime for f in sorted(inputs())]
38 |
39 |
40 | @lru_cache(maxsize=None)
41 | def _read_trakt_exports() -> D.FullTraktExport:
42 | return read_and_merge_exports(list(map(str, inputs())))
43 |
44 |
45 | ### Expose all the parsed information from traktexport.dal
46 |
47 |
48 | def profile_stats() -> Dict[str, Any]:
49 | # read the 'stats' key directly from the JSON file
50 | return _read_trakt_exports().stats
51 |
52 |
53 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
54 | def followers() -> Iterator[D.Follow]:
55 | yield from _read_trakt_exports().followers
56 |
57 |
58 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
59 | def likes() -> Iterator[D.Like]:
60 | yield from _read_trakt_exports().likes
61 |
62 |
63 | # TODO: hmm, cachew seems to fail with this one, not sure why
64 | # @mcachew(depends_on=_cachew_depends_on, logger=logger)
65 | def watchlist() -> Iterator[D.WatchListEntry]:
66 | yield from _read_trakt_exports().watchlist
67 |
68 |
69 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
70 | def ratings() -> Iterator[D.Rating]:
71 | yield from _read_trakt_exports().ratings
72 |
73 |
74 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
75 | def history() -> Iterator[D.HistoryEntry]:
76 | yield from _read_trakt_exports().history
77 |
78 |
79 | def stats() -> Stats:
80 | from my.core import stat
81 |
82 | return {
83 | **stat(followers),
84 | **stat(likes),
85 | **stat(watchlist),
86 | **stat(ratings),
87 | **stat(history),
88 | }
89 |
--------------------------------------------------------------------------------
/my/ttt.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses history from https://github.com/purarue/ttt
3 | """
4 |
5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
6 | from my.config import ttt as user_config # type: ignore[attr-defined]
7 |
8 | import csv
9 | from pathlib import Path
10 | from datetime import datetime
11 | from io import StringIO
12 | from typing import (
13 | NamedTuple,
14 | Iterator,
15 | Sequence,
16 | Optional,
17 | )
18 | from itertools import chain
19 | from functools import partial
20 |
21 | from more_itertools import unique_everseen
22 |
23 | from dataclasses import dataclass
24 | from my.core import get_files, Stats, Paths, make_logger
25 | from my.utils.time import parse_datetime_sec
26 | from my.utils.parse_csv import parse_csv_file
27 |
28 | logger = make_logger(__name__)
29 |
30 |
31 | @dataclass
32 | class config(user_config):
33 | # path[s]/glob to the backed up ttt history files
34 | # (can be a list if you want to provide the live file)
35 | export_path: Paths
36 |
37 |
38 | def inputs() -> Sequence[Path]:
39 | return get_files(config.export_path)
40 |
41 |
42 | # represents one history entry (command)
43 | class Entry(NamedTuple):
44 | dt: datetime
45 | command: str
46 | directory: Optional[str]
47 |
48 |
49 | Results = Iterator[Entry]
50 |
51 |
52 | def history() -> Results:
53 | func = partial(parse_csv_file, parse_function=_parse_text, logger=logger)
54 | yield from unique_everseen(
55 | chain(*map(func, inputs())),
56 | key=lambda e: (
57 | e.dt,
58 | e.command,
59 | ),
60 | )
61 |
62 |
63 | def _parse_text(data: str) -> Results:
64 | csv_reader = csv.reader(
65 | StringIO(data), delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
66 | )
67 | for row in csv_reader:
68 | yield Entry(
69 | dt=parse_datetime_sec(row[0]),
70 | command=row[2],
71 | directory=None if row[1] == "-" else row[1],
72 | )
73 |
74 |
75 | def stats() -> Stats:
76 | from my.core import stat
77 |
78 | return {**stat(history)}
79 |
--------------------------------------------------------------------------------
/my/twitch/all.py:
--------------------------------------------------------------------------------
1 | from .common import Results
2 |
3 |
4 | def events() -> Results:
5 | # comment out any sources you're not using
6 | from .gdpr import events as gdpr_events
7 | from .overrustle_logs import events as chatlog_events
8 |
9 | yield from chatlog_events()
10 | yield from gdpr_events()
11 |
12 |
13 | from my.core import Stats
14 |
15 |
16 | def stats() -> Stats:
17 | from my.core import stat
18 |
19 | return {**stat(events)}
20 |
--------------------------------------------------------------------------------
/my/twitch/common.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import NamedTuple, Union, Iterator
3 |
4 | from my.core import __NOT_HPI_MODULE__ # noqa: F401
5 |
6 |
7 | class Event(NamedTuple):
8 | event_type: str
9 | dt: datetime
10 | channel: str
11 | # e.g., additional data/chatlog message
12 | context: Union[str, int]
13 |
14 |
15 | Results = Iterator[Event]
16 |
--------------------------------------------------------------------------------
/my/twitch/gdpr.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses the twitch GDPR data request
3 | https://www.twitch.tv/p/en/legal/privacy-choices/#user-privacy-requests
4 | """
5 |
6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
7 | from my.config import twitch as user_config # type: ignore[attr-defined]
8 |
9 | from dataclasses import dataclass
10 | from my.core import PathIsh
11 |
12 |
13 | @dataclass
14 | class config(user_config.gdpr):
15 | gdpr_dir: PathIsh # path to unpacked GDPR archive
16 |
17 |
18 | import csv
19 | from datetime import datetime
20 | from pathlib import Path
21 | from typing import Iterator, Union, Sequence, List
22 |
23 | from .common import Event, Results
24 |
25 | from my.core import make_logger
26 | from my.core.cachew import mcachew
27 | from my.core.common import get_files
28 |
29 | logger = make_logger(__name__)
30 |
31 |
32 | def inputs() -> Sequence[Path]:
33 | return get_files(config.gdpr_dir, glob="*.csv")
34 |
35 |
36 | def _cachew_depends_on() -> List[float]:
37 | return [p.stat().st_mtime for p in inputs()]
38 |
39 |
40 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
41 | def events() -> Results:
42 | for file in inputs():
43 | yield from _parse_csv_file(file)
44 |
45 |
46 | def _parse_csv_file(p: Path) -> Iterator[Event]:
47 | with p.open("r") as f:
48 | reader = csv.reader(f)
49 | next(reader) # ignore header
50 | for line in reader:
51 | context: Union[str, int]
52 | context = line[6]
53 | if context.isdigit():
54 | context = int(line[6])
55 | yield Event(
56 | event_type=line[0],
57 | dt=datetime.fromisoformat(line[1]),
58 | channel=line[5],
59 | context=context,
60 | )
61 |
--------------------------------------------------------------------------------
/my/twitch/overrustle_logs.py:
--------------------------------------------------------------------------------
1 | """
2 | Reads parsed information from the overrustle logs dump
3 | https://github.com/purarue/overrustle_parser
4 | """
5 |
6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
7 | from my.config import twitch as user_config # type: ignore[attr-defined]
8 |
9 | from dataclasses import dataclass
10 | from my.core import Paths
11 |
12 |
13 | @dataclass
14 | class config(user_config.overrustle):
15 | export_path: Paths # parsed overrustle_parser json files
16 |
17 |
18 | import json
19 | from pathlib import Path
20 | from typing import Sequence, List
21 |
22 | from my.core import make_logger
23 | from my.core.cachew import mcachew
24 | from my.core.common import get_files
25 | from my.utils.time import parse_datetime_sec
26 |
27 | from .common import Event, Results
28 |
29 | logger = make_logger(__name__)
30 |
31 |
32 | def inputs() -> Sequence[Path]:
33 | return get_files(config.export_path)
34 |
35 |
36 | def _cachew_depends_on() -> List[float]:
37 | return [p.stat().st_mtime for p in inputs()]
38 |
39 |
40 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
41 | def events() -> Results:
42 | for file in inputs():
43 | yield from _parse_json_dump(file)
44 |
45 |
46 | def _parse_json_dump(p: Path) -> Results:
47 | for blob in json.loads(p.read_text()):
48 | yield Event(
49 | event_type="chatlog",
50 | dt=parse_datetime_sec(blob["dt"]),
51 | channel=blob["channel"],
52 | context=blob["message"],
53 | )
54 |
--------------------------------------------------------------------------------
/my/utils/backup_to/__main__.py:
--------------------------------------------------------------------------------
1 | from os import environ, path
2 | from pathlib import Path
3 |
4 | import click
5 |
6 | from my.core import __NOT_HPI_MODULE__ # noqa: F401
7 |
8 | # if the HPIDATA environment variable is set (which points to my data)
9 | # use that. Else, just default to ~/data
10 | BASE_PREFIX: Path = Path(environ.get("HPIDATA", path.expanduser("~/data")))
11 |
12 |
13 | def get_dir(name: str) -> Path:
14 | to = (BASE_PREFIX / name).absolute()
15 | to.mkdir(parents=True, exist_ok=True)
16 | return to
17 |
18 |
19 | @click.command()
20 | @click.argument("NAME")
21 | def main(name: str) -> None:
22 | """
23 | Helper script to locate a directory to backup to
24 | """
25 | click.echo(str(get_dir(name)))
26 |
27 |
28 | if __name__ == "__main__":
29 | main(prog_name="backup_to")
30 |
--------------------------------------------------------------------------------
/my/utils/parse_csv.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import logging
3 |
4 | from pathlib import Path
5 | from typing import Callable, Iterator, TypeVar, Optional
6 |
7 | T = TypeVar("T")
8 |
9 |
10 | def parse_csv_file(
11 | histfile: Path,
12 | parse_function: Callable[[str], Iterator[T]],
13 | logger: Optional[logging.Logger] = None,
14 | ) -> Iterator[T]:
15 | """
16 | Parses a CSV file using parse_function, yield results from that function.
17 |
18 | If the CSV file contains NUL bytes, replace those and try again.
19 | """
20 | with histfile.open("r", encoding="utf-8", newline="") as f:
21 | data = f.read()
22 | try:
23 | yield from parse_function(data)
24 | except (csv.Error, ValueError) as e:
25 | if "\0" not in data:
26 | raise RuntimeError(f"Could not parse {histfile}: {e}") from e
27 | else:
28 | if logger:
29 | logger.warning("Found NUL byte in %s: %s", histfile, e)
30 | yield from parse_function(data.replace("\0", ""))
31 |
--------------------------------------------------------------------------------
/my/utils/time.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from datetime import datetime, timezone
3 |
4 | from my.core import __NOT_HPI_MODULE__ # noqa: F401
5 |
6 | # TODO: maybe this should be PR'd to master/put into
7 | # my.time.tz/utils?
8 |
9 |
10 | def parse_datetime_sec(d: Union[str, float, int]) -> datetime:
11 | return datetime.fromtimestamp(int(d), tz=timezone.utc)
12 |
13 |
14 | def parse_datetime_millis(d: Union[str, float, int]) -> datetime:
15 | return parse_datetime_sec(int(d) / 1000)
16 |
--------------------------------------------------------------------------------
/my/zsh.py:
--------------------------------------------------------------------------------
1 | """
2 | Parses ZSH history (uses exports from ./job/zsh_history.job) and current zsh history (from $ZDOTDIR)
3 |
4 | This parses the zsh format I've configured, zsh is heavily configurable
5 | Mine looks like:
6 | : 1598471925:470;python3
7 | : datetime:duration:command
8 |
9 | My config looks like:
10 |
11 | HISTFILE="${ZDOTDIR}/.zsh_history"
12 | HISTSIZE=1000000
13 | SAVEHIST=1000000
14 | setopt APPEND_HISTORY # append to history file instead of replacing
15 | setopt HIST_REDUCE_BLANKS # delete empty lines from history file
16 | setopt HIST_IGNORE_SPACE # ignore lines that start with space
17 | setopt HIST_NO_STORE # Do not add history and fc commands to the history
18 | setopt EXTENDED_HISTORY # save time/duration to history file
19 | """
20 |
21 | # if on multiple computers, the zsh histories can be copied into the zsh.export_path
22 | # and it will merge everything without duplicates
23 |
24 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
25 | from my.config import zsh as user_config # type: ignore[attr-defined]
26 |
27 | from pathlib import Path
28 | from typing import Sequence, Optional
29 | from functools import lru_cache
30 |
31 | from dataclasses import dataclass
32 | from my.core import (
33 | get_files,
34 | warn_if_empty,
35 | Stats,
36 | make_logger,
37 | PathIsh,
38 | Paths,
39 | )
40 | from my.core.cachew import mcachew
41 | from my.core.warnings import low
42 | from my.utils.time import parse_datetime_sec
43 |
44 | from more_itertools import unique_everseen
45 |
46 |
47 | @dataclass
48 | class config(user_config):
49 | # path[s]/glob to the exported zsh history files
50 | export_path: Paths
51 |
52 | # path to current zsh history (i.e. the live file)
53 | live_file: Optional[PathIsh]
54 |
55 |
56 | logger = make_logger(__name__)
57 |
58 |
59 | def backup_inputs() -> Sequence[Path]:
60 | return list(get_files(config.export_path))
61 |
62 |
63 | @lru_cache(1)
64 | def _live_file() -> Optional[Path]:
65 | if config.live_file is not None:
66 | p: Path = Path(config.live_file).expanduser().absolute()
67 | if p.exists():
68 | return p
69 | else:
70 | low(f"'live_file' provided {config.live_file} but that file doesn't exist.")
71 | return None
72 | return None
73 |
74 |
75 | import re
76 |
77 | from datetime import datetime
78 | from typing import NamedTuple, Iterator, Tuple
79 | from itertools import chain
80 |
81 |
82 | # represents one history entry (command)
83 | class Entry(NamedTuple):
84 | dt: datetime
85 | duration: int
86 | command: str
87 |
88 |
89 | Results = Iterator[Entry]
90 |
91 |
92 | def history() -> Results:
93 | lf = _live_file()
94 | if lf is not None:
95 | yield from _merge_histories(_history_from_backups(), _parse_file(lf))
96 | else:
97 | # if we're not merging the live history file
98 | # dont need to spend the time doing the additional _merge_histories
99 | yield from _history_from_backups()
100 |
101 |
102 | def _cachew_depends_on() -> Sequence[Path]:
103 | return sorted(backup_inputs())
104 |
105 |
106 | @mcachew(depends_on=_cachew_depends_on, logger=logger)
107 | def _history_from_backups() -> Results:
108 | yield from _merge_histories(*map(_parse_file, backup_inputs()))
109 |
110 |
111 | @warn_if_empty
112 | def _merge_histories(*sources: Results) -> Results:
113 | yield from unique_everseen(
114 | chain(*sources),
115 | key=lambda e: (
116 | e.dt,
117 | e.command,
118 | ),
119 | )
120 |
121 |
122 | def _parse_file(histfile: Path) -> Results:
123 | dt: Optional[datetime] = None
124 | dur: Optional[int] = None
125 | command: str = ""
126 | # can't parse line by line since some commands are multiline
127 | # sort of structured like a do-while loop
128 | for line in histfile.open(encoding="latin-1"):
129 | r = _parse_metadata(line)
130 | # if regex didn't match, this is a multi line command string
131 | if r is None:
132 | command += "\n" + line
133 | else:
134 | # this 'if' is needed for the first item (since its not set on the first loop)
135 | # yield the last command
136 | if dt is not None and dur is not None:
137 | yield Entry(
138 | dt=dt,
139 | duration=dur,
140 | command=command,
141 | )
142 | # set 'current' dt, dur, command to matched groups
143 | dt, dur, command = r
144 | # yield the last entry
145 | if command:
146 | yield Entry(
147 | dt=dt, # type: ignore[arg-type]
148 | duration=dur, # type: ignore[arg-type]
149 | command=command,
150 | )
151 |
152 |
153 | PATTERN = re.compile(r"^: (\d+):(\d+);(.*)$")
154 |
155 |
156 | def _parse_metadata(histline: str) -> Optional[Tuple[datetime, int, str]]:
157 | """
158 | parse the date, duration, and command from a line
159 | """
160 | matches = PATTERN.match(histline)
161 | if matches:
162 | g = matches.groups()
163 | return (parse_datetime_sec(g[0]), int(g[1]), g[2])
164 | return None
165 |
166 |
167 | def stats() -> Stats:
168 | from my.core import stat
169 |
170 | return {**stat(history)}
171 |
--------------------------------------------------------------------------------
/scripts/functions.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # These should work with both bash and zsh
3 | #
4 | # To use these, put 'source /path/to/this/repo/functions.sh'
5 | # in your shell profile
6 | #
7 | # these use a bunch of common-ish shell tools
8 | # to interact with the hpi query JSON API
9 | # jq: https://github.com/stedolan/jq
10 | # fzf: https://github.com/junegunn/fzf
11 |
12 | # helpers used across multiple functions
13 | alias mpv-from-stdin='mpv --playlist=- --no-audio-display --msg-level=file=error'
14 | filter_unique() {
15 | awk '!seen[$0]++'
16 | }
17 |
18 | ###################
19 | # my.listenbrainz
20 | ###################
21 |
22 | scrobbles() {
23 | hpi query my.listenbrainz.export.history -s "$@"
24 | }
25 | scrobble-describe() {
26 | jq -r '"\(.listened_at) \(.artist_name) - \(.track_name)"'
27 | }
28 |
29 | ##########
30 | # my.mpv
31 | ##########
32 |
33 | # functions to replay music I've listened to recently
34 | mpv-recent() {
35 | local args=()
36 | if [[ -n "$1" ]]; then
37 | args+=("--limit" "$1")
38 |
39 | fi
40 | hpi query my.mpv.history_daemon.history --order-type datetime --reverse -s "${args[@]}"
41 | }
42 | mpv-recent-path() {
43 | mpv-recent "$1" | jq -r .path
44 | }
45 | replay() {
46 | mpv-recent-path | exists | grep --max-count=1 "$XDG_MUSIC_DIR" | mpv-from-stdin
47 | }
48 | # requires https://github.com/purarue/exists, https://github.com/purarue/pura-utils
49 | replay-recent() {
50 | mpv-recent-path "$1" | exists | unique | head -n "${1:-$LINES}" | fzf --select-1 | mpv-from-stdin
51 | }
52 |
53 | ##########
54 | # my.zsh
55 | ##########
56 |
57 | # jq later to preserve newlines in commands
58 | alias zsh-unique="hpi query -s my.zsh.history | jq '.command' | filter_unique | jq -r"
59 | alias zsh-unique-fzf='zsh-unique | fzf'
60 |
61 | ############
62 | # my.trakt
63 | ############
64 |
65 | # e.g. trakt-movies --recent 4w | trakt-describe-movie
66 | trakt-movies() {
67 | hpi query 'my.trakt.export.history' -s "$@" | trakt-filter-movies
68 | }
69 |
70 | # e.g. trakt-episodes --recent 4w | trakt-describe-episode
71 | trakt-episodes() {
72 | hpi query 'my.trakt.export.history' -s "$@" | trakt-filter-episodes
73 | }
74 |
75 | trakt-filter-movies() {
76 | jq 'select(.media_type == "movie")'
77 | }
78 |
79 | trakt-filter-episodes() {
80 | jq 'select(.media_type == "episode")'
81 | }
82 |
83 | trakt-describe-movie() {
84 | jq -r '"\(.media_data.title) (\(.media_data.year))"'
85 | }
86 |
87 | trakt-describe-episode() {
88 | jq -r '"\(.media_data.show.title) (\(.media_data.show.year)) - S\(.media_data.season)E\(.media_data.episode) \(.media_data.title)"'
89 | }
90 |
--------------------------------------------------------------------------------
/scripts/lint:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # wrapper around linter/formatters
3 | # pauses at each step if there are errors
4 | # this is run locally to autoformat/lint code
5 |
6 | set -o pipefail
7 |
8 | # get the name of this script
9 | declare script_name
10 | script_name="$(basename "${BASH_SOURCE[0]}")"
11 |
12 | # function to verify an external command is installed
13 | havecmd() {
14 | local BINARY ERRMSG
15 | # error if first argument isn't provided
16 | BINARY="${1:?Must provide command to check}"
17 | # the command exists, exit with 0 (success!)
18 | if command -v "${BINARY}" >/dev/null 2>&1; then
19 | return 0
20 | else
21 | # construct error message
22 | ERRMSG="'${script_name}' requires '${BINARY}', could not find that on your \$PATH"
23 | if [[ -n "$2" ]]; then
24 | ERRMSG="${ERRMSG}. $2"
25 | fi
26 | printf '%s\n' "${ERRMSG}" 1>&2
27 | return 1
28 | fi
29 | }
30 |
31 | set -e
32 | havecmd shellcheck
33 | havecmd exists 'See https://github.com/purarue/exists'
34 | havecmd rifleman 'See https://github.com/purarue/rifleman'
35 | havecmd pytest
36 | havecmd jq
37 | havecmd tput
38 | havecmd mypy
39 | havecmd flake8
40 | havecmd black
41 | havecmd shfmt
42 | set +e
43 |
44 | # cd to base directory
45 | BASE_DIR="$(realpath "$(dirname "${BASH_SOURCE[0]}")"/..)"
46 | readonly BASE_DIR
47 | cd "${BASE_DIR}" || exit 1
48 | printf 'In: %s\n' "$(pwd)"
49 |
50 | # call shellcheck on all the scripts
51 | shellcheck_scripts() {
52 | git ls-files | exists | rifleman - -a lint -j | jq -r 'to_entries[] | select(.key|startswith("shellcheck")) | .value | .[]' | xargs shellcheck
53 | }
54 |
55 | prompt() {
56 | local MESSAGE
57 | MESSAGE='Hit enter to continue > '
58 | [[ -n "$1" ]] && MESSAGE="$1"
59 | echo -en "$(tput setaf 1)${MESSAGE}$(tput sgr0)"
60 | read -r # if no variable is specified, sets the REPLY environment variable
61 | }
62 |
63 | update_fork() {
64 | local FORK_LOCATION
65 | FORK_LOCATION="$(realpath ../HPI-karlicoss/)"
66 | cd "${FORK_LOCATION}" || return $?
67 | git checkout master
68 | git pull upstream master
69 | }
70 |
71 | main() {
72 | (update_fork) # cd in subshell
73 | python3 -m pytest "$@" || prompt ''
74 | echo "Running mypy..."
75 | MY_CONFIG="${BASE_DIR}/tests/my" mypy --install-types --non-interactive --color-output -p my || prompt ''
76 | python3 -m mypy ~/.config/my/my/config/ || prompt ''
77 | python3 -m flake8 ./my || prompt ''
78 | echo "Running shellcheck..."
79 | shellcheck_scripts
80 | # format everything in the repo
81 | git ls-files | exists | rifleman -
82 | echo -e "$(tput setaf 2)Done!$(tput sgr0)"
83 | git status
84 | }
85 |
86 | main "$@"
87 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = HPI_purarue
3 | version = 0.0.1
4 | description = "A Python interface to my life"
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/purarue/HPI
8 | author = "purarue"
9 | license = MIT
10 | license_files = LICENSE
11 | classifiers =
12 | License :: OSI Approved :: MIT License
13 | Operating System :: OS Independent
14 | Programming Language :: Python :: 3
15 | Programming Language :: Python :: 3 :: Only
16 | Programming Language :: Python :: 3.10
17 | Programming Language :: Python :: 3.11
18 | Programming Language :: Python :: 3.12
19 | Programming Language :: Python :: 3.13
20 |
21 | [options]
22 | python_requires = >3.9
23 | include_package_data = True
24 | zip_safe = False
25 |
26 | [options.entry_points]
27 | console_scripts =
28 | backup_to = my.utils.backup_to.__main__:main
29 |
30 | [options.package_data]
31 | my = py.typed
32 |
33 | [flake8]
34 | ignore = E501,E402,W503,E266,E203
35 |
36 | [mypy]
37 | pretty = True
38 | show_error_context = True
39 | show_error_codes = True
40 | check_untyped_defs = True
41 | namespace_packages = True
42 | disallow_incomplete_defs = True
43 | no_implicit_optional = True
44 | disallow_any_generics = True
45 | disallow_untyped_calls = True
46 | warn_redundant_casts = True
47 | warn_return_any = True
48 | warn_unreachable = True
49 |
50 | [tool:pytest]
51 | addopts =
52 | --verbose
53 | tests
54 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from typing import Iterator
2 | from setuptools import setup, find_namespace_packages # type: ignore[import]
3 |
4 |
5 | def subpackages() -> Iterator[str]:
6 | # make sure subpackages are only in the my/ folder (not in tests or other folders here)
7 | for p in find_namespace_packages("."):
8 | if p.startswith("my"):
9 | yield p
10 |
11 |
12 | if __name__ == "__main__":
13 | setup(packages=list(subpackages()))
14 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/tests/__init__.py
--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | import pytest
6 |
7 | V = "HPI_TESTS_PURA"
8 |
9 | skip_if_not_pura = pytest.mark.skipif(
10 | V not in os.environ,
11 | reason=f"test on runs on @purarue data for now. Set envvar {V}=true to override",
12 | )
13 |
14 |
15 | def data(file: Optional[str]) -> Path:
16 | d = Path(__file__).absolute().parent / "testdata"
17 | if file:
18 | d = d / file
19 | assert d.exists()
20 | return d
21 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.fixture(autouse=True)
5 | def without_cachew():
6 | from my.core.cachew import disabled_cachew
7 |
8 | with disabled_cachew():
9 | yield
10 |
--------------------------------------------------------------------------------
/tests/my/my/config/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Config file used for testing in CI; so that config is defined
3 | """
4 |
5 | import tempfile
6 | from pathlib import Path
7 | from typing import Optional, Sequence, Callable
8 |
9 | from os import environ, path
10 |
11 | from my.core.common import PathIsh, Paths
12 |
13 |
14 | class core:
15 | cache_dir: PathIsh = path.join(environ["HOME"], ".cache", "cachew")
16 | tmp_dir: PathIsh = path.join(tempfile.gettempdir(), "HPI-tempdir")
17 | enabled_modules: Sequence[str] = []
18 | disabled_modules: Sequence[str] = []
19 |
20 |
21 | class mail:
22 | class imap:
23 | mailboxes: Paths = ""
24 | # filter function which filters the input paths
25 | filter_path: Optional[Callable[[Path], bool]] = None
26 |
27 | class mbox:
28 | mailboxes: Paths = ""
29 | exclude_extensions = ()
30 |
31 |
32 | class zsh:
33 | export_path: Paths = ""
34 | live_file: Optional[PathIsh] = ""
35 |
36 |
37 | class bash:
38 | export_path: Paths = ""
39 |
40 |
41 | class todotxt:
42 | class git_history:
43 | # path to git_doc_history directory
44 | export_path: Optional[PathIsh] = None
45 |
46 | class active:
47 | # path to your active todo.txt directory
48 | export_path: PathIsh = ""
49 | error_policy = "drop"
50 |
51 |
52 | class rss:
53 | class newsboat:
54 | class git_history:
55 | export_path: Paths = ""
56 |
57 |
58 | class mpv:
59 | class history_daemon:
60 | export_path: Paths = ""
61 | require_percent: Optional[float] = 0.75
62 |
63 |
64 | class league:
65 | class export:
66 | export_path: Paths = ""
67 | username = ""
68 |
69 |
70 | class chess:
71 | class export:
72 | export_path: Paths = ""
73 |
74 |
75 | class listenbrainz:
76 | class export:
77 | export_path: Paths = ""
78 |
79 |
80 | class trakt:
81 | class export:
82 | export_path: Paths = ""
83 |
84 |
85 | class mal:
86 | class export:
87 | export_path: PathIsh = ""
88 | zip_backup_path: Optional[PathIsh] = ""
89 |
90 |
91 | class grouvee:
92 | class export:
93 | export_path: Paths = ""
94 |
95 |
96 | class nextalbums:
97 | export_path: Paths = ""
98 |
99 |
100 | class steam:
101 | class scraper:
102 | export_path: Paths = ""
103 |
104 |
105 | class piazza:
106 | class scraper:
107 | export_path: Paths = ""
108 |
109 |
110 | class blizzard:
111 | class gdpr:
112 | export_path: Paths = ""
113 |
114 |
115 | class project_euler:
116 | export_path: Paths = ""
117 |
118 |
119 | class skype:
120 | class gdpr:
121 | export_path: Paths = ""
122 |
123 |
124 | class facebook:
125 | class gdpr:
126 | gdpr_dir: PathIsh = ""
127 |
128 |
129 | class spotify:
130 | class gdpr:
131 | gdpr_dir: PathIsh = ""
132 |
133 |
134 | class twitch:
135 | class overrustle:
136 | export_path: Paths = ""
137 |
138 | class gdpr:
139 | gdpr_dir: PathIsh = ""
140 |
141 |
142 | class ipython:
143 | export_path: Paths = ""
144 |
145 |
146 | class ttt:
147 | export_path: Paths = ""
148 |
149 |
150 | class activitywatch:
151 | class active_window:
152 | export_path: Paths = ""
153 |
154 |
155 | class apple:
156 | class privacy_export:
157 | gdpr_dir: PathIsh = ""
158 |
159 |
160 | class linkedin:
161 | class privacy_export:
162 | gdpr_dir: PathIsh = ""
163 |
164 |
165 | class scramble:
166 | class history:
167 | config_dir: Optional[PathIsh] = None
168 |
169 |
170 | class discord:
171 | class data_export:
172 | export_path: Paths = ""
173 |
174 |
175 | class runelite:
176 | class screenshots:
177 | export_path: Paths = ""
178 |
179 |
180 | class minecraft:
181 | class advancements:
182 | export_path: Paths = ""
183 |
184 |
185 | class offline:
186 | class listens:
187 | export_path: Paths = ""
188 |
189 |
190 | class time:
191 | class tz:
192 | policy = "convert"
193 |
--------------------------------------------------------------------------------
/tests/test_apple.py:
--------------------------------------------------------------------------------
1 | from more_itertools import ilen
2 |
3 |
4 | from .common import skip_if_not_pura
5 |
6 |
7 | @skip_if_not_pura
8 | def test_apple_types() -> None:
9 | from my.apple.privacy_export import (
10 | events,
11 | Game,
12 | GameAchievement,
13 | GameLeaderboardData,
14 | Location,
15 | )
16 |
17 | all_ev = list(events())
18 | assert len(all_ev) > 10
19 | all_types = set([Game, GameAchievement, GameLeaderboardData, Location])
20 | assert all_types == set(map(type, all_ev))
21 | # make sure we parsed everything without errors
22 | assert ilen(filter(lambda e: isinstance(e, Exception), all_ev)) == 0
23 |
--------------------------------------------------------------------------------
/tests/test_bash.py:
--------------------------------------------------------------------------------
1 | from my.bash import _parse_file
2 |
3 | from .common import data
4 |
5 |
6 | def test_single_file() -> None:
7 | history_file = data("bash/history")
8 | history = list(_parse_file(history_file))
9 | assert len(history) == 4
10 | assert history[0].command == "ls"
11 | assert history[1].command == "git status"
12 | assert (
13 | history[2].command
14 | == '''echo "$(
15 | date
16 | uname
17 | )"'''
18 | )
19 |
20 | assert history[3].command == "ls"
21 |
--------------------------------------------------------------------------------
/tests/test_commits.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from more_itertools import ilen
4 |
5 | from .common import skip_if_not_pura
6 |
7 |
8 | def file_count(dir_name: Path) -> int:
9 | return ilen(dir_name.rglob("*"))
10 |
11 |
12 | @skip_if_not_pura
13 | def test_commits() -> None:
14 | from my.coding.commits import repos, _cached_commits, Commit
15 |
16 | all_repos = list(repos())
17 | assert len(all_repos) > 1
18 | # get a repo which has lots of files
19 | # probably has a couple commits
20 | for r in sorted(all_repos):
21 | if file_count(r) > 50:
22 | biggest_repo = r
23 | break
24 | else:
25 | raise RuntimeError("Couldn't find a repo with more than 100 files!")
26 | commits_for_repo = list(_cached_commits(biggest_repo))
27 | assert len(commits_for_repo) >= 1
28 | assert isinstance(commits_for_repo[0], Commit)
29 |
--------------------------------------------------------------------------------
/tests/test_games.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from more_itertools import ilen
4 | from my.core.error import raise_exceptions
5 |
6 | from .common import skip_if_not_pura
7 |
8 |
9 | @skip_if_not_pura
10 | def test_league() -> None:
11 | from my.league.export import history, Game
12 |
13 | gs: List[Game] = list(raise_exceptions(history()))
14 | assert len(gs) > 50
15 |
16 |
17 | @skip_if_not_pura
18 | def test_steam() -> None:
19 | from my.steam.scraper import games, achievements, Achievement
20 |
21 | assert ilen(games()) > 10
22 | ach: List[Achievement] = list(raise_exceptions(achievements()))
23 | assert any([a.game_name == "Counter-Strike: Global Offensive" for a in ach])
24 |
--------------------------------------------------------------------------------
/tests/test_ipython.py:
--------------------------------------------------------------------------------
1 | from my.ipython import _parse_database
2 |
3 | from .common import data
4 |
5 | import pytest
6 |
7 | from IPython.core.history import HistoryAccessor
8 |
9 |
10 | # https://github.com/ipython/ipython/issues/13666
11 | def accessor_works() -> bool:
12 | ipython_db = str(data("ipython.sqlite"))
13 | hist = HistoryAccessor(hist_file=ipython_db) # type: ignore[no-untyped-call]
14 | try:
15 | hist.get_last_session_id()
16 | return True
17 | except Exception:
18 | return False
19 |
20 |
21 | @pytest.mark.skipif(not accessor_works(), reason="ipython historyaccessor failed")
22 | def test_ipython() -> None:
23 | ipython_db = str(data("ipython.sqlite"))
24 | cmds = list(_parse_database(ipython_db))
25 | assert len(cmds) == 13
26 | item = cmds[1]
27 | assert not isinstance(item, Exception)
28 | assert item.command == "fac(121)"
29 |
--------------------------------------------------------------------------------
/tests/test_my.py:
--------------------------------------------------------------------------------
1 | from my.discord.data_export import test_remove_link_suppression # noqa
2 |
--------------------------------------------------------------------------------
/tests/test_zsh.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Callable, Iterator
3 | from itertools import chain
4 |
5 | from my.zsh import _parse_file, _merge_histories, Entry
6 |
7 | from .common import data
8 |
9 | history_file = data("zsh/zsh_history")
10 | overlap_file = data("zsh/overlap_history")
11 |
12 |
13 | def _parse_and_merge(inputs: Callable[[], Iterator[Path]]) -> Iterator[Entry]:
14 | yield from _merge_histories(*chain(map(_parse_file, inputs())))
15 |
16 |
17 | def test_single_file() -> None:
18 | """
19 | test that a single zsh parse works and for an entry in the history
20 | """
21 |
22 | def zsh_small_test():
23 | yield Path(history_file)
24 |
25 | items = list(_parse_and_merge(inputs=zsh_small_test))
26 | assert len(items) == 11
27 |
28 | from datetime import datetime, timezone
29 |
30 | # from the test history file, fine to do
31 | e = Entry(
32 | dt=datetime(
33 | year=2020,
34 | month=7,
35 | day=14,
36 | hour=2,
37 | minute=21,
38 | second=37,
39 | tzinfo=timezone.utc,
40 | ),
41 | duration=0,
42 | command="ls",
43 | )
44 | assert e in items
45 |
46 |
47 | def test_overlap() -> None:
48 | """
49 | To make sure that duplicates are removed
50 | """
51 |
52 | def zsh_multiple_tests():
53 | yield Path(history_file)
54 | yield Path(overlap_file)
55 |
56 | items = list(_parse_and_merge(inputs=zsh_multiple_tests))
57 | assert len(items) == 11
58 |
--------------------------------------------------------------------------------
/tests/testdata/bash/history:
--------------------------------------------------------------------------------
1 | #1616723205
2 | ls
3 | #1616723205
4 | git status
5 | #1616723206
6 | echo "$(
7 | date
8 | uname
9 | )"
10 | #1616723207
11 | ls
12 |
--------------------------------------------------------------------------------
/tests/testdata/ipython.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/tests/testdata/ipython.sqlite
--------------------------------------------------------------------------------
/tests/testdata/zsh/overlap_history:
--------------------------------------------------------------------------------
1 | : 1594693239:2;{ for i in $(seq 10); do echo $i; sleep 1; done }
2 | : 1594693242:10;{ for i in $(seq 10); do echo $i; sleep 1; done } | tac
3 | : 1594693261:29;man tac
4 | : 1594693293:0;which parallel-moreutils
5 |
--------------------------------------------------------------------------------
/tests/testdata/zsh/zsh_history:
--------------------------------------------------------------------------------
1 | : 1594693071:0;ls
2 | : 1594693154:2;while true; do genpasswd -n $(tput cols); sleep 0.1; done
3 | : 1594693172:3;while true; do\
4 | for i in {1.."$(tput cols)"}; do genpasswd -n "$i"; done \
5 | for i in $(seq 1 "$(tput cols)" | tac); do genpasswd -n "$i"; done\
6 | done
7 | : 1594693184:2;while true; do\
8 | for i in {1.."$(tput cols)"}; do genpasswd -n "$i"; done \
9 | for i in $(seq 1 "$(tput cols)" | tac); do genpasswd -n "$i"; done\
10 | done
11 | : 1594693220:0;{ for i in 1..10; do echo $i; done }
12 | : 1594693231:0;{ for i in $(seq 10); do echo $i; done }
13 | : 1594693239:2;{ for i in $(seq 10); do echo $i; sleep 1; done }
14 | : 1594693242:10;{ for i in $(seq 10); do echo $i; sleep 1; done } | tac
15 | : 1594693261:29;man tac
16 | : 1594693293:0;which parallel-moreutils
17 | : 1594693297:0;ls
18 | : 1594693297:0;ls
19 |
--------------------------------------------------------------------------------