├── .github └── workflows │ └── main.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── doc ├── MAIL_SETUP.md └── TROUBLESHOOTING_INSTALLS.md ├── install ├── my ├── activitywatch │ └── active_window.py ├── apple │ └── privacy_export.py ├── bash.py ├── blizzard │ └── gdpr.py ├── chess │ └── export.py ├── discord │ └── data_export.py ├── facebook │ └── gdpr.py ├── grouvee │ └── export.py ├── ip │ ├── all.py │ ├── blizzard.py │ ├── discord.py │ └── facebook.py ├── ipython.py ├── league │ └── export.py ├── linkedin │ └── privacy_export.py ├── listenbrainz │ └── export.py ├── location │ └── apple.py ├── mail │ ├── all.py │ ├── common.py │ ├── imap.py │ ├── mbox.py │ └── parse_parts.py ├── mal │ └── export.py ├── minecraft │ └── advancements.py ├── mpv │ └── history_daemon.py ├── offline │ └── listens.py ├── piazza │ └── scraper.py ├── project_euler.py ├── py.typed ├── rss │ └── newsboat │ │ └── git_history.py ├── runelite │ └── screenshots.py ├── scramble │ └── history.py ├── skype │ └── gdpr.py ├── spotify │ └── gdpr.py ├── steam │ └── scraper.py ├── todotxt │ ├── active.py │ ├── common.py │ └── git_history.py ├── trakt │ └── export.py ├── ttt.py ├── twitch │ ├── all.py │ ├── common.py │ ├── gdpr.py │ └── overrustle_logs.py ├── utils │ ├── backup_to │ │ └── __main__.py │ ├── parse_csv.py │ └── time.py └── zsh.py ├── scripts ├── functions.sh └── lint ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── common.py ├── conftest.py ├── my └── my │ └── config │ └── __init__.py ├── test_apple.py ├── test_bash.py ├── test_commits.py ├── test_games.py ├── test_ipython.py ├── test_my.py ├── test_zsh.py └── testdata ├── bash └── history ├── ipython.sqlite └── zsh ├── overlap_history └── zsh_history /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: ["*"] 5 | pull_request: # needed to trigger on others' PRs 6 | workflow_dispatch: # needed to trigger workflows manually 7 | 8 | jobs: 9 | build: 10 | strategy: 11 | matrix: 12 | platform: [ubuntu-latest, macos-latest] 13 | python-version: [ "3.10", "3.11", "3.12", "3.13" ] 14 | exclude: 15 | [ 16 | { platform: macos-latest, python-version: "3.11" }, 17 | { platform: macos-latest, python-version: "3.12" }, 18 | ] 19 | 20 | runs-on: ${{ matrix.platform }} 21 | 22 | steps: 23 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH 24 | 25 | - uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - uses: actions/checkout@v4 30 | with: 31 | fetch-depth: 0 # nicer to have all git history when debugging/for tests 32 | 33 | - run: ./install 34 | 35 | - uses: actions/upload-artifact@v4 36 | with: 37 | name: .coverage.mypy-${{ matrix.platform }}_${{ matrix.python-version }} 38 | path: .coverage.mypy/ 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tags* 2 | *.priv.job 3 | /*.pdf 4 | Pipfile* 5 | 6 | 7 | # Created by https://www.gitignore.io/api/python,emacs 8 | # Edit at https://www.gitignore.io/?templates=python,emacs 9 | 10 | ### Emacs ### 11 | # -*- mode: gitignore; -*- 12 | *~ 13 | \#*\# 14 | /.emacs.desktop 15 | /.emacs.desktop.lock 16 | *.elc 17 | auto-save-list 18 | tramp 19 | .\#* 20 | 21 | # Org-mode 22 | .org-id-locations 23 | *_archive 24 | 25 | # flymake-mode 26 | *_flymake.* 27 | 28 | # eshell files 29 | /eshell/history 30 | /eshell/lastdir 31 | 32 | # elpa packages 33 | /elpa/ 34 | 35 | # reftex files 36 | *.rel 37 | 38 | # AUCTeX auto folder 39 | /auto/ 40 | 41 | # cask packages 42 | .cask/ 43 | dist/ 44 | 45 | # Flycheck 46 | flycheck_*.el 47 | 48 | # server auth directory 49 | /server/ 50 | 51 | # projectiles files 52 | .projectile 53 | 54 | # directory configuration 55 | .dir-locals.el 56 | 57 | # network security 58 | /network-security.data 59 | 60 | 61 | ### Python ### 62 | # Byte-compiled / optimized / DLL files 63 | __pycache__/ 64 | *.py[cod] 65 | *$py.class 66 | 67 | # C extensions 68 | *.so 69 | 70 | # Distribution / packaging 71 | .Python 72 | build/ 73 | develop-eggs/ 74 | downloads/ 75 | eggs/ 76 | .eggs/ 77 | lib/ 78 | lib64/ 79 | parts/ 80 | sdist/ 81 | var/ 82 | wheels/ 83 | pip-wheel-metadata/ 84 | share/python-wheels/ 85 | *.egg-info/ 86 | .installed.cfg 87 | *.egg 88 | MANIFEST 89 | 90 | # PyInstaller 91 | # Usually these files are written by a python script from a template 92 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 93 | *.manifest 94 | *.spec 95 | 96 | # Installer logs 97 | pip-log.txt 98 | pip-delete-this-directory.txt 99 | 100 | # Unit test / coverage reports 101 | htmlcov/ 102 | .tox/ 103 | .nox/ 104 | .coverage 105 | .coverage.* 106 | .cache 107 | nosetests.xml 108 | coverage.xml 109 | *.cover 110 | .hypothesis/ 111 | .pytest_cache/ 112 | 113 | # Translations 114 | *.mo 115 | *.pot 116 | 117 | # Scrapy stuff: 118 | .scrapy 119 | 120 | # Sphinx documentation 121 | docs/_build/ 122 | 123 | # PyBuilder 124 | target/ 125 | 126 | # pyenv 127 | .python-version 128 | 129 | # pipenv 130 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 131 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 132 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 133 | # install all needed dependencies. 134 | #Pipfile.lock 135 | 136 | # celery beat schedule file 137 | celerybeat-schedule 138 | 139 | # SageMath parsed files 140 | *.sage.py 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # Mr Developer 150 | .mr.developer.cfg 151 | .project 152 | .pydevproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # End of https://www.gitignore.io/api/python,emacs 166 | 167 | cov/ 168 | *.png 169 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 2024-10-13 2 | 3 | Removed the `my.utils.input_source` code, it doesn't work great with cachew, or in general (passing a custom function didn't always invalidate caching). 4 | 5 | ### 2022-03-20 6 | 7 | See [#33](https://github.com/purarue/HPI/pull/33) 8 | 9 | Replaced `file_backups` modules with `git_history`, using [`git_doc_history`](https://github.com/purarue/git_doc_history) 10 | 11 | I don't expect anyone else was using these modules, but there's a script [here](https://github.com/purarue/git_doc_history/blob/master/bin/file_backups_to_doc_history) to convert from the old format to new. Feel free to open an issue if you were using these -- could maintain them in a separate HPI repo as old repositories 12 | 13 | ### 2022-01-30 14 | 15 | [Relevant PR](https://github.com/purarue/HPI/pull/18); If you're having issues with the `my.config` blocks, compare yours to [mine](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py) 16 | 17 | Renamed some modules to allow for future extension, and less possibilities for conflicts with other related HPI modules under a particular company/service/source 18 | 19 | For reference, [here is the current directory structure](https://github.com/purarue/HPI/tree/eb425e653918d68eb9d41da29e791fe1ba554dc7/my) as of this commit 20 | 21 | In particular, anything with a `gdpr`/`data_export`/`privacy_export` is named to be that, instead of globally squashing the namespace module to the single `modulename.py` file 22 | 23 | Converting a single-file module to a namespace module [is always a breaking change](https://github.com/karlicoss/promnesia/pull/225#issuecomment-819773697), and though [one can do hacky traceback introspection](https://github.com/karlicoss/HPI/blob/master/my/reddit/__init__.py) (to possible delay the change from a non-namespace package to a namespace package. However, if anyone else is using this code, its likely in the background through promnesia, so most likely situation is that they don't see that till I deprecate it anyways), but its only a temporary solution until the `__init__.py`/`module.py` file is eventually removed -- so better to do them all now instead of waiting till it becomes 'too late' 24 | 25 | A user (or me) may want to write their own module with the same name, meaning they can't use both at the same time if mine is just `my.module_name.py`, since my module existing means any other namespace packages can't have the same base name (see [reorder_editable](https://github.com/purarue/reorder_editable) for an explanation) 26 | 27 | The one motivating this change is `apple.py`, since the old `apple.py` was parsing the privacy export, but I wanted to add something to parse [`imessage`](https://github.com/purarue/HPI/commit/e361ce8182d8be8b331875078ad17605d3f80a50) files. Someone else may want to add other `apple/file.py` files to parse other parts of apple/mac behaviour, but me having the single `apple.py` either means they have to have their repo before mine on their path (but doing so means they overwrite the current `apple.py` file, so they can't use that to parse their privacy export, even if they were trying to do something else entirely), or they have to rename their code to something like `my_apple/file.py` to create a new namespace module 28 | 29 | Possible 'Exceptions' to this: 30 | 31 | - For some files, if the possibility for conflict is low (I can't imagine anyone exporting data from the source in any other way, e.g., `ipython`, `project_euler`) or the name is so specific to the source that its not needed (e.g. `ttt`, `window_watcher`) 32 | - For files where I can't imagine you'd want both mine and your/custom implementation the same time, e.g. if you override `bash`, `zsh`, you're probably creating your own solution to parse that source, and don't need mine (If that's not the case, feel free to open an issue) 33 | - For some of my modules, I've renamed them from what they do to their service/project names instead (`albums` to `nextalbums`; `money` to `mint`), so I'm not holding the generic name of some function when I don't really need to (have since moved those to [HPI-personal](https://github.com/purarue/HPI-personal)) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2024 purarue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **TLDR**: I'm using `HPI`(Human Programming Interface) package as a means of unifying, accessing and interacting with all of my personal data. 2 | 3 | It's a Python library (named `my`), a collection of modules for: 4 | 5 | - social networks: posts, comments, favorites, searches 6 | - shell/program histories (zsh, bash, python, mpv, firefox) 7 | - programming (github/commits) 8 | - instant messaging 9 | - media histories (movies, TV shows, music, video game achievements/history); see 10 | 11 | [_Why?_](https://github.com/karlicoss/HPI#why) 12 | 13 | This is built on top of [`karlicoss/HPI`](https://github.com/karlicoss/HPI). These are all additional modules which aren't present in that repository - this is installed alongside the upstream repository (meaning _you can use both modules from upstream and here simultaneously_), see [#install](#install) 14 | 15 | ### My Modules 16 | 17 | - `my.zsh` and `my.bash`, access to my shell history w/ timestamps 18 | - `my.mail.imap` and `my.mail.mbox` to parse local IMAP sync's of my mail/mbox files -- see [doc/MAIL_SETUP.md](doc/MAIL_SETUP.md) 19 | - `my.mpv.history_daemon`, accesses movies/music w/ activity/metadata that have played on my machine, facilitated by a [mpv history daemon](https://github.com/purarue/mpv-history-daemon) 20 | - `my.discord.data_export`, parses ~1,000,000 messages/events from the discord data export, parser [here](https://github.com/purarue/discord_data) 21 | - `my.todotxt.active` to parse my current [todo.txt](https://github.com/todotxt/todo.txt-cli) file; `my.todotxt.git_history` tracks my history using backups of those files in [`git_doc_history`](https://github.com/purarue/git_doc_history) 22 | - `my.rss.newsboat`, keeps track of when I added/removed RSS feeds (for [`newsboat`](https://newsboat.org/)) 23 | - `my.ipython`, for timestamped python REPL history 24 | - `my.ttt`, to parse shell/system history tracked by [`ttt`](https://github.com/purarue/ttt) 25 | - `my.activitywatch.active_window`, to parse active window events (what application I'm using/what the window title is) using [`window_watcher`](https://github.com/purarue/aw-watcher-window) and [activitywatch](https://activitywatch.net/) on android 26 | - `my.chess.export`, to track my [chess.com](https://www.chess.com)/[lichess.org](https://lichess.org/) games, using [`chess_export`](https://github.com/purarue/chess_export) 27 | - `my.trakt.export`, providing me a history/my ratings for Movies/TV Show (episodes) using [`traktexport`](https://github.com/purarue/traktexport) 28 | - `my.listenbrainz.export`, exporting my music listening history from [ListenBrainz](https://listenbrainz.org/) (open-source Last.fm) using [`listenbrainz_export`](https://github.com/purarue/listenbrainz_export) 29 | - `my.offline.listens`, for offline music listen history, using [offline_listens](https://github.com/purarue/offline_listens) 30 | - `my.mal.export`, for anime/manga history using [`malexport`](https://github.com/purarue/malexport) 31 | - `my.grouvee.export`, for my video game history/backlog using [`grouvee_export`](https://github.com/purarue/grouvee_export) 32 | - `my.runelite.screenshots`, parses data from the [automatic runelite screenshots](https://github.com/runelite/runelite/wiki/Screenshot) 33 | - `my.minecraft.advancements`, parses advancement (local achievement data) from the `~/.minecraft` directory 34 | - `my.project_euler`, when I solved [Project Euler](https://projecteuler.net/) problems 35 | - `my.linkedin.privacy_export`, to parse the [privacy export](https://www.linkedin.com/help/linkedin/answer/50191/downloading-your-account-data?lang=en) from linkedin 36 | - `my.scramble.history` for merged (timed) rubiks cube solves from multiple sources, using [scramble_history](https://github.com/purarue/scramble-history) 37 | 38 | #### 'Historical' Modules 39 | 40 | These are modules to parse GDPR exports/data from services I used to use, but don't anymore. They're here to provide more context into the past. 41 | 42 | - `my.apple.privacy_export`, parses Game Center and location data from the [apple privacy export](https://privacy.apple.com/) 43 | - `my.facebook.gdpr`, to parse the GDPR export from Facebook 44 | - `my.league.export`, gives League of Legends game history using [`lolexport`](https://github.com/purarue/lolexport) 45 | - `my.steam.scraper`, for steam achievement data and game playtime using [`steamscraper`](https://github.com/purarue/steamscraper) 46 | - `my.piazza.scraper`, parsing [piazza](https://piazza.com/) (university forum) posts using [`piazza-scraper`](https://github.com/purarue/piazza-scraper) 47 | - `my.blizzard.gdpr`, for general battle.net event data [parsed from a GDPR export](https://github.com/purarue/blizzard_gdpr_parser) 48 | - `my.skype.gdpr` to parse a couple datetimes from the Skype GDPR export (seems all my data from years ago is long gone) 49 | - `my.spotify.gdpr`, to parse the GDPR export from Spotify, mostly to access songs from my playlists from years ago 50 | - `my.twitch`, merging the [data export](https://www.twitch.tv/p/en/legal/privacy-choices/#user-privacy-requests) and my messages parsed from the [overrustle logs dump](https://github.com/purarue/overrustle_parser) 51 | 52 | See [here](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py) for my `HPI` config 53 | 54 | [Promnesia `Source`s for these `HPI` modules](https://github.com/purarue/promnesia) 55 | 56 | I also have some more personal scripts/modules in a separate repo; [`HPI-personal`](https://github.com/purarue/HPI-personal) 57 | 58 | ### In-use from [karlicoss/HPI](https://github.com/karlicoss/HPI) 59 | 60 | - `my.browser`, to parse browser history using [`browserexport`](https://github.com/purarue/browserexport) 61 | - `my.google.takeout.parser`, parses lots of (~500,000) events (youtube, searches, phone usage, comments, location history) from [google takeouts](https://takeout.google.com/), using [`google_takeout_parser`](https://github.com/purarue/google_takeout_parser) 62 | - `my.coding.commits` to track git commits across the system 63 | - `my.github` to track github events/commits and parse the GDPR export, using [`ghexport`](https://github.com/karlicoss/ghexport) 64 | - `my.reddit`, get saved posts, comments. Uses [`rexport`](https://github.com/karlicoss/rexport) to create backups of recent activity periodically, and [`pushshift`](https://github.com/purarue/pushshift_comment_export) to get old comments. 65 | - `my.smscalls`, exports call/sms history using [SMS Backup & Restore](https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore&hl=en_US) 66 | - `my.stackexchange.stexport`, for stackexchange data using [`stexport`](https://github.com/karlicoss/stexport) 67 | 68 | #### Partially in-use/with overrides: 69 | 70 | - `my.location`, though since I also have some locations from `apple.privacy_export`, I have a [`my.location.apple`](./my/location/apple.py) which I then merge into `my.location.all` in my overridden [`all.py`](https://github.com/purarue/HPI-personal/blob/master/my/location/all.py) file on my personal repo 71 | - similarly, I do use `my.ip` and `my.location.via_ip` from upstream, but I have [overridden `all.py` and module files here](https://github.com/purarue/HPI/tree/master/my/ip) 72 | 73 | 'Overriding' an `all.py` file means replacing the `all.py` from upstream repo (this means it can use my sources here to grab more locations/ips, since those don't exist in the upstream). For more info see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages), and the [module design](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#adding-new-modules) docs for HPI, but you might be able to get the gist by comparing: 74 | 75 | - [my.location.all](https://github.com/karlicoss/HPI/blob/master/my/location/all.py) in `karlicoss/HPI` 76 | - [my.location.all](https://github.com/purarue/HPI-personal/blob/master/my/location/all.py) in `purarue/HPI-personal` 77 | 78 | Since I've mangled my `PYTHONPATH` (see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages)), it imports from my repo instead of `karlicoss/HPI`. `all.py` files tend to pretty small -- so overriding/changing a line to add a source is the whole point. 79 | 80 | ### Companion Tools/Libraries 81 | 82 | Disregarding tools which actively collect data (like [`ttt`](https://github.com/purarue/ttt)/[`window_watcher`](https://github.com/purarue/aw-watcher-window)) or repositories which have their own exporter/parsers which are used here, there are a couple other tools/libraries I've created for this project: 83 | 84 | - [`ipgeocache`](https://github.com/purarue/ipgeocache) - for any IPs gathered from data exports, provides geolocation info, so I have partial location info going back to 2013 85 | - [`sqlite_backup`](https://github.com/purarue/sqlite_backup) - to safely copy/backup application sqlite databases that may currently be in use 86 | - [`git_doc_history`](https://github.com/purarue/git_doc_history) - a bash script to copy/backup files into git history, with a python library to help traverse and create a history/parse diffs between commits 87 | - [`HPI_API`](https://github.com/purarue/HPI_API) - automatically creates a JSON API/server for HPI modules 88 | - [`url_metadata`](https://github.com/purarue/url_metadata) - caches youtube subtitles, url metadata (title, description, image links), and a html/plaintext summary for any URL 89 | 90 | I also use this in [`my_feed`](https://github.com/purarue/my_feed), which creates a feed of media/data using `HPI`, live at 91 | 92 | ### Ad-hoc and interactive 93 | 94 | Some basic examples. 95 | 96 | When was I most using reddit? 97 | 98 | ```python 99 | >>> import collections, my.reddit.all, pprint 100 | >>> pprint.pprint(collections.Counter([c.created.year for c in my.reddit.all.comments()])) 101 | Counter({2016: 3288, 102 | 2017: 801, 103 | 2015: 523, 104 | 2018: 209, 105 | 2019: 65, 106 | 2014: 4, 107 | 2020: 3}) 108 | ``` 109 | 110 | Most common shell commands? 111 | 112 | ```python 113 | >>> import collections, pprint, my.zsh 114 | # lots of these are git-related aliases 115 | >>> pprint.pprint(collections.Counter([c.command for c in my.zsh.history()]).most_common(10)) 116 | [('ls', 51059), 117 | ('gst', 11361), 118 | ('ranger', 6530), 119 | ('yst', 4630), 120 | ('gds', 3919), 121 | ('ec', 3808), 122 | ('clear', 3651), 123 | ('cd', 2111), 124 | ('yds', 1647), 125 | ('ga -A', 1333)] 126 | ``` 127 | 128 | What websites do I visit most? 129 | 130 | ```python 131 | >>> import collections, pprint, my.browser.export, urllib 132 | >>> pprint.pprint(collections.Counter([urllib.parse.urlparse(h.url).netloc for h in my.browser.export.history()]).most_common(5)) 133 | [('github.com', 20953), 134 | ('duckduckgo.com', 10146), 135 | ('www.youtube.com', 10126), 136 | ('discord.com', 8425), 137 | ('stackoverflow.com', 2906)] 138 | ``` 139 | 140 | Song I've listened to most? 141 | 142 | ```python 143 | >>> import collections, my.mpv.history_daemon 144 | >>> collections.Counter([m.path for m in my.mpv.history_daemon.history()]).most_common(1)[0][0] 145 | '/home/username/Music/JPEFMAFIA/JPEGMAFIA - LP! - 2021 - V0/JPEGMAFIA - LP! - 05 HAZARD DUTY PAY!.mp3' 146 | ``` 147 | 148 | Movie I've watched most? 149 | 150 | ```python 151 | >>> import my.trakt, from collections import Counter 152 | >>> Counter(e.media_data.title for e in my.trakt.history()).most_common(1) 153 | [('Up', 92)] # (the pixar movie) 154 | ``` 155 | 156 | `hpi` also has a JSON query interface, so I can do quick computations using shell tools like: 157 | 158 | ```bash 159 | # how many calories have I eaten today (from https://github.com/purarue/ttally) 160 | $ hpi query ttally.__main__.food --recent 1d -s | jq -r '(.quantity)*(.calories)' | datamash sum 1 161 | 2258.5 162 | ``` 163 | 164 | ### Install 165 | 166 | For the basic setup, I recommend you clone and install both directories as editable installs: 167 | 168 | ```bash 169 | # clone and install upstream as an editable package 170 | git clone https://github.com/karlicoss/HPI ./HPI-karlicoss 171 | python3 -m pip install --user -e ./HPI-karlicoss 172 | 173 | # clone and install my repository as an editable package 174 | git clone https://github.com/purarue/HPI ./HPI-pura 175 | python3 -m pip install --user -e ./HPI-pura 176 | ``` 177 | 178 | Editable install means any changes to python files reflect immediately, which is very convenient for debugging and developing new modules. To update, you can just `git pull` in those directories. 179 | 180 | If you care about [overriding modules](https://github.com/purarue/HPI#partially-in-usewith-overrides), to make sure your `easy-install.pth` is ordered correctly: 181 | 182 | ```bash 183 | python3 -m pip install --user reorder_editable 184 | python3 -m reorder_editable reorder ./HPI-pura ./HPI-karlicoss 185 | ``` 186 | 187 | Then, you likely need to run `hpi module install` for any modules you plan on using -- this can be done incrementally as you setup new modules. E.g.: 188 | 189 | - `hpi module install my.trakt.export` to install dependencies 190 | - Check the [stub config](./tests/my/my/config/__init__.py) or [my config](https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py) and setup the config block in your HPI configuration file 191 | - Run `hpi doctor my.trakt.export` to check for any possible config issues/if your data is being loaded properly 192 | 193 | (The [install](./install) script does that for all my modules, but you likely don't want to do that) 194 | 195 | Its possible to install both `my` packages because `HPI` is a namespace package. For more information on that, and some of the complications one can run into, see [reorder_editable](https://github.com/purarue/reorder_editable#editable-namespace-packages), and the [module design](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#adding-new-modules) docs for HPI. 196 | 197 | If you're having issues installing/re-installing, check the [TROUBLESHOOTING_INSTALLS.md](doc/TROUBLESHOOTING_INSTALLS.md) file. 198 | 199 | If you recently updated and it seems like something has broke, check the [CHANGELOG](CHANGELOG.md) for any possible breaking changes 200 | -------------------------------------------------------------------------------- /doc/MAIL_SETUP.md: -------------------------------------------------------------------------------- 1 | This is a distillation of the steps described in [this issue](https://github.com/purarue/HPI/issues/15) 2 | 3 | There are two mail parsing modules here -- `my.mail.imap` and `my.mail.mbox`. An [`mbox` file](https://docs.python.org/3/library/mailbox.html) is just a collection of email messages in a single text file 4 | 5 | You can also use both modules at the same time -- see `my.mail.all` below 6 | 7 | Remember to first run: `hpi module install my.mail.imap` to install the necessary dependencies 8 | 9 | Note: There are _lots of_ different ways email clients/websites will export messages, so any mention of thunderbird add-ons or syncing tools used in particular to back up mail are just examples. Anything that gives you access to either the raw email files or an mbox should do. 10 | 11 | ## `my.mail.imap` 12 | 13 | Personally, I use `my.mail.imap`. To sync my mail, I use [`mutt-wizard`](https://github.com/LukeSmithxyz/mutt-wizard/), which uses `mbsync` under the hood to saves a bunch of individual mail files in `~/.local/share/mail` -- updating every 5 minutes. 14 | 15 | There are - of course - hundreds of ways to save your mail locally. Lets take [the ImportTools thunderbird add-on](https://addons.thunderbird.net/en-US/thunderbird/addon/importexporttools-ng/) as an example (since its the one we did troubleshooting on in the [issue](https://github.com/purarue/HPI/issues/15)). To match the format `my.mail.imap` expects, select the folder you want to export, then use `Tools > ImportExportToolsNg > Export all messages in the Folder > Plain Text Format`, and export it to a folder somewhere. Then, in your config file, setup the block to point it at that path: 16 | 17 | ```python 18 | class mail: 19 | class imap: 20 | # path[s]/glob to the the mailboxes/IMAP files 21 | # you could also do something like: 22 | # mailboxes = "~/Documents/mbsync/*@*" 23 | # to match any files in that directory with '@' in them 24 | mailboxes = "~/Documents/ExportPlaintext/" 25 | 26 | # filter function which filters the input paths 27 | filter_path: Optional[Callable[[Path], bool]] 28 | ``` 29 | 30 | To verify its finding your files, you can use `hpi query my.mail.imap.files -s` -- that'll print all the matched files 31 | 32 | That may be fine to parse an archive (a backup of some email you don't use anymore), but you need to continuously create new archives/delete old ones. 33 | 34 | Recently, ImportToolsExports has added support for periodic backups, but only in MBOX format. So --> 35 | 36 | ## `my.mail.mbox` 37 | 38 | If you already have access to an mbox file, you can skip this setup, is just an example: 39 | 40 | ### Thunderbird add-on 41 | 42 | In `Tools > ImportExportToolsNg > Options > Backup scheduling`, set the `Destination` and `Enable Frequency` to backup once per day, selecting `Just mail files` 43 | 44 | You can force a backup with `Tools > ImportExportToolsNg > Backup` 45 | 46 | Note: you can set the `Overwrite the mbox files with the same name in the destination directory` to overwrite your backup. Alternatively, since `my.config` is a python script, you could write some custom python function to parse the timestamp from the exported filepath, and then pass those to `mailboxes` in your `my.config`, using only using the latest exports as the input. Though, If you're overwriting the `mbox` files while HPI is trying to parse the files, HPI may fail. 47 | 48 | ### Setup mbox 49 | 50 | Once you've exported, setup your configuration to point at the directory. Note that since this uses `my.mail.imap` to parse the messages, you may have to setup a basic config with no files so that module does not fail: 51 | 52 | ```python 53 | class mail: 54 | 55 | class imap: 56 | # signifies no files 57 | mailboxes = '' 58 | 59 | class mbox: 60 | 61 | # paths/glob to the mbox directory -- searches recursively 62 | mailboxes = "~/Documents/mboxExport" 63 | 64 | # additional extensions to ignore 65 | exclude_extensions = (".sbd") 66 | ``` 67 | 68 | ## `my.mail.all` 69 | 70 | You can also use both of these at the same time -- if you have some exported as individual text files and others as mbox files, setup a config like above, specifying `mailboxes` from both `imap` and `mbox` 71 | 72 | Then -- you can just use the `my.mbox.all.mail` function, which returns unique messages from both sources 73 | 74 | ## Testing 75 | 76 | To make sure this works, you can use the `doctor` and `query` commands, to make sure there are no config errors and it parses your mail properly: 77 | 78 | ```bash 79 | hpi --debug doctor --verobose my.mail.all 80 | hpi --debug doctor --verbose my.mail.imap 81 | hpi --debug doctor --verbose my.mail.mbox 82 | ``` 83 | 84 | ```bash 85 | hpi --debug query my.mail.all --stream 86 | hpi --debug query my.mail.imap --stream 87 | hpi --debug query my.mail.mbox --stream 88 | ``` 89 | -------------------------------------------------------------------------------- /doc/TROUBLESHOOTING_INSTALLS.md: -------------------------------------------------------------------------------- 1 | It seems that sometimes installing from git has weird side effects with upgrading? 2 | 3 | If you're having issues -- try doing the following 4 | 5 | I'll use my promnesia modules (at ) as an example. 6 | 7 | Note: though the repository is `promnesia`, the module it installs is `promnesia_pura`. In python packages in general, its not necessary for the module name to match the repository (that's just where its hosted). To figure out what the name of the package is, use `python3 -m pip list`. For this HPI repository, it installs as `HPI-purarue`, so its possible to differentiate between this and upstream HPI. 8 | 9 | These are directions for installing a package as non-editable (into your python `site-packages`), though it covers uninstalling editable packages -- in case your path is misconfigured in some way. If you want to install as editable, see [reorder_editable](https://github.com/purarue/reorder_editable) and the [install section](https://github.com/purarue/HPI#install) of the README for issues you may run into, or see the [editable](#editable) section of this doc 10 | 11 | Whenever there are directions to use `pip` to do something -- its safer to do `python3 -m pip` (unless you know exactly what you're doing with managing multiple python installs on your system). That makes sure the `pip` that is being used is the same version as when you use `python3` 12 | 13 | Uninstall the package you're using: 14 | 15 | ```bash 16 | python3 -m pip uninstall -y promnesia_pura 17 | ``` 18 | 19 | Make sure its actually uninstalled -- this should error: 20 | 21 | ```bash 22 | $ python3 -c "import promnesia_pura" 23 | Traceback (most recent call last): 24 | File "", line 1, in 25 | ModuleNotFoundError: No module named 'promnesia_pura' 26 | ``` 27 | 28 | Note: For `HPI` in particular (since its a namespace package), if you're trying to uninstall my modules but leaves `karlicoss`'s (the core) modules installed, `import my` won't error. Instead, try something like `import my.trakt.export`, since that would only appear in my modules. 29 | 30 | If that still imports, you likely have files leftover in your site packages. To find that directory, you can use: 31 | 32 | ```bash 33 | $ python3 -m site 34 | sys.path = [ 35 | '/home/username', 36 | '/usr/lib/python310.zip', 37 | '/usr/lib/python3.10', 38 | '/usr/lib/python3.10/lib-dynload', 39 | '/home/username/.local/lib/python3.10/site-packages', 40 | '/home/username/Repos/my_feed/src', 41 | '/home/username/Repos/albums', 42 | '/home/username/Repos/mint/budget', 43 | '/home/username/Repos/HPI-personal', 44 | '/home/username/Repos/HPI', 45 | '/home/username/Repos/HPI-karlicoss', 46 | '/home/username/Repos/promnesia-fork/src', 47 | '/usr/lib/python3.10/site-packages', 48 | ] 49 | USER_BASE: '/home/username/.local' (exists) 50 | USER_SITE: '/home/username/.local/lib/python3.10/site-packages' (exists) 51 | ENABLE_USER_SITE: True 52 | ``` 53 | 54 | That should let you which directories python is scanning for imports. Check any of the `site-packages` directories, for files like: 55 | 56 | ``` 57 | promnesia_prua 58 | promnesia_pura-0.0.0.dist-info 59 | ``` 60 | 61 | and remove those (this is essentially a 'manually uninstall' of a broken package) 62 | 63 | If you've previously installed this as editable, review your editable installs to make sure its still not there: 64 | 65 | ```bash 66 | python3 -m pip install reorder_editable 67 | python3 -m reorder_editable locate # should show you where which editable installs are placing .egg-link files 68 | python3 -m reorder_editable cat 69 | ``` 70 | 71 | Refer to the [reorder_editable](https://github.com/purarue/reorder_editable) README for more info on that. 72 | 73 | You should now be able to confirm it errors, like: 74 | 75 | ```bash 76 | $ python3 -c "import promnesia_pura" 77 | Traceback (most recent call last): 78 | File "", line 1, in 79 | ModuleNotFoundError: No module named 'promnesia_pura' 80 | ``` 81 | 82 | Now -- to install it again! 83 | 84 | Instead of installing from git (since that can sometimes cache the result and run into other issues), clone it to some local directory: 85 | 86 | ```bash 87 | git clone https://github.com/purarue/promnesia ./promnesia_pura 88 | ``` 89 | 90 | Then, you can install it by pointing it at the directory with includes the `setup.py` file, like: `python3 -m pip install --user ./promnesia_pura` 91 | 92 | You should now be able to confirm it imports properly: 93 | 94 | ```python3 95 | python3 -c "import promnesia_pura" 96 | ``` 97 | 98 | ### Editable 99 | 100 | Alternatively, since you already have it locally, you can install it as editable: 101 | 102 | ```bash 103 | python3 -m pip install --user -e ./promnesia_pura 104 | ``` 105 | 106 | That should modify your `sys.path` (run `python3 -m site`; and you'll see that directory appear on your path) 107 | 108 | That has the added benefit that whenever you want to update `promnesia_pura`, you can just: 109 | 110 | ```bash 111 | cd /path/to/promnesia_pura 112 | git pull 113 | ``` 114 | -------------------------------------------------------------------------------- /install: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # If the 'CI' environment variable is set, this runs 3 | # like it would on the CI. To try and test that locally, 4 | # can do: CI_SKIP_INSTALL=1 CI=1 ./install 5 | set -o pipefail 6 | 7 | ci() { 8 | [[ -n "${CI}" ]] 9 | } 10 | 11 | ci && set -x 12 | 13 | # script to setup HPI 14 | # - installs karlicoss/HPI as an editable namespace package, 15 | # - installs this repo 16 | # - installs additional python packages for modules 17 | # - checks for any required external commands 18 | 19 | # cd to base directory 20 | BASE_DIR="$(dirname "${BASH_SOURCE[0]}")" 21 | cd "${BASE_DIR}" || exit 1 22 | printf 'In: %s\n' "$(pwd)" 23 | 24 | # function to verify an external command is installed 25 | havecmd() { 26 | local BINARY ERRMSG script_name 27 | script_name='HPI' 28 | # error if first argument isn't provided 29 | BINARY="${1:?Must provide command to check}" 30 | # the command exists, exit with 0 (success!) 31 | if command -v "${BINARY}" >/dev/null 2>&1; then 32 | return 0 33 | else 34 | # construct error message 35 | ERRMSG="'${script_name}' requires '${BINARY}', could not find that on your \$PATH" 36 | if [[ -n "$2" ]]; then 37 | ERRMSG="$ERRMSG. $2" 38 | fi 39 | printf '%s\n' "$ERRMSG" 1>&2 40 | return 1 41 | fi 42 | } && export -f havecmd 43 | 44 | maybe_boxes() { 45 | # Print a fancy box, if boxes is installed 46 | # http://boxes.thomasjensen.com/ 47 | if havecmd boxes >/dev/null 2>&1; then 48 | boxes -dshell -pv1h2 49 | else 50 | cat 51 | fi 52 | } && export -f maybe_boxes 53 | 54 | prompt_if_fails() { 55 | ci && return 1 56 | printf "Command failed... Hit Enter to continue, Ctrl+C to exit" 57 | read -r 58 | } && export -f prompt_if_fails 59 | 60 | pipm() { 61 | python3 -m pip "$@" 62 | } && export -f pipm 63 | 64 | pip_install() { 65 | local -a ARGS=() 66 | # only use --user when not in a virtual environment 67 | [[ -z "$VIRTUAL_ENV" ]] && ARGS+=("--user") 68 | ARGS+=("$@") 69 | pipm install "${ARGS[@]}" 70 | } && export -f pip_install 71 | 72 | # install dependencies (with pip) for this module 73 | # this works by traversing the AST/looking for 74 | # a 'REQUIRES' global variable in the 75 | # corresponding file 76 | hpi_module() { 77 | local -a ARGS=() 78 | [[ -z "$VIRTUAL_ENV" ]] && ARGS+=("--user") 79 | ARGS+=("$@") 80 | python3 -m my.core module install "${ARGS[@]}" 81 | } && export -f hpi_module 82 | 83 | ci_install_deps() { 84 | if [[ -n "${CI}" && -z "${CI_SKIP_INSTALL}" ]]; then 85 | # install OS specific stuff here 86 | if [[ "$OSTYPE" == "darwin"* ]]; then 87 | # macos ci 88 | brew install coreutils || return $? 89 | brew install boxes || return $? 90 | else 91 | # ubuntu ci 92 | sudo apt update || return $? 93 | sudo apt install boxes || return $? 94 | fi 95 | pip_install -U pip wheel setuptools || return $? 96 | fi 97 | } 98 | 99 | install_dependencies() { 100 | echo 'Installing dependencies...' | maybe_boxes 101 | if ci; then 102 | ci_install_deps || return $? 103 | fi 104 | } 105 | 106 | required_commands() { 107 | echo 'Checking if required commands are installed...' | maybe_boxes 108 | havecmd realpath || return $? 109 | havecmd git || return $? 110 | } 111 | 112 | ci_realpath() { 113 | if ci; then 114 | if [[ "$OSTYPE" == "darwin"* ]]; then 115 | # need to use g-prefixed things on CI 116 | grealpath "$@" 117 | return $? 118 | fi 119 | fi 120 | realpath "$@" 121 | } 122 | 123 | setup_fork() { 124 | local FORK_ABBREV UPSTREAM_URL UPSTREAM_DIR 125 | echo 'Setting up upstream fork...' | maybe_boxes 126 | 127 | FORK_ABBREV="${HPI_UPSTREAM_FOLDER_NAME:-HPI-karlicoss}" 128 | UPSTREAM_URL='https://github.com/karlicoss/HPI' 129 | 130 | UPSTREAM_DIR="$(ci_realpath "../${FORK_ABBREV}")" 131 | 132 | # clone my fork one level up from here if it does not exist 133 | if [[ ! -e "${UPSTREAM_DIR}" ]]; then 134 | git clone "${UPSTREAM_URL}" "${UPSTREAM_DIR}" 135 | else 136 | echo "Path already exists, skipping clone..." 137 | fi 138 | 139 | # install upstream/core HPI 140 | cd "${UPSTREAM_DIR}" || return $? 141 | 142 | if ci; then 143 | pip_install -e '.[optional,testing]' || return $? 144 | else 145 | pip_install -e '.' || return $? 146 | fi 147 | 148 | # cd back to here, to install this 149 | cd "${BASE_DIR}" || return $? 150 | pip_install -e '.' || return $? 151 | } 152 | 153 | module_dependencies() { 154 | if ! ci; then 155 | printf 'Install all module dependencies? [y/N] ' 156 | read -r || return $? 157 | case "${REPLY}" in 158 | y | Y) ;; 159 | *) 160 | return 0 161 | ;; 162 | esac 163 | fi 164 | echo "Installing module dependencies" | maybe_boxes 165 | hpi_module my.activitywatch.active_window || return $? 166 | hpi_module my.discord.data_export || return $? 167 | hpi_module my.todotxt.git_history || return $? 168 | hpi_module my.ip.all || return $? 169 | hpi_module my.linkedin.privacy_export || return $? 170 | hpi_module my.chess.export || return $? 171 | hpi_module my.mpv.history_daemon || return $? 172 | hpi_module my.league.export || return $? 173 | hpi_module my.scramble.history || return $? 174 | hpi_module my.trakt.export || return $? 175 | hpi_module my.mail.all || return $? 176 | hpi_module my.piazza.scraper || return $? 177 | hpi_module my.apple.privacy_export || return $? 178 | hpi_module my.grouvee.export || return $? 179 | hpi_module my.offline.listens || return $? 180 | hpi_module my.mal.export || return $? 181 | hpi_module my.listenbrainz.export || return $? 182 | hpi_module my.skype.gdpr || return $? 183 | } 184 | 185 | ci_config() { 186 | CONF="$(ci_realpath "${BASE_DIR}/tests/my")" || return $? 187 | MY_CONFIG="${CONF}" python3 -m my.core config check 1>&2 || return $? 188 | echo "${CONF}" 189 | } 190 | 191 | ci_tests() { 192 | echo 'Running tests' | maybe_boxes 193 | ci && unset HPI_TESTS_PURA 194 | python3 -m pytest ./tests || return $? 195 | } 196 | 197 | ci_mypy() { 198 | echo 'Checking types' | maybe_boxes 199 | pip_install 'git+https://github.com/python/mypy' 200 | pip_install types-urllib3 types-simplejson types-setuptools types-PyYAML types-pytz types-dateparser types-requests lxml-stubs 201 | local mypy_args=(-p tests --txt-report .coverage.mypy) 202 | while read -r pkg; do 203 | mypy_args+=(-p "${pkg}") 204 | done < <(find my -name '*.py' | grep -v '__' | sed -e 's|\.\/||' -e 's|/|.|g' -e 's/\.py$//g' | sort) 205 | python3 -m mypy "${mypy_args[@]}" || return $? 206 | } 207 | 208 | ci_lint() { 209 | pip_install flake8 || return $? 210 | echo 'Linting...' | maybe_boxes 211 | python3 -m flake8 ./my || return $? 212 | } 213 | 214 | ci_run() { 215 | ci || return 0 216 | CONF="$(ci_config)" || return $? 217 | MY_CONFIG="${CONF}" ci_tests || return $? 218 | MY_CONFIG="${CONF}" ci_mypy || return $? 219 | MY_CONFIG="${CONF}" ci_lint || return $? 220 | } 221 | 222 | main() { 223 | install_dependencies || prompt_if_fails || return $? 224 | required_commands || prompt_if_fails || return $? 225 | # use realpath to make sure BASE_DIR is set properly 226 | BASE_DIR="$(ci_realpath "${BASE_DIR}")" || return $? 227 | (setup_fork) || prompt_if_fails || return $? 228 | module_dependencies || prompt_if_fails || return $? 229 | ci_run || return $? 230 | } 231 | 232 | # if user isn't running this directly 233 | # source the exported functions into the current bash environment 234 | if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then 235 | : 236 | else 237 | # otherwise, run main as usual 238 | main "$@" || exit $? 239 | fi 240 | -------------------------------------------------------------------------------- /my/activitywatch/active_window.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses history from https://github.com/purarue/aw-watcher-window 3 | using https://github.com/purarue/active_window 4 | """ 5 | 6 | REQUIRES = [ 7 | "git+https://github.com/purarue/aw-watcher-window", 8 | "git+https://github.com/purarue/active_window", 9 | ] 10 | 11 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 12 | from my.config import activitywatch as user_config # type: ignore[attr-defined] 13 | 14 | from pathlib import Path 15 | from typing import Iterator, Sequence, Union 16 | from functools import partial 17 | from itertools import chain 18 | 19 | from dataclasses import dataclass 20 | from my.core import get_files, Stats, Paths, make_logger, make_config 21 | 22 | from more_itertools import unique_everseen 23 | 24 | import active_window.parse as AW 25 | 26 | logger = make_logger(__name__) 27 | 28 | 29 | @dataclass 30 | class window_config(user_config.active_window): 31 | # path[s]/glob to the backed up aw-window JSON/window_watcher CSV history files 32 | export_path: Paths 33 | error_policy: AW.ErrorPolicy = "drop" 34 | 35 | 36 | config = make_config(window_config) 37 | 38 | 39 | Result = Union[AW.AWAndroidEvent, AW.AWComputerEvent, AW.AWWindowWatcherEvent] 40 | Results = Iterator[Result] 41 | 42 | 43 | def inputs() -> Sequence[Path]: 44 | return get_files(config.export_path) 45 | 46 | 47 | def history() -> Results: 48 | yield from unique_everseen( 49 | chain( 50 | *map( 51 | partial( 52 | AW.parse_window_events, 53 | logger=logger, 54 | error_policy=config.error_policy, 55 | ), 56 | inputs(), 57 | ) 58 | ), 59 | key=lambda e: e.timestamp, 60 | ) 61 | 62 | 63 | def stats() -> Stats: 64 | from my.core import stat 65 | 66 | return {**stat(history)} 67 | -------------------------------------------------------------------------------- /my/apple/privacy_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the apple privacy Export 3 | https://privacy.apple.com/ 4 | """ 5 | 6 | REQUIRES = ["lxml"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import apple as user_config # type: ignore[attr-defined] 10 | from dataclasses import dataclass 11 | from my.core import PathIsh 12 | 13 | 14 | @dataclass 15 | class config(user_config.privacy_export): 16 | # path to unpacked GDPR archive 17 | gdpr_dir: PathIsh 18 | 19 | 20 | import os 21 | import json 22 | from datetime import datetime, timezone 23 | from pathlib import Path 24 | from typing import Iterator, Dict, Any, NamedTuple, Union, Optional, Sequence 25 | 26 | from lxml import etree # type: ignore[import] 27 | from lxml.etree import _Element 28 | from more_itertools import sliced, first 29 | 30 | Element = Union[_Element, None] 31 | 32 | from my.core import Stats, Res, make_logger 33 | from my.core.cachew import mcachew 34 | 35 | 36 | logger = make_logger(__name__) 37 | 38 | 39 | class Game(NamedTuple): 40 | name: str 41 | last_played: datetime 42 | 43 | 44 | # some duplication here to allow cachew usage 45 | class GameLeaderboardData(NamedTuple): 46 | game_name: str 47 | title: str 48 | dt: datetime 49 | rank: int 50 | 51 | 52 | class GameAchievement(NamedTuple): 53 | dt: datetime 54 | percentage: int 55 | game_name: str 56 | title: str 57 | 58 | @property 59 | def achieved(self) -> bool: 60 | return self.percentage == 100 61 | 62 | 63 | class Location(NamedTuple): 64 | lng: float 65 | lat: float 66 | dt: datetime 67 | name: str 68 | address: Optional[str] 69 | 70 | 71 | Event = Union[ 72 | Game, 73 | GameLeaderboardData, 74 | GameAchievement, 75 | Location, 76 | ] 77 | 78 | Results = Iterator[Res[Event]] 79 | 80 | 81 | def _depends_on() -> Sequence[Path]: 82 | return sorted(Path(config.gdpr_dir).expanduser().absolute().rglob("*")) 83 | 84 | 85 | @mcachew(depends_on=_depends_on, logger=logger) 86 | def events() -> Results: 87 | gdpr_dir = Path(config.gdpr_dir).expanduser().absolute() # expand path 88 | handler_map = { 89 | "Apple ID account and device information": None, 90 | "Apple Online and Retail Stores": None, 91 | "iCloud Bookmarks": None, # TODO: parse, 92 | "Wallet Activity": None, 93 | "Game Center/Game Center Data.json": _parse_game_center, 94 | "iCloud Notes": None, # TODO: parse/copy? 95 | "Marketing communications": None, 96 | "iCloud Contacts": None, 97 | "iCloud Calendars and Reminders": None, # TODO: parse 98 | "Other data/Apple Features Using iCloud/EventKit/Locations.xml": _parse_locations, 99 | "Other data/Apple Features Using iCloud/Calendar/": _parse_calendar_recents, 100 | "Other data/Apple Features Using iCloud/Mail": None, # probably better to just do an IMAP sync and get all the data 101 | "Other data/": None, # ignore anything else in this directory 102 | } 103 | for f in gdpr_dir.rglob("*"): 104 | handler: Any 105 | for prefix, h in handler_map.items(): 106 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)): 107 | continue 108 | handler = h 109 | break 110 | else: 111 | if f.is_dir(): 112 | # rglob("*") matches directories, ignore those 113 | continue 114 | else: 115 | e = RuntimeError(f"Unhandled file: {f}") 116 | logger.debug(str(e)) 117 | yield e 118 | continue 119 | 120 | if handler is None: 121 | # explicitly ignored 122 | continue 123 | 124 | yield from handler(f) 125 | 126 | 127 | def stats() -> Stats: 128 | from my.core import stat 129 | 130 | return { 131 | **stat(events), 132 | } 133 | 134 | 135 | def _parse_game_center( 136 | f: Path, 137 | ) -> Iterator[Union[Game, GameLeaderboardData, GameAchievement]]: 138 | for gme in json.loads(f.read_text())["games_state"]: 139 | yield Game( 140 | name=gme["game_name"], 141 | last_played=_parse_apple_utc_date(gme["last_played_utc"]), 142 | ) 143 | for lb_inf in gme["leaderboard"]: 144 | for lb_val in lb_inf["leaderboard_score"]: 145 | yield GameLeaderboardData( 146 | game_name=gme["game_name"], 147 | title=lb_inf["leaderboard_title"], 148 | rank=lb_val["rank"], 149 | dt=_parse_apple_utc_date(lb_val["submitted_time_utc"]), 150 | ) 151 | for ach_info in gme["achievements"]: 152 | yield GameAchievement( 153 | dt=_parse_apple_utc_date(ach_info["last_update_utc"]), 154 | game_name=gme["game_name"], 155 | percentage=ach_info["percentage_complete"], 156 | title=ach_info["achievements_title"], 157 | ) 158 | 159 | 160 | def _parse_locations(f: Path) -> Iterator[Location]: 161 | tr = etree.parse(str(f)) 162 | for location in _parse_apple_xml_val(tr.find("array")): 163 | loc_data: Dict[str, Any] = first(list(location.values())) 164 | if "t" in loc_data: 165 | for tstamp in loc_data["t"]: 166 | yield Location( 167 | lng=loc_data["map location"]["longitude"], 168 | lat=loc_data["map location"]["latitude"], 169 | name=loc_data["display name"], 170 | address=loc_data["address"], 171 | dt=tstamp, 172 | ) 173 | 174 | 175 | def _parse_calendar_recents(f: Path) -> Iterator[Location]: 176 | tr = etree.parse(str(f)) 177 | for location in _parse_apple_xml_val(tr.find("array")): 178 | loc_data: Dict[str, Any] = first(list(location.values())) 179 | if "map location" in loc_data: 180 | if "t" in loc_data: 181 | for tstamp in loc_data["t"]: 182 | yield Location( 183 | lng=loc_data["map location"]["longitude"], 184 | lat=loc_data["map location"]["latitude"], 185 | name=loc_data["display name"], 186 | address=first(loc_data.get("addressArray", []), None), 187 | dt=tstamp, 188 | ) 189 | 190 | 191 | # parses apples XML file format, specifies what should be JSON as XML 192 | def _parse_apple_xml_val(xml_el: Element) -> Any: 193 | if xml_el is None: 194 | return None 195 | if xml_el.tag == "array": 196 | return [_parse_apple_xml_val(el) for el in xml_el] 197 | elif xml_el.tag == "dict": 198 | return { 199 | key.text: _parse_apple_xml_val(val) for key, val in sliced(list(xml_el), 2) 200 | } 201 | elif xml_el.tag == "string": 202 | return xml_el.text 203 | elif xml_el.tag == "integer": 204 | assert xml_el.text is not None, f"integer tag has no text: {xml_el}" 205 | return int(xml_el.text) 206 | elif xml_el.tag == "real": 207 | assert xml_el.text is not None, f"real tag has no text: {xml_el}" 208 | return float(xml_el.text) 209 | elif xml_el.tag == "date": 210 | # TODO: make sure this is parsing dates properly 211 | # is this UTC? probably, since others are 212 | assert xml_el.text is not None, f"date tag has no text: {xml_el}" 213 | return datetime.astimezone( 214 | datetime.fromisoformat(xml_el.text.rstrip("Z")), tz=timezone.utc 215 | ) 216 | elif xml_el.tag == "data": 217 | return xml_el.text # BASE64 data, dont think I need this 218 | else: 219 | raise RuntimeError(f"Unknown tag: {xml_el.tag}") 220 | 221 | 222 | def _parse_apple_utc_date(dstr: str) -> datetime: 223 | return datetime.astimezone( 224 | datetime.strptime(dstr.rstrip("Z"), r"%m/%d/%Y %H:%M:%S"), tz=timezone.utc 225 | ) 226 | -------------------------------------------------------------------------------- /my/bash.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses bash history (mostly from servers/vps I run) 3 | using the following bootstrap script: 4 | https://github.com/purarue/bootstrap/ 5 | 6 | This parses bash history with the following configuration: 7 | 8 | export HISTTIMEFORMAT="%s " 9 | export HISTFILESIZE=-1 10 | export HISTSIZE=-1 11 | shopt -s histappend # dont overwrite history 12 | shopt -s cmdhist # save al-lines of multi-line commands in the same entry 13 | shopt -s lithist # embedded newlines for multi-line commands 14 | 15 | That adds timestamps to history, making it look like: 16 | 17 | #1620931766 18 | command ls 19 | #1620931767 20 | command ls -al 21 | #1620931737 22 | which ls 23 | """ 24 | 25 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 26 | from my.config import bash as user_config # type: ignore[attr-defined] 27 | 28 | from pathlib import Path 29 | from typing import Sequence, List 30 | from datetime import datetime 31 | from typing import NamedTuple, Iterator, Optional 32 | from itertools import chain 33 | 34 | from more_itertools import unique_everseen 35 | 36 | from dataclasses import dataclass 37 | from my.core import get_files, Stats, make_logger, Paths 38 | from my.core.cachew import mcachew 39 | from my.utils.time import parse_datetime_sec 40 | 41 | 42 | @dataclass 43 | class config(user_config): 44 | # path[s]/glob to the exported bash history files 45 | export_path: Paths 46 | 47 | 48 | logger = make_logger(__name__) 49 | 50 | 51 | def inputs() -> Sequence[Path]: 52 | return get_files(config.export_path) 53 | 54 | 55 | # represents one history entry (command) 56 | class Entry(NamedTuple): 57 | dt: datetime 58 | command: str 59 | 60 | 61 | Results = Iterator[Entry] 62 | 63 | 64 | def _cachew_depends_on() -> List[float]: 65 | return [p.stat().st_mtime for p in inputs()] 66 | 67 | 68 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 69 | def history() -> Results: 70 | yield from unique_everseen( 71 | chain(*map(_parse_file, inputs())), 72 | key=lambda h: ( 73 | h.dt, 74 | h.command, 75 | ), 76 | ) 77 | 78 | 79 | def _parse_file(histfile: Path) -> Results: 80 | dt: Optional[datetime] = None 81 | command_buf = "" # current command 82 | for line in histfile.open(encoding="latin-1"): 83 | if line.startswith("#"): 84 | # parse lines like '#1620931766' 85 | # possible string datetime 86 | sdt = line[1:].strip() # remove newline 87 | try: 88 | newdt = parse_datetime_sec(sdt) 89 | except Exception as e: 90 | logger.debug(f"Error while parsing datetime {e}") 91 | else: 92 | # this case happens when we successfully parse a datetime line 93 | # yield old data, then set newly parsed data to next items datetime 94 | if dt is not None: 95 | # rstrip \n gets rid of the last newline for each command 96 | yield Entry(dt=dt, command=command_buf.rstrip("\n")) 97 | # set new datetime for next entry 98 | dt = newdt 99 | # overwrite command buffer 100 | command_buf = "" 101 | continue 102 | # otherwise, append. this already includes newline 103 | command_buf += line 104 | # yield final command 105 | if dt is not None and command_buf.strip(): 106 | yield Entry(dt=dt, command=command_buf.rstrip("\n")) 107 | 108 | 109 | def stats() -> Stats: 110 | from my.core import stat 111 | 112 | return {**stat(history)} 113 | -------------------------------------------------------------------------------- /my/blizzard/gdpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses generic event data from my parsed GDPR data 3 | from: https://github.com/purarue/blizzard_gdpr_parser 4 | """ 5 | 6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 7 | from my.config import blizzard as user_config # type: ignore[attr-defined] 8 | from dataclasses import dataclass 9 | from my.core import PathIsh, make_logger 10 | from my.core.cachew import mcachew 11 | 12 | 13 | @dataclass 14 | class config(user_config.gdpr): 15 | # path to the exported data 16 | export_path: PathIsh 17 | 18 | 19 | import json 20 | from pathlib import Path 21 | from datetime import datetime 22 | from typing import NamedTuple, Iterator, Sequence, List 23 | from itertools import chain 24 | 25 | from my.core import get_files, Stats 26 | from my.utils.time import parse_datetime_sec 27 | 28 | 29 | logger = make_logger(__name__) 30 | 31 | 32 | def inputs() -> Sequence[Path]: 33 | return get_files(config.export_path) 34 | 35 | 36 | def _cachew_depends_on() -> List[float]: 37 | return [p.stat().st_mtime for p in inputs()] 38 | 39 | 40 | class Event(NamedTuple): 41 | dt: datetime 42 | event_tag: str 43 | metadata: List[str] 44 | 45 | 46 | Results = Iterator[Event] 47 | 48 | 49 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 50 | def events() -> Results: 51 | yield from chain(*map(_parse_json_file, inputs())) 52 | 53 | 54 | def _parse_json_file(p: Path) -> Results: 55 | for e_info in json.loads(p.read_text()): 56 | dt, meta_tuple = e_info 57 | meta_tag, meta_joined = meta_tuple 58 | yield Event( 59 | dt=parse_datetime_sec(dt), 60 | event_tag=meta_tag, 61 | metadata=meta_joined.split("|"), 62 | ) 63 | 64 | 65 | def stats() -> Stats: 66 | from my.core import stat 67 | 68 | return {**stat(events)} 69 | -------------------------------------------------------------------------------- /my/chess/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses chess games from chess.com/lichess.org using 3 | https://github.com/purarue/chess_export 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/chess_export"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import chess as user_config # type: ignore[attr-defined] 10 | 11 | 12 | from pathlib import Path 13 | from typing import Iterator, Sequence, List, Union 14 | from itertools import chain 15 | 16 | import chess_export.chessdotcom.model as cmodel 17 | import chess_export.lichess.model as lmodel 18 | from more_itertools import unique_everseen 19 | 20 | from dataclasses import dataclass 21 | from my.core import get_files, Stats, make_logger, Paths 22 | from my.core.cachew import mcachew 23 | 24 | 25 | @dataclass 26 | class config(user_config.export): 27 | # path[s]/glob to the exported data. These are the resulting JSON files from 'chess_export ... export' 28 | export_path: Paths 29 | 30 | 31 | logger = make_logger(__name__) 32 | 33 | 34 | def inputs() -> Sequence[Path]: 35 | return get_files(config.export_path) 36 | 37 | 38 | # TODO: make extendible? Not sure if anyone has any other things they need to include here though... 39 | Results = Iterator[Union[cmodel.ChessDotComGame, lmodel.LichessGame]] 40 | 41 | 42 | def _cachew_depends_on() -> List[float]: 43 | return [p.stat().st_mtime for p in inputs()] 44 | 45 | 46 | def _parse_export_file(p: Path) -> Results: 47 | # try one, else the other 48 | # typically this raises a KeyError since the JSON didn't match 49 | # what the NamedTuple expects 50 | try: 51 | yield from lmodel.from_export(p) 52 | except Exception: 53 | yield from cmodel.from_export(p) 54 | 55 | 56 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 57 | def history() -> Results: 58 | yield from unique_everseen( 59 | chain(*(_parse_export_file(p) for p in inputs())), key=lambda g: g.end_time 60 | ) 61 | 62 | 63 | def stats() -> Stats: 64 | from my.core import stat 65 | 66 | return {**stat(history)} 67 | -------------------------------------------------------------------------------- /my/discord/data_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Discord Data: messages and events data 3 | """ 4 | 5 | REQUIRES = [ 6 | "git+https://github.com/purarue/discord_data", 7 | "urlextract", 8 | ] 9 | 10 | 11 | from pathlib import Path 12 | from typing import List 13 | from dataclasses import dataclass 14 | 15 | from my.config import discord as user_config # type: ignore[attr-defined] 16 | from my.core import PathIsh, make_config 17 | from my.core.cachew import mcachew 18 | 19 | 20 | @dataclass 21 | class discord_config(user_config.data_export): 22 | # path to the top level discord export directory 23 | # see https://github.com/purarue/discord_data for more info 24 | export_path: PathIsh 25 | 26 | # whether to guess the compression of the files in the export_path 27 | # this uses kompress.ZipPath, which is a bit experimental 28 | # 29 | # NOTE: before adding this config flag, this was enabled, 30 | # since guess_compression=True on get_files by default 31 | _use_zippath: bool = True 32 | 33 | 34 | config = make_config(discord_config) 35 | 36 | 37 | from typing import Iterator, Optional, Tuple, Set, NamedTuple 38 | from datetime import datetime 39 | 40 | from my.core import make_logger, Stats, get_files 41 | from my.core.structure import match_structure 42 | from discord_data.parse import parse_messages, parse_activity 43 | from discord_data.model import Activity, Message 44 | from urlextract import URLExtract # type: ignore[import] 45 | 46 | 47 | logger = make_logger(__name__) 48 | 49 | 50 | def _remove_suppression(text: str, first_index: int, second_index: int) -> str: 51 | # add spaces so that text like 52 | # don't get converted into one long link 53 | return ( 54 | text[:first_index] # before URL 55 | + " " 56 | + text[first_index + 1 : second_index] # URL itself 57 | + " " 58 | + text[second_index + 1 :] # after URL 59 | ) 60 | 61 | 62 | extractor = URLExtract() 63 | 64 | 65 | def _remove_link_suppression( 66 | content: str, *, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None 67 | ) -> str: 68 | # fix content to remove discord link suppression if any links had any 69 | # e.g. this is a suppressed link 70 | 71 | if urls is None: 72 | urls = extractor.find_urls(content, get_indices=True) 73 | 74 | if not urls: 75 | return content.strip() 76 | 77 | for _, (start_index, end_index) in urls: 78 | before_ind = start_index - 1 79 | after_ind = end_index 80 | try: 81 | if content[before_ind] == "<" and content[after_ind] == ">": 82 | content = _remove_suppression(content, before_ind, after_ind) 83 | # could happen if the url didn't have braces and we hit the end of a string 84 | except IndexError: 85 | continue 86 | return content.strip() 87 | 88 | 89 | def test_remove_link_suppression() -> None: 90 | content = "" 91 | left = content.index("<") 92 | right = content.index(">") 93 | assert _remove_suppression(content, left, right) == " test " 94 | 95 | # shouldn't affect this at all 96 | content = "https://urlextract.readthedocs.io" 97 | assert _remove_link_suppression(content) == content 98 | 99 | content = "" 100 | expected = content.strip("<").strip(">") 101 | assert _remove_link_suppression(content) == expected 102 | 103 | content = "Here is some text " 104 | expected = "Here is some text https://urlextract.readthedocs.io" 105 | assert _remove_link_suppression(content) == expected 106 | 107 | content = "text other text" 108 | expected = "text https://urlextract.readthedocs.io other text" 109 | assert _remove_link_suppression(content) == expected 110 | 111 | content = ( 112 | "t other f " 113 | ) 114 | expected = ( 115 | "t https://urlextract.readthedocs.io other github.com f other.website" 116 | ) 117 | assert _remove_link_suppression(content) == expected 118 | 119 | content = "t " 120 | expected = "t https://urlextract.readthedocs.io other.website" 121 | assert _remove_link_suppression(content) == expected 122 | 123 | 124 | def _cachew_depends_on() -> List[str]: 125 | return [str(p) for p in get_files(config.export_path)] 126 | 127 | 128 | EXPECTED_DISCORD_STRUCTURE = ("messages/index.json", "account/user.json") 129 | 130 | 131 | def get_discord_exports() -> Iterator[Path]: 132 | for exp in get_files(config.export_path, guess_compression=config._use_zippath): 133 | # weak type check here, ZipPath is a bit experimental, so don't want a dependency 134 | # see https://github.com/karlicoss/HPI/blob/master/my/core/kompress.py#L160 135 | if type(exp).__name__ == "ZipPath": 136 | yield exp 137 | continue 138 | with match_structure( 139 | exp, expected=EXPECTED_DISCORD_STRUCTURE 140 | ) as discord_export: 141 | yield from discord_export 142 | 143 | 144 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 145 | def messages() -> Iterator[Message]: 146 | emitted: Set[int] = set() 147 | for discord_export in get_discord_exports(): 148 | message_dir = discord_export / "messages" 149 | for msg in parse_messages(message_dir): 150 | if isinstance(msg, Exception): 151 | logger.warning(msg) 152 | continue 153 | if msg.message_id in emitted: 154 | continue 155 | yield Message( 156 | message_id=msg.message_id, 157 | timestamp=msg.timestamp, 158 | channel=msg.channel, 159 | content=_remove_link_suppression(msg.content), 160 | attachments=msg.attachments, 161 | ) 162 | emitted.add(msg.message_id) 163 | 164 | 165 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 166 | def activity() -> Iterator[Activity]: 167 | emitted: Set[str] = set() 168 | for discord_export in get_discord_exports(): 169 | activity_dir = discord_export / "activity" 170 | for act in parse_activity(activity_dir): 171 | if isinstance(act, Exception): 172 | logger.warning(act) 173 | continue 174 | if act.event_id in emitted: 175 | continue 176 | yield act 177 | emitted.add(act.event_id) 178 | 179 | 180 | class Reaction(NamedTuple): 181 | message_id: int 182 | emote: str 183 | timestamp: datetime 184 | 185 | 186 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 187 | def reactions() -> Iterator[Reaction]: 188 | for act in activity(): 189 | jd = act.json_data 190 | if "emoji_name" in jd and "message_id" in jd: 191 | yield Reaction( 192 | message_id=int(jd["message_id"]), 193 | emote=jd["emoji_name"], 194 | timestamp=act.timestamp, 195 | ) 196 | 197 | 198 | class AppLaunch(NamedTuple): 199 | name: str 200 | timestamp: datetime 201 | 202 | 203 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 204 | def app_launches() -> Iterator[AppLaunch]: 205 | for act in activity(): 206 | jd = act.json_data 207 | name = jd.get("game") or jd.get("application") 208 | if name is not None: 209 | yield AppLaunch( 210 | name=name, 211 | timestamp=act.timestamp, 212 | ) 213 | 214 | 215 | def stats() -> Stats: 216 | from my.core import stat 217 | 218 | return { 219 | **stat(messages), 220 | **stat(activity), 221 | **stat(reactions), 222 | **stat(app_launches), 223 | } 224 | -------------------------------------------------------------------------------- /my/facebook/gdpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the facebook GPDR Export 3 | """ 4 | 5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 6 | from my.config import facebook as user_config # type: ignore[attr-defined] 7 | from dataclasses import dataclass 8 | from my.core import PathIsh 9 | 10 | 11 | @dataclass 12 | class config(user_config.gdpr): 13 | gdpr_dir: PathIsh # path to unpacked GDPR archive 14 | 15 | 16 | import os 17 | import json 18 | from datetime import datetime 19 | from pathlib import Path 20 | from itertools import chain 21 | from typing import Iterator, Dict, Any, NamedTuple, Union, Optional, List 22 | 23 | 24 | from my.core import get_files, Stats, Res, Json, make_logger 25 | from my.utils.time import parse_datetime_sec 26 | 27 | 28 | logger = make_logger(__name__) 29 | 30 | FacebookJson = Dict[Any, Any] 31 | 32 | 33 | class Contact(NamedTuple): 34 | name: str 35 | phone_number: str 36 | created: datetime 37 | updated: datetime 38 | 39 | 40 | class Action(NamedTuple): 41 | description: str 42 | dt: datetime 43 | metadata: Json = {} 44 | 45 | 46 | # (logs/account activity) 47 | class AdminAction(NamedTuple): 48 | description: str 49 | dt: datetime 50 | ip: str 51 | user_agent: str 52 | metadata: Json = {} 53 | 54 | 55 | class Search(NamedTuple): 56 | query: str 57 | dt: datetime 58 | 59 | 60 | class UploadedPhoto(NamedTuple): 61 | dt: datetime 62 | ip: str 63 | 64 | 65 | class Post(NamedTuple): 66 | content: str 67 | dt: datetime 68 | action: Optional[str] 69 | 70 | 71 | class Comment(NamedTuple): 72 | action: str 73 | dt: datetime 74 | content: str 75 | metadata: Optional[str] 76 | 77 | 78 | class AcceptedEvent(NamedTuple): 79 | name: str 80 | starts_dt: datetime 81 | ends_dt: datetime 82 | 83 | 84 | class Friend(NamedTuple): 85 | name: str 86 | dt: datetime 87 | added: bool # whether this was when I added a friend or removed one 88 | 89 | 90 | # i.e. a PM 91 | class Message(NamedTuple): 92 | author: str 93 | dt: datetime 94 | content: str 95 | metadata: Optional[str] = None 96 | 97 | 98 | # a chain of messages back and forth, with one or more people 99 | class Conversation(NamedTuple): 100 | title: str 101 | participants: List[str] 102 | messages: List[Message] 103 | 104 | 105 | Event = Union[ 106 | Contact, 107 | Conversation, 108 | Friend, 109 | UploadedPhoto, 110 | AcceptedEvent, 111 | Action, 112 | Post, 113 | Comment, 114 | Search, 115 | AdminAction, 116 | Contact, 117 | ] 118 | 119 | Results = Iterator[Res[Event]] 120 | 121 | 122 | def events() -> Results: 123 | # get files 2 levels deep into the export 124 | gdpr_dir = str(Path(config.gdpr_dir).expanduser().absolute()) # expand path 125 | files = chain(*map(lambda f: f.rglob("*"), get_files(config.gdpr_dir))) 126 | handler_map = { 127 | "about_you/face_recog": None, 128 | "about_you/friend_peer": None, 129 | "about_you/your_address_books": _parse_address_book, 130 | "ads": None, 131 | "apps_and_websites/apps_and_websites": _parse_installed_apps, 132 | "apps_and_websites/posts_from_apps_and_websites": _parse_app_posts, 133 | "comments/comments": _parse_group_comments, 134 | "events/event_invitations": None, # just parse the ones I accepted 135 | "events/your_event_responses": _parse_joined_events, 136 | "following_and": None, # I have no data here 137 | "friends/friends": _parse_friends, 138 | "friends/received_friend_requests": None, # Not interested 139 | "friends/rejected_friend": None, # Not interested 140 | "friends/sent_friend": None, # Not interested 141 | "friends/removed_": _parse_deleted_friends, 142 | "groups/your_group_membership": _parse_group_activity, 143 | "groups/your_posts_and_comments": _parse_group_posts, 144 | "likes_and_reactions/pages": _parse_page_likes, 145 | "likes_and_reactions/posts_and_comments": _parse_reactions, 146 | "location": None, # No data 147 | "marketplace": None, 148 | "other_activity": None, 149 | "pages": None, 150 | "payment_history": None, 151 | "photos_and_videos/album": _parse_photo_ips, # ip info for where images were uplodaed from 152 | "photos_and_videos/": None, # pull these out in my/photos.py 153 | "profile_information/profile_information.json": None, 154 | "saved_items": None, 155 | "stories": None, 156 | "your_places": None, 157 | "posts/your_posts": _parse_posts, 158 | "search_history": _parse_search_history, 159 | "profile_information/profile_update_history": _parse_posts, 160 | "messages/stickers_used": None, # no one needs stickers o_o 161 | "messages/": _parse_conversation, 162 | "security_and_login_information/account_activity": _parse_account_activity, 163 | "security_and_login_information/authorized_logins": _parse_authorized_logins, 164 | "security_and_login_information/administrative_records": _parse_admin_records, 165 | "security_and_login_information/where_you": None, 166 | "security_and_login_information/used_ip_addresses": None, 167 | "security_and_login_information/account_status_changes": None, 168 | "security_and_login_information/logins_and_logouts": None, 169 | "security_and_login_information/login_protection": None, 170 | "security_and_login_information/datr_cookie": None, 171 | "posts/other_people's_posts_to_your_timeline": None, # maybe implement this? OtherComment NamedTuple? Comment should just be mine 172 | } 173 | for f in files: 174 | handler: Any 175 | for prefix, h in handler_map.items(): 176 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)): 177 | continue 178 | handler = h 179 | break 180 | else: 181 | if f.is_dir(): 182 | # rglob("*") matches directories, as well as any subredirectories/json files in those 183 | # this is here exclusively for the messages dir, which has a larger structure 184 | # json files from inside the dirs are still picked up by rglob 185 | continue 186 | else: 187 | e = RuntimeError(f"Unhandled file: {f}") 188 | logger.debug(str(e)) 189 | yield e 190 | continue 191 | 192 | if handler is None: 193 | # explicitly ignored 194 | continue 195 | 196 | if f.suffix != ".json": 197 | continue 198 | 199 | j = json.loads(f.read_text()) 200 | yield from handler(j) 201 | 202 | 203 | def _parse_address_book(d: FacebookJson) -> Iterator[Contact]: 204 | # remove top-level address book name 205 | for addr_book_top in d.values(): 206 | for addr_book_list in addr_book_top.values(): 207 | for contact in addr_book_list: 208 | yield Contact( 209 | name=contact["name"], 210 | phone_number=contact["details"][0]["contact_point"], 211 | created=parse_datetime_sec(contact["created_timestamp"]), 212 | updated=parse_datetime_sec(contact["updated_timestamp"]), 213 | ) 214 | 215 | 216 | def _parse_installed_apps(d: FacebookJson) -> Iterator[Action]: 217 | for app in d["installed_apps"]: 218 | yield Action( 219 | description="{} was installed".format(app["name"]), 220 | dt=parse_datetime_sec(app["added_timestamp"]), 221 | ) 222 | 223 | 224 | def _parse_app_posts(d: FacebookJson) -> Iterator[Action]: 225 | for post in d["app_posts"]: 226 | yield Action( 227 | description=post["title"], dt=parse_datetime_sec(post["timestamp"]) 228 | ) 229 | 230 | 231 | def _parse_photo_ips(d: FacebookJson) -> Iterator[UploadedPhoto]: 232 | for photo_info in d["photos"]: 233 | if ( 234 | "media_metadata" in photo_info 235 | and "photo_metadata" in photo_info["media_metadata"] 236 | and "upload_ip" in photo_info["media_metadata"]["photo_metadata"] 237 | ): 238 | yield UploadedPhoto( 239 | dt=parse_datetime_sec(photo_info["creation_timestamp"]), 240 | ip=photo_info["media_metadata"]["photo_metadata"]["upload_ip"], 241 | ) 242 | 243 | 244 | def _parse_group_comments(d: FacebookJson) -> Iterator[Comment]: 245 | for comment in d["comments"]: 246 | yield Comment( 247 | content=comment["data"][0]["comment"]["comment"], 248 | action=comment["title"], 249 | dt=parse_datetime_sec(comment["timestamp"]), 250 | metadata=comment["data"][0]["comment"]["group"], 251 | ) 252 | 253 | 254 | def _parse_joined_events(d: FacebookJson) -> Iterator[AcceptedEvent]: 255 | for event in d["event_responses"]["events_joined"]: 256 | yield AcceptedEvent( 257 | name=event["name"], 258 | starts_dt=parse_datetime_sec(event["start_timestamp"]), 259 | ends_dt=parse_datetime_sec(event["end_timestamp"]), 260 | ) 261 | 262 | 263 | def _parse_friends(d: FacebookJson) -> Iterator[Friend]: 264 | for friend in d["friends"]: 265 | yield Friend( 266 | name=friend["name"], dt=parse_datetime_sec(friend["timestamp"]), added=True 267 | ) 268 | 269 | 270 | def _parse_deleted_friends(d: FacebookJson) -> Iterator[Friend]: 271 | for friend in d["deleted_friends"]: 272 | yield Friend( 273 | name=friend["name"], dt=parse_datetime_sec(friend["timestamp"]), added=False 274 | ) 275 | 276 | 277 | def _parse_group_activity(d: FacebookJson) -> Iterator[Action]: 278 | for gr in d["groups_joined"]: 279 | yield Action( 280 | description=gr["title"], 281 | dt=parse_datetime_sec(gr["timestamp"]), 282 | ) 283 | 284 | 285 | def _parse_group_posts(d: FacebookJson) -> Iterator[Union[Comment, Post]]: 286 | for log_data_list in d.values(): 287 | for comm_list in log_data_list.values(): 288 | for comm in comm_list: 289 | data_keys = comm["data"][0].keys() 290 | if "comment" in data_keys: 291 | yield Comment( 292 | content=comm["data"][0]["comment"]["comment"], 293 | action=comm["title"], 294 | dt=parse_datetime_sec(comm["timestamp"]), 295 | metadata=comm["data"][0]["comment"]["group"], 296 | ) 297 | else: 298 | yield Post( 299 | content=comm["data"][0]["post"], 300 | action=comm["title"], 301 | dt=parse_datetime_sec(comm["timestamp"]), 302 | ) 303 | 304 | 305 | def _parse_page_likes(d: FacebookJson) -> Iterator[Action]: 306 | for page in d["page_likes"]: 307 | yield Action( 308 | description="Liked Page {}".format(page["name"]), 309 | dt=parse_datetime_sec(page["timestamp"]), 310 | ) 311 | 312 | 313 | def _parse_reactions(d: FacebookJson) -> Iterator[Action]: 314 | for react in d["reactions"]: 315 | yield Action( 316 | description=react["title"], dt=parse_datetime_sec(react["timestamp"]) 317 | ) 318 | 319 | 320 | def _parse_search_history(d: FacebookJson) -> Iterator[Search]: 321 | for search in d["searches"]: 322 | assert len(search["data"]) == 1 323 | yield Search( 324 | query=search["data"][0]["text"], dt=parse_datetime_sec(search["timestamp"]) 325 | ) 326 | 327 | 328 | def _parse_conversation( 329 | d: FacebookJson, 330 | ) -> Iterator[Res[Conversation]]: # will only return 1 convo 331 | participants: List[str] = [p["name"] for p in d["participants"]] 332 | good_messages: List[Message] = [] 333 | for m in _parse_messages_in_conversation(d["messages"]): 334 | # propagate up exception if one exists 335 | if isinstance(m, Exception): 336 | yield m 337 | else: 338 | good_messages.append(m) 339 | yield Conversation( 340 | participants=participants, 341 | title=d["title"], 342 | messages=good_messages, 343 | ) 344 | 345 | 346 | def _parse_messages_in_conversation( 347 | messages: List[FacebookJson], 348 | ) -> Iterator[Res[Message]]: 349 | for m in messages: 350 | timestamp = parse_datetime_sec(m["timestamp_ms"] / 1000) 351 | author = m["sender_name"] 352 | if m["type"] == "Unsubscribe": 353 | continue 354 | elif m["type"] in ["Generic", "Share"]: 355 | # eh, I dont care that much about these in context, can do analysis on my/photos.py on its own 356 | if any([k in m for k in ["photos", "sticker"]]): 357 | continue 358 | elif "content" in m: 359 | yield Message( 360 | dt=timestamp, 361 | author=author, 362 | content=m["content"], 363 | metadata=m.get("share"), 364 | ) 365 | # if this just actually does not have a field with content for some reason, ignore it 366 | elif set(m.keys()).issubset(set(["sender_name", "timestamp_ms", "type"])): 367 | continue 368 | else: 369 | yield RuntimeError( 370 | "Not sure how to parse message without 'photos' or 'content': {}".format( 371 | m 372 | ) 373 | ) 374 | else: 375 | yield RuntimeError("Not sure how to parse message for type: {}".format(m)) 376 | 377 | 378 | # yikes. this is pretty much whenever I posted *anything*, or a third party app communicated 379 | # back to facebook that I listened to something/played a game, so it has like 5000 events 380 | # 381 | # not sure if I hit all the types, but this yields RuntimeErrors if it can't parse something, 382 | # so just check hpi doctor to make sure its all gooood 383 | # or 384 | # list(filter(lambda e: isinstance(e, Exception), events())), 385 | # throw a 'import pdb; pdb.set_trace()' at where its throwing the error 386 | # and add a new case for a new type of post 387 | def _parse_posts(d: FacebookJson) -> Iterator[Res[Union[Post, Action]]]: 388 | all_posts = d 389 | # handle both profile updates and posts 390 | if isinstance(all_posts, dict) and "profile_updates" in all_posts: 391 | all_posts = all_posts["profile_updates"] 392 | for post in all_posts: 393 | if "attachments" in post: 394 | att = post["attachments"] 395 | # e.g. photo with a description 396 | # make sure the structure looks like a media post 397 | # traverse into the image metadata post to see if we can find a description 398 | if len(att) >= 1 and "data" in att[0] and len(att[0]["data"]) >= 1: 399 | # make sure each data item has only one item of media 400 | if all([len(attach["data"]) == 1 for attach in att]): 401 | att_data = [attach["data"][0] for attach in att] 402 | # switch, over posts that have descriptions (e.g. me describing what the photo is), and posts that dont 403 | for dat in att_data: 404 | if "media" in dat: 405 | mdat = dat["media"] 406 | # image where I described something 407 | if "description" in mdat: 408 | yield Action( 409 | description=mdat["description"], 410 | dt=parse_datetime_sec(post["timestamp"]), 411 | metadata=mdat, 412 | ) 413 | # image when I just posted to a album 414 | elif "title" in mdat: 415 | yield Action( 416 | description="Posted to Album {}".format( 417 | mdat["title"] 418 | ), 419 | dt=parse_datetime_sec(post["timestamp"]), 420 | metadata=mdat, 421 | ) 422 | else: 423 | yield RuntimeError( 424 | "No known way to parse image post {}".format(post) 425 | ) 426 | elif "place" in dat: 427 | # check-in into place 428 | if "name" in dat["place"]: 429 | yield Action( 430 | description="Visited {}".format( 431 | dat["place"]["name"] 432 | ), 433 | dt=parse_datetime_sec(post["timestamp"]), 434 | metadata=dat, 435 | ) 436 | else: 437 | yield RuntimeError( 438 | "No known way to parse location post {}".format( 439 | post 440 | ) 441 | ) 442 | elif "life_event" in dat: 443 | # started high school etc. 444 | ddat = dat["life_event"] 445 | yield Action( 446 | description=ddat["title"], 447 | dt=parse_datetime_sec(post["timestamp"]), 448 | metadata=ddat, 449 | ) 450 | # third party app event (e.g. Listened to Spotify Song) 451 | elif "title" in post: 452 | if "external_context" in dat: 453 | if "title" in post: 454 | yield Action( 455 | description=post["title"], 456 | dt=parse_datetime_sec(post["timestamp"]), 457 | metadata=dat, 458 | ) 459 | # seems like bad data handling on facebooks part. 460 | # these are still events, 461 | # but it does not have an external context, 462 | # its like a stringified version of the data 463 | elif "text" in dat: 464 | yield Action( 465 | description=post["title"], 466 | dt=parse_datetime_sec(post["timestamp"]), 467 | metadata=dat, 468 | ) 469 | else: 470 | yield RuntimeError( 471 | "No known way to parse attachment post with title {}".format( 472 | post 473 | ) 474 | ) 475 | else: # unknown data type 476 | yield RuntimeError( 477 | "No known way to parse data type with attachment {}".format( 478 | post 479 | ) 480 | ) 481 | else: # unknown structure 482 | yield RuntimeError( 483 | "No known way to parse data from post {}".format(post) 484 | ) 485 | else: 486 | yield RuntimeError( 487 | "No known way to parse attachment post {}".format(post) 488 | ) 489 | elif "data" in post and len(post["data"]) == 1: 490 | dat = post["data"][0] 491 | # basic post I wrote on my timeline 492 | if "post" in dat and isinstance(dat["post"], str) and "title" in post: 493 | yield Post( 494 | content=dat["post"], 495 | dt=parse_datetime_sec(post["timestamp"]), 496 | action=post["title"], 497 | ) 498 | elif "profile_update" in dat: 499 | yield Action( 500 | description="Updated Profile", 501 | dt=parse_datetime_sec(post["timestamp"]), 502 | metadata=dat["profile_update"], 503 | ) 504 | else: 505 | yield RuntimeError("No known way to parse basic post {}".format(post)) 506 | # post without any actual content (e.g. {'timestamp': 1334515711, 'title': 'purarue posted in club'}) 507 | # treat this as an action since I have no content here 508 | elif set(("timestamp", "title")) == set(post.keys()): 509 | yield Action( 510 | description=post["title"], dt=parse_datetime_sec(post["timestamp"]) 511 | ) 512 | else: 513 | yield RuntimeError("No known way to parse post {}".format(post)) 514 | 515 | 516 | def _parse_account_activity(d: FacebookJson) -> Iterator[AdminAction]: 517 | for ac in d["account_activity"]: 518 | yield AdminAction( 519 | description=ac["action"], 520 | dt=parse_datetime_sec(ac["timestamp"]), 521 | ip=ac["ip_address"], 522 | user_agent=ac["user_agent"], 523 | ) 524 | 525 | 526 | def _parse_authorized_logins(d: FacebookJson) -> Iterator[AdminAction]: 527 | for ac in d["recognized_devices"]: 528 | metadata = {} 529 | if "updated_timestamp" in ac: 530 | metadata["updated_at"] = parse_datetime_sec(ac["updated_timestamp"]) 531 | yield AdminAction( 532 | description="Known Device: {}".format(ac["name"]), 533 | dt=parse_datetime_sec(ac["created_timestamp"]), 534 | ip=ac["ip_address"], 535 | user_agent=ac["user_agent"], 536 | metadata=metadata, 537 | ) 538 | 539 | 540 | def _parse_admin_records(d: FacebookJson) -> Iterator[AdminAction]: 541 | for rec in d["admin_records"]: 542 | s = rec["session"] 543 | yield AdminAction( 544 | description=rec["event"], 545 | dt=parse_datetime_sec(s["created_timestamp"]), 546 | ip=s["ip_address"], 547 | user_agent=s["user_agent"], 548 | ) 549 | 550 | 551 | def stats() -> Stats: 552 | from my.core import stat 553 | 554 | return { 555 | **stat(events), 556 | } 557 | -------------------------------------------------------------------------------- /my/grouvee/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the CSV export from https://www.grouvee.com/ 3 | """ 4 | 5 | REQUIRES = ["git+https://github.com/purarue/grouvee_export"] 6 | 7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 8 | from my.config import grouvee as user_config # type: ignore[attr-defined] 9 | 10 | from pathlib import Path 11 | from typing import Iterator, List 12 | from functools import lru_cache 13 | 14 | from more_itertools import last 15 | import grouvee_export.dal as G 16 | 17 | from dataclasses import dataclass 18 | from my.core import get_files, Stats, Paths 19 | 20 | 21 | @dataclass 22 | class config(user_config.export): 23 | # path[s]/glob to the exported CSV files 24 | export_path: Paths 25 | 26 | 27 | def _latest_input() -> Path: 28 | """Since the exports are complete exports, can just use the most recent export""" 29 | return last(sorted(get_files(config.export_path), key=lambda p: p.stat().st_mtime)) 30 | 31 | 32 | # should typically only parse the latest dump 33 | @lru_cache(maxsize=None) 34 | def _read_grouvee_export(p: Path) -> List[G.Game]: 35 | return list(G.parse_export(p)) 36 | 37 | 38 | def games() -> Iterator[G.Game]: 39 | yield from _read_grouvee_export(_latest_input()) 40 | 41 | 42 | def _filter_games_for_shelf(name: str) -> Iterator[G.Game]: 43 | for game in games(): 44 | if name in (s.name for s in game.shelves): 45 | yield game 46 | 47 | 48 | def played() -> Iterator[G.Game]: 49 | """Games I've Played""" 50 | yield from _filter_games_for_shelf("Played") 51 | 52 | 53 | def watched() -> Iterator[G.Game]: 54 | """Games I've watched, not played""" 55 | yield from _filter_games_for_shelf("Watched") 56 | 57 | 58 | def backlog() -> Iterator[G.Game]: 59 | """Games on my backlog""" 60 | yield from _filter_games_for_shelf("Backlog") 61 | 62 | 63 | def wish_list() -> Iterator[G.Game]: 64 | """Games on my wish list""" 65 | yield from _filter_games_for_shelf("Wish List") 66 | 67 | 68 | def stats() -> Stats: 69 | from my.core import stat 70 | 71 | return { 72 | **stat(played), 73 | **stat(watched), 74 | **stat(backlog), 75 | **stat(wish_list), 76 | } 77 | -------------------------------------------------------------------------------- /my/ip/all.py: -------------------------------------------------------------------------------- 1 | """ 2 | Combines IPs from data exports which include IP addresses 3 | """ 4 | 5 | REQUIRES = ["git+https://github.com/purarue/ipgeocache"] 6 | 7 | from typing import Iterator 8 | 9 | from my.ip.common import IP # type: ignore[import] 10 | 11 | from my.core import make_logger, Stats 12 | 13 | logger = make_logger(__name__) 14 | 15 | 16 | # can add more sources here, or disable them through core.disabled_modules 17 | def ips() -> Iterator[IP]: 18 | from . import facebook 19 | from . import discord 20 | from . import blizzard 21 | 22 | yield from facebook.ips() 23 | yield from discord.ips() 24 | yield from blizzard.ips() 25 | 26 | 27 | def stats() -> Stats: 28 | from my.core import stat 29 | 30 | return {**stat(ips)} 31 | -------------------------------------------------------------------------------- /my/ip/blizzard.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | 3 | from my.core import Stats 4 | from my.core.source import import_source 5 | from my.ip.common import IP # type: ignore[import] 6 | 7 | 8 | @import_source(module_name="my.blizzard.gdpr") 9 | def ips() -> Iterator[IP]: 10 | from my.blizzard.gdpr import events as blizzard_events 11 | 12 | for e in blizzard_events(): 13 | if e.event_tag == "Activity History": 14 | yield IP(dt=e.dt, addr=e.metadata[-2]) 15 | 16 | 17 | def stats() -> Stats: 18 | from my.core import stat 19 | 20 | return {**stat(ips)} 21 | -------------------------------------------------------------------------------- /my/ip/discord.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | 3 | from my.ip.common import IP, drop_private # type: ignore[import] 4 | 5 | 6 | from my.core import make_logger, Stats 7 | from my.core.cachew import mcachew 8 | from my.core.source import import_source 9 | 10 | logger = make_logger(__name__) 11 | 12 | 13 | @import_source(module_name="my.discord.data_export") 14 | def ips() -> Iterator[IP]: 15 | from my.discord.data_export import activity, _cachew_depends_on 16 | 17 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 18 | def _discord_ips() -> Iterator[IP]: 19 | for a in activity(): 20 | if a.fingerprint.ip is not None: 21 | yield IP(dt=a.timestamp, addr=a.fingerprint.ip) 22 | 23 | yield from drop_private(_discord_ips()) 24 | 25 | 26 | def stats() -> Stats: 27 | from my.core import stat 28 | 29 | return {**stat(ips)} 30 | -------------------------------------------------------------------------------- /my/ip/facebook.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, List 2 | from pathlib import Path 3 | 4 | from my.ip.common import IP, drop_private # type: ignore[import] 5 | 6 | from my.core import make_logger, Stats 7 | from my.core.source import import_source 8 | from my.core.cachew import mcachew 9 | 10 | 11 | logger = make_logger(__name__) 12 | 13 | 14 | def _cachew_depends_on() -> List[float]: 15 | from my.facebook.gdpr import config as facebook_config 16 | 17 | return [p.stat().st_mtime for p in Path(facebook_config.gdpr_dir).rglob("*")] 18 | 19 | 20 | @import_source(module_name="my.facebook.gdpr") 21 | def ips() -> Iterator[IP]: 22 | from my.facebook.gdpr import ( 23 | AdminAction, 24 | UploadedPhoto, 25 | events as facebook_events, 26 | ) 27 | 28 | @mcachew( 29 | depends_on=_cachew_depends_on, 30 | logger=logger, 31 | ) 32 | def _facebook_ips() -> Iterator[IP]: 33 | for e in facebook_events(): 34 | if isinstance(e, AdminAction) or isinstance(e, UploadedPhoto): 35 | if not isinstance(e, Exception): 36 | yield IP(dt=e.dt, addr=e.ip) 37 | 38 | yield from drop_private(_facebook_ips()) 39 | 40 | 41 | def stats() -> Stats: 42 | from my.core import stat 43 | 44 | return {**stat(ips)} 45 | -------------------------------------------------------------------------------- /my/ipython.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get IPython (REPL) History with datetimes 3 | https://ipython.readthedocs.io/en/stable/api/generated/IPython.core.history.html?highlight=hist#IPython.core.history.HistoryAccessor.__init__ 4 | 5 | In order to save python history with timestamps, I define the following in my zshrc: 6 | 7 | # if I type python with out any arguments, launch ipython instead 8 | python() { python3 "$@" } 9 | python3() { 10 | if (( $# == 0 )); then 11 | echo -e "$(tput setaf 2)Launching ipython instead...$(tput sgr0)" 12 | ipython 13 | else 14 | /usr/bin/python3 "$@" 15 | fi 16 | } 17 | """ 18 | 19 | REQUIRES = ["ipython>=8.5.0"] 20 | 21 | 22 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 23 | from my.config import ipython as user_config # type: ignore[attr-defined] 24 | 25 | from dataclasses import dataclass 26 | from my.core import Paths 27 | 28 | 29 | @dataclass 30 | class config(user_config): 31 | # path[s]/glob to the exported ipython sqlite databases 32 | export_path: Paths 33 | 34 | 35 | from pathlib import Path 36 | from datetime import datetime 37 | from typing import Iterable, NamedTuple, Iterator, Optional 38 | from itertools import chain 39 | 40 | from more_itertools import unique_everseen 41 | from IPython.core.history import HistoryAccessor 42 | 43 | from my.core import get_files, Stats, make_logger 44 | 45 | logger = make_logger(__name__) 46 | 47 | 48 | class Command(NamedTuple): 49 | dt: datetime 50 | command: str 51 | 52 | 53 | Results = Iterator[Command] 54 | 55 | 56 | # Return backed up sqlite databases 57 | def inputs() -> Iterable[Path]: 58 | yield from get_files(config.export_path) 59 | 60 | 61 | def _live_history() -> Results: 62 | # the empty string makes IPython use the live history file ~/.local/share/ipython/.../history.sqlite 63 | # instead of one of the files from the export backup 64 | # merge histories combines those 65 | # 66 | # seems that this has the possibility to fail to locate your live 67 | # history file if its being run in the background? unsure why 68 | try: 69 | yield from _parse_database(sqlite_database="") 70 | except Exception as e: 71 | logger.warning(f"Failed to get data from current ipython database: {e}") 72 | return 73 | 74 | 75 | def history() -> Results: 76 | yield from unique_everseen( 77 | chain(*(_parse_database(str(p)) for p in inputs()), _live_history()), 78 | key=lambda h: (h.command, h.dt), 79 | ) 80 | 81 | 82 | def _parse_database(sqlite_database: str) -> Results: 83 | hist = HistoryAccessor(hist_file=sqlite_database) # type: ignore[no-untyped-call] 84 | try: 85 | total_sessions: Optional[int] = hist.get_last_session_id() 86 | except Exception as e: 87 | logger.warning(f"Failed to get last session id: {e}") 88 | # if database is corrupt/fails to compute sessions, skip 89 | return 90 | if total_sessions is None: 91 | return 92 | # yes, these start at 1 93 | for sess in range(1, total_sessions + 1): 94 | # get when this session started, use that as timestamp 95 | session_info = hist.get_session_info(sess) 96 | assert len(session_info) == 5 # sanity checks 97 | start_time = session_info[1] 98 | assert isinstance(start_time, datetime) 99 | for msg in hist.get_range(sess).fetchall(): # sqlite cursor 100 | assert len(msg) == 3 101 | assert isinstance(msg[-1], str) 102 | yield Command(command=msg[-1], dt=start_time) 103 | 104 | 105 | def stats() -> Stats: 106 | from my.core import stat 107 | 108 | return {**stat(history)} 109 | -------------------------------------------------------------------------------- /my/league/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses league of legend history from https://github.com/purarue/lolexport 3 | """ 4 | 5 | REQUIRES = ["git+https://github.com/purarue/lolexport"] 6 | 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import league as user_config # type: ignore[attr-defined] 10 | from dataclasses import dataclass 11 | from my.core import Paths 12 | 13 | 14 | @dataclass 15 | class config(user_config.export): 16 | # path[s]/glob to the exported data. These are the resulting json file from 'lolexport parse', or v5 exports 17 | export_path: Paths 18 | 19 | # league of legends username 20 | username: str 21 | 22 | 23 | from pathlib import Path 24 | from typing import Iterator, Sequence, Optional 25 | 26 | from my.core import get_files, Stats, Res, make_logger 27 | 28 | from lolexport.merge import Game, merge_game_histories 29 | import lolexport.log as llog 30 | from logzero import setup_logger # type: ignore[import] 31 | 32 | logger = make_logger(__name__) 33 | 34 | # configure logs 35 | llog.logger = setup_logger(name="lolexport", level=logger.level) 36 | 37 | 38 | def inputs() -> Sequence[Path]: 39 | return get_files(config.export_path) 40 | 41 | 42 | Results = Iterator[Res[Game]] 43 | 44 | 45 | def history(summoner_name: Optional[str] = None) -> Results: 46 | sname = summoner_name or config.username 47 | for g in merge_game_histories(list(inputs()), username=sname): 48 | try: 49 | g._serialize() # try parsing the data from this 50 | yield g 51 | except Exception as ex: 52 | yield ex 53 | 54 | 55 | def stats() -> Stats: 56 | from my.core import stat 57 | 58 | return {**stat(history)} 59 | -------------------------------------------------------------------------------- /my/linkedin/privacy_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the linkedin privacy/export 3 | https://www.linkedin.com/help/linkedin/answer/50191/downloading-your-account-data?lang=en 4 | """ 5 | 6 | REQUIRES = ["dateparser"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import linkedin as user_config # type: ignore[attr-defined] 10 | 11 | from dataclasses import dataclass 12 | from my.core import PathIsh 13 | 14 | 15 | @dataclass 16 | class config(user_config.privacy_export): 17 | # path to unpacked privacy export, or a zip 18 | gdpr_dir: PathIsh 19 | 20 | 21 | import csv 22 | from datetime import datetime, timezone 23 | from pathlib import Path 24 | from typing import Iterator, Dict, cast, Optional 25 | from io import StringIO 26 | 27 | import dateparser 28 | 29 | from my.core import Stats, make_logger 30 | from my.core.structure import match_structure 31 | 32 | 33 | logger = make_logger(__name__) 34 | 35 | 36 | EXPECTED = ( 37 | "Registration.csv", 38 | "messages.csv", 39 | "Jobs", 40 | "Profile.csv", 41 | ) 42 | 43 | 44 | def input() -> Path: 45 | return Path(config.gdpr_dir).expanduser().absolute() 46 | 47 | 48 | DATELIKE_KEYS = {"date", "time"} 49 | ENDSWITH_KEYS = {" on", " at"} 50 | 51 | 52 | def _dateparser_to_utc(val: str) -> Optional[datetime]: 53 | dt_data = dateparser.parse(val) 54 | if dt_data is not None: 55 | return datetime.fromtimestamp(dt_data.timestamp(), tz=timezone.utc) 56 | return None 57 | 58 | 59 | @dataclass 60 | class Event: 61 | data: Dict[str, str] 62 | event_type: str # file name this was read from 63 | 64 | def iter_dts(self) -> Iterator[datetime]: 65 | for k, v in self.data.items(): 66 | kl = k.lower() 67 | for en in ENDSWITH_KEYS: 68 | if kl.endswith(en): 69 | data = _dateparser_to_utc(v) 70 | if data is not None: 71 | yield data 72 | for dk in DATELIKE_KEYS: 73 | if dk in kl: 74 | data = _dateparser_to_utc(v) 75 | if data is not None: 76 | yield data 77 | 78 | @property 79 | def dt(self) -> Optional[datetime]: 80 | """Try to parse a datetime from this event""" 81 | if hasattr(self, "_dt"): 82 | return cast(datetime, getattr(self, "_dt")) 83 | dts = list(set(self.iter_dts())) 84 | if len(dts) >= 1: 85 | if len(dts) >= 2: 86 | logger.debug(f"Parsed multiple dates from {self.data}: {dts}") 87 | setattr(self, "_dt", dts[0]) 88 | return dts[0] 89 | return None 90 | 91 | 92 | Results = Iterator[Event] 93 | 94 | 95 | def events() -> Iterator[Event]: 96 | with match_structure(input(), expected=EXPECTED, partial=True) as exports: 97 | for exp in exports: 98 | for csv_file in exp.rglob("*"): 99 | if not csv_file.is_file(): 100 | continue 101 | yield from _csv_to_json(csv_file) 102 | 103 | 104 | # TODO: cache? 105 | def connections() -> Iterator[Event]: 106 | yield from filter(lambda f: f.event_type == "connections", events()) 107 | 108 | 109 | def _csv_to_json(p: Path) -> Iterator[Event]: 110 | event_type = p.stem.strip().casefold().replace(" ", "_") 111 | text = p.read_text() 112 | # some items have 'Notes:' at the top, which are useless when parsing 113 | if text.startswith("Notes:\n"): 114 | # hopefully this is robust enough? -- seems to always be nother line after the note 115 | if "\n\n" in text.strip(): 116 | text = text.split("\n\n", maxsplit=1)[1] 117 | reader = csv.reader(StringIO(text)) 118 | header = next(reader) 119 | header_mapping = {i: t for i, t in enumerate(header)} 120 | for line in reader: 121 | # ignore empty lines -- not sure why they're present sometimes 122 | if "".join(line).strip() == "": 123 | continue 124 | yield Event( 125 | event_type=event_type, 126 | data={header_mapping[i]: line[i] for i in header_mapping}, 127 | ) 128 | 129 | 130 | def stats() -> Stats: 131 | from my.core import stat 132 | 133 | return { 134 | **stat(events), 135 | **stat(connections), 136 | } 137 | -------------------------------------------------------------------------------- /my/listenbrainz/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses scrobbles from https://listenbrainz.org/ using 3 | https://github.com/purarue/listenbrainz_export 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/listenbrainz_export"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import listenbrainz as user_config # type: ignore[attr-defined] 10 | 11 | 12 | from pathlib import Path 13 | from typing import Iterator, Sequence 14 | from itertools import chain 15 | 16 | from listenbrainz_export.parse import Listen, iter_listens 17 | from more_itertools import unique_everseen 18 | 19 | from dataclasses import dataclass 20 | from my.core import get_files, Stats, make_logger, Paths 21 | 22 | 23 | @dataclass 24 | class config(user_config.export): 25 | # path[s]/glob to the exported data 26 | export_path: Paths 27 | 28 | 29 | logger = make_logger(__name__) 30 | 31 | 32 | def inputs() -> Sequence[Path]: 33 | return get_files(config.export_path) 34 | 35 | 36 | Results = Iterator[Listen] 37 | 38 | 39 | def _parse_export_file(p: Path) -> Results: 40 | # remove any items which have null as listen date 41 | # (may have been listening to something when export happened) 42 | yield from filter(lambda lst: lst.listened_at is not None, iter_listens(p)) 43 | 44 | 45 | def history() -> Results: 46 | yield from unique_everseen( 47 | chain(*(_parse_export_file(p) for p in inputs())), 48 | key=lambda lst: lst.listened_at, 49 | ) 50 | 51 | 52 | def stats() -> Stats: 53 | from my.core import stat 54 | 55 | return {**stat(history)} 56 | -------------------------------------------------------------------------------- /my/location/apple.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | 3 | from my.core.source import import_source 4 | from my.location.common import Location # type: ignore[import] 5 | 6 | 7 | @import_source(module_name="my.apple.privacy_export") 8 | def locations() -> Iterator[Location]: 9 | from my.apple.privacy_export import events, Location as AppleLocation 10 | 11 | for a in events(): 12 | if isinstance(a, AppleLocation) and not isinstance(a, Exception): 13 | yield Location( 14 | lon=a.lng, 15 | lat=a.lat, 16 | dt=a.dt, 17 | accuracy=50.0, 18 | elevation=None, 19 | datasource="apple", 20 | ) 21 | -------------------------------------------------------------------------------- /my/mail/all.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | from itertools import chain 3 | 4 | from my.core import Stats 5 | from my.core.source import import_source 6 | 7 | REQUIRES = ["mail-parser", "dateparser"] 8 | 9 | MAIL_HELP = "https://github.com/purarue/HPI/blob/master/doc/MAIL_SETUP.md" 10 | 11 | src_imap = import_source(module_name="my.mail.imap", help_url=MAIL_HELP) 12 | src_mbox = import_source(module_name="my.mail.mbox", help_url=MAIL_HELP) 13 | 14 | 15 | # top-level import -- this whole module requires mail-parser/dateparser 16 | from .common import Email, unique_mail, MessagePart 17 | 18 | 19 | @src_imap 20 | def _mail_imap() -> Iterator[Email]: 21 | from . import imap 22 | 23 | return imap.mail() 24 | 25 | 26 | @src_mbox 27 | def _mail_mbox() -> Iterator[Email]: 28 | from . import mbox 29 | 30 | return mbox.mail() 31 | 32 | 33 | # NOTE: you can comment out the sources you don't want 34 | def mail() -> Iterator[Email]: 35 | yield from unique_mail( 36 | chain( 37 | _mail_mbox(), 38 | _mail_imap(), 39 | ) 40 | ) 41 | 42 | 43 | def mail_subparts() -> Iterator[MessagePart]: 44 | for m in mail(): 45 | yield from m.subparts 46 | 47 | 48 | def stats() -> Stats: 49 | from my.core import stat 50 | 51 | return {**stat(mail)} 52 | -------------------------------------------------------------------------------- /my/mail/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from email.message import Message 4 | from typing import ( 5 | List, 6 | Tuple, 7 | TextIO, 8 | Iterator, 9 | Optional, 10 | Union, 11 | Dict, 12 | Any, 13 | cast, 14 | ) 15 | from datetime import datetime 16 | from dataclasses import dataclass 17 | 18 | import dateparser 19 | from mailparser import MailParser # type: ignore[import] 20 | from mailparser.exceptions import MailParserReceivedParsingError # type: ignore[import] 21 | from more_itertools import unique_everseen 22 | 23 | from my.core import make_logger, __NOT_HPI_MODULE__ # noqa: F401 24 | 25 | from .parse_parts import tag_message_subparts 26 | 27 | REQUIRES = ["mail-parser", "dateparser"] 28 | 29 | # silence all mailparser logs 30 | # https://stackoverflow.com/a/55396144 31 | mlog = logging.getLogger("mailparser") 32 | for handler in mlog.handlers.copy(): 33 | mlog.removeHandler(handler) 34 | mlog.addHandler(logging.NullHandler()) 35 | mlog.propagate = False 36 | 37 | logger = make_logger(__name__) 38 | 39 | 40 | @dataclass 41 | class MessagePart: 42 | content_type: str 43 | payload: Any 44 | _email: "Email" 45 | 46 | 47 | class Email(MailParser): 48 | """ 49 | subclass of the mailparser which 50 | supports serialization by my.core.serialize 51 | along with a few other convenience functions 52 | """ 53 | 54 | # note: The 'message' property on this class 55 | # is the stdlib email.Message class: 56 | # https://docs.python.org/3/library/email.message.html#module-email.message 57 | def __init__(self, message: Message) -> None: 58 | super().__init__(message=message) 59 | self.filepath: Optional[Path] = None 60 | self._dt: Optional[datetime] = None # property to cache datetime result 61 | self._dateparser_failed: bool = False # if dateparser previously failed 62 | 63 | @property 64 | def dt(self) -> Optional[datetime]: 65 | """ 66 | Try to parse datetime if mail date wasn't in RFC 2822 format 67 | """ 68 | if self._dt is not None: 69 | return self._dt 70 | if self._dateparser_failed: 71 | return None 72 | # If date was parsed properly by mailparser 73 | d = self.date 74 | if isinstance(d, datetime): 75 | self._dt = d 76 | return self._dt 77 | if "Date" in self.headers: 78 | dateparser_res: Optional[datetime] = dateparser.parse(self.headers["Date"]) 79 | # if this failed to parse, save it on the object 80 | if dateparser_res is None: 81 | self._dateparser_failed = True 82 | return None 83 | else: 84 | self._dt = dateparser_res 85 | return self._dt 86 | return None 87 | 88 | def _serialize(self) -> Dict[str, Any]: 89 | return { 90 | "filepath": self.filepath, 91 | "bcc": self.bcc, 92 | "cc": self.cc, 93 | "date": self.dt, 94 | "date_utc": self.date_utc, 95 | "delivered_to": self.delivered_to, 96 | "from": self.from_, 97 | "message_id": self.message_id, 98 | "received": self.received, 99 | "reply_to": self.reply_to, 100 | "subject": self.subject, 101 | "to": self.to, 102 | "by": self.by, 103 | "envelope_from": self.envelope_from, 104 | "envelope_sender": self.envelope_sender, 105 | "for": getattr(self, "for"), 106 | "hop": self.hop, 107 | "with": getattr(self, "with"), 108 | "body": self.body, 109 | "body_html": self.body_html, 110 | "body_plain": self.body_plain, 111 | "attachments": self.attachments, 112 | "sender_ip_address": self.sender_ip_address, 113 | "to_domains": self.to_domains, 114 | } 115 | 116 | @property 117 | def description(self) -> str: 118 | return f"""From: {describe_persons(self.from_)} 119 | To: {describe_persons(self.to)} 120 | Subject: {self.subject}""" 121 | 122 | @classmethod 123 | def safe_parse( 124 | cls, fp: Union[str, bytes, Message, TextIO], display_filename: Path 125 | ) -> Optional["Email"]: 126 | try: 127 | if isinstance(fp, bytes): 128 | m = cls.from_bytes(fp) 129 | elif isinstance(fp, str): 130 | m = cls.from_string(fp) 131 | elif isinstance(fp, Message): 132 | # convert the email.Message (or a subclass) to this class 133 | m = cls(message=fp) 134 | else: 135 | m = cls.from_file_obj(fp) 136 | return cast(Email, m) 137 | except UnicodeDecodeError as e: 138 | logger.debug(f"While parsing {display_filename}: {e}") 139 | except MailParserReceivedParsingError as e: 140 | logger.debug(f"While parsing {display_filename}: {e}") 141 | except AttributeError as e: 142 | # error in the 'find_between' function when 143 | # the epilogue fails to be parse 144 | if str(e) == "'NoneType' object has no attribute 'index'": 145 | logger.debug( 146 | f"While parsing {display_filename}, epilogue failed to be parsed: {e}" 147 | ) 148 | else: 149 | logger.debug( 150 | f"Unknown error while parsing {display_filename}: {e}, skipping...", 151 | exc_info=e, 152 | ) 153 | except Exception as e: 154 | logger.warning( 155 | f"Unknown error while parsing {display_filename}: {e}, skipping...", 156 | exc_info=e, 157 | ) 158 | return None 159 | 160 | @classmethod 161 | def safe_parse_path(cls, path: Path) -> Optional["Email"]: 162 | with path.open("rb") as bf: 163 | m = cls.safe_parse(try_decode_buf(bf.read()), display_filename=path) 164 | if m is None: 165 | return None 166 | m.filepath = path 167 | return m 168 | 169 | @property 170 | def subparts(self) -> Iterator[MessagePart]: 171 | for payload, content_type in tag_message_subparts(self.message): 172 | yield MessagePart( 173 | content_type=content_type, 174 | payload=payload, 175 | _email=self, 176 | ) 177 | 178 | 179 | def unique_mail(emails: Iterator[Email]) -> Iterator[Email]: 180 | # remove duplicates (from a file being 181 | # in multiple boxes and the 'default' inbox) 182 | # some formats won't have a message id, 183 | # but hopefully the date/subject creates a unique 184 | # key in that case 185 | yield from unique_everseen( 186 | emails, 187 | key=lambda m: ( 188 | m.subject_json, 189 | m.message_id_json, 190 | m.dt, 191 | ), 192 | ) 193 | 194 | 195 | def try_decode_buf(buf: bytes) -> str: 196 | try: 197 | return buf.decode("utf-8") 198 | except UnicodeDecodeError: 199 | try: 200 | return buf.decode("iso-8859-1") 201 | except UnicodeDecodeError: 202 | return buf.decode("latin-1") 203 | 204 | 205 | def describe_person(p: Tuple[str, str]) -> str: 206 | """ 207 | ( 208 | "Person", 209 | "emailhere@gmail.com" 210 | ) 211 | converts to 212 | Person 213 | if there's no 'Person' text, it 214 | just becomes: 215 | emailhere@gmail.com 216 | """ 217 | if p[0].strip(): 218 | return f"{p[0]} <{p[1]}>" 219 | else: 220 | return p[1] 221 | 222 | 223 | def describe_persons(m: List[Tuple[str, str]]) -> str: 224 | """ 225 | >>> [('Google', 'no-reply@accounts.google.com'), ('Github', 'no-reply@github.com')] 226 | 'Google , Github ' 227 | """ 228 | return ", ".join([describe_person(p) for p in m]) 229 | -------------------------------------------------------------------------------- /my/mail/imap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses my locally synced IMAP email files, using mbsync 3 | https://isync.sourceforge.io/mbsync.html 4 | Uses https://github.com/SpamScope/mail-parser to parse the mail 5 | """ 6 | 7 | REQUIRES = ["mail-parser", "dateparser"] 8 | 9 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 10 | from my.config import mail as user_config # type: ignore[attr-defined] 11 | 12 | from pathlib import Path 13 | from typing import ( 14 | Iterator, 15 | Callable, 16 | Optional, 17 | List, 18 | ) 19 | 20 | 21 | from dataclasses import dataclass 22 | from my.core import Stats, Paths, get_files, make_config 23 | from .common import Email, unique_mail 24 | 25 | 26 | @dataclass 27 | class imap_conf(user_config.imap): 28 | # path[s]/glob to the the individual email files -- searches recursively 29 | mailboxes: Paths 30 | 31 | # filter function which filters the input paths 32 | filter_path: Optional[Callable[[Path], bool]] = None 33 | 34 | 35 | config = make_config(imap_conf) 36 | 37 | 38 | def mailboxes() -> List[Path]: 39 | return list(get_files(config.mailboxes)) 40 | 41 | 42 | def _files() -> Iterator[Path]: 43 | for box in mailboxes(): 44 | for path in box.rglob("*"): 45 | if not path.is_file(): 46 | continue 47 | if path.stem.startswith("."): 48 | continue 49 | yield path 50 | 51 | 52 | def files() -> Iterator[Path]: 53 | if config.filter_path is None: 54 | yield from _files() 55 | else: 56 | assert callable(config.filter_path) 57 | yield from filter(config.filter_path, _files()) 58 | 59 | 60 | def raw_mail() -> Iterator[Email]: 61 | for m in map(Email.safe_parse_path, files()): 62 | if m is not None: 63 | yield m 64 | 65 | 66 | def mail() -> Iterator[Email]: 67 | yield from unique_mail(raw_mail()) 68 | 69 | 70 | def stats() -> Stats: 71 | from my.core import stat 72 | 73 | return {**stat(mail)} 74 | -------------------------------------------------------------------------------- /my/mail/mbox.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses local mbox files 3 | """ 4 | 5 | REQUIRES = ["mail-parser", "dateparser"] 6 | 7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 8 | from my.config import mail as user_config # type: ignore[attr-defined] 9 | 10 | import mailbox 11 | from pathlib import Path 12 | from typing import List, Iterator, Optional, Sequence, IO, Any 13 | 14 | from dataclasses import dataclass 15 | from my.core import Stats, Paths, get_files 16 | from my.core import make_logger 17 | 18 | from .common import Email, unique_mail, try_decode_buf 19 | 20 | 21 | logger = make_logger(__name__) 22 | 23 | 24 | @dataclass 25 | class config(user_config.mbox): 26 | # path[s]/glob to the mbox file directory 27 | mailboxes: Paths 28 | 29 | # any additional extensions to ignore -- by default includes .msf, .dat, .log 30 | exclude_extensions: Optional[Sequence[str]] = None 31 | 32 | 33 | def mailboxes() -> List[Path]: 34 | return list(get_files(config.mailboxes)) 35 | 36 | 37 | DEFAULT_EXCLUDED_EXTENSIONS = { 38 | ".msf", 39 | ".log", 40 | ".dat", 41 | } 42 | 43 | 44 | def files() -> Iterator[Path]: 45 | excluded_ext = set(DEFAULT_EXCLUDED_EXTENSIONS) 46 | if config.exclude_extensions: 47 | for ext in config.exclude_extensions: 48 | excluded_ext.add(ext) 49 | 50 | for box in mailboxes(): 51 | for path in box.rglob("*"): 52 | if path.stem.startswith("."): 53 | continue 54 | if path.is_file(): 55 | if path.suffix not in excluded_ext: 56 | yield path 57 | 58 | 59 | def _decode_msg(msg: IO[Any]) -> mailbox.mboxMessage: 60 | """ 61 | Custom decode function 62 | 63 | by default this uses 'ascii' which can cause fatal errors 64 | on UnicodeDecodeErrors 65 | """ 66 | msg_str = try_decode_buf(msg.read()) 67 | return mailbox.mboxMessage(mailbox.Message(msg_str)) 68 | 69 | 70 | def _iter_mailbox(file: Path) -> Iterator[Email]: 71 | mbox = mailbox.mbox( 72 | str(file), 73 | factory=_decode_msg, 74 | create=False, 75 | ) 76 | mbox_itr = iter(mbox) 77 | while True: 78 | try: 79 | mbox_message = next(mbox_itr) 80 | email = Email.safe_parse(mbox_message, display_filename=file) 81 | if email is not None: 82 | email.filepath = file 83 | yield email 84 | except StopIteration: 85 | break 86 | except Exception as ex: 87 | logger.warning( 88 | f"Unexpected error while parsing {file}: {ex}... no way to continue parsing mbox file...", 89 | exc_info=ex, 90 | ) 91 | 92 | 93 | def raw_mail() -> Iterator[Email]: 94 | for file in files(): 95 | assert file.exists() # sanity check -- make sure were not creating mboxes 96 | yield from _iter_mailbox(file) 97 | 98 | 99 | def mail() -> Iterator[Email]: 100 | yield from unique_mail(raw_mail()) 101 | 102 | 103 | def stats() -> Stats: 104 | from my.core import stat 105 | 106 | return {**stat(mail)} 107 | -------------------------------------------------------------------------------- /my/mail/parse_parts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some helper functions/constants for parsing message subparts/ignoring certain content types 3 | """ 4 | 5 | from typing import Iterator, Tuple, Set, Union, Any, Literal 6 | from email.message import Message 7 | 8 | # explicitly ignored types, anything else sends a warning 9 | IGNORED_CONTENT_TYPES = { 10 | "text/calendar", 11 | "application/ics", 12 | "application/pdf", 13 | "application/octet-stream", 14 | "application/octetstream", 15 | "text/csv", 16 | "application/json", 17 | "application/zip", 18 | "application/x-zip-compressed", 19 | "application/msword", 20 | "multipart/alternative", 21 | "application/postscript", 22 | "text/x-vcard", 23 | "multipart/parallel", # not sure what the best way to parse this is 24 | } 25 | 26 | IGNORED_CONTENT_PREFIXES: Set[str] = { 27 | "application/vnd", 28 | "application/x-apple", 29 | "application/x-iwork", 30 | "image", 31 | "audio", 32 | "video", 33 | } 34 | 35 | 36 | def get_message_parts(m: Message) -> Iterator[Message]: 37 | # since walk returns both multiparts and their children 38 | # we can ignore the multipart and return all individual parts 39 | # 40 | # if single type, it just returns the message itself 41 | for part in m.walk(): 42 | if not part.is_multipart(): 43 | yield part 44 | 45 | 46 | EmailText = Literal["html", "text"] 47 | 48 | 49 | EmailTextOrContentType = Union[EmailText, str] 50 | 51 | 52 | def tag_message_subparts( 53 | msg: Message, 54 | ) -> Iterator[Tuple[Any, EmailTextOrContentType]]: 55 | for message_part in get_message_parts(msg): 56 | content_type = message_part.get_content_type() 57 | payload = message_part.get_payload() 58 | 59 | # known ignored content types 60 | if content_type in IGNORED_CONTENT_TYPES: 61 | yield payload, content_type 62 | 63 | if any( 64 | [content_type.startswith(prefix) for prefix in IGNORED_CONTENT_PREFIXES] 65 | ): 66 | yield payload, content_type 67 | 68 | if content_type.startswith("text") and "html" in content_type: 69 | yield payload, "html" 70 | elif content_type == "text/plain": 71 | yield payload, "text" 72 | else: 73 | # unknown ignored content types 74 | yield payload, content_type 75 | -------------------------------------------------------------------------------- /my/mal/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the data directory for my MAL export 3 | Uses https://github.com/purarue/malexport/ 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/malexport"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import mal as user_config # type: ignore[attr-defined] 10 | 11 | from pathlib import Path 12 | from datetime import datetime 13 | from typing import Iterator, List, Tuple, NamedTuple, Optional 14 | from functools import lru_cache 15 | 16 | from dataclasses import dataclass 17 | from my.core import Stats, make_logger, PathIsh, make_config, get_files 18 | from my.core.structure import match_structure 19 | 20 | from malexport.paths import LocalDir 21 | from malexport.parse.combine import combine, AnimeData, MangaData 22 | from malexport.parse.forum import Post, iter_forum_posts 23 | from malexport.parse.friends import Friend, iter_friends 24 | from malexport.parse.messages import Thread, Message, iter_user_threads 25 | from malexport.parse.recover_deleted_entries import recover_deleted as rec_del, Approved 26 | 27 | 28 | @dataclass 29 | class mal_config(user_config.export): 30 | # path[s]/glob to the exported data 31 | export_path: PathIsh 32 | 33 | # this should be the top level directory, not the zip files or username directories 34 | # see https://github.com/purarue/malexport/#recover_deleted 35 | zip_backup_path: Optional[PathIsh] = None 36 | 37 | 38 | config = make_config(mal_config) 39 | 40 | 41 | logger = make_logger(__name__) 42 | 43 | 44 | # malexport supports multiple accounts 45 | # in its data directory structure 46 | @lru_cache(maxsize=1) 47 | def export_dirs() -> List[Path]: 48 | base: Path = Path(config.export_path).expanduser().absolute() 49 | with match_structure(base, expected="animelist.xml") as matches: 50 | return list(matches) 51 | 52 | 53 | Export = Tuple[List[AnimeData], List[MangaData]] 54 | 55 | 56 | @lru_cache(maxsize=2) 57 | def _read_malexport_aux(username: str, *, mtimes: Tuple[float, ...]) -> Export: 58 | logger.debug(f"reading {username}; cache miss: {mtimes}") 59 | return combine(username) 60 | 61 | 62 | def _read_malexport(username: str) -> Export: 63 | paths = LocalDir.from_username(username).data_dir.rglob("*") 64 | return _read_malexport_aux( 65 | username, mtimes=tuple(sorted(map(lambda f: f.stat().st_mtime, paths))) 66 | ) 67 | 68 | 69 | @lru_cache(maxsize=None) 70 | def _find_deleted_aux(username: str, zips: Tuple[Path, ...]) -> Export: 71 | return rec_del( 72 | approved=Approved.parse_from_git_dir(), 73 | username=username, 74 | backups=list(zips), 75 | filter_with_activity=False, 76 | logger=logger, 77 | ) 78 | 79 | 80 | def _find_deleted_inputs(username: str) -> Tuple[Path, ...]: 81 | if config.zip_backup_path is None: 82 | return tuple() 83 | directory_for_user: Path = Path(config.zip_backup_path) / username 84 | return get_files(directory_for_user, sort=True, glob="*.zip") 85 | 86 | 87 | def _find_deleted(username: str) -> Optional[Export]: 88 | return _find_deleted_aux(username, _find_deleted_inputs(username)) 89 | 90 | 91 | ### Expose all the parsed information from malexport 92 | 93 | 94 | def anime() -> Iterator[AnimeData]: 95 | for path in export_dirs(): 96 | anime, _ = _read_malexport(path.stem) 97 | yield from anime 98 | 99 | 100 | def manga() -> Iterator[MangaData]: 101 | for path in export_dirs(): 102 | _, manga = _read_malexport(path.stem) 103 | yield from manga 104 | 105 | 106 | def deleted_anime() -> Iterator[AnimeData]: 107 | for path in export_dirs(): 108 | if export := _find_deleted(path.stem): 109 | anime, _ = export 110 | yield from anime 111 | 112 | 113 | def deleted_manga() -> Iterator[MangaData]: 114 | for path in export_dirs(): 115 | if export := _find_deleted(path.stem): 116 | _, manga = export 117 | yield from manga 118 | 119 | 120 | class Episode(NamedTuple): 121 | mal_id: int 122 | title: str 123 | episode: int 124 | at: datetime 125 | 126 | 127 | # use the combined data when reading history 128 | # since it removes entries you may have deleted 129 | # which still have local history files left over 130 | def episodes() -> Iterator[Episode]: 131 | for path in export_dirs(): 132 | anime, _ = _read_malexport(path.stem) 133 | for a in anime: 134 | for h in a.history: 135 | yield Episode( 136 | mal_id=a.id, 137 | title=a.XMLData.title, 138 | episode=h.number, 139 | at=h.at, 140 | ) 141 | 142 | 143 | class Chapter(NamedTuple): 144 | mal_id: int 145 | title: str 146 | chapter: int 147 | at: datetime 148 | 149 | 150 | def chapters() -> Iterator[Chapter]: 151 | for path in export_dirs(): 152 | _, manga = _read_malexport(path.stem) 153 | for m in manga: 154 | for h in m.history: 155 | yield Chapter( 156 | mal_id=m.id, 157 | title=m.XMLData.title, 158 | chapter=h.number, 159 | at=h.at, 160 | ) 161 | 162 | 163 | def posts() -> Iterator[Post]: 164 | for path in export_dirs(): 165 | yield from iter_forum_posts(path.stem) 166 | 167 | 168 | def threads() -> Iterator[Thread]: 169 | for path in export_dirs(): 170 | yield from iter_user_threads(path.stem) 171 | 172 | 173 | def messages() -> Iterator[Message]: 174 | for t in threads(): 175 | yield from t.messages 176 | 177 | 178 | def friends() -> Iterator[Friend]: 179 | for path in export_dirs(): 180 | yield from iter_friends(path.stem) 181 | 182 | 183 | def stats() -> Stats: 184 | from my.core import stat 185 | 186 | return { 187 | **stat(anime), 188 | **stat(manga), 189 | **stat(chapters), 190 | **stat(episodes), 191 | **stat(posts), 192 | **stat(friends), 193 | } 194 | -------------------------------------------------------------------------------- /my/minecraft/advancements.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses achievement data/timestamps from local minecraft worlds 3 | Copied from the ~/.minecraft directory, one for each world 4 | Backed up with: 5 | https://github.com/purarue/HPI-personal/blob/master/scripts/backup_minecraft_advancements 6 | """ 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import minecraft as user_config # type: ignore[attr-defined] 10 | 11 | from dataclasses import dataclass 12 | from my.core import Paths 13 | 14 | 15 | @dataclass 16 | class config(user_config.advancements): 17 | # path[s]/glob to the backup directory 18 | export_path: Paths 19 | 20 | 21 | import json 22 | from pathlib import Path 23 | from typing import Sequence, NamedTuple, Iterator, List, Any, Dict 24 | from datetime import datetime 25 | from itertools import chain 26 | 27 | from my.core import get_files, Stats 28 | from my.core.structure import match_structure 29 | 30 | from more_itertools import unique_everseen 31 | 32 | EXPECTED = ("advancements",) 33 | 34 | 35 | def _advancement_json_files(world_dir: Path) -> List[Path]: 36 | d = (world_dir / "advancements").absolute() 37 | if not d.exists(): 38 | return [] 39 | return list(d.rglob("*.json")) 40 | 41 | 42 | def worlds() -> Sequence[Path]: 43 | found = [] 44 | for f in get_files(config.export_path): 45 | with match_structure(f, EXPECTED) as match: 46 | for m in match: 47 | if _advancement_json_files(m): 48 | found.append(m.absolute()) 49 | return found 50 | 51 | 52 | class Advancement(NamedTuple): 53 | advancement_id: str 54 | world_name: str 55 | dt: datetime 56 | 57 | 58 | Results = Iterator[Advancement] 59 | 60 | 61 | def advancements() -> Results: 62 | yield from unique_everseen(chain(*map(_parse_world, worlds()))) 63 | 64 | 65 | DATE_REGEX = r"%Y-%m-%d %H:%M:%S %z" 66 | 67 | 68 | def _parse_world(world_dir: Path) -> Results: 69 | """ 70 | An example of a key, val this is trying to parse: 71 | 72 | "minecraft:nether/obtain_crying_obsidian": { 73 | "criteria": { 74 | "crying_obsidian": "2022-06-17 22:48:18 -0700" 75 | }, 76 | "done": true 77 | }, 78 | """ 79 | 80 | for f in _advancement_json_files(world_dir): 81 | data = json.loads(f.read_text()) 82 | for key, val in data.items(): 83 | # ignore advanced in crafting recipes 84 | # and random non-dict values (version numbers etc.) 85 | if key.startswith("minecraft:recipes") or not isinstance(val, dict): 86 | continue 87 | # if just a marker and not 'done', don't include 88 | if "done" in val and val["done"] is False: 89 | continue 90 | possible_date_blobs: List[Dict[Any, Any]] = [ 91 | v for v in val.values() if isinstance(v, dict) 92 | ] 93 | for blob in possible_date_blobs: 94 | for datestr in filter(lambda s: isinstance(s, str), blob.values()): 95 | try: 96 | parsed_date = datetime.strptime(datestr, DATE_REGEX) 97 | except ValueError: 98 | continue 99 | yield Advancement( 100 | advancement_id=key, world_name=world_dir.stem, dt=parsed_date 101 | ) 102 | 103 | 104 | def stats() -> Stats: 105 | from my.core import stat 106 | 107 | return {**stat(advancements)} 108 | -------------------------------------------------------------------------------- /my/mpv/history_daemon.py: -------------------------------------------------------------------------------- 1 | """ 2 | Any Media being played on my computer with mpv 3 | Uses my mpv-history-daemon 4 | https://github.com/purarue/mpv-history-daemon 5 | """ 6 | 7 | REQUIRES = ["git+https://github.com/purarue/mpv-history-daemon"] 8 | 9 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 10 | from my.config import mpv as user_config # type: ignore[attr-defined] 11 | 12 | from typing import Iterator, Sequence, Optional 13 | from dataclasses import dataclass 14 | from my.core import Paths, make_config 15 | 16 | 17 | @dataclass 18 | class mpv_config(user_config.history_daemon): 19 | # glob to the JSON files that the daemon writes whenever Im using mpv 20 | export_path: Paths 21 | 22 | # amount of song I should have listened to to qualify it as a listen (e.g. 0.5, 0.75) 23 | require_percent: Optional[float] = None 24 | 25 | 26 | config = make_config(mpv_config) 27 | 28 | 29 | import itertools 30 | from pathlib import Path 31 | 32 | from mpv_history_daemon.events import ( 33 | Media, 34 | all_history as M_all_history, 35 | _actually_listened_to, 36 | ) 37 | 38 | from my.core import get_files, Stats, make_logger 39 | 40 | 41 | logger = make_logger(__name__) 42 | 43 | # monkey patch logs 44 | import mpv_history_daemon.events 45 | 46 | mpv_history_daemon.events.logger = mpv_history_daemon.events.setup_logger( 47 | name="mpv_history_events", level=logger.level 48 | ) 49 | 50 | Results = Iterator[Media] 51 | 52 | 53 | def stats() -> Stats: 54 | from my.core import stat 55 | 56 | return { 57 | **stat(history), 58 | } 59 | 60 | 61 | def inputs() -> Sequence[Path]: 62 | # this takes the files, sorts it so merged event files 63 | # are returned first, then the individual event ones 64 | # this makes it so that history is close to (it may not be if you opened 2 mpv 65 | # instances and listened to something while another was paused) chronologically sorted, 66 | # because the merged files are ordered by keyname 67 | files = list(get_files(config.export_path, sort=True)) 68 | groups = { 69 | k: list(g) 70 | for k, g in itertools.groupby(files, key=lambda f: "merged" in f.stem) 71 | } 72 | # merged files, then raw event files 73 | return list(itertools.chain(groups.get(True, []), groups.get(False, []))) 74 | 75 | 76 | def _filter_by(m: Media) -> bool: 77 | if m.is_stream: 78 | return True 79 | # if duration is under 10 minutes, but listen_time is over 80 | # 3 hours, probably a broken item, caused by hanging mpv/socket? 81 | # I only have 2 of these, in the 13,000 or so history items 82 | if m.media_duration is not None and m.media_duration < 600: 83 | if m.listen_time > 10800: 84 | logger.debug(f"Assuming this is a broken file: {str(m)}") 85 | return False 86 | perc = config.require_percent or 0.75 87 | # fallback to library func 88 | return _actually_listened_to(m, require_listened_to_percent=perc) 89 | 90 | 91 | def all_history() -> Results: 92 | yield from M_all_history(list(inputs())) 93 | 94 | 95 | def history() -> Results: 96 | yield from filter(_filter_by, all_history()) 97 | -------------------------------------------------------------------------------- /my/offline/listens.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses scrobbles from https://github.com/purarue/offline_listens 3 | """ 4 | 5 | REQUIRES = ["git+https://github.com/purarue/offline_listens"] 6 | 7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 8 | from my.config import offline as user_config # type: ignore[attr-defined] 9 | 10 | 11 | from pathlib import Path 12 | from typing import Iterator, Sequence 13 | 14 | from offline_listens.listens import Listen 15 | from offline_listens.parse import iter_dir, parse_file 16 | 17 | from dataclasses import dataclass 18 | from my.core import get_files, Stats, Paths 19 | 20 | 21 | @dataclass 22 | class config(user_config.listens): 23 | # path[s]/glob to the exported data 24 | export_path: Paths 25 | 26 | 27 | def inputs() -> Sequence[Path]: 28 | return get_files(config.export_path) 29 | 30 | 31 | Results = Iterator[Listen] 32 | 33 | 34 | def history() -> Results: 35 | for f in inputs(): 36 | if f.is_dir(): 37 | yield from iter_dir(f) 38 | else: 39 | yield from parse_file(f) 40 | 41 | 42 | def stats() -> Stats: 43 | from my.core import stat 44 | 45 | return {**stat(history)} 46 | -------------------------------------------------------------------------------- /my/piazza/scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses piazza posts scraped by 3 | https://github.com/purarue/piazza-scraper 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/piazza-scraper"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import piazza as user_config # type: ignore[attr-defined] 10 | from dataclasses import dataclass 11 | from my.core import Paths 12 | 13 | 14 | @dataclass 15 | class config(user_config.scraper): 16 | # path to the exported data 17 | export_path: Paths 18 | 19 | 20 | import os 21 | from pathlib import Path 22 | from typing import Iterator, Sequence, Optional 23 | 24 | from my.core import get_files, Stats 25 | 26 | from piazza_scraper.parse import Post, Export 27 | 28 | 29 | def inputs() -> Sequence[Path]: 30 | return get_files(config.export_path) 31 | 32 | 33 | def classes() -> Iterator[Export]: 34 | for file in inputs(): 35 | yield Export.parse_file(file) 36 | 37 | 38 | def _all_posts() -> Iterator[Post]: 39 | for exp in classes(): 40 | for post in exp.posts: 41 | yield from post.walk_posts() 42 | 43 | 44 | def posts() -> Iterator[Post]: 45 | """ 46 | Infer my user id by checking the stats/users area 47 | Parse all posts, and return ones made by me 48 | """ 49 | for exp in classes(): 50 | # hmm -- it seems that I'm always the only user in this? 51 | # will check an envvar in case someone else has issues configuring this/has different results 52 | # feel free to open an issue 53 | user_id: Optional[str] = os.environ.get("PIAZZA_UID") 54 | if user_id is None: 55 | assert ( 56 | len(exp.users) > 0 57 | ), "Could not infer user id, set the PIAZZA_UID environment variable to your users' uid" 58 | user_id = exp.users[0].uid 59 | 60 | assert user_id is not None 61 | for post in exp.posts: 62 | yield from post.walk_posts_by_me(user_id) 63 | 64 | 65 | def stats() -> Stats: 66 | from my.core import stat 67 | 68 | return { 69 | **stat(posts), 70 | } 71 | -------------------------------------------------------------------------------- /my/project_euler.py: -------------------------------------------------------------------------------- 1 | """ 2 | When I completed https://projecteuler.net problems 3 | 4 | This information has to be updated manually, I do it once 5 | every few months/years depending on how many of these I keep 6 | solving 7 | 8 | To download, log in to your Project Euler account 9 | (in your browser), and then go to: 10 | https://projecteuler.net/history 11 | 12 | That txt file is what this accepts as input (can accept multiple) 13 | """ 14 | 15 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 16 | from my.config import project_euler as user_config # type: ignore[attr-defined] 17 | 18 | from dataclasses import dataclass 19 | from my.core import Paths 20 | 21 | 22 | @dataclass 23 | class config(user_config): 24 | # path[s]/glob to the .txt export files 25 | export_path: Paths 26 | 27 | 28 | import re 29 | import csv 30 | from pathlib import Path 31 | from datetime import datetime, timezone 32 | from typing import Sequence, Iterator, NamedTuple, Optional, List, Dict 33 | from itertools import chain, groupby 34 | 35 | from my.core import get_files, Stats 36 | 37 | 38 | class Solution(NamedTuple): 39 | problem: int 40 | dt: datetime 41 | name: Optional[str] 42 | 43 | 44 | def inputs() -> Sequence[Path]: 45 | return get_files(config.export_path) 46 | 47 | 48 | def history() -> Iterator[Solution]: 49 | # need to sort here to dedupe accurately 50 | items: List[Solution] = sorted( 51 | chain(*map(_parse_file, inputs())), key=lambda s: s.problem 52 | ) 53 | # group by items, and if there are multiple return the one with the name 54 | # (or None if there is no name) 55 | grouped: Dict[int, List[Solution]] = { 56 | num: list(problems) for num, problems in groupby(items, lambda s: s.problem) 57 | } 58 | for items in grouped.values(): 59 | for item in items: 60 | if item.name is not None: 61 | yield item 62 | break # break out of the inner loop 63 | else: 64 | # no name on item, just yield the first 65 | yield items[0] 66 | 67 | 68 | # Example line: 69 | # 037: 07 Nov 14 (13:46) 70 | # project euler was started in early 2000s, 71 | # so no need to support 19XX 72 | # '14' means 2014 73 | OLD_LINE_REGEX = re.compile(r"(\d+):\s*(\d+)\s*(\w+)\s*(\d+)\s*\((\d+):(\d+)\)") 74 | 75 | # hardcoding instead of using calendar module avoid possible issues with locale 76 | MONTHS = [ 77 | "jan", 78 | "feb", 79 | "mar", 80 | "apr", 81 | "may", 82 | "jun", 83 | "jul", 84 | "aug", 85 | "sep", 86 | "oct", 87 | "nov", 88 | "dec", 89 | ] 90 | 91 | 92 | def _parse_file(p: Path) -> Iterator[Solution]: 93 | for line in p.open(): 94 | m = OLD_LINE_REGEX.match(line) 95 | if m: 96 | # old format 97 | problem, day, month_desc, year_short, hour, minute = m.groups() 98 | month_lowered = month_desc.lower() 99 | assert month_lowered in MONTHS, f"Couldn't find {month_lowered} in {MONTHS}" 100 | # datetimes in the file are UTC time 101 | yield Solution( 102 | problem=int(problem), 103 | dt=datetime( 104 | year=int(f"20{year_short}"), 105 | month=MONTHS.index(month_lowered) + 1, 106 | day=int(day), 107 | hour=int(hour), 108 | minute=int(minute), 109 | tzinfo=timezone.utc, 110 | ), 111 | name=None, 112 | ) 113 | else: 114 | # new format 115 | csv_reader = csv.reader([line]) 116 | row = next(csv_reader) 117 | dt = datetime.strptime(row[0], "%d %b %y (%H:%M)") 118 | yield Solution(problem=int(row[1]), dt=dt, name=row[2]) 119 | 120 | 121 | def stats() -> Stats: 122 | from my.core import stat 123 | 124 | return {**stat(history)} 125 | -------------------------------------------------------------------------------- /my/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/my/py.typed -------------------------------------------------------------------------------- /my/rss/newsboat/git_history.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses when I added/removed newsboat subscriptions 3 | """ 4 | 5 | REQUIRES = ["git+https://github.com/purarue/git_doc_history"] 6 | 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import rss as user_config # type: ignore[attr-defined] 10 | 11 | 12 | from pathlib import Path 13 | from datetime import datetime 14 | from typing import ( 15 | Iterator, 16 | List, 17 | ) 18 | 19 | from git_doc_history import ( 20 | DocHistory, 21 | parse_snapshot_diffs, 22 | Diff, 23 | ) 24 | 25 | from dataclasses import dataclass 26 | from my.core import Stats, PathIsh 27 | 28 | 29 | @dataclass 30 | class config(user_config.newsboat.git_history): 31 | # path to the git backup directory 32 | export_path: PathIsh 33 | 34 | 35 | RSS_FILES = ["urls"] 36 | 37 | 38 | def input() -> DocHistory: 39 | return DocHistory( 40 | backup_dir=Path(config.export_path).expanduser().absolute(), 41 | copy_files=RSS_FILES, 42 | ) 43 | 44 | 45 | Results = Iterator[str] 46 | 47 | 48 | def _parse_buffer(buf: bytes) -> List[str]: 49 | return buf.decode("utf-8").strip().splitlines() 50 | 51 | 52 | def subscriptions() -> Results: 53 | yield from _parse_buffer(input().extract_buffer_at(RSS_FILES[0], at=datetime.now())) 54 | 55 | 56 | def events() -> Iterator[Diff]: 57 | yield from parse_snapshot_diffs( 58 | input(), 59 | file=RSS_FILES[0], 60 | ) 61 | 62 | 63 | def stats() -> Stats: 64 | from my.core import stat 65 | 66 | return { 67 | **stat(subscriptions), 68 | **stat(events), 69 | } 70 | -------------------------------------------------------------------------------- /my/runelite/screenshots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extracts metadata from the automatic runelite (OldSchool RuneScape Client) screenshots 3 | that happen when you finish quests/gain levels 4 | https://github.com/runelite/runelite/wiki/Screenshot 5 | """ 6 | 7 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 8 | from my.config import runelite as user_config # type: ignore[attr-defined] 9 | 10 | from dataclasses import dataclass 11 | from my.core import Paths 12 | 13 | 14 | @dataclass 15 | class config(user_config.screenshots): 16 | # path[s]/glob to the base screenshot directory or each username 17 | # this can be some rsynced folder (like my jobs/computer/runelite_screenshots.job does) 18 | # or the .runelite folder itself 19 | export_path: Paths 20 | 21 | 22 | import re 23 | from pathlib import Path 24 | from typing import Sequence, Union, NamedTuple, Iterator, Tuple 25 | from datetime import datetime 26 | 27 | from my.core import get_files, Stats 28 | from my.core.structure import match_structure 29 | 30 | EXPECTED = ("Levels", "Quests") 31 | 32 | 33 | def accounts() -> Sequence[Path]: 34 | accounts = [] 35 | for f in get_files(config.export_path): 36 | with match_structure(f, EXPECTED) as match: 37 | accounts.extend(list(match)) 38 | return accounts 39 | 40 | 41 | class Level(NamedTuple): 42 | skill: str 43 | level: int 44 | 45 | 46 | Description = Union[Level, str] 47 | 48 | 49 | class Screenshot(NamedTuple): 50 | """represents one screenshot (quest/level etc.)""" 51 | 52 | dt: datetime 53 | path: Path 54 | screenshot_type: str # Level/Quest etc 55 | description: Description 56 | username: str 57 | 58 | 59 | Results = Iterator[Screenshot] 60 | 61 | 62 | def screenshots() -> Results: 63 | for acc in accounts(): 64 | for p in acc.iterdir(): 65 | if p.is_dir(): 66 | yield from _parse_subdir(p, username=acc.stem) 67 | 68 | 69 | DT_REGEX = r"%Y-%m-%d_%H-%M-%S" 70 | 71 | # TODO: use tz module to optionally figure out what timezone I was 72 | # when the file was created, so I can make sure the info in the filename 73 | # being a naive date isn't an issue if I'm ever in another timezone 74 | 75 | 76 | def _extract_info_from_filename(p: Path) -> Tuple[str, datetime]: 77 | desc, _, dstr = p.stem.rpartition(" ") 78 | return desc.strip(), datetime.strptime(dstr, DT_REGEX) 79 | 80 | 81 | def _parse_subdir(p: Path, username: str) -> Results: 82 | if p.stem == "Levels": 83 | yield from _parse_level_dir(p, username=username) 84 | elif p.stem == "Quests": 85 | yield from _parse_quest_dir(p, username=username) 86 | else: 87 | yield from _parse_other_dir(p, username=username) 88 | 89 | 90 | QUEST_REGEX = re.compile(r"^Quest\((.*?)\)$") 91 | 92 | 93 | def _parse_quest_dir(p: Path, username: str) -> Results: 94 | for img in p.rglob("*.png"): 95 | desc, dt = _extract_info_from_filename(img) 96 | m = re.match(QUEST_REGEX, desc) 97 | assert m, f"Couldn't extract quest name from {desc}" 98 | yield Screenshot( 99 | dt=dt, 100 | path=img, 101 | screenshot_type="Quest", 102 | description=m.group(1), 103 | username=username, 104 | ) 105 | 106 | 107 | LEVEL_REGEX = re.compile(r"^([\w ]+)\((\d+)\)$") 108 | 109 | 110 | def _parse_level_dir(p: Path, username: str) -> Results: 111 | for img in p.rglob("*.png"): 112 | desc, dt = _extract_info_from_filename(img) 113 | m = re.match(LEVEL_REGEX, desc) 114 | assert m, f"Could not match levels out of {desc}" 115 | skill_name, level = m.groups() 116 | yield Screenshot( 117 | dt=dt, 118 | path=img, 119 | screenshot_type="Level", 120 | description=Level(skill=skill_name, level=int(level)), 121 | username=username, 122 | ) 123 | 124 | 125 | def _parse_other_dir(p: Path, username: str) -> Results: 126 | for img in p.rglob("*.png"): 127 | desc, dt = _extract_info_from_filename(img) 128 | yield Screenshot( 129 | dt=dt, path=img, screenshot_type=p.stem, description=desc, username=username 130 | ) 131 | 132 | 133 | def stats() -> Stats: 134 | from my.core import stat 135 | 136 | return {**stat(screenshots)} 137 | -------------------------------------------------------------------------------- /my/scramble/history.py: -------------------------------------------------------------------------------- 1 | """ 2 | Timed Rubiks Cube Solve History from multiple sources using 3 | https://github.com/purarue/scramble-history 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/scramble-history"] 7 | 8 | from pathlib import Path 9 | from typing import Optional 10 | from dataclasses import dataclass 11 | from my.core import PathIsh, make_config 12 | 13 | from my.config import scramble as user_config # type: ignore[attr-defined] 14 | 15 | 16 | @dataclass 17 | class scramble_config(user_config.history): 18 | config_dir: Optional[PathIsh] = None 19 | 20 | 21 | config = make_config(scramble_config) 22 | 23 | from typing import Iterator 24 | 25 | from scramble_history.__main__ import ( 26 | scramble_history_config_dir, 27 | conf_name, 28 | sourcemap_name, 29 | ) 30 | from scramble_history.config import parse_config_file 31 | from scramble_history.models import Solve 32 | from scramble_history.source_merger import merge as merge_solves 33 | 34 | config_dir = Path(config.config_dir or scramble_history_config_dir).expanduser() 35 | 36 | 37 | parsed_conf = parse_config_file(config_dir / conf_name) 38 | 39 | 40 | def solves() -> Iterator[Solve]: 41 | yield from merge_solves( 42 | sourcemap_file=config_dir / sourcemap_name, conf=parsed_conf 43 | ) 44 | -------------------------------------------------------------------------------- /my/skype/gdpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parse Message Dates from Skypes GDPR JSON export 3 | """ 4 | 5 | REQUIRES = ["dateparser"] 6 | 7 | # Isn't a lot of data here, seems a lot of the old 8 | # data is gone. Only parses a couple messages, might 9 | # as well use the datetimes for context on when I 10 | # was using skype 11 | 12 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 13 | from my.config import skype as user_config # type: ignore[attr-defined] 14 | 15 | from dataclasses import dataclass 16 | from my.core import Paths, Stats 17 | 18 | 19 | @dataclass 20 | class config(user_config.gdpr): 21 | # path[s]/glob to the skype JSON files 22 | export_path: Paths 23 | 24 | 25 | import json 26 | from pathlib import Path 27 | from datetime import datetime 28 | from typing import Iterator, Sequence 29 | from itertools import chain 30 | 31 | import dateparser 32 | 33 | from my.core import get_files, make_logger 34 | 35 | logger = make_logger(__name__) 36 | 37 | 38 | Results = Iterator[datetime] 39 | 40 | 41 | def inputs() -> Sequence[Path]: 42 | return get_files(config.export_path) 43 | 44 | 45 | def timestamps() -> Results: 46 | yield from chain(*map(_parse_file, inputs())) 47 | 48 | 49 | def _parse_file(post_file: Path) -> Results: 50 | items = json.loads(post_file.read_text()) 51 | for conv in items["conversations"]: 52 | for msg in conv["MessageList"]: 53 | d = dateparser.parse(msg["originalarrivaltime"].rstrip("Z")) 54 | if d is not None: 55 | yield d 56 | 57 | 58 | def stats() -> Stats: 59 | from my.core import stat 60 | 61 | return {**stat(timestamps)} 62 | -------------------------------------------------------------------------------- /my/spotify/gdpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the spotify GPDR Export 3 | """ 4 | 5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 6 | from my.config import spotify as user_config # type: ignore[attr-defined] 7 | 8 | from dataclasses import dataclass 9 | from my.core import PathIsh, Stats 10 | 11 | 12 | @dataclass 13 | class config(user_config.gdpr): 14 | gdpr_dir: PathIsh # path to unpacked GDPR archive 15 | 16 | 17 | import os 18 | import json 19 | from datetime import date 20 | from pathlib import Path 21 | from typing import Iterator, Any, NamedTuple, List, Set, Tuple, Sequence, Optional 22 | 23 | from my.core import Res, get_files, make_logger, Json 24 | 25 | logger = make_logger(__name__) 26 | 27 | 28 | class Song(NamedTuple): 29 | name: str 30 | artist: str 31 | album: str 32 | 33 | 34 | class Playlist(NamedTuple): 35 | name: str 36 | last_modified: date 37 | songs: List[Song] 38 | 39 | 40 | Playlists = Iterator[Res[Playlist]] 41 | Songs = Iterator[Res[Song]] 42 | 43 | 44 | def inputs(gdpr_dir: Optional[PathIsh] = None) -> Sequence[Path]: 45 | chosen: PathIsh = gdpr_dir if gdpr_dir is not None else config.gdpr_dir 46 | echosen = Path(chosen).expanduser().absolute() 47 | return get_files(echosen, glob="*.json") 48 | 49 | 50 | def playlists() -> Playlists: 51 | gdpr_dir = str(Path(config.gdpr_dir).expanduser().absolute()) # expand path 52 | files = inputs(gdpr_dir) 53 | handler_map = { 54 | "Follow": None, 55 | "Inferences": None, 56 | "Payments": None, 57 | "Playlist": _filter_playlists, 58 | "StreamingHistory": None, # does not save any of the old play history, not worth parsing 59 | "Userdata": None, 60 | "YourLibrary": None, 61 | } 62 | for f in files: 63 | handler: Any 64 | for prefix, h in handler_map.items(): 65 | if not str(f).startswith(os.path.join(gdpr_dir, prefix)): 66 | continue 67 | handler = h 68 | break 69 | else: 70 | if f.is_dir(): 71 | continue 72 | else: 73 | e = RuntimeError(f"Unhandled file: {f}") 74 | logger.debug(str(e)) 75 | yield e 76 | continue 77 | 78 | if handler is None: 79 | # explicitly ignored 80 | continue 81 | 82 | if f.suffix != ".json": 83 | continue 84 | 85 | j = json.loads(f.read_text()) 86 | yield from handler(j) 87 | 88 | 89 | def songs() -> Songs: 90 | emitted: Set[Tuple[str, str, str]] = set() 91 | for p in playlists(): 92 | if isinstance(p, Exception): 93 | yield p 94 | continue 95 | for song in p.songs: 96 | key = (song.name, song.artist, song.album) 97 | if key in emitted: 98 | continue 99 | yield song 100 | emitted.add(key) 101 | 102 | 103 | def stats() -> Stats: 104 | from my.core import stat 105 | 106 | return { 107 | **stat(playlists), 108 | **stat(songs), 109 | } 110 | 111 | 112 | def _filter_playlists(d: Json) -> Iterator[Playlist]: 113 | # parse, then filter 114 | # make sure this playlist has more than one artist 115 | # if its just one artist, its probably just an album 116 | # that's been classified as a playlist 117 | for p in _parse_all_playlists(d): 118 | if len(set([s.artist for s in p.songs])) > 1: 119 | yield p 120 | 121 | 122 | def _parse_all_playlists(d: Json) -> Iterator[Playlist]: 123 | for plist in d["playlists"]: 124 | if plist["numberOfFollowers"] > 50: 125 | logger.debug( 126 | f"Ignoring playlist: {plist['name']}, too many followers to be one of my playlists" 127 | ) 128 | continue 129 | songs: List[Song] = [_parse_song(b) for b in plist["items"]] 130 | yield Playlist( 131 | name=plist["name"], 132 | last_modified=_parse_date(plist["lastModifiedDate"]), 133 | songs=songs, 134 | ) 135 | 136 | 137 | def _parse_song(song_info: Json) -> Song: 138 | tr: Json = song_info["track"] 139 | return Song( 140 | name=tr["trackName"], 141 | artist=tr["artistName"], 142 | album=tr["albumName"], 143 | ) 144 | 145 | 146 | def _parse_date(date_str: str) -> date: 147 | date_info: List[int] = list(map(int, date_str.split("-"))) 148 | return date(year=date_info[0], month=date_info[1], day=date_info[2]) 149 | -------------------------------------------------------------------------------- /my/steam/scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses steam game/achievement data scraped with 3 | https://github.com/purarue/steamscraper 4 | """ 5 | 6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 7 | from my.config import steam as user_config # type: ignore[attr-defined] 8 | from dataclasses import dataclass 9 | from my.core import Paths 10 | 11 | 12 | @dataclass 13 | class config(user_config.scraper): 14 | # path to the exported data 15 | export_path: Paths 16 | 17 | 18 | import json 19 | from functools import partial 20 | from pathlib import Path 21 | from datetime import datetime 22 | from typing import NamedTuple, Iterator, Sequence, Dict, List, Optional, Any 23 | from itertools import groupby 24 | 25 | from my.core import get_files, Stats, Res 26 | from my.utils.time import parse_datetime_sec 27 | 28 | 29 | def inputs() -> Sequence[Path]: 30 | return get_files(config.export_path) 31 | 32 | 33 | class Achievement(NamedTuple): 34 | title: str 35 | description: str 36 | achieved: bool 37 | game_name: str 38 | achieved_on: Optional[datetime] 39 | icon: Optional[str] 40 | 41 | 42 | class Game(NamedTuple): 43 | id: int 44 | name: str 45 | hours_played: float 46 | achievements: List[Achievement] 47 | image_url: Optional[str] 48 | 49 | @property 50 | def achieved(self) -> int: 51 | return list(map(lambda g: g.achieved, self.achievements)).count(True) 52 | 53 | @property 54 | def achievement_count(self) -> int: 55 | return len(self.achievements) 56 | 57 | @property 58 | def achievement_percentage(self) -> float: 59 | return self.achieved / self.achievement_count 60 | 61 | 62 | Results = Iterator[Res[Game]] 63 | AchievementResults = Iterator[Res[Achievement]] 64 | 65 | 66 | def games() -> Results: 67 | """only ones I've played""" 68 | for game in all_games(): 69 | if isinstance(game, Exception): 70 | yield game 71 | else: 72 | if game.hours_played > 0.0: 73 | yield game 74 | 75 | 76 | def all_games() -> Results: 77 | # combine the results from multiple files 78 | games_no_exc: List[Game] = [] 79 | for json_file in inputs(): 80 | for g in _read_parsed_json(json_file): 81 | if isinstance(g, Exception): 82 | yield g 83 | else: 84 | assert isinstance(g, Game) 85 | games_no_exc.append(g) 86 | 87 | # only return the single game with the most achievement count if there are duplicates 88 | for _, gm in groupby(sorted(games_no_exc, key=lambda x: x.id), lambda x: x.id): 89 | yield max(gm, key=lambda gmo: gmo.achieved) 90 | 91 | 92 | def all_achievements() -> AchievementResults: 93 | # combine the results from multiple achievement lists 94 | for game in all_games(): 95 | if isinstance(game, Exception): 96 | yield game 97 | else: 98 | yield from game.achievements 99 | 100 | 101 | # only ones which Ive actually achieved 102 | def achievements() -> AchievementResults: 103 | for ach in all_achievements(): 104 | if isinstance(ach, Exception): 105 | yield ach 106 | else: 107 | if ach.achieved: 108 | yield ach 109 | 110 | 111 | def _read_parsed_json(p: Path) -> Results: 112 | items = json.loads(p.read_text()) 113 | for _, game in items.items(): 114 | ach_lambda = partial(_parse_achievement, game_name=game["name"]) 115 | try: 116 | yield Game( 117 | id=game["id"], 118 | name=game["name"], 119 | hours_played=game["hours"], 120 | image_url=game["image"], 121 | achievements=list(map(ach_lambda, game["achievements"])), 122 | ) 123 | except TypeError as e: 124 | # error creating datetime? 125 | yield e 126 | 127 | 128 | def _parse_achievement(ach: Dict[str, Any], game_name: str) -> Achievement: 129 | achieved = ach["progress"]["unlocked"] 130 | achieved_on = None 131 | # parse datetime if it has it 132 | # could possibly throw an error, but its caught above 133 | if achieved: 134 | achieved_on = parse_datetime_sec(ach["progress"]["data"]) 135 | return Achievement( 136 | title=ach["title"], 137 | description=ach["description"], 138 | game_name=game_name, 139 | achieved=achieved, 140 | achieved_on=achieved_on, 141 | icon=ach.get("icon"), 142 | ) 143 | 144 | 145 | def stats() -> Stats: 146 | from my.core import stat 147 | 148 | return { 149 | **stat(games), 150 | **stat(achievements), 151 | } 152 | -------------------------------------------------------------------------------- /my/todotxt/active.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses your active todotxt (http://todotxt.org/) done.txt and todo.txt 3 | """ 4 | 5 | REQUIRES = ["pytodotxt>=1.5.0"] 6 | 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import todotxt as user_config # type: ignore[attr-defined] 10 | 11 | 12 | from pathlib import Path 13 | from typing import ( 14 | Tuple, 15 | Iterator, 16 | ) 17 | 18 | from dataclasses import dataclass 19 | from my.core import Stats, PathIsh 20 | from .common import Todo, TODOTXT_FILES, parse_todotxt_buffer 21 | 22 | 23 | @dataclass 24 | class config(user_config.active): 25 | # path to your active todo.txt directory 26 | # this is the same place todo.sh stores your files 27 | export_path: PathIsh 28 | 29 | 30 | def inputs() -> Tuple[Path, Path]: 31 | p = Path(config.export_path).expanduser().absolute() 32 | if not p.exists(): 33 | raise FileNotFoundError(f"todotxt export path {p} doesn't exist") 34 | # todo.txt, done.txt 35 | return ( 36 | p / TODOTXT_FILES[0], 37 | p / TODOTXT_FILES[1], 38 | ) 39 | 40 | 41 | Results = Iterator[Todo] 42 | 43 | 44 | def done() -> Results: 45 | df = inputs()[1] 46 | if not Path(df).exists(): 47 | return 48 | yield from parse_todotxt_buffer(Path(df).read_text()) 49 | 50 | 51 | def todos() -> Results: 52 | tf = inputs()[0] 53 | if not tf.exists(): 54 | return 55 | yield from parse_todotxt_buffer(tf.read_text()) 56 | 57 | 58 | def stats() -> Stats: 59 | from my.core import stat 60 | 61 | return { 62 | **stat(todos), 63 | **stat(done), 64 | } 65 | -------------------------------------------------------------------------------- /my/todotxt/common.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, cast, Optional, List, Union 2 | from datetime import datetime 3 | 4 | from pytodotxt import Task, TodoTxtParser # type: ignore[import] 5 | from my.core import __NOT_HPI_MODULE__ # noqa: F401 6 | 7 | REQUIRES = ["pytodotxt>=1.5.0"] 8 | 9 | 10 | class Todo(Task): 11 | # support serializing with hpi query 12 | def _serialize(self) -> Dict[str, Any]: 13 | assert self._raw is not None 14 | return { 15 | "completed": self.is_completed, 16 | "completion_date": self.completion_date, 17 | "deadline": self.deadline, 18 | "creation_date": self.creation_date, 19 | "priority": self.priority, 20 | "text": self.bare_description(), 21 | "projects": self.projects, 22 | "contexts": self.contexts, 23 | "attributes": self.attributes, 24 | "raw": self._raw, 25 | } 26 | 27 | @property 28 | def bare(self) -> str: 29 | return cast(str, self.bare_description()) 30 | 31 | # parse the deadline created by https://github.com/purarue/full_todotxt 32 | # this is optional, so if it fails, just return None 33 | @property 34 | def deadline(self) -> Optional[datetime]: 35 | attrs = self.attributes 36 | if not attrs: 37 | return None 38 | if not isinstance(attrs, dict): 39 | return None 40 | if "deadline" in attrs: 41 | try: 42 | data = attrs["deadline"][0] 43 | parsed = datetime.strptime(data, "%Y-%m-%dT%H-%M%z") 44 | return parsed 45 | except ValueError: 46 | pass 47 | return None 48 | 49 | def __eq__(self, other: Any) -> bool: 50 | if not isinstance(other, Task): 51 | return False 52 | return cast(bool, self._raw == other._raw) 53 | 54 | def __ne__(self, other: Any) -> bool: 55 | return not self.__eq__(other) 56 | 57 | def __hash__(self) -> int: 58 | return hash(self._raw) 59 | 60 | 61 | TODOTXT_FILES = ["todo.txt", "done.txt"] 62 | 63 | 64 | def parse_todotxt_buffer(data: Union[str, bytes]) -> List[Todo]: 65 | return cast(List[Todo], TodoTxtParser(task_type=Todo).parse(data)) 66 | -------------------------------------------------------------------------------- /my/todotxt/git_history.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses todotxt (http://todotxt.org/) done.txt and todo.txt history 3 | from https://github.com/purarue/git_doc_history backups 4 | """ 5 | 6 | REQUIRES = [ 7 | "pytodotxt>=1.5.0", 8 | "git+https://github.com/purarue/git_doc_history", 9 | ] 10 | 11 | 12 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 13 | from my.config import todotxt as user_config # type: ignore[attr-defined] 14 | 15 | 16 | from pathlib import Path 17 | from datetime import datetime, timezone 18 | from typing import Iterator 19 | 20 | from git_doc_history import DocHistory, parse_snapshot_diffs, Action 21 | 22 | from dataclasses import dataclass 23 | from my.core import Stats, PathIsh 24 | from .common import Todo, TODOTXT_FILES, parse_todotxt_buffer 25 | 26 | 27 | @dataclass 28 | class config(user_config.git_history): 29 | # path to the git backup directory 30 | export_path: PathIsh 31 | 32 | 33 | def input() -> DocHistory: 34 | return DocHistory( 35 | backup_dir=Path(config.export_path).expanduser().absolute(), 36 | copy_files=TODOTXT_FILES, 37 | ) 38 | 39 | 40 | Results = Iterator[Todo] 41 | 42 | 43 | # These work by grabbing the latest version of the file 44 | # from the git repo, so they may not always be up to date 45 | # if you don't update git_doc_history often enough 46 | def done() -> Results: 47 | yield from parse_todotxt_buffer( 48 | input().extract_buffer_at("done.txt", at=datetime.now()) 49 | ) 50 | 51 | 52 | def todos() -> Results: 53 | yield from parse_todotxt_buffer( 54 | input().extract_buffer_at("todo.txt", at=datetime.now()) 55 | ) 56 | 57 | 58 | @dataclass 59 | class TodoEvent: 60 | todo: Todo 61 | dt: datetime 62 | action: Action 63 | 64 | 65 | def events() -> Iterator[TodoEvent]: 66 | """ 67 | Keeps track when I added/completed todos 68 | """ 69 | for diff in parse_snapshot_diffs( 70 | input(), 71 | file="todo.txt", 72 | parse_func=lambda doc: parse_todotxt_buffer(doc.data), 73 | ): 74 | yield TodoEvent( 75 | todo=diff.data, 76 | dt=datetime.fromtimestamp(diff.epoch_time, tz=timezone.utc), 77 | action=diff.action, 78 | ) 79 | 80 | 81 | def stats() -> Stats: 82 | from my.core import stat 83 | 84 | return { 85 | **stat(todos), 86 | **stat(done), 87 | **stat(events), 88 | } 89 | -------------------------------------------------------------------------------- /my/trakt/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the dump of my movies/tv shows history and watchlist from https://trakt.tv/ 3 | Uses https://github.com/purarue/traktexport 4 | """ 5 | 6 | REQUIRES = ["git+https://github.com/purarue/traktexport"] 7 | 8 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 9 | from my.config import trakt as user_config # type: ignore[attr-defined] 10 | 11 | from pathlib import Path 12 | from typing import Iterator, Dict, Any, Sequence, List 13 | from functools import lru_cache 14 | 15 | import traktexport.dal as D 16 | from traktexport.merge import read_and_merge_exports 17 | 18 | from dataclasses import dataclass 19 | from my.core import get_files, Stats, make_logger, Paths 20 | from my.core.cachew import mcachew 21 | 22 | 23 | @dataclass 24 | class config(user_config.export): 25 | # path[s]/glob to the exported data. These are the resulting json file from 'traktexport export' 26 | export_path: Paths 27 | 28 | 29 | logger = make_logger(__name__) 30 | 31 | 32 | def inputs() -> Sequence[Path]: 33 | return get_files(config.export_path) 34 | 35 | 36 | def _cachew_depends_on() -> List[float]: 37 | return [Path(f).lstat().st_mtime for f in sorted(inputs())] 38 | 39 | 40 | @lru_cache(maxsize=None) 41 | def _read_trakt_exports() -> D.FullTraktExport: 42 | return read_and_merge_exports(list(map(str, inputs()))) 43 | 44 | 45 | ### Expose all the parsed information from traktexport.dal 46 | 47 | 48 | def profile_stats() -> Dict[str, Any]: 49 | # read the 'stats' key directly from the JSON file 50 | return _read_trakt_exports().stats 51 | 52 | 53 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 54 | def followers() -> Iterator[D.Follow]: 55 | yield from _read_trakt_exports().followers 56 | 57 | 58 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 59 | def likes() -> Iterator[D.Like]: 60 | yield from _read_trakt_exports().likes 61 | 62 | 63 | # TODO: hmm, cachew seems to fail with this one, not sure why 64 | # @mcachew(depends_on=_cachew_depends_on, logger=logger) 65 | def watchlist() -> Iterator[D.WatchListEntry]: 66 | yield from _read_trakt_exports().watchlist 67 | 68 | 69 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 70 | def ratings() -> Iterator[D.Rating]: 71 | yield from _read_trakt_exports().ratings 72 | 73 | 74 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 75 | def history() -> Iterator[D.HistoryEntry]: 76 | yield from _read_trakt_exports().history 77 | 78 | 79 | def stats() -> Stats: 80 | from my.core import stat 81 | 82 | return { 83 | **stat(followers), 84 | **stat(likes), 85 | **stat(watchlist), 86 | **stat(ratings), 87 | **stat(history), 88 | } 89 | -------------------------------------------------------------------------------- /my/ttt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses history from https://github.com/purarue/ttt 3 | """ 4 | 5 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 6 | from my.config import ttt as user_config # type: ignore[attr-defined] 7 | 8 | import csv 9 | from pathlib import Path 10 | from datetime import datetime 11 | from io import StringIO 12 | from typing import ( 13 | NamedTuple, 14 | Iterator, 15 | Sequence, 16 | Optional, 17 | ) 18 | from itertools import chain 19 | from functools import partial 20 | 21 | from more_itertools import unique_everseen 22 | 23 | from dataclasses import dataclass 24 | from my.core import get_files, Stats, Paths, make_logger 25 | from my.utils.time import parse_datetime_sec 26 | from my.utils.parse_csv import parse_csv_file 27 | 28 | logger = make_logger(__name__) 29 | 30 | 31 | @dataclass 32 | class config(user_config): 33 | # path[s]/glob to the backed up ttt history files 34 | # (can be a list if you want to provide the live file) 35 | export_path: Paths 36 | 37 | 38 | def inputs() -> Sequence[Path]: 39 | return get_files(config.export_path) 40 | 41 | 42 | # represents one history entry (command) 43 | class Entry(NamedTuple): 44 | dt: datetime 45 | command: str 46 | directory: Optional[str] 47 | 48 | 49 | Results = Iterator[Entry] 50 | 51 | 52 | def history() -> Results: 53 | func = partial(parse_csv_file, parse_function=_parse_text, logger=logger) 54 | yield from unique_everseen( 55 | chain(*map(func, inputs())), 56 | key=lambda e: ( 57 | e.dt, 58 | e.command, 59 | ), 60 | ) 61 | 62 | 63 | def _parse_text(data: str) -> Results: 64 | csv_reader = csv.reader( 65 | StringIO(data), delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL 66 | ) 67 | for row in csv_reader: 68 | yield Entry( 69 | dt=parse_datetime_sec(row[0]), 70 | command=row[2], 71 | directory=None if row[1] == "-" else row[1], 72 | ) 73 | 74 | 75 | def stats() -> Stats: 76 | from my.core import stat 77 | 78 | return {**stat(history)} 79 | -------------------------------------------------------------------------------- /my/twitch/all.py: -------------------------------------------------------------------------------- 1 | from .common import Results 2 | 3 | 4 | def events() -> Results: 5 | # comment out any sources you're not using 6 | from .gdpr import events as gdpr_events 7 | from .overrustle_logs import events as chatlog_events 8 | 9 | yield from chatlog_events() 10 | yield from gdpr_events() 11 | 12 | 13 | from my.core import Stats 14 | 15 | 16 | def stats() -> Stats: 17 | from my.core import stat 18 | 19 | return {**stat(events)} 20 | -------------------------------------------------------------------------------- /my/twitch/common.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple, Union, Iterator 3 | 4 | from my.core import __NOT_HPI_MODULE__ # noqa: F401 5 | 6 | 7 | class Event(NamedTuple): 8 | event_type: str 9 | dt: datetime 10 | channel: str 11 | # e.g., additional data/chatlog message 12 | context: Union[str, int] 13 | 14 | 15 | Results = Iterator[Event] 16 | -------------------------------------------------------------------------------- /my/twitch/gdpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the twitch GDPR data request 3 | https://www.twitch.tv/p/en/legal/privacy-choices/#user-privacy-requests 4 | """ 5 | 6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 7 | from my.config import twitch as user_config # type: ignore[attr-defined] 8 | 9 | from dataclasses import dataclass 10 | from my.core import PathIsh 11 | 12 | 13 | @dataclass 14 | class config(user_config.gdpr): 15 | gdpr_dir: PathIsh # path to unpacked GDPR archive 16 | 17 | 18 | import csv 19 | from datetime import datetime 20 | from pathlib import Path 21 | from typing import Iterator, Union, Sequence, List 22 | 23 | from .common import Event, Results 24 | 25 | from my.core import make_logger 26 | from my.core.cachew import mcachew 27 | from my.core.common import get_files 28 | 29 | logger = make_logger(__name__) 30 | 31 | 32 | def inputs() -> Sequence[Path]: 33 | return get_files(config.gdpr_dir, glob="*.csv") 34 | 35 | 36 | def _cachew_depends_on() -> List[float]: 37 | return [p.stat().st_mtime for p in inputs()] 38 | 39 | 40 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 41 | def events() -> Results: 42 | for file in inputs(): 43 | yield from _parse_csv_file(file) 44 | 45 | 46 | def _parse_csv_file(p: Path) -> Iterator[Event]: 47 | with p.open("r") as f: 48 | reader = csv.reader(f) 49 | next(reader) # ignore header 50 | for line in reader: 51 | context: Union[str, int] 52 | context = line[6] 53 | if context.isdigit(): 54 | context = int(line[6]) 55 | yield Event( 56 | event_type=line[0], 57 | dt=datetime.fromisoformat(line[1]), 58 | channel=line[5], 59 | context=context, 60 | ) 61 | -------------------------------------------------------------------------------- /my/twitch/overrustle_logs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reads parsed information from the overrustle logs dump 3 | https://github.com/purarue/overrustle_parser 4 | """ 5 | 6 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 7 | from my.config import twitch as user_config # type: ignore[attr-defined] 8 | 9 | from dataclasses import dataclass 10 | from my.core import Paths 11 | 12 | 13 | @dataclass 14 | class config(user_config.overrustle): 15 | export_path: Paths # parsed overrustle_parser json files 16 | 17 | 18 | import json 19 | from pathlib import Path 20 | from typing import Sequence, List 21 | 22 | from my.core import make_logger 23 | from my.core.cachew import mcachew 24 | from my.core.common import get_files 25 | from my.utils.time import parse_datetime_sec 26 | 27 | from .common import Event, Results 28 | 29 | logger = make_logger(__name__) 30 | 31 | 32 | def inputs() -> Sequence[Path]: 33 | return get_files(config.export_path) 34 | 35 | 36 | def _cachew_depends_on() -> List[float]: 37 | return [p.stat().st_mtime for p in inputs()] 38 | 39 | 40 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 41 | def events() -> Results: 42 | for file in inputs(): 43 | yield from _parse_json_dump(file) 44 | 45 | 46 | def _parse_json_dump(p: Path) -> Results: 47 | for blob in json.loads(p.read_text()): 48 | yield Event( 49 | event_type="chatlog", 50 | dt=parse_datetime_sec(blob["dt"]), 51 | channel=blob["channel"], 52 | context=blob["message"], 53 | ) 54 | -------------------------------------------------------------------------------- /my/utils/backup_to/__main__.py: -------------------------------------------------------------------------------- 1 | from os import environ, path 2 | from pathlib import Path 3 | 4 | import click 5 | 6 | from my.core import __NOT_HPI_MODULE__ # noqa: F401 7 | 8 | # if the HPIDATA environment variable is set (which points to my data) 9 | # use that. Else, just default to ~/data 10 | BASE_PREFIX: Path = Path(environ.get("HPIDATA", path.expanduser("~/data"))) 11 | 12 | 13 | def get_dir(name: str) -> Path: 14 | to = (BASE_PREFIX / name).absolute() 15 | to.mkdir(parents=True, exist_ok=True) 16 | return to 17 | 18 | 19 | @click.command() 20 | @click.argument("NAME") 21 | def main(name: str) -> None: 22 | """ 23 | Helper script to locate a directory to backup to 24 | """ 25 | click.echo(str(get_dir(name))) 26 | 27 | 28 | if __name__ == "__main__": 29 | main(prog_name="backup_to") 30 | -------------------------------------------------------------------------------- /my/utils/parse_csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | 4 | from pathlib import Path 5 | from typing import Callable, Iterator, TypeVar, Optional 6 | 7 | T = TypeVar("T") 8 | 9 | 10 | def parse_csv_file( 11 | histfile: Path, 12 | parse_function: Callable[[str], Iterator[T]], 13 | logger: Optional[logging.Logger] = None, 14 | ) -> Iterator[T]: 15 | """ 16 | Parses a CSV file using parse_function, yield results from that function. 17 | 18 | If the CSV file contains NUL bytes, replace those and try again. 19 | """ 20 | with histfile.open("r", encoding="utf-8", newline="") as f: 21 | data = f.read() 22 | try: 23 | yield from parse_function(data) 24 | except (csv.Error, ValueError) as e: 25 | if "\0" not in data: 26 | raise RuntimeError(f"Could not parse {histfile}: {e}") from e 27 | else: 28 | if logger: 29 | logger.warning("Found NUL byte in %s: %s", histfile, e) 30 | yield from parse_function(data.replace("\0", "")) 31 | -------------------------------------------------------------------------------- /my/utils/time.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from datetime import datetime, timezone 3 | 4 | from my.core import __NOT_HPI_MODULE__ # noqa: F401 5 | 6 | # TODO: maybe this should be PR'd to master/put into 7 | # my.time.tz/utils? 8 | 9 | 10 | def parse_datetime_sec(d: Union[str, float, int]) -> datetime: 11 | return datetime.fromtimestamp(int(d), tz=timezone.utc) 12 | 13 | 14 | def parse_datetime_millis(d: Union[str, float, int]) -> datetime: 15 | return parse_datetime_sec(int(d) / 1000) 16 | -------------------------------------------------------------------------------- /my/zsh.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses ZSH history (uses exports from ./job/zsh_history.job) and current zsh history (from $ZDOTDIR) 3 | 4 | This parses the zsh format I've configured, zsh is heavily configurable 5 | Mine looks like: 6 | : 1598471925:470;python3 7 | : datetime:duration:command 8 | 9 | My config looks like: 10 | 11 | HISTFILE="${ZDOTDIR}/.zsh_history" 12 | HISTSIZE=1000000 13 | SAVEHIST=1000000 14 | setopt APPEND_HISTORY # append to history file instead of replacing 15 | setopt HIST_REDUCE_BLANKS # delete empty lines from history file 16 | setopt HIST_IGNORE_SPACE # ignore lines that start with space 17 | setopt HIST_NO_STORE # Do not add history and fc commands to the history 18 | setopt EXTENDED_HISTORY # save time/duration to history file 19 | """ 20 | 21 | # if on multiple computers, the zsh histories can be copied into the zsh.export_path 22 | # and it will merge everything without duplicates 23 | 24 | # see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example 25 | from my.config import zsh as user_config # type: ignore[attr-defined] 26 | 27 | from pathlib import Path 28 | from typing import Sequence, Optional 29 | from functools import lru_cache 30 | 31 | from dataclasses import dataclass 32 | from my.core import ( 33 | get_files, 34 | warn_if_empty, 35 | Stats, 36 | make_logger, 37 | PathIsh, 38 | Paths, 39 | ) 40 | from my.core.cachew import mcachew 41 | from my.core.warnings import low 42 | from my.utils.time import parse_datetime_sec 43 | 44 | from more_itertools import unique_everseen 45 | 46 | 47 | @dataclass 48 | class config(user_config): 49 | # path[s]/glob to the exported zsh history files 50 | export_path: Paths 51 | 52 | # path to current zsh history (i.e. the live file) 53 | live_file: Optional[PathIsh] 54 | 55 | 56 | logger = make_logger(__name__) 57 | 58 | 59 | def backup_inputs() -> Sequence[Path]: 60 | return list(get_files(config.export_path)) 61 | 62 | 63 | @lru_cache(1) 64 | def _live_file() -> Optional[Path]: 65 | if config.live_file is not None: 66 | p: Path = Path(config.live_file).expanduser().absolute() 67 | if p.exists(): 68 | return p 69 | else: 70 | low(f"'live_file' provided {config.live_file} but that file doesn't exist.") 71 | return None 72 | return None 73 | 74 | 75 | import re 76 | 77 | from datetime import datetime 78 | from typing import NamedTuple, Iterator, Tuple 79 | from itertools import chain 80 | 81 | 82 | # represents one history entry (command) 83 | class Entry(NamedTuple): 84 | dt: datetime 85 | duration: int 86 | command: str 87 | 88 | 89 | Results = Iterator[Entry] 90 | 91 | 92 | def history() -> Results: 93 | lf = _live_file() 94 | if lf is not None: 95 | yield from _merge_histories(_history_from_backups(), _parse_file(lf)) 96 | else: 97 | # if we're not merging the live history file 98 | # dont need to spend the time doing the additional _merge_histories 99 | yield from _history_from_backups() 100 | 101 | 102 | def _cachew_depends_on() -> Sequence[Path]: 103 | return sorted(backup_inputs()) 104 | 105 | 106 | @mcachew(depends_on=_cachew_depends_on, logger=logger) 107 | def _history_from_backups() -> Results: 108 | yield from _merge_histories(*map(_parse_file, backup_inputs())) 109 | 110 | 111 | @warn_if_empty 112 | def _merge_histories(*sources: Results) -> Results: 113 | yield from unique_everseen( 114 | chain(*sources), 115 | key=lambda e: ( 116 | e.dt, 117 | e.command, 118 | ), 119 | ) 120 | 121 | 122 | def _parse_file(histfile: Path) -> Results: 123 | dt: Optional[datetime] = None 124 | dur: Optional[int] = None 125 | command: str = "" 126 | # can't parse line by line since some commands are multiline 127 | # sort of structured like a do-while loop 128 | for line in histfile.open(encoding="latin-1"): 129 | r = _parse_metadata(line) 130 | # if regex didn't match, this is a multi line command string 131 | if r is None: 132 | command += "\n" + line 133 | else: 134 | # this 'if' is needed for the first item (since its not set on the first loop) 135 | # yield the last command 136 | if dt is not None and dur is not None: 137 | yield Entry( 138 | dt=dt, 139 | duration=dur, 140 | command=command, 141 | ) 142 | # set 'current' dt, dur, command to matched groups 143 | dt, dur, command = r 144 | # yield the last entry 145 | if command: 146 | yield Entry( 147 | dt=dt, # type: ignore[arg-type] 148 | duration=dur, # type: ignore[arg-type] 149 | command=command, 150 | ) 151 | 152 | 153 | PATTERN = re.compile(r"^: (\d+):(\d+);(.*)$") 154 | 155 | 156 | def _parse_metadata(histline: str) -> Optional[Tuple[datetime, int, str]]: 157 | """ 158 | parse the date, duration, and command from a line 159 | """ 160 | matches = PATTERN.match(histline) 161 | if matches: 162 | g = matches.groups() 163 | return (parse_datetime_sec(g[0]), int(g[1]), g[2]) 164 | return None 165 | 166 | 167 | def stats() -> Stats: 168 | from my.core import stat 169 | 170 | return {**stat(history)} 171 | -------------------------------------------------------------------------------- /scripts/functions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # These should work with both bash and zsh 3 | # 4 | # To use these, put 'source /path/to/this/repo/functions.sh' 5 | # in your shell profile 6 | # 7 | # these use a bunch of common-ish shell tools 8 | # to interact with the hpi query JSON API 9 | # jq: https://github.com/stedolan/jq 10 | # fzf: https://github.com/junegunn/fzf 11 | 12 | # helpers used across multiple functions 13 | alias mpv-from-stdin='mpv --playlist=- --no-audio-display --msg-level=file=error' 14 | filter_unique() { 15 | awk '!seen[$0]++' 16 | } 17 | 18 | ################### 19 | # my.listenbrainz 20 | ################### 21 | 22 | scrobbles() { 23 | hpi query my.listenbrainz.export.history -s "$@" 24 | } 25 | scrobble-describe() { 26 | jq -r '"\(.listened_at) \(.artist_name) - \(.track_name)"' 27 | } 28 | 29 | ########## 30 | # my.mpv 31 | ########## 32 | 33 | # functions to replay music I've listened to recently 34 | mpv-recent() { 35 | local args=() 36 | if [[ -n "$1" ]]; then 37 | args+=("--limit" "$1") 38 | 39 | fi 40 | hpi query my.mpv.history_daemon.history --order-type datetime --reverse -s "${args[@]}" 41 | } 42 | mpv-recent-path() { 43 | mpv-recent "$1" | jq -r .path 44 | } 45 | replay() { 46 | mpv-recent-path | exists | grep --max-count=1 "$XDG_MUSIC_DIR" | mpv-from-stdin 47 | } 48 | # requires https://github.com/purarue/exists, https://github.com/purarue/pura-utils 49 | replay-recent() { 50 | mpv-recent-path "$1" | exists | unique | head -n "${1:-$LINES}" | fzf --select-1 | mpv-from-stdin 51 | } 52 | 53 | ########## 54 | # my.zsh 55 | ########## 56 | 57 | # jq later to preserve newlines in commands 58 | alias zsh-unique="hpi query -s my.zsh.history | jq '.command' | filter_unique | jq -r" 59 | alias zsh-unique-fzf='zsh-unique | fzf' 60 | 61 | ############ 62 | # my.trakt 63 | ############ 64 | 65 | # e.g. trakt-movies --recent 4w | trakt-describe-movie 66 | trakt-movies() { 67 | hpi query 'my.trakt.export.history' -s "$@" | trakt-filter-movies 68 | } 69 | 70 | # e.g. trakt-episodes --recent 4w | trakt-describe-episode 71 | trakt-episodes() { 72 | hpi query 'my.trakt.export.history' -s "$@" | trakt-filter-episodes 73 | } 74 | 75 | trakt-filter-movies() { 76 | jq 'select(.media_type == "movie")' 77 | } 78 | 79 | trakt-filter-episodes() { 80 | jq 'select(.media_type == "episode")' 81 | } 82 | 83 | trakt-describe-movie() { 84 | jq -r '"\(.media_data.title) (\(.media_data.year))"' 85 | } 86 | 87 | trakt-describe-episode() { 88 | jq -r '"\(.media_data.show.title) (\(.media_data.show.year)) - S\(.media_data.season)E\(.media_data.episode) \(.media_data.title)"' 89 | } 90 | -------------------------------------------------------------------------------- /scripts/lint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # wrapper around linter/formatters 3 | # pauses at each step if there are errors 4 | # this is run locally to autoformat/lint code 5 | 6 | set -o pipefail 7 | 8 | # get the name of this script 9 | declare script_name 10 | script_name="$(basename "${BASH_SOURCE[0]}")" 11 | 12 | # function to verify an external command is installed 13 | havecmd() { 14 | local BINARY ERRMSG 15 | # error if first argument isn't provided 16 | BINARY="${1:?Must provide command to check}" 17 | # the command exists, exit with 0 (success!) 18 | if command -v "${BINARY}" >/dev/null 2>&1; then 19 | return 0 20 | else 21 | # construct error message 22 | ERRMSG="'${script_name}' requires '${BINARY}', could not find that on your \$PATH" 23 | if [[ -n "$2" ]]; then 24 | ERRMSG="${ERRMSG}. $2" 25 | fi 26 | printf '%s\n' "${ERRMSG}" 1>&2 27 | return 1 28 | fi 29 | } 30 | 31 | set -e 32 | havecmd shellcheck 33 | havecmd exists 'See https://github.com/purarue/exists' 34 | havecmd rifleman 'See https://github.com/purarue/rifleman' 35 | havecmd pytest 36 | havecmd jq 37 | havecmd tput 38 | havecmd mypy 39 | havecmd flake8 40 | havecmd black 41 | havecmd shfmt 42 | set +e 43 | 44 | # cd to base directory 45 | BASE_DIR="$(realpath "$(dirname "${BASH_SOURCE[0]}")"/..)" 46 | readonly BASE_DIR 47 | cd "${BASE_DIR}" || exit 1 48 | printf 'In: %s\n' "$(pwd)" 49 | 50 | # call shellcheck on all the scripts 51 | shellcheck_scripts() { 52 | git ls-files | exists | rifleman - -a lint -j | jq -r 'to_entries[] | select(.key|startswith("shellcheck")) | .value | .[]' | xargs shellcheck 53 | } 54 | 55 | prompt() { 56 | local MESSAGE 57 | MESSAGE='Hit enter to continue > ' 58 | [[ -n "$1" ]] && MESSAGE="$1" 59 | echo -en "$(tput setaf 1)${MESSAGE}$(tput sgr0)" 60 | read -r # if no variable is specified, sets the REPLY environment variable 61 | } 62 | 63 | update_fork() { 64 | local FORK_LOCATION 65 | FORK_LOCATION="$(realpath ../HPI-karlicoss/)" 66 | cd "${FORK_LOCATION}" || return $? 67 | git checkout master 68 | git pull upstream master 69 | } 70 | 71 | main() { 72 | (update_fork) # cd in subshell 73 | python3 -m pytest "$@" || prompt '' 74 | echo "Running mypy..." 75 | MY_CONFIG="${BASE_DIR}/tests/my" mypy --install-types --non-interactive --color-output -p my || prompt '' 76 | python3 -m mypy ~/.config/my/my/config/ || prompt '' 77 | python3 -m flake8 ./my || prompt '' 78 | echo "Running shellcheck..." 79 | shellcheck_scripts 80 | # format everything in the repo 81 | git ls-files | exists | rifleman - 82 | echo -e "$(tput setaf 2)Done!$(tput sgr0)" 83 | git status 84 | } 85 | 86 | main "$@" 87 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = HPI_purarue 3 | version = 0.0.1 4 | description = "A Python interface to my life" 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/purarue/HPI 8 | author = "purarue" 9 | license = MIT 10 | license_files = LICENSE 11 | classifiers = 12 | License :: OSI Approved :: MIT License 13 | Operating System :: OS Independent 14 | Programming Language :: Python :: 3 15 | Programming Language :: Python :: 3 :: Only 16 | Programming Language :: Python :: 3.10 17 | Programming Language :: Python :: 3.11 18 | Programming Language :: Python :: 3.12 19 | Programming Language :: Python :: 3.13 20 | 21 | [options] 22 | python_requires = >3.9 23 | include_package_data = True 24 | zip_safe = False 25 | 26 | [options.entry_points] 27 | console_scripts = 28 | backup_to = my.utils.backup_to.__main__:main 29 | 30 | [options.package_data] 31 | my = py.typed 32 | 33 | [flake8] 34 | ignore = E501,E402,W503,E266,E203 35 | 36 | [mypy] 37 | pretty = True 38 | show_error_context = True 39 | show_error_codes = True 40 | check_untyped_defs = True 41 | namespace_packages = True 42 | disallow_incomplete_defs = True 43 | no_implicit_optional = True 44 | disallow_any_generics = True 45 | disallow_untyped_calls = True 46 | warn_redundant_casts = True 47 | warn_return_any = True 48 | warn_unreachable = True 49 | 50 | [tool:pytest] 51 | addopts = 52 | --verbose 53 | tests 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | from setuptools import setup, find_namespace_packages # type: ignore[import] 3 | 4 | 5 | def subpackages() -> Iterator[str]: 6 | # make sure subpackages are only in the my/ folder (not in tests or other folders here) 7 | for p in find_namespace_packages("."): 8 | if p.startswith("my"): 9 | yield p 10 | 11 | 12 | if __name__ == "__main__": 13 | setup(packages=list(subpackages())) 14 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/tests/__init__.py -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import pytest 6 | 7 | V = "HPI_TESTS_PURA" 8 | 9 | skip_if_not_pura = pytest.mark.skipif( 10 | V not in os.environ, 11 | reason=f"test on runs on @purarue data for now. Set envvar {V}=true to override", 12 | ) 13 | 14 | 15 | def data(file: Optional[str]) -> Path: 16 | d = Path(__file__).absolute().parent / "testdata" 17 | if file: 18 | d = d / file 19 | assert d.exists() 20 | return d 21 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(autouse=True) 5 | def without_cachew(): 6 | from my.core.cachew import disabled_cachew 7 | 8 | with disabled_cachew(): 9 | yield 10 | -------------------------------------------------------------------------------- /tests/my/my/config/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Config file used for testing in CI; so that config is defined 3 | """ 4 | 5 | import tempfile 6 | from pathlib import Path 7 | from typing import Optional, Sequence, Callable 8 | 9 | from os import environ, path 10 | 11 | from my.core.common import PathIsh, Paths 12 | 13 | 14 | class core: 15 | cache_dir: PathIsh = path.join(environ["HOME"], ".cache", "cachew") 16 | tmp_dir: PathIsh = path.join(tempfile.gettempdir(), "HPI-tempdir") 17 | enabled_modules: Sequence[str] = [] 18 | disabled_modules: Sequence[str] = [] 19 | 20 | 21 | class mail: 22 | class imap: 23 | mailboxes: Paths = "" 24 | # filter function which filters the input paths 25 | filter_path: Optional[Callable[[Path], bool]] = None 26 | 27 | class mbox: 28 | mailboxes: Paths = "" 29 | exclude_extensions = () 30 | 31 | 32 | class zsh: 33 | export_path: Paths = "" 34 | live_file: Optional[PathIsh] = "" 35 | 36 | 37 | class bash: 38 | export_path: Paths = "" 39 | 40 | 41 | class todotxt: 42 | class git_history: 43 | # path to git_doc_history directory 44 | export_path: Optional[PathIsh] = None 45 | 46 | class active: 47 | # path to your active todo.txt directory 48 | export_path: PathIsh = "" 49 | error_policy = "drop" 50 | 51 | 52 | class rss: 53 | class newsboat: 54 | class git_history: 55 | export_path: Paths = "" 56 | 57 | 58 | class mpv: 59 | class history_daemon: 60 | export_path: Paths = "" 61 | require_percent: Optional[float] = 0.75 62 | 63 | 64 | class league: 65 | class export: 66 | export_path: Paths = "" 67 | username = "" 68 | 69 | 70 | class chess: 71 | class export: 72 | export_path: Paths = "" 73 | 74 | 75 | class listenbrainz: 76 | class export: 77 | export_path: Paths = "" 78 | 79 | 80 | class trakt: 81 | class export: 82 | export_path: Paths = "" 83 | 84 | 85 | class mal: 86 | class export: 87 | export_path: PathIsh = "" 88 | zip_backup_path: Optional[PathIsh] = "" 89 | 90 | 91 | class grouvee: 92 | class export: 93 | export_path: Paths = "" 94 | 95 | 96 | class nextalbums: 97 | export_path: Paths = "" 98 | 99 | 100 | class steam: 101 | class scraper: 102 | export_path: Paths = "" 103 | 104 | 105 | class piazza: 106 | class scraper: 107 | export_path: Paths = "" 108 | 109 | 110 | class blizzard: 111 | class gdpr: 112 | export_path: Paths = "" 113 | 114 | 115 | class project_euler: 116 | export_path: Paths = "" 117 | 118 | 119 | class skype: 120 | class gdpr: 121 | export_path: Paths = "" 122 | 123 | 124 | class facebook: 125 | class gdpr: 126 | gdpr_dir: PathIsh = "" 127 | 128 | 129 | class spotify: 130 | class gdpr: 131 | gdpr_dir: PathIsh = "" 132 | 133 | 134 | class twitch: 135 | class overrustle: 136 | export_path: Paths = "" 137 | 138 | class gdpr: 139 | gdpr_dir: PathIsh = "" 140 | 141 | 142 | class ipython: 143 | export_path: Paths = "" 144 | 145 | 146 | class ttt: 147 | export_path: Paths = "" 148 | 149 | 150 | class activitywatch: 151 | class active_window: 152 | export_path: Paths = "" 153 | 154 | 155 | class apple: 156 | class privacy_export: 157 | gdpr_dir: PathIsh = "" 158 | 159 | 160 | class linkedin: 161 | class privacy_export: 162 | gdpr_dir: PathIsh = "" 163 | 164 | 165 | class scramble: 166 | class history: 167 | config_dir: Optional[PathIsh] = None 168 | 169 | 170 | class discord: 171 | class data_export: 172 | export_path: Paths = "" 173 | 174 | 175 | class runelite: 176 | class screenshots: 177 | export_path: Paths = "" 178 | 179 | 180 | class minecraft: 181 | class advancements: 182 | export_path: Paths = "" 183 | 184 | 185 | class offline: 186 | class listens: 187 | export_path: Paths = "" 188 | 189 | 190 | class time: 191 | class tz: 192 | policy = "convert" 193 | -------------------------------------------------------------------------------- /tests/test_apple.py: -------------------------------------------------------------------------------- 1 | from more_itertools import ilen 2 | 3 | 4 | from .common import skip_if_not_pura 5 | 6 | 7 | @skip_if_not_pura 8 | def test_apple_types() -> None: 9 | from my.apple.privacy_export import ( 10 | events, 11 | Game, 12 | GameAchievement, 13 | GameLeaderboardData, 14 | Location, 15 | ) 16 | 17 | all_ev = list(events()) 18 | assert len(all_ev) > 10 19 | all_types = set([Game, GameAchievement, GameLeaderboardData, Location]) 20 | assert all_types == set(map(type, all_ev)) 21 | # make sure we parsed everything without errors 22 | assert ilen(filter(lambda e: isinstance(e, Exception), all_ev)) == 0 23 | -------------------------------------------------------------------------------- /tests/test_bash.py: -------------------------------------------------------------------------------- 1 | from my.bash import _parse_file 2 | 3 | from .common import data 4 | 5 | 6 | def test_single_file() -> None: 7 | history_file = data("bash/history") 8 | history = list(_parse_file(history_file)) 9 | assert len(history) == 4 10 | assert history[0].command == "ls" 11 | assert history[1].command == "git status" 12 | assert ( 13 | history[2].command 14 | == '''echo "$( 15 | date 16 | uname 17 | )"''' 18 | ) 19 | 20 | assert history[3].command == "ls" 21 | -------------------------------------------------------------------------------- /tests/test_commits.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from more_itertools import ilen 4 | 5 | from .common import skip_if_not_pura 6 | 7 | 8 | def file_count(dir_name: Path) -> int: 9 | return ilen(dir_name.rglob("*")) 10 | 11 | 12 | @skip_if_not_pura 13 | def test_commits() -> None: 14 | from my.coding.commits import repos, _cached_commits, Commit 15 | 16 | all_repos = list(repos()) 17 | assert len(all_repos) > 1 18 | # get a repo which has lots of files 19 | # probably has a couple commits 20 | for r in sorted(all_repos): 21 | if file_count(r) > 50: 22 | biggest_repo = r 23 | break 24 | else: 25 | raise RuntimeError("Couldn't find a repo with more than 100 files!") 26 | commits_for_repo = list(_cached_commits(biggest_repo)) 27 | assert len(commits_for_repo) >= 1 28 | assert isinstance(commits_for_repo[0], Commit) 29 | -------------------------------------------------------------------------------- /tests/test_games.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from more_itertools import ilen 4 | from my.core.error import raise_exceptions 5 | 6 | from .common import skip_if_not_pura 7 | 8 | 9 | @skip_if_not_pura 10 | def test_league() -> None: 11 | from my.league.export import history, Game 12 | 13 | gs: List[Game] = list(raise_exceptions(history())) 14 | assert len(gs) > 50 15 | 16 | 17 | @skip_if_not_pura 18 | def test_steam() -> None: 19 | from my.steam.scraper import games, achievements, Achievement 20 | 21 | assert ilen(games()) > 10 22 | ach: List[Achievement] = list(raise_exceptions(achievements())) 23 | assert any([a.game_name == "Counter-Strike: Global Offensive" for a in ach]) 24 | -------------------------------------------------------------------------------- /tests/test_ipython.py: -------------------------------------------------------------------------------- 1 | from my.ipython import _parse_database 2 | 3 | from .common import data 4 | 5 | import pytest 6 | 7 | from IPython.core.history import HistoryAccessor 8 | 9 | 10 | # https://github.com/ipython/ipython/issues/13666 11 | def accessor_works() -> bool: 12 | ipython_db = str(data("ipython.sqlite")) 13 | hist = HistoryAccessor(hist_file=ipython_db) # type: ignore[no-untyped-call] 14 | try: 15 | hist.get_last_session_id() 16 | return True 17 | except Exception: 18 | return False 19 | 20 | 21 | @pytest.mark.skipif(not accessor_works(), reason="ipython historyaccessor failed") 22 | def test_ipython() -> None: 23 | ipython_db = str(data("ipython.sqlite")) 24 | cmds = list(_parse_database(ipython_db)) 25 | assert len(cmds) == 13 26 | item = cmds[1] 27 | assert not isinstance(item, Exception) 28 | assert item.command == "fac(121)" 29 | -------------------------------------------------------------------------------- /tests/test_my.py: -------------------------------------------------------------------------------- 1 | from my.discord.data_export import test_remove_link_suppression # noqa 2 | -------------------------------------------------------------------------------- /tests/test_zsh.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Callable, Iterator 3 | from itertools import chain 4 | 5 | from my.zsh import _parse_file, _merge_histories, Entry 6 | 7 | from .common import data 8 | 9 | history_file = data("zsh/zsh_history") 10 | overlap_file = data("zsh/overlap_history") 11 | 12 | 13 | def _parse_and_merge(inputs: Callable[[], Iterator[Path]]) -> Iterator[Entry]: 14 | yield from _merge_histories(*chain(map(_parse_file, inputs()))) 15 | 16 | 17 | def test_single_file() -> None: 18 | """ 19 | test that a single zsh parse works and for an entry in the history 20 | """ 21 | 22 | def zsh_small_test(): 23 | yield Path(history_file) 24 | 25 | items = list(_parse_and_merge(inputs=zsh_small_test)) 26 | assert len(items) == 11 27 | 28 | from datetime import datetime, timezone 29 | 30 | # from the test history file, fine to do 31 | e = Entry( 32 | dt=datetime( 33 | year=2020, 34 | month=7, 35 | day=14, 36 | hour=2, 37 | minute=21, 38 | second=37, 39 | tzinfo=timezone.utc, 40 | ), 41 | duration=0, 42 | command="ls", 43 | ) 44 | assert e in items 45 | 46 | 47 | def test_overlap() -> None: 48 | """ 49 | To make sure that duplicates are removed 50 | """ 51 | 52 | def zsh_multiple_tests(): 53 | yield Path(history_file) 54 | yield Path(overlap_file) 55 | 56 | items = list(_parse_and_merge(inputs=zsh_multiple_tests)) 57 | assert len(items) == 11 58 | -------------------------------------------------------------------------------- /tests/testdata/bash/history: -------------------------------------------------------------------------------- 1 | #1616723205 2 | ls 3 | #1616723205 4 | git status 5 | #1616723206 6 | echo "$( 7 | date 8 | uname 9 | )" 10 | #1616723207 11 | ls 12 | -------------------------------------------------------------------------------- /tests/testdata/ipython.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purarue/HPI/d17f7355e88f97ce3750d903106c6dad0063c6ab/tests/testdata/ipython.sqlite -------------------------------------------------------------------------------- /tests/testdata/zsh/overlap_history: -------------------------------------------------------------------------------- 1 | : 1594693239:2;{ for i in $(seq 10); do echo $i; sleep 1; done } 2 | : 1594693242:10;{ for i in $(seq 10); do echo $i; sleep 1; done } | tac 3 | : 1594693261:29;man tac 4 | : 1594693293:0;which parallel-moreutils 5 | -------------------------------------------------------------------------------- /tests/testdata/zsh/zsh_history: -------------------------------------------------------------------------------- 1 | : 1594693071:0;ls 2 | : 1594693154:2;while true; do genpasswd -n $(tput cols); sleep 0.1; done 3 | : 1594693172:3;while true; do\ 4 | for i in {1.."$(tput cols)"}; do genpasswd -n "$i"; done \ 5 | for i in $(seq 1 "$(tput cols)" | tac); do genpasswd -n "$i"; done\ 6 | done 7 | : 1594693184:2;while true; do\ 8 | for i in {1.."$(tput cols)"}; do genpasswd -n "$i"; done \ 9 | for i in $(seq 1 "$(tput cols)" | tac); do genpasswd -n "$i"; done\ 10 | done 11 | : 1594693220:0;{ for i in 1..10; do echo $i; done } 12 | : 1594693231:0;{ for i in $(seq 10); do echo $i; done } 13 | : 1594693239:2;{ for i in $(seq 10); do echo $i; sleep 1; done } 14 | : 1594693242:10;{ for i in $(seq 10); do echo $i; sleep 1; done } | tac 15 | : 1594693261:29;man tac 16 | : 1594693293:0;which parallel-moreutils 17 | : 1594693297:0;ls 18 | : 1594693297:0;ls 19 | --------------------------------------------------------------------------------