├── roam_to_git
    ├── __init__.py
    ├── fs.py
    ├── __main__.py
    ├── formatter.py
    └── scrapping.py
├── setup.cfg
├── .gitignore
├── requirements.txt
├── env.template
├── .github
    ├── workflows
    │   └── test.yml
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── LICENSE.txt
├── setup.py
├── tests.py
└── README.md


/roam_to_git/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | logs
 2 | notes
 3 | downloads
 4 | venv/
 5 | env/
 6 | **.pyc
 7 | .env
 8 | .mypy_cache/
 9 | MANIFEST
10 | dist/
11 | *.ipynb
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # WARNING: don't forget to update setup.py
2 | gitpython>=3.1.*
3 | loguru==0.4.*
4 | pyppeteer>=0.0.25
5 | python-dotenv>=0.10.*
6 | psutil>=5.6.0
7 | 


--------------------------------------------------------------------------------
/env.template:
--------------------------------------------------------------------------------
1 | # Copy this file to ".env" and fill the values, or configure it on Github secrets if using Github actions
2 | ROAMRESEARCH_USER="YOUR_EMAIL"
3 | ROAMRESEARCH_PASSWORD="YOUR_PASSWORD"
4 | # find it here https://user-images.githubusercontent.com/656694/84388282-98136800-abf4-11ea-84c1-85ffc59b30b0.png
5 | ROAMRESEARCH_DATABASE="YOUR_DATABASE"
6 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: "roam-to-git tests.py"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - dev 
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     name: Test
13 |     timeout-minutes: 15
14 |     steps:
15 |       -   uses: actions/checkout@v2
16 |       -   name: Set up Python 3.8
17 |           uses: actions/setup-python@v1
18 |           with:
19 |             python-version: 3.8
20 | 
21 |       -   name: Setup dependencies
22 |           run: |
23 |             pip install -r requirements.txt
24 |             pip install mypy
25 |       -   name: Run backup
26 |           run: ./tests.py
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: MatthieuBizien
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2018 YOUR NAME
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: MatthieuBizien
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Traceback**
24 | Please use http://gist.github.com/ or similar, and report the last line here.
25 | 
26 | **Run `roam-to-git --debug notes/` and report what you get.**
27 | It should open a Chrome front-end, an do the scrapping. The repository content will not be modified. If applicable, add screenshots to help explain your problem.
28 | 
29 | **Please complete the following information:**
30 | - OS: [e.g. MacOs, Linux]
31 | - Do you use Github Action?
32 | - Do you use multiple Roam databases?
33 | - Does roam-to-git use to work for you? When precisely did it stopped to work?
34 | - Does some backup runs are still working?
35 | 
36 | **Additional context**
37 | Add any other context about the problem here.
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | from pkg_resources import parse_requirements
 4 | 
 5 | setup(
 6 |     name='roam_to_git',
 7 |     packages=['roam_to_git'],
 8 |     version='0.1',
 9 |     license='MIT',
10 |     description='Automatic RoamResearch backup to Git',
11 |     author='Matthieu Bizien',  # Type in your name
12 |     author_email='oao2005@gmail.com',  # Type in your E-Mail
13 |     url='https://github.com/MatthieuBizien/roam-to-git',
14 |     download_url='https://github.com/MatthieuBizien/roam-to-git/archive/v0.1.tar.gz',
15 |     keywords=['ROAMRESEARCH', 'GIT', 'BACKUP'],
16 |     install_requires=[str(requirement) for requirement in
17 |                       parse_requirements(open("requirements.txt"))],
18 |     classifiers=[
19 |         'Development Status :: 3 - Alpha',
20 |         'Intended Audience :: Developers',
21 |         'Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Wiki',
22 |         'License :: OSI Approved :: MIT License',
23 |         'Programming Language :: Python :: 3',
24 |         'Programming Language :: Python :: 3.6',
25 |         'Programming Language :: Python :: 3.7',
26 |         'Programming Language :: Python :: 3.8',
27 |     ],
28 |     entry_points={
29 |         'console_scripts': ['roam-to-git=roam_to_git.__main__:main'],
30 |     }
31 | )
32 | 


--------------------------------------------------------------------------------
/roam_to_git/fs.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import zipfile
 4 | from pathlib import Path
 5 | from typing import List, Dict
 6 | 
 7 | import git
 8 | from loguru import logger
 9 | 
10 | 
11 | def get_zip_path(zip_dir_path: Path) -> Path:
12 |     """Return the path to the single zip file in a directory, and fail if there is not one single
13 |     zip file"""
14 |     zip_files = list(zip_dir_path.iterdir())
15 |     zip_files = [f for f in zip_files if f.name.endswith(".zip")]
16 |     assert len(zip_files) == 1, (zip_files, zip_dir_path)
17 |     zip_path, = zip_files
18 |     return zip_path
19 | 
20 | 
21 | def reset_git_directory(git_path: Path, skip=(".git",)):
22 |     """Remove all files in a git directory"""
23 |     to_remove: List[Path] = []
24 |     for file in git_path.glob("**/*"):
25 |         if any(skip_item in file.parts for skip_item in skip):
26 |             continue
27 |         to_remove.append(file)
28 |     # Now we remove starting from the end to remove childs before parents
29 |     to_remove = sorted(set(to_remove))[::-1]
30 |     for file in to_remove:
31 |         if file.is_file():
32 |             file.unlink()
33 |         elif file.is_dir():
34 |             if list(file.iterdir()):
35 |                 logger.debug("Impossible to remove directory {}", file)
36 |             else:
37 |                 file.rmdir()
38 | 
39 | 
40 | def unzip_markdown_archive(zip_dir_path: Path):
41 |     zip_path = get_zip_path(zip_dir_path)
42 |     with zipfile.ZipFile(zip_path) as zip_file:
43 |         contents = {file.filename: zip_file.read(file.filename).decode()
44 |                     for file in zip_file.infolist()
45 |                     if not file.is_dir()}
46 |     return contents
47 | 
48 | 
49 | def save_markdowns(directory: Path, contents: Dict[str, str]):
50 |     logger.debug("Saving markdown to {}", directory)
51 |     # Format and write the markdown files
52 |     for file_name, content in contents.items():
53 |         dest = (directory / file_name)
54 |         dest.parent.mkdir(parents=True, exist_ok=True)  # Needed if a new directory is used
55 |         # We have to specify encoding because crontab on Mac don't use UTF-8
56 |         # https://stackoverflow.com/questions/11735363/python3-unicodeencodeerror-crontab
57 |         with dest.open("w", encoding="utf-8") as f:
58 |             f.write(content)
59 | 
60 | 
61 | def unzip_and_save_json_archive(zip_dir_path: Path, directory: Path):
62 |     logger.debug("Saving json to {}", directory)
63 |     directory.mkdir(exist_ok=True)
64 |     zip_path = get_zip_path(zip_dir_path)
65 |     with zipfile.ZipFile(zip_path) as zip_file:
66 |         files = list(zip_file.namelist())
67 |         for file in files:
68 |             assert file.endswith(".json")
69 |             content = json.loads(zip_file.read(file).decode())
70 |             with open(directory / file, "w") as f:
71 |                 json.dump(content, f, sort_keys=True, indent=2, ensure_ascii=True)
72 | 
73 | 
74 | def commit_git_directory(repo: git.Repo):
75 |     """Add an automatic commit in a git directory if it has changed, and push it"""
76 |     if not repo.is_dirty() and not repo.untracked_files:
77 |         # No change, nothing to do
78 |         return
79 |     logger.debug("Committing git repository {}", repo.git_dir)
80 |     repo.git.add(A=True)  # https://github.com/gitpython-developers/GitPython/issues/292
81 |     repo.index.commit(f"Automatic commit {datetime.datetime.now().isoformat()}")
82 | 
83 | 
84 | def push_git_repository(repo: git.Repo):
85 |     logger.debug("Pushing to origin")
86 |     origin = repo.remote(name='origin')
87 |     origin.push()
88 | 


--------------------------------------------------------------------------------
/roam_to_git/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | import tempfile
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | import git
10 | from dotenv import load_dotenv
11 | from loguru import logger
12 | 
13 | from roam_to_git.formatter import read_markdown_directory, format_markdown
14 | from roam_to_git.fs import reset_git_directory, unzip_markdown_archive, \
15 |     unzip_and_save_json_archive, commit_git_directory, push_git_repository, save_markdowns
16 | from roam_to_git.scrapping import patch_pyppeteer, scrap, Config
17 | 
18 | 
19 | @logger.catch(reraise=True)
20 | def main():
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("directory", default=None, nargs="?",
23 |                         help="Directory of your notes are stored. Default to notes/")
24 |     parser.add_argument("--debug", action="store_true",
25 |                         help="Help debug by opening the browser in the foreground. Note that the "
26 |                              "git repository will not be updated with that option.")
27 |     parser.add_argument("--database", default=None,
28 |                         help="If you have multiple Roam databases, select the one you want to save."
29 |                              "Can also be configured with env variable ROAMRESEARCH_DATABASE.")
30 |     parser.add_argument("--skip-git", action="store_true",
31 |                         help="Consider the repository as just a directory, and don't do any "
32 |                              "git-related action.")
33 |     parser.add_argument("--skip-push", action="store_true",
34 |                         help="Don't git push after commit.")
35 |     parser.add_argument("--skip-fetch", action="store_true",
36 |                         help="Do not download the data from Roam, just update the formatting.")
37 |     parser.add_argument("--sleep-duration", type=float, default=2.,
38 |                         help="Duration to wait for the interface. We wait 100x that duration for"
39 |                              "Roam to load. Increase it if Roam servers are slow, but be careful"
40 |                              "with the free tier of Github Actions.")
41 |     args = parser.parse_args()
42 | 
43 |     patch_pyppeteer()
44 |     if args.directory is None:
45 |         git_path = Path("notes").absolute()
46 |     else:
47 |         git_path = Path(args.directory).absolute()
48 | 
49 |     if (git_path / ".env").exists():
50 |         logger.info("Loading secrets from {}", git_path / ".env")
51 |         load_dotenv(git_path / ".env", override=True)
52 |     else:
53 |         logger.debug("No secret found at {}", git_path / ".env")
54 |     if "ROAMRESEARCH_USER" not in os.environ or "ROAMRESEARCH_PASSWORD" not in os.environ:
55 |         logger.error("Please define ROAMRESEARCH_USER and ROAMRESEARCH_PASSWORD, "
56 |                      "in the .env file of your notes repository, or in environment variables")
57 |         sys.exit(1)
58 |     config = Config(args.database, debug=args.debug, sleep_duration=float(args.sleep_duration))
59 | 
60 |     if args.skip_git:
61 |         repo = None
62 |     else:
63 |         repo = git.Repo(git_path)
64 |         assert not repo.bare  # Fail fast if it's not a repo
65 | 
66 |     reset_git_directory(git_path / "formatted")
67 |     if not args.skip_fetch:
68 |         reset_git_directory(git_path / "json")
69 |         reset_git_directory(git_path / "markdown")
70 | 
71 |         with tempfile.TemporaryDirectory() as markdown_zip_path, \
72 |                 tempfile.TemporaryDirectory() as json_zip_path:
73 |             markdown_zip_path = Path(markdown_zip_path)
74 |             json_zip_path = Path(json_zip_path)
75 | 
76 |             scrap(markdown_zip_path, json_zip_path, config)
77 |             if config.debug:
78 |                 logger.debug("waiting for the download...")
79 |                 time.sleep(20)
80 |                 return
81 |             raws = unzip_markdown_archive(markdown_zip_path)
82 |             save_markdowns(git_path / "markdown", raws)
83 |             unzip_and_save_json_archive(json_zip_path, git_path / "json")
84 | 
85 |     formatted = format_markdown(read_markdown_directory(git_path / "markdown"))
86 |     save_markdowns(git_path / "formatted", formatted)
87 | 
88 |     if repo is not None:
89 |         commit_git_directory(repo)
90 |         if not args.skip_push:
91 |             push_git_repository(repo)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import unittest
  3 | from pathlib import Path
  4 | from typing import List
  5 | 
  6 | import mypy.api
  7 | 
  8 | from roam_to_git.formatter import extract_links, format_link, format_to_do
  9 | 
 10 | 
 11 | class TestFormatTodo(unittest.TestCase):
 12 |     def test_empty(self):
 13 |         self.assertEqual(format_to_do(""), "")
 14 | 
 15 |     def test_no_link(self):
 16 |         self.assertEqual(format_to_do("string"), "string")
 17 | 
 18 |     def test_to_do(self):
 19 |         self.assertEqual(format_to_do("a\n- {{[[TODO]]}}string"), "a\n- [ ] string")
 20 | 
 21 |     def test_done(self):
 22 |         self.assertEqual(format_to_do("a\n- {{[[DONE]]}}string"), "a\n- [x] string")
 23 | 
 24 |     def test_something_else(self):
 25 |         self.assertEqual(format_to_do("a\n- {{[[ZZZ]]}}string"), "a\n- {{[[ZZZ]]}}string")
 26 | 
 27 | 
 28 | class TestFormatLinks(unittest.TestCase):
 29 |     """Test that we correctly format the links"""
 30 | 
 31 |     def test_empty(self):
 32 |         self.assertEqual(format_link(""), "")
 33 | 
 34 |     def test_no_link(self):
 35 |         self.assertEqual(format_link("string"), "string")
 36 | 
 37 |     def test_one_link(self):
 38 |         self.assertEqual(format_link("string [[link]]."), "string [link](<link.md>).")
 39 | 
 40 |     def test_one_link_prefix(self):
 41 |         self.assertEqual(format_link("string [[link]].", link_prefix="../../"),
 42 |                          "string [link](<../../link.md>).")
 43 | 
 44 |     def test_two_links(self):
 45 |         self.assertEqual(format_link("[[link]] [[other]]"),
 46 |                          "[link](<link.md>) [other](<other.md>)")
 47 | 
 48 |     def test_one_hashtag(self):
 49 |         self.assertEqual(format_link("string #link."), "string [link](<link.md>).")
 50 | 
 51 |     def test_two_hashtag(self):
 52 |         self.assertEqual(format_link("#link #other"),
 53 |                          "[link](<link.md>) [other](<other.md>)")
 54 | 
 55 |     def test_attribute(self):
 56 |         self.assertEqual(format_link("  - string:: link"), "  - **[string](<string.md>):** link")
 57 | 
 58 |     def test_attribute_then_attribute_like(self):
 59 |         self.assertEqual(format_link("- attrib:: string:: val"),
 60 |                          "- **[attrib](<attrib.md>):** string:: val")
 61 | 
 62 |     def test_attribute_with_colon(self):
 63 |         self.assertEqual(format_link("- attrib:is:: string"),
 64 |                          "- **[attrib:is](<attrib:is.md>):** string")
 65 | 
 66 |     def test_attribute_new_line(self):
 67 |         self.assertEqual(format_link("  - attrib:: string\n  "
 68 |                                      "- attrib:: string"),
 69 |                          "  - **[attrib](<attrib.md>):** string\n "
 70 |                          " - **[attrib](<attrib.md>):** string")
 71 | 
 72 | def _extract_links(string) -> List[str]:
 73 |     return [m.group(1) for m in extract_links(string)]
 74 | 
 75 | 
 76 | class TestExtractLinks(unittest.TestCase):
 77 |     """Test that we correctly extract the links, for backreference"""
 78 |     def test_empty(self):
 79 |         self.assertEqual(_extract_links(""), [])
 80 | 
 81 |     def test_no_link(self):
 82 |         self.assertEqual(_extract_links("string"), [])
 83 | 
 84 |     def test_one_link(self):
 85 |         self.assertEqual(_extract_links("string [[link]]."), ["link"])
 86 | 
 87 |     def test_two_links(self):
 88 |         self.assertEqual(_extract_links("[[link]] [[other]]"), ["link", "other"])
 89 | 
 90 |     def test_one_hashtag(self):
 91 |         self.assertEqual(_extract_links("string [[link]]."), ["link"])
 92 | 
 93 |     def test_two_hashtag(self):
 94 |         self.assertEqual(_extract_links("[[link]] [[other]]"), ["link", "other"])
 95 | 
 96 |     def test_no_attribute(self):
 97 |         self.assertEqual(_extract_links("  - string: link"), [])
 98 | 
 99 |     def test_attribute(self):
100 |         self.assertEqual(_extract_links("  - attrib:: link"), ["attrib"])
101 | 
102 |     def test_attribute_then_attribute_like(self):
103 |         self.assertEqual(_extract_links("- attrib:: link:: val"), ["attrib"])
104 | 
105 |     def test_attribute_with_colon(self):
106 |         self.assertEqual(_extract_links("- attrib:is:: link"), ["attrib:is"])
107 | 
108 |     def test_attribute_new_line(self):
109 |         self.assertEqual(_extract_links("  - attrib:: link\n  "
110 |                                         "- attrib2:: link"),
111 |                          ["attrib", "attrib2"])
112 | 
113 | 
114 | class TestMypy(unittest.TestCase):
115 |     def _test_mypy(self, files: List[str]):
116 |         stdout, stderr, exit_status = mypy.api.run(["--ignore-missing-imports", *files])
117 |         self.assertEqual(exit_status, 0)
118 | 
119 |     def test_mypy_rtg(self):
120 |         self._test_mypy(["roam_to_git"])
121 | 
122 |     def test_mypy_rtg_and_tests(self):
123 |         self._test_mypy(["roam_to_git", "tests.py"])
124 | 
125 |     def test_mypy_all(self):
126 |         self._test_mypy([str(f) for f in Path(__file__).parent.iterdir()
127 |                          if f.is_file() and f.name.endswith(".py")])
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     unittest.main()
132 | 


--------------------------------------------------------------------------------
/roam_to_git/formatter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from collections import defaultdict
  4 | from itertools import takewhile
  5 | from pathlib import Path
  6 | from typing import Dict, List, Match, Tuple
  7 | 
  8 | 
  9 | def read_markdown_directory(raw_directory: Path) -> Dict[str, str]:
 10 |     contents = {}
 11 |     for file in raw_directory.iterdir():
 12 |         if file.is_dir():
 13 |             # We recursively add the content of sub-directories.
 14 |             # They exists when there is a / in the note name.
 15 |             for child_name, content in read_markdown_directory(file).items():
 16 |                 contents[f"{file.name}/{child_name}"] = content
 17 |         if not file.is_file():
 18 |             continue
 19 |         with file.open(encoding="utf-8") as f:
 20 |             content = file.read_text(encoding="utf-8")
 21 |         parts = file.parts[len(raw_directory.parts):]
 22 |         file_name = os.path.join(*parts)
 23 |         contents[file_name] = content
 24 |     return contents
 25 | 
 26 | 
 27 | def get_back_links(contents: Dict[str, str]) -> Dict[str, List[Tuple[str, Match]]]:
 28 |     # Extract backlinks from the markdown
 29 |     forward_links = {file_name: extract_links(content) for file_name, content in contents.items()}
 30 |     back_links: Dict[str, List[Tuple[str, Match]]] = defaultdict(list)
 31 |     for file_name, links in forward_links.items():
 32 |         for link in links:
 33 |             back_links[f"{link.group(1)}.md"].append((file_name, link))
 34 |     return back_links
 35 | 
 36 | 
 37 | def format_markdown(contents: Dict[str, str]) -> Dict[str, str]:
 38 |     back_links = get_back_links(contents)
 39 |     # Format and write the markdown files
 40 |     out = {}
 41 |     for file_name, content in contents.items():
 42 |         # We add the backlinks first, because they use the position of the caracters
 43 |         # of the regex matchs
 44 |         content = add_back_links(content, back_links[file_name])
 45 | 
 46 |         # Format content. Backlinks content will be formatted automatically.
 47 |         content = format_to_do(content)
 48 |         link_prefix = "../" * sum("/" in char for char in file_name)
 49 |         content = format_link(content, link_prefix=link_prefix)
 50 |         if len(content) > 0:
 51 |             out[file_name] = content
 52 | 
 53 |     return out
 54 | 
 55 | 
 56 | def format_to_do(contents: str):
 57 |     contents = re.sub(r"{{\[\[TODO\]\]}} *", r"[ ] ", contents)
 58 |     contents = re.sub(r"{{\[\[DONE\]\]}} *", r"[x] ", contents)
 59 |     return contents
 60 | 
 61 | 
 62 | def extract_links(string: str) -> List[Match]:
 63 |     out = list(re.finditer(r"\[\["
 64 |                            r"([^\]\n]+)"
 65 |                            r"\]\]", string))
 66 |     # Match attributes
 67 |     out.extend(re.finditer(r"(?:^|\n) *- "
 68 |                            r"((?:[^:\n]|:[^:\n])+)"  # Match everything except ::
 69 |                            r"::", string))
 70 |     return out
 71 | 
 72 | 
 73 | def add_back_links(content: str, back_links: List[Tuple[str, Match]]) -> str:
 74 |     if not back_links:
 75 |         return content
 76 |     files = sorted(set((file_name[:-3], match) for file_name, match in back_links),
 77 |                    key=lambda e: (e[0], e[1].start()))
 78 |     new_lines = []
 79 |     file_before = None
 80 |     for file, match in files:
 81 |         if file != file_before:
 82 |             new_lines.append(f"## [{file}](<{file}.md>)")
 83 |         file_before = file
 84 | 
 85 |         start_context_ = list(takewhile(lambda c: c != "\n", match.string[:match.start()][::-1]))
 86 |         start_context = "".join(start_context_[::-1])
 87 | 
 88 |         middle_context = match.string[match.start():match.end()]
 89 | 
 90 |         end_context_ = takewhile(lambda c: c != "\n", match.string[match.end()])
 91 |         end_context = "".join(end_context_)
 92 | 
 93 |         context = (start_context + middle_context + end_context).strip()
 94 |         new_lines.extend([context, ""])
 95 |     backlinks_str = "\n".join(new_lines)
 96 |     return f"{content}\n# Backlinks\n{backlinks_str}\n"
 97 | 
 98 | 
 99 | def format_link(string: str, link_prefix="") -> str:
100 |     """Transform a RoamResearch-like link to a Markdown link.
101 | 
102 |     @param link_prefix: Add the given prefix before all links.
103 |         WARNING: not robust to special characters.
104 |     """
105 |     # Regex are read-only and can't parse [[[[recursive]] [[links]]]], but they do the job.
106 |     # We use a special syntax for links that can have SPACES in them
107 |     # Format internal reference: [[mynote]]
108 |     string = re.sub(r"\[\["  # We start with [[
109 |                     # TODO: manage a single ] in the tag
110 |                     r"([^\]\n]+)"  # Everything except ]
111 |                     r"\]\]",
112 |                     rf"[\1](<{link_prefix}\1.md>)",
113 |                     string, flags=re.MULTILINE)
114 | 
115 |     # Format hashtags: #mytag
116 |     string = re.sub(r"#([a-zA-Z-_0-9]+)",
117 |                     rf"[\1](<{link_prefix}\1.md>)",
118 |                     string, flags=re.MULTILINE)
119 | 
120 |     # Format attributes
121 |     string = re.sub(r"(^ *- )"  # Match the beginning, like '  - '
122 |                     r"(([^:\n]|:[^:\n])+)"  # Match everything except ::
123 |                     r"::",
124 |                     rf"\1**[\2](<{link_prefix}\2.md>):**",  # Format Markdown link
125 |                     string, flags=re.MULTILINE)
126 |     return string
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Automatic RoamResearch backup
  2 | 
  3 | [![Roam Research backup](https://github.com/MatthieuBizien/roam-to-git-demo/workflows/Roam%20Research%20backup/badge.svg)](https://github.com/MatthieuBizien/roam-to-git-demo/actions)
  4 | [![roam-to-git tests.py](https://github.com/MatthieuBizien/roam-to-git/workflows/roam-to-git%20tests.py/badge.svg)](https://github.com/MatthieuBizien/roam-to-git/actions)
  5 | 
  6 | This script helps you backup your [RoamResearch](https://roamresearch.com/) graphs!
  7 | 
  8 | This script automatically
  9 | - Downloads a markdown archive of your RoamResearch workspace
 10 | - Downloads a json archive of your RoamResearch workspace
 11 | - Unzips them to your git directory
 12 | - Commits and pushes the difference to Github
 13 | 
 14 | # Demo
 15 | [See it in action!](https://github.com/MatthieuBizien/roam-to-git-demo). This repo is updated using roam-to-git.
 16 | 
 17 | # Why to use it
 18 | 
 19 | - You have a backup if RoamResearch loses some of your data.
 20 | - You have a history of your notes.
 21 | - You can browse your Github repository easily with a mobile device
 22 | 
 23 | 
 24 | # Use it with Github Actions (recommended)
 25 | 
 26 | ##  Create a (private) Github repository for all your notes
 27 | 
 28 | With [gh](https://github.com/cli/cli): `gh repo create notes` (yes, it's private)
 29 | 
 30 | Or [manually](https://help.github.com/en/github/getting-started-with-github/create-a-repo)
 31 | 
 32 | ## Configure Github secrets 
 33 | 
 34 | - Go to github.com/your/repository/settings/secrets 
 35 | 
 36 | ###
 37 | 
 38 | Add 3 (separate) secrets where the names are 
 39 | 
 40 | `ROAMRESEARCH_USER`
 41 | 
 42 | `ROAMRESEARCH_PASSWORD`
 43 | 
 44 | `ROAMRESEARCH_DATABASE`
 45 | 
 46 | - Refer to [env.template](env.template) for more information
 47 | 
 48 | - when inserting the information, there is no need for quotations or assignments
 49 | 
 50 | ![image](https://user-images.githubusercontent.com/173090/90904133-2cf1c900-e3cf-11ea-960d-71d0543b8158.png)
 51 | 
 52 | 
 53 | ## Add GitHub action
 54 | 
 55 | ```
 56 | cd notes
 57 | mkdir -p .github/workflows/
 58 | curl https://raw.githubusercontent.com/MatthieuBizien/roam-to-git-demo/master/.github/workflows/main.yml > \
 59 |     .github/workflows/main.yml
 60 | git add .github/workflows/main.yml
 61 | git commit -m "Add github/workflows/main.yml"
 62 | git push --set-upstream origin master
 63 | ```
 64 | 
 65 | ## Check that the Github Action works
 66 | 
 67 | - Go to github.com/your/repository/actions
 68 | - Your CI job should start in a few seconds
 69 | 
 70 | ### Note:
 71 | 
 72 | If the backup does not automatically start, try pushing to the repository again
 73 | 
 74 | 
 75 | # Use it locally
 76 | 
 77 | **Note**: if your file system is not case-sensitive, you will not backup notes that have the same name in different 
 78 | cases
 79 | 
 80 | ## Install Roam-To-Git
 81 | With [pipx](https://github.com/pipxproject/pipx) 
 82 | (if you don't know pipx, you should look at it, it's wonderful!)
 83 | 
 84 | `pipx install git+https://github.com/MatthieuBizien/roam-to-git.git`
 85 | 
 86 | ## Create a (private) Github repository for all your notes
 87 | 
 88 | With [gh](https://github.com/cli/cli): `gh repo create notes` (yes, it's private)
 89 | 
 90 | Or [manually](https://help.github.com/en/github/getting-started-with-github/create-a-repo)
 91 | 
 92 | Then run `git push --set-upstream origin master`
 93 | 
 94 | ## Configure environment variables
 95 | 
 96 | - `curl https://raw.githubusercontent.com/MatthieuBizien/roam-to-git/master/env.template > notes/.env`
 97 | - Fill the .env file: `vi .env`
 98 | - Ignore it: `echo .env > notes/.gitignore; cd notes; git add .gitignore; git commit -m "Initial commit"`
 99 | 
100 | ## Manual backup
101 | 
102 | - Run the script: `roam-to-git notes/`
103 | - Check your Github repository, it should be filled with your notes :)
104 | 
105 | ## Automatic backup
106 | 
107 | One-liner to run it with a [cron](https://en.wikipedia.org/wiki/Cron) every hours: 
108 | `echo "0 *  *  *  *  '$(which roam-to-git)' '$(pwd)/notes'" | crontab -`
109 | 
110 | NB: there are [issues](https://github.com/MatthieuBizien/roam-to-git/issues/43) on Mac with a crontab.
111 | 
112 | # Debug
113 | 
114 | Making `roam-to-git` foolproof is hard, as it depends on Roam, on Github Action or the local environment, 
115 | on software not very stable (`pyppeteer` we still love you 😉 )
116 | and on the correct user configuration.
117 | 
118 | For debugging, please try the following:
119 | 
120 | - Check that the environment variables `ROAMRESEARCH_USER`, `ROAMRESEARCH_PASSWORD`, `ROAMRESEARCH_DATABASE` are correctly setup
121 | - Login into Roam using the username and the password. 
122 | You may want to ask a new password if you have enabled Google Login, as it solved some user problems.
123 | - Run `roam-to-git --debug` to check the authentification and download work
124 | - Look at the traceback
125 | - Look for similar issues
126 | - If nothing else work, create a new issue with as many details as possible. 
127 | I will try my best to understand and help you, no SLA promised 😇
128 | 
129 | # Task list
130 | 
131 | ## Backup all RoamResearch data
132 | 
133 | - [x] Download automatically from RoamResearch
134 | - [x] Create Cron
135 | - [x] Write detailed README
136 | - [x] Publish the repository on Github
137 | - [ ] Download images (they currently visible in Github, but not in the archive so not saved in the repository 😕)
138 | 
139 | ## Format the backup to have a good UI
140 | 
141 | ### Link formatting to be compatible with Github markdown
142 | - [x] Format `[[links]]`
143 | - [x] Format `#links`
144 | - [x] Format `attribute::`
145 | - [ ] Format `[[ [[link 1]] [[link 2]] ]]` 
146 | - [ ] Format `((link))`
147 | 
148 | ### Backlink formatting
149 | - [x] Add backlinks reference to the notes files
150 | - [x] Integrate the context into the backlink
151 | - [x] Manage `/` in file names
152 | 
153 | ### Other formatting
154 | - [x] Format `{{TODO}}` to be compatible with Github markdown
155 | - [ ] Format `{{query}}``
156 | 
157 | ## Make it for others
158 | - [x] Push it to Github
159 | - [x] Add example repository
160 | - [x] Make the backup directory configurable
161 | - [ ] Publicize it
162 |     - [x] [RoamResearch Slack](https://roamresearch.slack.com/) [thread](https://roamresearch.slack.com/archives/CN5MK4D2M/p1588670473431200)
163 |     - [ ] [RoamResearch Reddit](https://www.reddit.com/r/RoamResearch/)
164 |     - [ ] Twitter
165 | 
166 | ## Some ideas, I don't need it, but PR welcome 😀
167 | - [ ] Test it/make it work on Windows
168 | - [x] Pre-configure a CI server so it can run every hour without a computer
169 |     Thanks @Stvad for [#4](https://github.com/MatthieuBizien/roam-to-git/issues/4)!
170 | 


--------------------------------------------------------------------------------
/roam_to_git/scrapping.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import atexit
  3 | import os
  4 | import sys
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | import psutil
  9 | import pyppeteer.connection
 10 | from loguru import logger
 11 | from pyppeteer.page import Page
 12 | 
 13 | 
 14 | def patch_pyppeteer():
 15 |     """Fix https://github.com/miyakogi/pyppeteer/issues/178"""
 16 |     import pyppeteer.connection
 17 |     original_method = pyppeteer.connection.websockets.client.connect
 18 | 
 19 |     def new_method(*args, **kwargs):
 20 |         kwargs['ping_interval'] = None
 21 |         kwargs['ping_timeout'] = None
 22 |         return original_method(*args, **kwargs)
 23 | 
 24 |     pyppeteer.connection.websockets.client.connect = new_method
 25 | 
 26 | 
 27 | async def get_text(page, b, norm=True):
 28 |     """Get the inner text of an element"""
 29 |     text = await page.evaluate('(element) => element.textContent', b)
 30 |     if norm:
 31 |         text = text.lower().strip()
 32 |     return text
 33 | 
 34 | 
 35 | class Config:
 36 |     def __init__(self, database: Optional[str], debug: bool, sleep_duration: float = 2.):
 37 |         self.user = os.environ["ROAMRESEARCH_USER"]
 38 |         self.password = os.environ["ROAMRESEARCH_PASSWORD"]
 39 |         assert self.user
 40 |         assert self.password
 41 |         if database:
 42 |             self.database: Optional[str] = database
 43 |         else:
 44 |             self.database = os.environ["ROAMRESEARCH_DATABASE"]
 45 |         assert self.database, "Please define the Roam database you want to backup."
 46 |         self.debug = debug
 47 |         self.sleep_duration = sleep_duration
 48 | 
 49 | 
 50 | async def download_rr_archive(output_type: str,
 51 |                               output_directory: Path,
 52 |                               config: Config,
 53 |                               slow_motion=10,
 54 |                               ):
 55 |     logger.debug("Creating browser")
 56 |     browser = await pyppeteer.launch(devtools=config.debug,
 57 |                                      slowMo=slow_motion,
 58 |                                      autoClose=False,
 59 |                                      )
 60 |     if config.debug:
 61 |         # We want the browser to stay open for debugging the interface
 62 |         pages = await browser.pages()
 63 |         document = pages[0]
 64 |         return await _download_rr_archive(document, output_type, output_directory, config)
 65 | 
 66 |     try:
 67 |         pages = await browser.pages()
 68 |         document = pages[0]
 69 |         return await _download_rr_archive(document, output_type, output_directory, config)
 70 |     except (KeyboardInterrupt, SystemExit):
 71 |         logger.debug("Closing browser on interrupt {}", output_type)
 72 |         await browser.close()
 73 |         logger.debug("Closed browser {}", output_type)
 74 |         raise
 75 |     finally:
 76 |         logger.debug("Closing browser {}", output_type)
 77 |         await browser.close()
 78 |         logger.debug("Closed browser {}", output_type)
 79 | 
 80 | 
 81 | async def _download_rr_archive(document: Page,
 82 |                                output_type: str,
 83 |                                output_directory: Path,
 84 |                                config: Config,
 85 |                                ):
 86 |     """Download an archive in RoamResearch.
 87 | 
 88 |     :param output_type: Download JSON or Markdown
 89 |     :param output_directory: Directory where to stock the outputs
 90 |     """
 91 |     if not config.debug:
 92 |         logger.debug("Configure downloads to {}", output_directory)
 93 |         cdp = await document.target.createCDPSession()
 94 |         await cdp.send('Page.setDownloadBehavior',
 95 |                        {'behavior': 'allow', 'downloadPath': str(output_directory)})
 96 | 
 97 |     await signin(document, config, sleep_duration=config.sleep_duration)
 98 | 
 99 |     if config.database:
100 |         await go_to_database(document, config.database)
101 | 
102 |     logger.debug("Wait for interface to load")
103 |     dot_button = None
104 |     for _ in range(100):
105 |         # Starting is a little bit slow, so we wait for the button that signal it's ok
106 |         await asyncio.sleep(config.sleep_duration)
107 |         dot_button = await document.querySelector(".bp3-icon-more")
108 |         if dot_button is not None:
109 |             break
110 | 
111 |         # If we have multiple databases, we will be stuck. Let's detect that.
112 |         await asyncio.sleep(config.sleep_duration)
113 |         strong = await document.querySelector("strong")
114 |         if strong:
115 |             if "database's you are an admin of" == await get_text(document, strong):
116 |                 logger.error(
117 |                     "You seems to have multiple databases. Please select it with the option "
118 |                     "--database")
119 |                 sys.exit(1)
120 | 
121 |     assert dot_button is not None, "All roads leads to Roam, but that one is too long. Try " \
122 |                                    "again when Roam servers are faster."
123 | 
124 |     # Click on something empty to remove the eventual popup
125 |     # "Sync Quick Capture Notes with Workspace"
126 |     await document.mouse.click(0, 0)
127 | 
128 |     await dot_button.click()
129 | 
130 |     logger.debug("Launch download popup")
131 |     divs_pb3 = await document.querySelectorAll(".bp3-fill")
132 |     export_all, = [b for b in divs_pb3 if await get_text(document, b) == 'export all']
133 |     await export_all.click()
134 |     await asyncio.sleep(config.sleep_duration)
135 | 
136 |     async def get_dropdown_button():
137 |         dropdown_button = await document.querySelector(".bp3-dialog .bp3-button-text")
138 |         assert dropdown_button is not None
139 |         dropdown_button_text = await get_text(document, dropdown_button)
140 |         # Defensive check if the interface change
141 |         assert dropdown_button_text in ["markdown", "json"], dropdown_button_text
142 |         return dropdown_button, dropdown_button_text
143 | 
144 |     logger.debug("Checking download type")
145 |     button, button_text = await get_dropdown_button()
146 | 
147 |     if button_text != output_type:
148 |         logger.debug("Changing output type to {}", output_type)
149 |         await button.click()
150 |         await asyncio.sleep(config.sleep_duration)
151 |         output_type_elems = await document.querySelectorAll(".bp3-text-overflow-ellipsis")
152 |         output_type_elem, = [e for e in output_type_elems if await get_text(document, e) == output_type]
153 |         await output_type_elem.click()
154 | 
155 |         # defensive check
156 |         await asyncio.sleep(config.sleep_duration)
157 |         _, button_text_ = await get_dropdown_button()
158 |         assert button_text_ == output_type, (button_text_, output_type)
159 | 
160 |     logger.debug("Downloading output of type {}", output_type)
161 |     buttons = await document.querySelectorAll('button')
162 |     export_all_confirm, = [b for b in buttons if await get_text(document, b) == 'export all']
163 |     await export_all_confirm.click()
164 | 
165 |     logger.debug("Wait download of {} to {}", output_type, output_directory)
166 |     if config.debug:
167 |         # No way to check because download location is not specified
168 |         return
169 |     for i in range(1, 60 * 10):
170 |         await asyncio.sleep(1)
171 |         if i % 60 == 0:
172 |             logger.debug("Keep waiting for {}, {}s elapsed", output_type, i)
173 |         for file in output_directory.iterdir():
174 |             if file.name.endswith(".zip"):
175 |                 logger.debug("File {} found for {}", file, output_type)
176 |                 await asyncio.sleep(1)
177 |                 return
178 |     logger.debug("Waiting too long {}")
179 |     raise FileNotFoundError("Impossible to download {} in {}", output_type, output_directory)
180 | 
181 | 
182 | async def signin(document, config: Config, sleep_duration=1.):
183 |     """Sign-in into Roam"""
184 |     logger.debug("Opening signin page")
185 |     await document.goto('https://roamresearch.com/#/signin')
186 |     await asyncio.sleep(sleep_duration)
187 | 
188 |     logger.debug("Fill email '{}'", config.user)
189 |     email_elem = await document.querySelector("input[name='email']")
190 |     await email_elem.click()
191 |     await email_elem.type(config.user)
192 | 
193 |     logger.debug("Fill password")
194 |     passwd_elem = await document.querySelector("input[name='password']")
195 |     await passwd_elem.click()
196 |     await passwd_elem.type(config.password)
197 | 
198 |     logger.debug("Click on sign-in")
199 |     buttons = await document.querySelectorAll('button')
200 |     signin_confirm, = [b for b in buttons if await get_text(document, b) == 'sign in']
201 |     await signin_confirm.click()
202 |     await asyncio.sleep(sleep_duration)
203 | 
204 | 
205 | async def go_to_database(document, database):
206 |     """Go to the database page"""
207 |     url = f'https://roamresearch.com/#/app/{database}'
208 |     logger.debug(f"Load database from url '{url}'")
209 |     await document.goto(url)
210 | 
211 | 
212 | def _kill_child_process(timeout=50):
213 |     procs = psutil.Process().children(recursive=True)
214 |     if not procs:
215 |         return
216 |     logger.debug("Terminate child process {}", procs)
217 |     for p in procs:
218 |         try:
219 |             p.terminate()
220 |         except psutil.NoSuchProcess:
221 |             pass
222 |     gone, still_alive = psutil.wait_procs(procs, timeout=timeout)
223 |     if still_alive:
224 |         logger.warning(f"Kill child process {still_alive} that was still alive after "
225 |                        f"'timeout={timeout}' from 'terminate()' command")
226 |         for p in still_alive:
227 |             try:
228 |                 p.kill()
229 |             except psutil.NoSuchProcess:
230 |                 pass
231 | 
232 | 
233 | def scrap(markdown_zip_path: Path, json_zip_path: Path, config: Config):
234 |     # Just for easier run from the CLI
235 |     markdown_zip_path = Path(markdown_zip_path)
236 |     json_zip_path = Path(json_zip_path)
237 | 
238 |     tasks = [download_rr_archive("markdown", Path(markdown_zip_path), config=config),
239 |              download_rr_archive("json", Path(json_zip_path), config=config),
240 |              ]
241 |     # Register to always kill child process when the script close, to not have zombie process.
242 |     # Because of https://github.com/miyakogi/pyppeteer/issues/274 without this patch it does happen
243 |     # a lot.
244 |     if not config.debug:
245 |         atexit.register(_kill_child_process)
246 |     if config.debug:
247 |         for task in tasks:
248 |             # Run sequentially for easier debugging
249 |             asyncio.get_event_loop().run_until_complete(task)
250 |         logger.warning("Exiting without updating the git repository, "
251 |                        "because we can't get the downloads with the option --debug")
252 |     else:
253 |         asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))
254 |         logger.debug("Scrapping finished")
255 | 


--------------------------------------------------------------------------------