├── .github
└── workflows
│ ├── codeql-analysis.yml
│ └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
├── src
└── notion_df
│ ├── __init__.py
│ ├── _pandas.py
│ ├── agent.py
│ ├── base.py
│ ├── blocks.py
│ ├── configs.py
│ ├── constants.py
│ ├── utils.py
│ └── values.py
└── tests
├── test_agent.py
└── test_base.py
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master ]
20 | schedule:
21 | - cron: '29 20 * * 6'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v2
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v1
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v1
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v1
71 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | release-pypi:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: '3.x'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
24 | run: |
25 | python setup.py sdist bdist_wheel
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | .gitattributes
3 | .last_checked
4 | .gitconfig
5 | *.bak
6 | *.log
7 | *~
8 | ~*
9 | _tmp*
10 | tmp*
11 | tags
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | env/
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 |
40 | # PyInstaller
41 | # Usually these files are written by a python script from a template
42 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 |
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 |
50 | # Unit test / coverage reports
51 | htmlcov/
52 | .tox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | .hypothesis/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # dotenv
95 | .env
96 |
97 | # virtualenv
98 | .venv
99 | venv/
100 | ENV/
101 |
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 |
106 | # Rope project settings
107 | .ropeproject
108 |
109 | # mkdocs documentation
110 | /site
111 |
112 | # mypy
113 | .mypy_cache/
114 |
115 | .vscode
116 | *.swp
117 |
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 |
126 | # pytest
127 | .pytest_cache
128 |
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 |
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 |
136 | # link checker
137 | checklink/cookies.txt
138 |
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 |
142 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Shannon Shen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # `notion-df`: Seamlessly Connecting Notion Database with Pandas DataFrame
2 |
3 | *Please Note: This project is currently in pre-alpha stage. The code are not appropriately documented and tested. Please report any issues you find. Thanks!*
4 |
5 | ## Installation
6 |
7 | ```bash
8 | pip install notion-df
9 | ```
10 |
11 | ## Usage
12 |
13 | - Before starting, please follow the instructions to [create a new integration](https://www.notion.com/my-integrations) and [add it to your Notion page or database](https://developers.notion.com/docs/getting-started#step-2-share-a-database-with-your-integration).
14 | - We'll refer `Internal Integration Token` as the `api_key` below.
15 |
16 | - Pandas-flavored APIs: Just need to add two additional lines of code:
17 | ```python
18 | import notion_df
19 | notion_df.pandas() #That's it!
20 |
21 | page_url = "paste your page url from Notion"
22 | api_key = "paste your api key (internal integration key)"
23 |
24 | import pandas as pd
25 | df = pd.read_notion(page_url, api_key=api_key)
26 | df.to_notion(page_url, api_key=api_key)
27 | ```
28 |
29 | - Download your Notion table as a pandas DataFrame
30 | ```python
31 | import notion_df
32 | df = notion_df.download(notion_database_url, api_key=api_key)
33 | # Equivalent to: df = pd.read_notion(notion_database_url, api_key=api_key)
34 | df.head()
35 | ```
36 |
37 | Only downloading the first `nrows` from a database
38 |
39 | ```python
40 | df = notion_df.download(notion_database_url, nrows=nrows) #e.g., 10
41 | ```
42 |
43 |
44 |
45 |
46 | What if your table has a relation column?
47 |
48 | ```python
49 | df = notion_df.download(notion_database_url,
50 | resolve_relation_values=True)
51 | ```
52 | The `resolve_relation_values=True` will automatically resolve the linking for all the relation columns whose target can be accessed by the current notion integration.
53 |
54 | In details, let's say the `"test"` column in df is a relation column in Notion.
55 | 1. When `resolve_relation_values=False`, the return results for that column will be a list of UUIDs of the target page: `['65e04f11-xxxx', 'b0ffcb4b-xxxx', ]`.
56 | 2. When `resolve_relation_values=True`, the return results for that column will be a list of regular strings corresponding to the name column of the target pages: `['page1', 'page2', ]`.
57 |
58 |
59 |
60 | - Append a local `df` to a Notion database:
61 |
62 | ```python
63 | import notion_df
64 | notion_df.upload(df, notion_database_url, title="page-title", api_key=api_key)
65 | # Equivalent to: df.to_notion(notion_database_url, title="page-title", api_key=api_key)
66 | ```
67 |
68 | - Upload a local `df` to a newly created database in a Notion page:
69 |
70 | ```python
71 | import notion_df
72 | notion_df.upload(df, notion_page_url, title="page-title", api_key=api_key)
73 | # Equivalent to: df.to_notion(notion_page_url, title="page-title", api_key=api_key)
74 | ```
75 |
76 | - Tired of typing `api_key=api_key` each time?
77 |
78 | ```python
79 | import notion_df
80 | notion_df.config(api_key=api_key) # Or set an environment variable `NOTION_API_KEY`
81 | df = notion_df.download(notion_database_url)
82 | notion_df.upload(df, notion_page_url, title="page-title")
83 | # Similarly in pandas APIs: df.to_notion(notion_page_url, title="page-title")
84 | ```
85 |
86 | ## Development
87 |
88 | 1. Clone the repo and install the dependencies:
89 | ```bash
90 | git clone git@github.com:lolipopshock/notion-df.git
91 | cd notion-df
92 | pip install -e .[dev]
93 | ```
94 | 2. How to run tests?
95 | ```bash
96 | NOTION_API_KEY="" pytest tests/
97 | ```
98 | The tests are dependent on a list of notebooks, specified by the following environment variables:
99 |
100 | | Environment Variable | Description |
101 | | --------------------------- | --------------------------------------- |
102 | | `NOTION_API_KEY` | The API key for your Notion integration |
103 | | `NOTION_ROLLUP_DF` | - |
104 | | `NOTION_FILES_DF` | - |
105 | | `NOTION_FORMULA_DF` | - |
106 | | `NOTION_RELATION_DF` | - |
107 | | `NOTION_RELATION_TARGET_DF` | - |
108 | | `NOTION_LONG_STRING_DF` | - |
109 | | `NOTION_RICH_TEXT_DF` | - |
110 |
111 |
112 | ## TODOs
113 |
114 | - [ ] Add tests for
115 | - [ ] `load`
116 | - [ ] `upload`
117 | - [ ] `values.py`
118 | - [ ] `configs.py`
119 | - [ ] `base.py`
120 | - [ ] Better class organizations/namings for `*Configs` and `*Values`
121 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | notion-client>=0.8.0
2 | pydantic~=1.9.0
3 | pandas
4 | dataclasses
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import os
3 |
4 |
5 | def get_requirements(req_file):
6 | reqs = []
7 | with open(req_file, "r") as fp:
8 | for line in fp.readlines():
9 | if line.startswith("#") or line.strip() == "":
10 | continue
11 | else:
12 | reqs.append(line.strip())
13 | return reqs
14 |
15 |
16 | # A trick from https://github.com/jina-ai/jina/blob/79b302c93b01689e82cf4b52f46522eb7497c404/setup.py#L20
17 | libinfo_py = os.path.join("src", "notion_df", "__init__.py")
18 | libinfo_content = open(libinfo_py, "r", encoding="utf8").readlines()
19 | version_line = [l.strip() for l in libinfo_content if l.startswith("__version__")][0]
20 | exec(version_line) # gives __version__
21 |
22 | setup(
23 | name="notion-df",
24 | version=__version__,
25 | description="Notion-DF: Seamlessly Connecting Notion Database with Pandas DataFrame",
26 | author="Zejiang Shen",
27 | author_email="zejiangshen@gmail.com",
28 | license="MIT",
29 | url="https://github.com/lolipopshock/notion-df",
30 | package_dir={"": "src"},
31 | packages=find_packages("src"),
32 | long_description=open("README.md", "r", encoding="utf-8").read(),
33 | long_description_content_type="text/markdown",
34 | python_requires=">=3.6",
35 | install_requires=get_requirements("requirements.txt"),
36 | extras_require={
37 | "dev": [
38 | "black==21.12b0",
39 | "pytest",
40 | ],
41 | }
42 | )
--------------------------------------------------------------------------------
/src/notion_df/__init__.py:
--------------------------------------------------------------------------------
1 | from notion_df.agent import download, upload, config
2 | from notion_df._pandas import pandas
3 |
4 | __version__ = "0.0.5"
5 |
--------------------------------------------------------------------------------
/src/notion_df/_pandas.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from notion_df import upload, download
3 |
4 |
5 | def read_notion(
6 | notion_url: str,
7 | nrows: Optional[int] = None,
8 | resolve_relation_values: bool = False,
9 | errors: str = "strict",
10 | api_key: str = None,
11 | ) -> "pd.DataFrame":
12 | """Download a Notion database as a pandas DataFrame.
13 |
14 | Args:
15 | notion_url (str):
16 | The URL of the Notion database to download from.
17 | nrows (int, optional):
18 | Number of rows of file to read. Useful for reading
19 | pieces of large files.
20 | resolve_relation_values (bool, optional):
21 | By default, when downloading relation columns, notion-df
22 | will just download the object ids. If set `resolve_relation_values`
23 | to `True`, notion-df will try to pull the values of the title
24 | column from the target table and map the object ids to those values.
25 | Defaults to False.
26 | errors (str, optional):
27 | You can specify how to handle errors during downloading. There
28 | are several options:
29 | 1. "strict": raise an error when there is one.
30 | 2. "ignore": ignore errors.
31 | 3. "warn": print the error message.
32 | Defaults to "strict".
33 | api_key (str, optional):
34 | The API key of the Notion integration.
35 | Defaults to None.
36 | Returns:
37 | pd.DataFrame: the loaded dataframe.
38 | """
39 | return download(
40 | notion_url,
41 | nrows=nrows,
42 | resolve_relation_values=resolve_relation_values,
43 | errors=errors,
44 | api_key=api_key,
45 | )
46 |
47 |
48 | def to_notion(
49 | self,
50 | notion_url: str,
51 | schema=None,
52 | mode: str = "a",
53 | title: str = "",
54 | title_col: str = "",
55 | errors: str = "strict",
56 | resolve_relation_values: bool = False,
57 | create_new_rows_in_relation_target: bool = False,
58 | return_response: bool = False,
59 | api_key: str = None,
60 | ):
61 |
62 | """Upload a dataframe to the specified Notion database.
63 |
64 | Args:
65 | df (pd.DataFrame):
66 | The dataframe to upload.
67 | notion_url (str):
68 | The URL of the Notion page to upload to.
69 | If it is a notion page, then it will create a new database
70 | under that page and upload the dataframe to it.
71 | schema (DatabaseSchema, optional):
72 | The schema of the Notion database.
73 | When not set, it will be inferred from (1) the target
74 | notion database (if it is) then (2) the dataframe itself.
75 | mode (str, optional):
76 | (the function is not supported yet.)
77 | Whether to append to the database or overwrite.
78 | Defaults to "a".
79 | title (str, optional):
80 | The title of the Notion database.
81 | Defaults to "".
82 | title_col (str, optional):
83 | Every Notion database requires a "title" column.
84 | When the schema is not set, by default it infers the first
85 | column of uploaded dataframe as the title column. You can
86 | set this value to specify the title column.
87 | Defaults to "".
88 | errors (str, optional):
89 | Since we upload the dataframe to Notion row by row, you
90 | can specify how to handle errors during uploading. There
91 | are several options:
92 | 1. "strict": raise an error when there is one.
93 | 2. "ignore": ignore errors and continue uploading
94 | subsequent rows.
95 | 3. "warn": print the error message and continue uploading
96 | Defaults to "strict".
97 | resolve_relation_values (bool, optional):
98 | If `True`, notion-df assumes the items in any relation columns
99 | are not notion object ids, but the value of the corresponding
100 | "title column" in the target table. It will try to convert the
101 | relation column to notion object ids by looking up the value.
102 | Defaults to False.
103 | create_new_rows_in_relation_target (bool, optional):
104 | This argument is used in conjunction with `resolve_relation_values`.
105 | If True, then notion-df will try to create new rows in the target
106 | the relation table if the relation column value is not found there.
107 | Defaults to False.
108 | return_response (bool, optional):
109 | If True, then the function will return a list of responses for
110 | the updates from Notion.
111 | api_key (str, optional):
112 | The API key of the Notion integration.
113 | Defaults to None.
114 | """
115 |
116 | return upload(
117 | df=self,
118 | notion_url=notion_url,
119 | schema=schema,
120 | mode=mode,
121 | title=title,
122 | title_col=title_col,
123 | errors=errors,
124 | resolve_relation_values=resolve_relation_values,
125 | create_new_rows_in_relation_target=create_new_rows_in_relation_target,
126 | return_response=return_response,
127 | api_key=api_key,
128 | )
129 |
130 |
131 | def pandas():
132 | import pandas as pd
133 |
134 | pd.read_notion = read_notion
135 | pd.DataFrame.to_notion = to_notion
136 |
--------------------------------------------------------------------------------
/src/notion_df/agent.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional, Union, Tuple
2 | from datetime import datetime
3 | import warnings
4 | import os
5 | from functools import wraps
6 |
7 | import pandas as pd
8 | from httpx import HTTPStatusError
9 | from notion_client import Client
10 | from notion_client.helpers import get_id
11 |
12 | from notion_df.values import PageProperties, PageProperty
13 | from notion_df.configs import DatabaseSchema, NON_EDITABLE_TYPES
14 | from notion_df.utils import is_uuid, flatten_dict
15 | from notion_df.blocks import parse_blocks, BaseNotionBlock
16 |
17 | API_KEY = None
18 | NOT_REVERSE_DATAFRAME = -1
19 | # whether to reverse the dataframe when performing uploading.
20 | # for some reason, notion will reverse the order of dataframe
21 | # when uploading.
22 | # -1 for reversing, for not reversing
23 | NOTION_DEFAULT_PAGE_SIZE = 100
24 | NOTION_MAX_PAGE_SIZE = 100
25 |
26 |
27 | def config(api_key: str):
28 | global API_KEY
29 | API_KEY = api_key
30 |
31 |
32 | def _load_api_key(api_key: str) -> str:
33 | if api_key is not None:
34 | return api_key
35 | elif API_KEY is not None:
36 | return API_KEY
37 | elif os.environ.get("NOTION_API_KEY") is not None:
38 | return os.environ.get("NOTION_API_KEY")
39 | else:
40 | raise ValueError("No API key provided")
41 |
42 |
43 | def _is_notion_database(notion_url):
44 | return "?v=" in notion_url.split("/")[-1]
45 |
46 |
47 | def use_client(func):
48 | @wraps(func)
49 | def wrapper(*args, **kwargs):
50 | orig_client = client = kwargs.pop("client", None)
51 |
52 | if client is None:
53 | api_key = _load_api_key(kwargs.pop("api_key", None))
54 | client = Client(auth=api_key)
55 | out = func(client=client, *args, **kwargs)
56 |
57 | if orig_client is None:
58 | # Automatically close the client if it was not passed in
59 | client.close()
60 | return out
61 |
62 | return wrapper
63 |
64 |
65 | def query_database(
66 | database_id: str,
67 | client: Client,
68 | start_cursor: Optional[str] = None,
69 | page_size: int = NOTION_DEFAULT_PAGE_SIZE,
70 | ):
71 | query_dict = {"database_id": database_id, "page_size": page_size}
72 | if start_cursor is not None:
73 | query_dict["start_cursor"] = start_cursor
74 | # For now, Notion API doesn't allow start_cursor='null'
75 |
76 | query_results = client.databases.query(**query_dict)
77 |
78 | assert query_results["object"] == "list"
79 | return query_results
80 |
81 |
82 | def load_df_from_queries(
83 | database_query_results: List[Dict],
84 | ):
85 | properties = PageProperties.from_raw(database_query_results)
86 | df = properties.to_frame()
87 |
88 | with warnings.catch_warnings():
89 | warnings.simplefilter("ignore")
90 | # TODO: figure out a better solution
91 | # When doing the following, Pandas may think you are trying
92 | # to add a new column to the dataframe; it will show the warnings,
93 | # but it will not actually add the column. So we use catch_warnings
94 | # to hide the warnings.
95 | # However this might not be the best way to do so. Some alternatives
96 | # include setting df.attrs https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.attrs.html
97 | # Or even use something like multi-level index for saving notion_ids.
98 | # Nevertheless, all of them seems not that perfect -- for example,
99 | # after copying or slicing, the values will disappear.
100 | # Should try to figure out a better solution in the future.
101 | df.notion_urls = pd.Series([ele["url"] for ele in database_query_results])
102 | df.notion_ids = pd.Series([ele["id"] for ele in database_query_results])
103 | df.notion_query_results = database_query_results
104 | # TODO: Rethink if this should be private
105 |
106 | return df
107 |
108 |
109 | def download_df_from_database(
110 | notion_url: str,
111 | client: Client,
112 | nrows: Optional[int] = None,
113 | errors: str = "strict",
114 | ) -> pd.DataFrame:
115 | """Download a Notion database as a pandas DataFrame.
116 |
117 | Args:
118 | notion_url (str):
119 | The URL of the Notion database to download from.
120 | nrows (int, optional):
121 | Number of rows of file to read. Useful for reading
122 | pieces of large files.
123 | api_key (str, optional):
124 | The API key of the Notion integration.
125 | Defaults to None.
126 | client (Client, optional):
127 | The notion client.
128 | Defaults to None.
129 | Returns:
130 | pd.DataFrame: the loaded dataframe.
131 | """
132 | if not is_uuid(notion_url):
133 | assert _is_notion_database(notion_url)
134 | database_id = get_id(notion_url)
135 | else:
136 | database_id = notion_url
137 |
138 | # Check the if the id is a database first
139 | try:
140 | retrieve_results = client.databases.retrieve(database_id=database_id)
141 | schema = DatabaseSchema.from_raw(retrieve_results["properties"])
142 | except HTTPStatusError:
143 | error_msg = (
144 | f"The object {database_id} might not be a notion database, "
145 | "or integration associated with the API key don't have access "
146 | "to it."
147 | )
148 | if errors == "strict":
149 | raise ValueError(error_msg)
150 | elif errors == "warn":
151 | warnings.warn(error_msg)
152 | return None
153 | elif errors == "ignore":
154 | return None
155 |
156 | downloaded_rows = []
157 |
158 | page_size = NOTION_MAX_PAGE_SIZE
159 | if nrows is not None:
160 | if nrows <= NOTION_MAX_PAGE_SIZE:
161 | page_size = nrows
162 |
163 | query_results = query_database(database_id, client, page_size=page_size)
164 | downloaded_rows.extend(query_results["results"])
165 |
166 | while query_results["has_more"]:
167 | if nrows is not None:
168 | if len(downloaded_rows) >= nrows:
169 | break
170 | else:
171 | page_size = nrows - len(downloaded_rows)
172 | else:
173 | page_size = NOTION_MAX_PAGE_SIZE
174 |
175 | query_results = query_database(
176 | database_id,
177 | client,
178 | start_cursor=query_results["next_cursor"],
179 | page_size=page_size,
180 | )
181 | downloaded_rows.extend(query_results["results"])
182 |
183 | df = load_df_from_queries(downloaded_rows)
184 | df = schema.create_df(df)
185 | return df
186 |
187 |
188 | @use_client
189 | def download(
190 | notion_url: str,
191 | nrows: Optional[int] = None,
192 | resolve_relation_values: Optional[bool] = False,
193 | errors: str = "strict",
194 | *,
195 | api_key: str = None,
196 | client: Client = None,
197 | ):
198 | df = download_df_from_database(
199 | notion_url=notion_url,
200 | nrows=nrows,
201 | client=client,
202 | errors=errors,
203 | )
204 | if resolve_relation_values:
205 | for col in df.columns:
206 | if df.schema[col].type == "relation":
207 | relation_df = download_df_from_database(
208 | df.schema[col].relation.database_id,
209 | errors="warn",
210 | client=client,
211 | )
212 | if relation_df is not None:
213 | rel_title_col = relation_df.schema.title_column
214 | obj_id_to_string = {
215 | obj_id: obj_title
216 | for obj_id, obj_title in zip(
217 | relation_df.notion_ids, relation_df[rel_title_col]
218 | )
219 | }
220 | df[col] = df[col].apply(
221 | lambda row: [obj_id_to_string[ele] for ele in row]
222 | )
223 | return df
224 |
225 |
226 | def create_database(
227 | page_id: str, client: Client, schema: DatabaseSchema, title: str = ""
228 | ):
229 | response = client.databases.create(
230 | parent={"type": "page_id", "page_id": page_id},
231 | title=[{"type": "text", "text": {"content": title}}],
232 | properties=schema.query_dict(),
233 | )
234 | assert response["object"] == "database"
235 | return response
236 |
237 |
238 | def upload_row_to_database(row, database_id, schema, children, client) -> Dict:
239 |
240 | properties = PageProperty.from_series(row, schema).query_dict()
241 | if children:
242 | if not isinstance(children, list):
243 | children = [children]
244 | for cid in range(len(children)):
245 | if isinstance(children[cid], BaseNotionBlock):
246 | children[cid] = flatten_dict(children[cid].dict())
247 |
248 | response = client.pages.create(
249 | parent={"database_id": database_id}, properties=properties, children=children
250 | )
251 | else:
252 | response = client.pages.create(
253 | parent={"database_id": database_id}, properties=properties,
254 | )
255 | return response
256 |
257 |
258 | def upload_to_database(df, databse_id, schema, client, errors, children) -> List[Dict]:
259 | all_response = []
260 | if children is not None:
261 | assert len(children) == len(df)
262 | children = children[::NOT_REVERSE_DATAFRAME]
263 |
264 | for idx, (_, row) in enumerate(df[::NOT_REVERSE_DATAFRAME].iterrows(), ):
265 | try:
266 | child = children[idx] if children is not None else None
267 | response = upload_row_to_database(row, databse_id, schema, child, client)
268 | all_response.append(response)
269 | except Exception as e:
270 | if errors == "strict":
271 | raise e
272 | elif errors == "warn":
273 | warnings.warn(f"Encountered errors {e} while uploading row: {row}")
274 | elif errors == "ignore":
275 | continue
276 | return all_response[::NOT_REVERSE_DATAFRAME]
277 |
278 |
279 | def load_database_schema(database_id, client):
280 | return DatabaseSchema.from_raw(
281 | client.databases.retrieve(database_id=database_id)["properties"]
282 | )
283 |
284 |
285 | @use_client
286 | def upload(
287 | df: pd.DataFrame,
288 | notion_url: str,
289 | schema: DatabaseSchema = None,
290 | mode: str = "a",
291 | title: str = "",
292 | title_col: str = "",
293 | errors: str = "strict",
294 | resolve_relation_values: bool = False,
295 | create_new_rows_in_relation_target: bool = False,
296 | children: List[Union[Dict, BaseNotionBlock]] = None,
297 | return_response: bool = False,
298 | *,
299 | api_key: str = None,
300 | client: Client = None,
301 | ) -> Union[str, Tuple[str, List[Dict]]]:
302 | """Upload a dataframe to the specified Notion database.
303 |
304 | Args:
305 | df (pd.DataFrame):
306 | The dataframe to upload.
307 | notion_url (str):
308 | The URL of the Notion page to upload to.
309 | If it is a notion page, then it will create a new database
310 | under that page and upload the dataframe to it.
311 | schema (DatabaseSchema, optional):
312 | The schema of the Notion database.
313 | When not set, it will be inferred from (1) the target
314 | notion database (if it is) then (2) the dataframe itself.
315 | mode (str, optional):
316 | (the function is not supported yet.)
317 | Whether to append to the database or overwrite.
318 | Defaults to "a".
319 | title (str, optional):
320 | The title of the Notion database.
321 | Defaults to "".
322 | title_col (str, optional):
323 | Every Notion database requires a "title" column.
324 | When the schema is not set, by default it infers the first
325 | column of uploaded dataframe as the title column. You can
326 | set this value to specify the title column.
327 | Defaults to "".
328 | errors (str, optional):
329 | Since we upload the dataframe to Notion row by row, you
330 | can specify how to handle errors during uploading. There
331 | are several options:
332 | 1. "strict": raise an error when there is one.
333 | 2. "ignore": ignore errors and continue uploading
334 | subsequent rows.
335 | 3. "warn": print the error message and continue uploading
336 | Defaults to "strict".
337 | children (List[Union[Dict, BaseNotionBlock]], optional):
338 | The corresponding children of the uploaded Notion page. It should be
339 | a list of the same length as the dataframe.
340 | resolve_relation_values (bool, optional):
341 | If `True`, notion-df assumes the items in any relation columns
342 | are not notion object ids, but the value of the corresponding
343 | "title column" in the target table. It will try to convert the
344 | relation column to notion object ids by looking up the value.
345 | Defaults to False.
346 | create_new_rows_in_relation_target (bool, optional):
347 | This argument is used in conjunction with `resolve_relation_values`.
348 | If True, then notion-df will try to create new rows in the target
349 | the relation table if the relation column value is not found there.
350 | Defaults to False.
351 | return_response (bool, optional):
352 | If True, then the function will return a list of responses for
353 | the updates from Notion.
354 | api_key (str, optional):
355 | The API key of the Notion integration.
356 | Defaults to None.
357 | client (Client, optional):
358 | The notion client.
359 | Defaults to None.
360 | """
361 | if schema is None:
362 | if hasattr(df, "schema"):
363 | schema = df.schema
364 |
365 | if not _is_notion_database(notion_url):
366 | if schema is None:
367 | schema = DatabaseSchema.from_df(df, title_col=title_col)
368 | database_properties = create_database(get_id(notion_url), client, schema, title)
369 | databse_id = database_properties["id"]
370 | notion_url = database_properties["url"]
371 | else:
372 | databse_id = get_id(notion_url)
373 | if schema is None:
374 | schema = load_database_schema(databse_id, client)
375 |
376 | # At this stage, we should have the appropriate schema
377 | assert schema is not None
378 |
379 | if not schema.is_df_compatible(df):
380 | raise ValueError(
381 | "The dataframe is not compatible with the database schema."
382 | "The df contains columns that are not in the databse: "
383 | + f"{[col for col in df.columns if col not in schema.configs.keys()]}"
384 | )
385 |
386 | if mode not in ("a", "append"):
387 | raise NotImplementedError
388 | # TODO: clean the current values in the notion database (if any)
389 |
390 | df = schema.transform(df, remove_non_editables=True)
391 |
392 | # Assumes the notion database is created and has the appropriate schema
393 | if resolve_relation_values:
394 | for col in df.columns:
395 | if schema[col].type == "relation":
396 |
397 | if df[col].apply(lambda row: all([is_uuid(ele) for ele in row])).all():
398 | # The column is all in uuid, we don't need to resolve it
399 | continue
400 |
401 | # Try to download the target_relation_df
402 | relation_db_id = schema[col].relation.database_id
403 | relation_df = download_df_from_database(
404 | relation_db_id,
405 | errors="warn",
406 | client=client,
407 | )
408 |
409 | if relation_df is not None:
410 | rel_title_col = relation_df.schema.title_column
411 | obj_string_to_id = {
412 | obj_title: obj_id
413 | for obj_id, obj_title in zip(
414 | relation_df.notion_ids, relation_df[rel_title_col]
415 | )
416 | }
417 |
418 | all_unique_obj_strings_in_relation_df = set(
419 | relation_df[rel_title_col].tolist()
420 | )
421 | all_unique_obj_strings_in_df = set(sum(df[col].tolist(), []))
422 | # This assumes the column has been transformed to a list of lists;
423 | # which is a true assumption given the transformation for the relation
424 | # column (LIST_TRANSFORM).
425 | new_object_strings = all_unique_obj_strings_in_df.difference(
426 | all_unique_obj_strings_in_relation_df
427 | )
428 |
429 | if create_new_rows_in_relation_target and len(new_object_strings) > 0:
430 | new_relation_df = pd.DataFrame(
431 | list(new_object_strings), columns=[rel_title_col]
432 | )
433 | responses = upload_to_database(
434 | new_relation_df,
435 | relation_db_id,
436 | relation_df.schema,
437 | client,
438 | "warn",
439 | )
440 | appended_relation_df = load_df_from_queries(responses)
441 | obj_string_to_id.update(
442 | {
443 | obj_title: obj_id
444 | for obj_id, obj_title in zip(
445 | appended_relation_df.notion_ids,
446 | appended_relation_df[rel_title_col],
447 | )
448 | }
449 | )
450 |
451 | df[col] = df[col].apply(
452 | lambda row: [obj_string_to_id[ele] for ele in row if ele in obj_string_to_id]
453 | )
454 |
455 | response = upload_to_database(df, databse_id, schema, client, errors, children)
456 |
457 | print(f"Your dataframe has been uploaded to the Notion page: {notion_url} .")
458 | if return_response:
459 | return notion_url, response
460 | return notion_url
461 |
462 | @use_client
463 | def download_page_children(
464 | notion_url: str,
465 | api_key: str = None,
466 | client: Client = None,
467 | ):
468 | """Download the children of a Notion page.
469 |
470 | Args:
471 | notion_url (str):
472 | The url of the Notion page.
473 | api_key (str, optional):
474 | The API key of the Notion integration.
475 | Defaults to None.
476 | client (Client, optional):
477 | The notion client.
478 | Defaults to None.
479 | """
480 | page_id = get_id(notion_url)
481 | r = client.blocks.children.list(block_id=page_id)
482 | return parse_blocks(r['results'], recursive=True, client=client)
--------------------------------------------------------------------------------
/src/notion_df/base.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional, Any
2 | from enum import Enum
3 | from pydantic import BaseModel, validator, root_validator
4 | import pandas as pd
5 |
6 | from notion_df.utils import is_time_string, is_uuid
7 | from notion_df.constants import RICH_TEXT_CONTENT_MAX_LENGTH
8 |
9 | ### All colors supported in NOTION
10 |
11 |
12 | class NotionColorEnum(str, Enum):
13 | Default = "default"
14 | Gray = "gray"
15 | Brown = "brown"
16 | Orange = "orange"
17 | Yellow = "yellow"
18 | Green = "green"
19 | Blue = "blue"
20 | Purple = "purple"
21 | Pink = "pink"
22 | Red = "red"
23 |
24 |
25 | class NotionExtendedColorEnum(str, Enum):
26 | Default = "default"
27 | Gray = "gray"
28 | Brown = "brown"
29 | Orange = "orange"
30 | Yellow = "yellow"
31 | Green = "green"
32 | Blue = "blue"
33 | Purple = "purple"
34 | Pink = "pink"
35 | Red = "red"
36 | GrayBackground = "gray_background"
37 | BrownBackground = "brown_background"
38 | OrangeBackground = "orange_background"
39 | YellowBackground = "yellow_background"
40 | GreenBackground = "green_background"
41 | BlueBackground = "blue_background"
42 | PurpleBackground = "purple_background"
43 | PinkBackground = "pink_background"
44 | RedBackground = "red_background"
45 |
46 |
47 | class RichTextTypeEnum(str, Enum):
48 | Text = "text"
49 | Mention = "mention"
50 | Equation = "equation"
51 |
52 |
53 | class SelectOption(BaseModel):
54 | id: Optional[str]
55 | name: str
56 | color: Optional[NotionColorEnum]
57 |
58 | @classmethod
59 | def from_value(cls, value: str):
60 | return cls(name=value)
61 |
62 | @validator("name")
63 | def name_cannot_contain_comma(cls, v):
64 | if "," in v:
65 | raise ValueError(f"Invalid option name {v} that contains comma")
66 | return v
67 |
68 |
69 | class SelectOptions(BaseModel):
70 | options: Optional[List[SelectOption]]
71 |
72 | @classmethod
73 | def from_value(cls, values: List[str]):
74 | return cls(options=[SelectOption.from_value(value) for value in values])
75 |
76 |
77 | class RelationObject(BaseModel):
78 | id: str
79 | # TODO: Change this to UUID validation
80 |
81 | @classmethod
82 | def from_value(cls, value: str):
83 | return cls(id=value)
84 |
85 | @validator("id")
86 | def id_must_be_uuid(cls, v):
87 | if not is_uuid(v):
88 | raise ValueError(f"Invalid id {v}")
89 | return v
90 |
91 |
92 | class UserObject(BaseModel):
93 | object: str = "user"
94 | id: str
95 | type: Optional[str]
96 | name: Optional[str]
97 | avatar_url: Optional[str]
98 |
99 | @classmethod
100 | def from_value(cls, value: str):
101 | return cls(id=value)
102 |
103 | @validator("object")
104 | def object_is_name(cls, v):
105 | if v != "user":
106 | raise ValueError(f"Invalid user object value {v}")
107 | return v
108 |
109 | @property
110 | def value(self):
111 | return self.name
112 |
113 |
114 | class NumberFormat(BaseModel):
115 | format: str
116 |
117 |
118 | class FormulaProperty(BaseModel):
119 | expression: str
120 |
121 |
122 | class RelationProperty(BaseModel):
123 | database_id: str
124 | # TODO: Change this to UUID validation
125 | synced_property_name: Optional[str]
126 | synced_property_id: Optional[str]
127 |
128 |
129 | class DateObject(BaseModel):
130 | start: Optional[str] = None
131 | end: Optional[str] = None
132 | time_zone: Optional[str] = None
133 |
134 | @validator("start")
135 | def is_start_ISO8601(cls, v):
136 | # TODO: Currently it cannot suport time ranges
137 | if v is not None:
138 | if not is_time_string(v):
139 | raise ValueError(
140 | "The data start is not appropriately formatted as an ISO 8601 date string."
141 | )
142 | return v
143 |
144 | @validator("end")
145 | def is_end_ISO8601(cls, v):
146 | if v is not None:
147 | if not is_time_string(v):
148 | raise ValueError(
149 | "The data end is not appropriately formatted as an ISO 8601 date string."
150 | )
151 | return v
152 |
153 | @classmethod
154 | def from_value(cls, value: str):
155 | return cls(start=value)
156 | # TODO: Now we assume the value has already been formated as strings
157 | # But we should parse them into appropriate formats.
158 |
159 | @property
160 | def value(self):
161 | return pd.to_datetime(self.start)
162 | # TODO: what should the data structure be if self.end is not None?
163 |
164 |
165 | class RollupProperty(BaseModel):
166 | relation_property_name: Optional[str]
167 | relation_property_id: Optional[str]
168 | rollup_property_name: Optional[str]
169 | rollup_property_id: Optional[str]
170 | function: str
171 | # TODO: Change this to ENUM - https://developers.notion.com/reference/create-a-database#rollup-configuration
172 |
173 |
174 | class RollupObject(BaseModel):
175 | type: str
176 | # TODO: Change this to ENUM - https://developers.notion.com/reference/property-value-object#rollup-property-values
177 | number: Optional[float]
178 | date: Optional[DateObject]
179 | array: Optional[List[Any]]
180 | # Based on the description in https://developers.notion.com/reference/property-value-object#rollup-property-value-element
181 | # Each element is exactly like property value object, but without the "id" key.
182 | # As there's a preprocess step in RollupValues, each item of the array must
183 | # be a property value object.
184 | function: Optional[str]
185 | # Though the function param doesn't appear in the documentation, it exists
186 | # in the return values of the API. Set it as optional for future compatibility.
187 | # TODO: check in the future if the function param should be updated.
188 |
189 | @validator("type")
190 | def ensure_non_empty_data(cls, v):
191 | data_type = v
192 | if data_type is None:
193 | raise ValueError("RollupObject must have a type.")
194 | if data_type not in ["number", "date", "array"]:
195 | raise ValueError(f"RollupObject type {data_type} is invalid.")
196 | return v
197 |
198 | @property
199 | def value(self):
200 | if self.type == "number":
201 | return self.number
202 | if self.type == "date":
203 | if self.date is not None:
204 | return self.date.value
205 | if self.type == "array":
206 | return [ele.value for ele in self.array]
207 |
208 |
209 | class FileTargetObject(BaseModel):
210 | url: str
211 | expiry_time: Optional[str]
212 |
213 | @property
214 | def value(self):
215 | return self.url
216 |
217 |
218 | class FileObject(BaseModel):
219 | name: Optional[str] #TODO: Figure out why this is not required...
220 | type: str
221 | file: Optional[FileTargetObject]
222 | external: Optional[FileTargetObject]
223 |
224 | @property
225 | def value(self):
226 | if self.type == "file":
227 | if self.file is not None:
228 | return self.file.value
229 | else:
230 | if self.external is not None:
231 | return self.external.value
232 |
233 |
234 | class FormulaObject(BaseModel):
235 | type: str
236 | string: Optional[str]
237 | number: Optional[float]
238 | boolean: Optional[bool]
239 | date: Optional[DateObject]
240 |
241 | @property
242 | def value(self):
243 | if self.type == "string":
244 | return self.string
245 | elif self.type == "number":
246 | return self.number
247 | elif self.type == "boolean":
248 | return self.boolean
249 | elif self.type == "date":
250 | if self.date is not None:
251 | return self.date.value
252 |
253 |
254 | class AnnotationObject(BaseModel):
255 | bold: bool
256 | italic: bool
257 | strikethrough: bool
258 | underline: bool
259 | code: bool
260 | color: NotionExtendedColorEnum
261 |
262 |
263 | class TextLinkObject(BaseModel):
264 | type: Optional[str] = "url"
265 | url: str
266 |
267 |
268 | class TextObject(BaseModel):
269 | content: str
270 | link: Optional[TextLinkObject]
271 |
272 |
273 | class PageReferenceObject(BaseModel):
274 | id: str
275 |
276 |
277 | class LinkPreviewMentionObject(BaseModel):
278 | url: str
279 |
280 |
281 | class MentionObject(BaseModel):
282 | type: str
283 | user: Optional[UserObject]
284 | page: Optional[PageReferenceObject]
285 | database: Optional[PageReferenceObject]
286 | date: Optional[DateObject]
287 | link_preview: Optional[LinkPreviewMentionObject]
288 |
289 |
290 | class EquationObject(BaseModel):
291 | expression: str
292 |
293 |
294 | class BaseRichTextObject(BaseModel):
295 | plain_text: Optional[str]
296 | # TODO: The Optional[plain_text] is used when creating property values
297 | href: Optional[str] = None
298 | annotations: Optional[AnnotationObject] = None
299 | type: Optional[RichTextTypeEnum]
300 |
301 | @property
302 | def value(self):
303 | return self.plain_text
304 |
305 |
306 | class RichTextObject(BaseRichTextObject):
307 | text: Optional[TextObject]
308 | mention: Optional[MentionObject]
309 | equation: Optional[EquationObject]
310 |
311 | @classmethod
312 | def from_value(cls, value: str):
313 | return cls(text=TextObject(content=value))
314 |
315 | @classmethod
316 | def encode_string(cls, value: str) -> List["RichTextObject"]:
317 | chunk_size = RICH_TEXT_CONTENT_MAX_LENGTH
318 | return [
319 | cls(text=TextObject(content=value[idx : idx + chunk_size]))
320 | for idx in range(0, len(value), chunk_size)
321 | ]
322 |
323 |
324 | class EmojiObject(BaseModel):
325 | type: str = "emoji"
326 | emoji: str
327 |
--------------------------------------------------------------------------------
/src/notion_df/blocks.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from typing import List, Union, Dict, Any, Tuple, Optional, Union
3 |
4 | from notion_client import Client
5 | from pydantic import BaseModel, parse_obj_as, validator, root_validator
6 |
7 | from notion_df.base import (
8 | RichTextObject,
9 | SelectOption,
10 | DateObject,
11 | RelationObject,
12 | UserObject,
13 | RollupObject,
14 | FileObject,
15 | EmojiObject,
16 | FormulaObject,
17 | NotionExtendedColorEnum,
18 | )
19 |
20 |
21 | class ParentObject(BaseModel):
22 | type: str
23 | database_id: Optional[str]
24 | page_id: Optional[str]
25 | workspace: Optional[bool]
26 | block_id: Optional[str]
27 |
28 |
29 | # BaseClasses
30 | class BaseAttributes(BaseModel):
31 | pass
32 |
33 |
34 | class BaseAttributeWithChildren(BaseModel):
35 | children: Optional[List["BaseNotionBlock"]]
36 |
37 |
38 | class TextBlockAttributes(BaseAttributeWithChildren):
39 | rich_text: List[RichTextObject]
40 | color: Optional[NotionExtendedColorEnum]
41 |
42 |
43 | class HeadingBlockAttributes(BaseAttributeWithChildren):
44 | rich_text: List[RichTextObject]
45 | color: Optional[NotionExtendedColorEnum]
46 | is_toggleable: bool
47 | # Whether or not the heading block is a toggle heading or not. If true, the heading block has toggle and can support children. If false, the heading block is a normal heading block.
48 |
49 |
50 | class CalloutBlockAttributes(BaseAttributeWithChildren):
51 | rich_text: List[RichTextObject]
52 | icon: Optional[Union[FileObject, EmojiObject]]
53 | color: Optional[NotionExtendedColorEnum]
54 |
55 |
56 | class ToDoBlockAttributes(BaseAttributeWithChildren):
57 | rich_text: List[RichTextObject]
58 | color: Optional[NotionExtendedColorEnum]
59 | checked: Optional[bool]
60 |
61 |
62 | class CodeBlockAttributes(BaseAttributes):
63 | rich_text: List[RichTextObject]
64 | caption: Optional[List[RichTextObject]]
65 | language: Optional[str] # TODO: it's actually an enum
66 |
67 |
68 | class ChildPageAttributes(BaseAttributes):
69 | title: List[RichTextObject]
70 |
71 |
72 | class EmbedBlockAttributes(BaseAttributes):
73 | url: str
74 |
75 |
76 | class ImageBlockAttributes(BaseAttributes, FileObject):
77 | caption: Optional[List[RichTextObject]]
78 | # This is not listed in the docs, but it is in the API response (Nov 2022)
79 |
80 |
81 | class VideoBlockAttributes(BaseAttributes):
82 | video: FileObject
83 |
84 |
85 | class FileBlockAttributes(BaseAttributes):
86 | file: FileObject
87 | caption: Optional[List[RichTextObject]]
88 |
89 |
90 | class PdfBlockAttributes(BaseAttributes):
91 | pdf: FileObject
92 |
93 |
94 | class BookmarkBlockAttributes(BaseAttributes):
95 | url: str
96 | caption: Optional[List[RichTextObject]]
97 |
98 |
99 | class EquationBlockAttributes(BaseAttributes):
100 | expression: str
101 |
102 |
103 | class TableOfContentsAttributes(BaseAttributes):
104 | color: Optional[NotionExtendedColorEnum]
105 |
106 |
107 | class LinkPreviewAttributes(BaseAttributes):
108 | url: str
109 |
110 |
111 | class LinkToPageAttributes(BaseAttributes):
112 | type: str
113 | page_id: Optional[str]
114 | database_id: Optional[str]
115 |
116 |
117 | ATTRIBUTES_MAPPING = {
118 | _cls.__name__: _cls
119 | for _cls in BaseAttributes.__subclasses__()
120 | + BaseAttributeWithChildren.__subclasses__()
121 | }
122 |
123 |
124 | class BaseNotionBlock(BaseModel):
125 | object: str = "block"
126 | parent: Optional[ParentObject]
127 | id: Optional[str]
128 | type: Optional[str]
129 | created_time: Optional[str]
130 | # created_by
131 | last_edited_time: Optional[str]
132 | # created_by
133 | has_children: Optional[bool]
134 | archived: Optional[bool]
135 | type: str
136 |
137 | @property
138 | def children(self):
139 | return self.__getattribute__(self.type).children
140 |
141 | def set_children(self, value: Any):
142 | self.__getattribute__(self.type).children = value
143 |
144 |
145 | class ParagraphBlock(BaseNotionBlock):
146 | type: str = "paragraph"
147 | paragraph: TextBlockAttributes
148 |
149 |
150 | class HeadingOneBlock(BaseNotionBlock):
151 | type: str = "heading_1"
152 | heading_1: HeadingBlockAttributes
153 |
154 |
155 | class HeadingTwoBlock(BaseNotionBlock):
156 | type: str = "heading_2"
157 | heading_2: HeadingBlockAttributes
158 |
159 |
160 | class HeadingThreeBlock(BaseNotionBlock):
161 | type: str = "heading_3"
162 | heading_3: HeadingBlockAttributes
163 |
164 |
165 | class CalloutBlock(BaseNotionBlock):
166 | type: str = "callout"
167 | callout: CalloutBlockAttributes
168 |
169 |
170 | class QuoteBlock(BaseNotionBlock):
171 | type: str = "quote"
172 | quote: TextBlockAttributes
173 |
174 |
175 | class BulletedListItemBlock(BaseNotionBlock):
176 | type: str = "bulleted_list_item"
177 | bulleted_list_item: TextBlockAttributes
178 |
179 |
180 | class NumberedListItemBlock(BaseNotionBlock):
181 | type: str = "numbered_list_item"
182 | numbered_list_item: TextBlockAttributes
183 |
184 |
185 | class ToDoBlock(BaseNotionBlock):
186 | type: str = "to_do"
187 | to_do: ToDoBlockAttributes
188 |
189 |
190 | class ToggleBlock(BaseNotionBlock):
191 | type: str = "toggle"
192 | toggle: TextBlockAttributes
193 |
194 |
195 | class CodeBlock(BaseNotionBlock):
196 | type: str = "code"
197 | code: CodeBlockAttributes
198 |
199 |
200 | class ChildPageBlock(BaseNotionBlock):
201 | type: str = "child_page"
202 | child_page: ChildPageAttributes
203 |
204 |
205 | class ChildDatabaseBlock(BaseNotionBlock):
206 | type: str = "child_database"
207 | child_database: ChildPageAttributes
208 |
209 |
210 | class EmbedBlock(BaseNotionBlock):
211 | type: str = "embed"
212 | embed: EmbedBlockAttributes
213 |
214 |
215 | class ImageBlock(BaseNotionBlock):
216 | type: str = "image"
217 | image: ImageBlockAttributes
218 |
219 |
220 | class VideoBlock(BaseNotionBlock):
221 | type: str = "video"
222 | video: VideoBlockAttributes
223 |
224 |
225 | class FileBlock(BaseNotionBlock):
226 | type: str = "file"
227 | file: FileBlockAttributes
228 |
229 |
230 | class PdfBlock(BaseNotionBlock):
231 | type: str = "pdf"
232 | pdf: PdfBlockAttributes
233 |
234 |
235 | class BookmarkBlock(BaseNotionBlock):
236 | type: str = "bookmark"
237 | bookmark: BookmarkBlockAttributes
238 |
239 |
240 | class EquationBlock(BaseNotionBlock):
241 | type: str = "equation"
242 | equation: EquationBlockAttributes
243 |
244 |
245 | class DividerBlock(BaseNotionBlock):
246 | type: str = "divider"
247 | divider: Optional[Dict]
248 |
249 |
250 | class TableOfContentsBlock(BaseNotionBlock):
251 | type: str = "table_of_contents"
252 | table_of_contents: TableOfContentsAttributes
253 |
254 |
255 | class BreadcrumbBlock(BaseNotionBlock):
256 | type: str = "breadcrumb"
257 | breadcrumb: Optional[Dict]
258 |
259 |
260 | # TODO: Column List and Column Blocks
261 |
262 |
263 | class LinkPreviewBlock(BaseNotionBlock):
264 | type: str = "link_preview"
265 | link_preview: LinkPreviewAttributes
266 |
267 |
268 | # TODO: Template blocks
269 |
270 |
271 | class LinkToPageBlock(BaseNotionBlock):
272 | type: str = "link_to_page"
273 | link_to_page: LinkToPageAttributes
274 |
275 |
276 | # TODO: Synced Block blocks
277 |
278 | # TODO: Table blocks
279 |
280 | # TODO: Table row blocks
281 |
282 | BLOCKS_MAPPING = {
283 | list(_cls.__fields__.keys())[-1]: _cls for _cls in BaseNotionBlock.__subclasses__()
284 | }
285 |
286 |
287 | def parse_one_block(data: Dict) -> BaseNotionBlock:
288 | if data["type"] not in BLOCKS_MAPPING:
289 | warnings.warn(f"Unknown block type: {data['type']}")
290 | return None
291 |
292 | return parse_obj_as(BLOCKS_MAPPING[data["type"]], data)
293 |
294 |
295 | def parse_blocks(
296 | data: List[Dict], recursive: bool = False, client: Client = None
297 | ) -> List[BaseNotionBlock]:
298 | all_blocks = []
299 | for block_data in data:
300 | block = parse_one_block(block_data)
301 | if block.has_children and recursive and client:
302 | block.set_children(
303 | parse_blocks(
304 | client.blocks.children.list(block_id=block.id)["results"],
305 | recursive=recursive,
306 | client=client,
307 | )
308 | )
309 | all_blocks.append(block)
310 | return all_blocks
311 |
--------------------------------------------------------------------------------
/src/notion_df/configs.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional, Callable, Tuple
2 | import warnings
3 | import itertools
4 | from dataclasses import dataclass
5 |
6 | from pydantic import BaseModel, validator, parse_obj_as
7 | from pandas.api.types import (
8 | is_datetime64_any_dtype,
9 | is_numeric_dtype,
10 | is_object_dtype,
11 | is_bool_dtype,
12 | is_categorical_dtype,
13 | is_list_like,
14 | )
15 |
16 | from notion_df.base import (
17 | SelectOptions,
18 | NumberFormat,
19 | RollupProperty,
20 | FormulaProperty,
21 | RelationProperty,
22 | )
23 | from notion_df.utils import (
24 | flatten_dict,
25 | IDENTITY_TRANSFORM,
26 | REMOVE_EMPTY_STR_TRANSFORM,
27 | SECURE_STR_TRANSFORM,
28 | SECURE_BOOL_TRANSFORM,
29 | SECURE_TIME_TRANSFORM,
30 | LIST_TRANSFORM,
31 | )
32 |
33 |
34 | class BasePropertyConfig(BaseModel):
35 | id: Optional[str]
36 | type: Optional[str]
37 |
38 | def query_dict(self):
39 | return flatten_dict(self.dict())
40 |
41 | @validator("type", always=True)
42 | def automatically_set_type_value(cls, v):
43 | _type = list(cls.__fields__.keys())[-1]
44 | if v is None:
45 | return _type
46 | else:
47 | assert _type == v, f"{_type} != {v}"
48 | return _type
49 |
50 |
51 | class TitleConfig(BasePropertyConfig):
52 | title: Dict = {}
53 |
54 | # TODO: Make the validator automatically geneerated
55 | @validator("title")
56 | def title_is_empty_dict(cls, v):
57 | if v:
58 | raise ValueError("The title dict must be empty")
59 | return v
60 |
61 |
62 | class RichTextConfig(BasePropertyConfig):
63 | rich_text: Dict = {}
64 |
65 | @validator("rich_text")
66 | def title_is_empty_dict(cls, v):
67 | if v:
68 | raise ValueError("The rich_text dict must be empty")
69 | return v
70 |
71 |
72 | class NumberConfig(BasePropertyConfig):
73 | number: NumberFormat
74 |
75 | # TODO:Add enum based on https://developers.notion.com/reference/create-a-database#number-configuration
76 |
77 |
78 | class SelectConfig(BasePropertyConfig):
79 | select: Optional[SelectOptions]
80 |
81 |
82 | class MultiSelectConfig(BasePropertyConfig):
83 | multi_select: Optional[SelectOptions]
84 |
85 |
86 | class DateConfig(BasePropertyConfig):
87 | date: Dict = {}
88 |
89 | @validator("date")
90 | def title_is_empty_dict(cls, v):
91 | if v:
92 | raise ValueError("The date dict must be empty")
93 | return v
94 |
95 |
96 | class PeopleConfig(BasePropertyConfig):
97 | people: Dict = {}
98 |
99 | @validator("people")
100 | def title_is_empty_dict(cls, v):
101 | if v:
102 | raise ValueError("The people dict must be empty")
103 | return v
104 |
105 |
106 | class FilesConfig(BasePropertyConfig):
107 | files: Dict = {}
108 |
109 | @validator("files")
110 | def title_is_empty_dict(cls, v):
111 | if v:
112 | raise ValueError("The files dict must be empty")
113 | return v
114 |
115 |
116 | class CheckboxConfig(BasePropertyConfig):
117 | checkbox: Dict = {}
118 |
119 | @validator("checkbox")
120 | def title_is_empty_dict(cls, v):
121 | if v:
122 | raise ValueError("The checkbox dict must be empty")
123 | return v
124 |
125 |
126 | class URLConfig(BasePropertyConfig):
127 | url: Dict = {}
128 |
129 | @validator("url")
130 | def title_is_empty_dict(cls, v):
131 | if v:
132 | raise ValueError("The url dict must be empty")
133 | return v
134 |
135 |
136 | class EmailConfig(BasePropertyConfig):
137 | email: Dict = {}
138 |
139 | @validator("email")
140 | def title_is_empty_dict(cls, v):
141 | if v:
142 | raise ValueError("The email dict must be empty")
143 | return v
144 |
145 |
146 | class PhoneNumberConfig(BasePropertyConfig):
147 | phone_number: Dict = {}
148 |
149 | @validator("phone_number")
150 | def title_is_empty_dict(cls, v):
151 | if v:
152 | raise ValueError("The phone_number dict must be empty")
153 | return v
154 |
155 |
156 | class FormulaConfig(BasePropertyConfig):
157 | formula: FormulaProperty
158 |
159 |
160 | class RelationConfig(BasePropertyConfig):
161 | relation: RelationProperty
162 |
163 |
164 | class RollupConfig(BasePropertyConfig):
165 | rollup: RollupProperty
166 |
167 |
168 | class CreatedTimeConfig(BasePropertyConfig):
169 | created_time: Dict = {}
170 |
171 | @validator("created_time")
172 | def title_is_empty_dict(cls, v):
173 | if v:
174 | raise ValueError("The created_time dict must be empty")
175 | return v
176 |
177 |
178 | class CreatedByConfig(BasePropertyConfig):
179 | created_by: Dict = {}
180 |
181 | @validator("created_by")
182 | def title_is_empty_dict(cls, v):
183 | if v:
184 | raise ValueError("The created_by dict must be empty")
185 | return v
186 |
187 |
188 | class LastEditedTimeConfig(BasePropertyConfig):
189 | last_edited_time: Dict = {}
190 |
191 | @validator("last_edited_time")
192 | def title_is_empty_dict(cls, v):
193 | if v:
194 | raise ValueError("The last_edited_time dict must be empty")
195 | return v
196 |
197 |
198 | class LastEditedByConfig(BasePropertyConfig):
199 | last_edited_by: Dict = {}
200 |
201 | @validator("last_edited_by")
202 | def title_is_empty_dict(cls, v):
203 | if v:
204 | raise ValueError("The last_edited_by dict must be empty")
205 | return v
206 |
207 |
208 | def _convert_classname_to_typename(s):
209 | import re
210 |
211 | s = s.replace("Config", "").replace("URL", "Url")
212 | return re.sub(r"(? BasePropertyConfig:
232 | return parse_obj_as(CONFIGS_MAPPING[data["type"]], data)
233 |
234 |
235 | CONFIGS_DF_TRANSFORMER = {
236 | "title": SECURE_STR_TRANSFORM,
237 | "rich_text": SECURE_STR_TRANSFORM,
238 | "number": None,
239 | "select": REMOVE_EMPTY_STR_TRANSFORM,
240 | "multi_select": lambda lst: [str(ele) for ele in lst]
241 | if is_list_like(lst)
242 | else str(lst),
243 | "date": SECURE_TIME_TRANSFORM,
244 | "checkbox": SECURE_BOOL_TRANSFORM,
245 | ### Notion-specific Properties ###
246 | # Currently we don't automatically convert these properties
247 | # We assume the users will use the correct type and we don't need to perform any transformation
248 | "people": IDENTITY_TRANSFORM,
249 | "relation": LIST_TRANSFORM,
250 | "url": REMOVE_EMPTY_STR_TRANSFORM,
251 | "email": REMOVE_EMPTY_STR_TRANSFORM,
252 | ### TODO: check the following ###
253 | "files": SECURE_STR_TRANSFORM,
254 | "phone_number": SECURE_STR_TRANSFORM,
255 | "formula": SECURE_STR_TRANSFORM,
256 | "rollup": SECURE_STR_TRANSFORM,
257 | "created_time": SECURE_STR_TRANSFORM,
258 | "created_by": SECURE_STR_TRANSFORM,
259 | "last_edited_time": SECURE_STR_TRANSFORM,
260 | "last_edited_by": SECURE_STR_TRANSFORM,
261 | }
262 |
263 |
264 | def _infer_series_config(column: "pd.Series") -> BasePropertyConfig:
265 | dtype = column.dtype
266 |
267 | if is_object_dtype(dtype):
268 | if all(is_list_like(ele) for ele in column):
269 | all_possible_values = set(
270 | list(itertools.chain.from_iterable(column.to_list()))
271 | )
272 | all_possible_values = [str(ele) for ele in all_possible_values]
273 | return MultiSelectConfig(
274 | multi_select=SelectOptions.from_value(all_possible_values),
275 | )
276 | else:
277 | return RichTextConfig()
278 | if is_numeric_dtype(dtype):
279 | return NumberConfig(number=NumberFormat(format="number"))
280 | if is_bool_dtype(dtype):
281 | return CheckboxConfig()
282 | if is_categorical_dtype(dtype):
283 | return SelectConfig(
284 | select=SelectOptions.from_value([str for cat in dtype.categories]),
285 | )
286 | if is_datetime64_any_dtype(dtype):
287 | return DateConfig()
288 |
289 | return None
290 |
291 |
292 | @dataclass
293 | class DatabaseSchema:
294 |
295 | configs: Dict[str, BasePropertyConfig]
296 |
297 | @classmethod
298 | def from_raw(cls, configs: Dict) -> "DatabaseSchema":
299 |
300 | configs = {key: parse_single_config(config) for key, config in configs.items()}
301 | return cls(configs)
302 |
303 | def __getitem__(self, key: int):
304 | return self.configs[key]
305 |
306 | def query_dict(self) -> Dict:
307 | return {key: config.query_dict() for key, config in self.configs.items()}
308 |
309 | @classmethod
310 | def from_df(
311 | cls, df: "pd.DataFrame", title_col: Optional[str] = None
312 | ) -> "DatabaseSchema":
313 | """Automatically infer the schema from a pandas dataframe"""
314 | df = df.infer_objects()
315 |
316 | configs = {}
317 | for col in df.columns:
318 | config = _infer_series_config(df[col])
319 | configs[col] = config
320 |
321 | if title_col is not None:
322 | configs[title_col] = TitleConfig()
323 | else:
324 | configs[df.columns[0]] = TitleConfig()
325 |
326 | return cls(configs)
327 |
328 | @property
329 | def title_column(self) -> Optional[str]:
330 | for key, config in self.configs.items():
331 | if isinstance(config, TitleConfig) or config.type == "title":
332 | # TODO: Rethink this
333 | return key
334 |
335 | def create_df(self, df) -> "pd.DataFrame":
336 |
337 | notion_urls = df.notion_urls
338 | notion_ids = df.notion_ids
339 | notion_query_results = df.notion_query_results
340 |
341 | df = df.copy()
342 | # Ensure the column integrity
343 | # See the issue mentioned in https://github.com/lolipopshock/notion-df/issues/17
344 | columns = [col for col in df.columns if col in self.configs]
345 | df = df[columns]
346 |
347 | df.schema = self
348 |
349 | with warnings.catch_warnings():
350 | warnings.simplefilter("ignore")
351 | df.notion_urls = notion_urls
352 | df.notion_ids = notion_ids
353 | df.notion_query_results = notion_query_results
354 |
355 | return df
356 |
357 | def is_df_compatible(self, df: "pd.DataFrame") -> bool:
358 | """Validate the dataframe against the schema"""
359 |
360 | if hasattr(df, "schema"):
361 | if not df.schema == self:
362 | return False
363 |
364 | # TODO: There might miss one thing: if the rollup is not configured
365 | # the database reterive result will be empty for that column.
366 | # But the database query will return the value for that column
367 | # (even if that's empty). So this would miss this check...
368 | else:
369 | for col in df.columns:
370 | if col not in self.configs.keys():
371 | return False
372 |
373 | # TODO: Add more advanced check on datatypes
374 | return True
375 |
376 | def transform(
377 | self, df: "pd.DataFrame", remove_non_editables=False
378 | ) -> "pd.DataFrame":
379 | """Transform the df such that the data values are compatible with the schema.
380 | It assumes the df has already been validated against the schema.
381 | """
382 | df = df.copy()
383 | used_columns = []
384 | for col in df.columns:
385 | if self[col].type in NON_EDITABLE_TYPES:
386 | continue # Skip non-editable columns
387 |
388 | transform = CONFIGS_DF_TRANSFORMER[self[col].type]
389 | if transform is not None:
390 | df[col] = df[col].apply(transform)
391 | used_columns.append(col)
392 | if remove_non_editables:
393 | return df[used_columns]
394 | return df
395 |
--------------------------------------------------------------------------------
/src/notion_df/constants.py:
--------------------------------------------------------------------------------
1 | # See https://developers.notion.com/reference/request-limits
2 |
3 | RICH_TEXT_CONTENT_MAX_LENGTH = 2000
4 | RICH_TEXT_LINK_MAX_LENGTH = 1000
5 | EQUATION_EXPRESSION_MAX_LENGTH = 1000
--------------------------------------------------------------------------------
/src/notion_df/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional, Union, Any
2 | from datetime import datetime
3 | from dateutil.parser import parse
4 | from uuid import UUID
5 |
6 | import pandas as pd
7 | from pandas.api.types import is_array_like, is_datetime64_any_dtype, is_list_like
8 |
9 |
10 | def flatten_dict(data: Dict):
11 | """Remove entries in dict whose values are None"""
12 | if isinstance(data, dict):
13 | return {
14 | key: flatten_dict(value) for key, value in data.items() if value is not None
15 | }
16 | elif isinstance(data, list) or isinstance(data, tuple):
17 | return [flatten_dict(value) for value in data]
18 | else:
19 | return data
20 |
21 |
22 | def is_item_empty(item: Any) -> bool:
23 |
24 | if item is None or item == []:
25 | return True
26 |
27 | isna = pd.isna(item)
28 | if is_array_like(isna):
29 | isna = isna.all()
30 | # TODO: Rethink it is all or any
31 |
32 | return isna
33 |
34 |
35 | def is_time_string(s: str) -> bool:
36 |
37 | # Ref https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
38 | try:
39 | parse(s)
40 | return True
41 | except ValueError:
42 | return False
43 |
44 |
45 | def is_uuid(s: str) -> bool:
46 | # Kind of an OK solution.. But can be further improved?
47 | try:
48 | UUID(str(s))
49 | return True
50 | except ValueError:
51 | return False
52 |
53 |
54 | ISO8601_REGEX = r"^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]+)?(Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$"
55 | # See https://stackoverflow.com/questions/41129921/validate-an-iso-8601-datetime-string-in-python
56 | ISO8601_STRFTIME_TRANSFORM = lambda ele: ele.strftime("%Y-%m-%dT%H:%M:%SZ")
57 |
58 | strtime_transform = lambda ele: parse(ele).strftime("%Y-%m-%dT%H:%M:%SZ")
59 | datetime_transform = lambda ele: ele.strftime("%Y-%m-%dT%H:%M:%SZ")
60 |
61 |
62 | def transform_time(s: Any) -> str:
63 | if not is_item_empty(s):
64 | if isinstance(s, str):
65 | return strtime_transform(s)
66 | elif isinstance(s, datetime):
67 | return datetime_transform(s)
68 | elif is_datetime64_any_dtype(s):
69 | return datetime_transform(s)
70 |
71 |
72 | IDENTITY_TRANSFORM = lambda ele: ele
73 | SECURE_STR_TRANSFORM = lambda ele: str(ele) if not is_item_empty(ele) else ""
74 | LIST_TRANSFORM = lambda ele: ele if is_list_like(ele) else [ele]
75 | REMOVE_EMPTY_STR_TRANSFORM = (
76 | lambda ele: None if ele == "" or ele is None or pd.isna(ele) else SECURE_STR_TRANSFORM(ele)
77 | )
78 | SECURE_BOOL_TRANSFORM = lambda ele: bool(ele) if not is_item_empty(ele) else None
79 | SECURE_TIME_TRANSFORM = transform_time
80 |
--------------------------------------------------------------------------------
/src/notion_df/values.py:
--------------------------------------------------------------------------------
1 | ### Referring to https://developers.notion.com/reference/page#property-value-object
2 |
3 | from typing import List, Dict, Optional, Union, Any
4 | from dataclasses import dataclass
5 | from copy import deepcopy
6 | import numbers
7 |
8 | from pydantic import BaseModel, parse_obj_as, validator, root_validator
9 | import pandas as pd
10 | from pandas.api.types import is_array_like
11 |
12 | from notion_df.base import (
13 | RichTextObject,
14 | SelectOption,
15 | DateObject,
16 | RelationObject,
17 | UserObject,
18 | RollupObject,
19 | FileObject,
20 | FormulaObject
21 | )
22 | from notion_df.utils import (
23 | flatten_dict,
24 | is_list_like
25 | )
26 |
27 |
28 | class BasePropertyValues(BaseModel):
29 | id: Optional[str] # TODO: Rethink whether we can do this
30 | # The Optional[id] is used when creating property values
31 | type: Optional[str]
32 |
33 | # TODO: Add abstractmethods for them
34 | @classmethod
35 | def from_value(cls, value):
36 | pass
37 |
38 | @property
39 | def value(self):
40 | pass
41 |
42 | def query_dict(self):
43 | return flatten_dict(self.dict())
44 |
45 |
46 | class TitleValues(BasePropertyValues):
47 | title: List[RichTextObject]
48 |
49 | @property
50 | def value(self) -> Optional[str]:
51 | return (
52 | None
53 | if len(self.title) == 0
54 | else " ".join([text.value for text in self.title])
55 | )
56 |
57 | @classmethod
58 | def from_value(cls, value):
59 | return cls(title=RichTextObject.encode_string(value))
60 | # TODO: Rethink whether we should split input string to multiple elements in the list
61 |
62 |
63 | class RichTextValues(BasePropertyValues):
64 | rich_text: List[RichTextObject]
65 |
66 | @property
67 | def value(self) -> Optional[str]:
68 | return (
69 | None
70 | if len(self.rich_text) == 0
71 | else " ".join([text.value for text in self.rich_text])
72 | )
73 |
74 | @classmethod
75 | def from_value(cls, value: str):
76 | return cls(rich_text=RichTextObject.encode_string(value))
77 |
78 |
79 | class NumberValues(BasePropertyValues):
80 | number: Optional[Union[float, int]]
81 |
82 | @property
83 | def value(self) -> str:
84 | return self.number
85 |
86 | @classmethod
87 | def from_value(cls, value: Union[float, int]):
88 | return cls(number=value)
89 |
90 |
91 | class SelectValues(BasePropertyValues):
92 | select: Optional[SelectOption]
93 |
94 | @property
95 | def value(self) -> Optional[str]:
96 | return self.select.name if self.select else None
97 |
98 | @classmethod
99 | def from_value(cls, value: str):
100 | return cls(select=SelectOption.from_value(value))
101 |
102 |
103 | class MultiSelectValues(BasePropertyValues):
104 | multi_select: List[SelectOption]
105 |
106 | @property
107 | def value(self) -> List[str]:
108 | return [select.name for select in self.multi_select]
109 |
110 | @classmethod
111 | def from_value(cls, values: Union[List[str], str]):
112 | if is_list_like(values):
113 | return cls(
114 | multi_select=[SelectOption.from_value(value) for value in values]
115 | )
116 | else:
117 | return cls(multi_select=[SelectOption.from_value(values)])
118 |
119 |
120 | class DateValues(BasePropertyValues):
121 | date: Optional[DateObject]
122 |
123 | @property
124 | def value(self) -> str:
125 | return self.date.value if self.date else None
126 |
127 | @classmethod
128 | def from_value(cls, value: str):
129 | return cls(date=DateObject.from_value(value))
130 |
131 |
132 | class FormulaValues(BasePropertyValues):
133 | formula: FormulaObject
134 |
135 | @property
136 | def value(self):
137 | return self.formula.value
138 |
139 |
140 | class RelationValues(BasePropertyValues):
141 | relation: List[RelationObject]
142 |
143 | @property
144 | def value(self) -> List[str]:
145 | return [relation.id for relation in self.relation]
146 |
147 | @classmethod
148 | def from_value(cls, values: Union[List[str], str]):
149 | if is_list_like(values):
150 | return cls(relation=[RelationObject.from_value(value) for value in values])
151 | else:
152 | return cls(relation=[RelationObject.from_value(values)])
153 |
154 |
155 | class PeopleValues(BasePropertyValues):
156 | people: List[UserObject]
157 |
158 | @property
159 | def value(self) -> List[str]:
160 | return [people.id for people in self.people]
161 |
162 | @classmethod
163 | def from_value(cls, values: Union[List[str], str]):
164 | if is_list_like(values):
165 | return cls(people=[UserObject.from_value(value) for value in values])
166 | else:
167 | return cls(people=[UserObject.from_value(values)])
168 |
169 |
170 | class FilesValues(BasePropertyValues):
171 | files: List[FileObject]
172 |
173 | @property
174 | def value(self) -> List[str]:
175 | return [file.value for file in self.files]
176 |
177 | class CheckboxValues(BasePropertyValues):
178 | checkbox: Optional[bool]
179 |
180 | @property
181 | def value(self) -> Optional[bool]:
182 | return self.checkbox
183 |
184 | @classmethod
185 | def from_value(cls, value: bool):
186 | return cls(checkbox=value)
187 |
188 |
189 | class URLValues(BasePropertyValues):
190 | url: Optional[str]
191 |
192 | @property
193 | def value(self) -> Optional[str]:
194 | return self.url
195 |
196 | @classmethod
197 | def from_value(cls, value: Optional[str]):
198 | return cls(url=value)
199 |
200 | def query_dict(self):
201 | res = flatten_dict(self.dict())
202 | if "url" not in res:
203 | res["url"] = None
204 | # The url value is required by the notion API
205 | return res
206 |
207 |
208 | class EmailValues(BasePropertyValues):
209 | email: Optional[str]
210 |
211 | @property
212 | def value(self) -> Optional[str]:
213 | return self.email
214 |
215 | @classmethod
216 | def from_value(cls, value: str):
217 | return cls(email=value)
218 |
219 |
220 | class PhoneNumberValues(BasePropertyValues):
221 | phone_number: Optional[str]
222 |
223 | @property
224 | def value(self) -> Optional[str]:
225 | return self.phone_number
226 |
227 | @classmethod
228 | def from_value(cls, value: str):
229 | return cls(phone_number=value)
230 |
231 |
232 | class CreatedTimeValues(BasePropertyValues):
233 | created_time: Optional[str]
234 |
235 | @property
236 | def value(self) -> Optional[str]:
237 | return self.created_time
238 |
239 | @classmethod
240 | def from_value(cls, value: str):
241 | return cls(created_time=value)
242 |
243 |
244 | class CreatedByValues(BasePropertyValues):
245 | created_by: UserObject
246 |
247 | @property
248 | def value(self) -> List[str]:
249 | return self.created_by.value
250 |
251 |
252 | class LastEditedTimeValues(BasePropertyValues):
253 | last_edited_time: str
254 |
255 | @property
256 | def value(self) -> Optional[str]:
257 | return self.last_edited_time
258 |
259 | @classmethod
260 | def from_value(cls, value: str):
261 | return cls(last_edited_time=value)
262 |
263 |
264 | class LastEditedByValues(BasePropertyValues):
265 | last_edited_by: UserObject
266 |
267 | @property
268 | def value(self) -> List[str]:
269 | return self.last_edited_by.value
270 |
271 |
272 | VALUES_MAPPING = {
273 | list(_cls.__fields__.keys())[-1]: _cls
274 | for _cls in BasePropertyValues.__subclasses__()
275 | if len(_cls.__fields__)
276 | == 3 # TODO: When all classes have been implemented, we can just remove this check
277 | }
278 |
279 |
280 | class RollupValues(BasePropertyValues):
281 | rollup: RollupObject
282 |
283 | @validator("rollup", pre=True)
284 | def check_rollup_values(cls, val):
285 | val = deepcopy(val)
286 | if val.get("array") is not None:
287 | val["array"] = [
288 | parse_obj_as(VALUES_MAPPING[data["type"]], data)
289 | for data in val["array"]
290 | ]
291 | return val
292 |
293 | @property
294 | def value(self):
295 | return self.rollup.value
296 |
297 |
298 | VALUES_MAPPING["rollup"] = RollupValues
299 |
300 |
301 | def parse_single_values(data: Dict) -> BasePropertyValues:
302 | return parse_obj_as(VALUES_MAPPING[data["type"]], data)
303 |
304 |
305 | def _guess_value_schema(val: Any) -> object:
306 |
307 | if isinstance(val, str):
308 | return RichTextValues
309 | elif isinstance(val, numbers.Number):
310 | return NumberValues
311 | elif isinstance(val, bool):
312 | return CheckboxValues
313 | else:
314 | raise ValueError(f"Unknown value type: {type(val)}")
315 |
316 |
317 | def _is_item_empty(item):
318 |
319 | if item is None or item == []:
320 | return True
321 |
322 | isna = pd.isna(item)
323 | if is_array_like(isna):
324 | isna = isna.all()
325 | # TODO: Rethink it is all or any
326 |
327 | return isna
328 |
329 |
330 | RESERVED_VALUES = ["url"]
331 | # Even if the value is none, we still want to keep it in the dataframe
332 |
333 |
334 | def _is_reserved_value(key, schema):
335 | return schema[key].type in RESERVED_VALUES
336 |
337 |
338 | def parse_value_with_schema(
339 | idx: int, key: str, value: Any, schema: "DatabaseSchema"
340 | ) -> BasePropertyValues:
341 | # TODO: schema shouldn't be allowed to be empty in the future version
342 | # schema should be determined at the dataframe level.
343 |
344 | if schema is not None:
345 | value_func = VALUES_MAPPING[schema[key].type]
346 | else:
347 | if idx == 0:
348 | # TODO: Brutally enforce the first one to be the title, though
349 | # should be optimized in future versions
350 | value_func = TitleValues
351 | value = str(value)
352 | else:
353 | value_func = _guess_value_schema(value)
354 |
355 | return value_func.from_value(value)
356 |
357 |
358 | @dataclass
359 | class PageProperty:
360 | """This class is used to parse properties of a single Notion Page.
361 |
362 | :: example:
363 |
364 | >>> data = \
365 | {"Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []},
366 | "Created": {"id": "mbOA", "type": "date", "date": None},
367 | "Title": {"id": "title", "type": "title", "title": []}}
368 | >>> property = PageProperty.from_raw(data)
369 | """
370 |
371 | properties: Dict[str, BasePropertyValues]
372 |
373 | @classmethod
374 | def from_raw(cls, properties: Dict) -> "PageProperty":
375 | properties = {k: parse_single_values(v) for k, v in properties.items()}
376 | return cls(properties)
377 |
378 | def __getitem__(self, key):
379 | return self.properties[key]
380 |
381 | def to_series(self):
382 | return pd.Series(
383 | {key: property.value for key, property in self.properties.items()}
384 | )
385 |
386 | @classmethod
387 | def from_series(
388 | cls, series: pd.Series, schema: "DatabaseSchema" = None
389 | ) -> "PageProperty":
390 | return cls(
391 | {
392 | key: parse_value_with_schema(idx, key, val, schema)
393 | for idx, (key, val) in enumerate(series.items())
394 | if not _is_item_empty(val) or _is_reserved_value(key, schema)
395 | }
396 | )
397 |
398 | def query_dict(self) -> Dict:
399 | return {key: property.query_dict() for key, property in self.properties.items()}
400 |
401 |
402 | @dataclass
403 | class PageProperties:
404 | """This class is used to parse multiple page properties within a database
405 |
406 | :: example:
407 |
408 | >>> data = \
409 | [
410 | {
411 | "object": "page",
412 | "id": "xxxx",
413 | "created_time": "2032-01-03T00:00:00.000Z",
414 | "properties": {
415 | "Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []},
416 | "Created": {"id": "mbOA", "type": "date", "date": None},
417 | "Title": {"id": "title", "type": "title", "title": []}
418 | }
419 | },
420 | {
421 | "object": "page",
422 | "id": "xxxx",
423 | "created_time": "2032-01-03T00:00:01.000Z",
424 | "properties": {
425 | "Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []},
426 | "Created": {"id": "mbOA", "type": "date", "date": None},
427 | "Title": {"id": "title", "type": "title", "title": []}
428 | }
429 | }
430 | ]
431 | >>> property = PageProperties.from_raw(data)
432 | """
433 |
434 | page_properties: List[PageProperty]
435 |
436 | @classmethod
437 | def from_raw(cls, properties: List[Dict]) -> "PageProperties":
438 | page_properties = [
439 | PageProperty.from_raw(property["properties"]) for property in properties
440 | ]
441 | return cls(page_properties)
442 |
443 | def __getitem__(self, key: int):
444 | return self.page_properties[key]
445 |
446 | def to_frame(self):
447 | return pd.DataFrame([property.to_series() for property in self.page_properties])
448 |
--------------------------------------------------------------------------------
/tests/test_agent.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | from notion_df.agent import download
4 |
5 | NOTION_API_KEY = os.environ.get("NOTION_API_KEY")
6 | NOTION_LARGE_DF = os.environ.get("NOTION_LARGE_DF")
7 | NOTION_LARGE_DF_ROWS = 150
8 |
9 | def test_nrows():
10 | if not NOTION_LARGE_DF or not NOTION_API_KEY:
11 | pytest.skip("API key not provided")
12 |
13 | df = download(NOTION_LARGE_DF, api_key=NOTION_API_KEY)
14 | assert len(df) == NOTION_LARGE_DF_ROWS
15 |
16 | df = download(NOTION_LARGE_DF, nrows=101, api_key=NOTION_API_KEY)
17 | assert len(df) == 101
18 |
19 | df = download(NOTION_LARGE_DF, nrows=15, api_key=NOTION_API_KEY)
20 | assert len(df) == 15
--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import pytest
4 | import notion_df
5 | import pandas as pd
6 | from pydantic import ValidationError
7 | from notion_df.agent import download, upload
8 |
9 | NOTION_API_KEY = os.environ.get("NOTION_API_KEY")
10 |
11 |
12 | def test_select_option():
13 | schema = notion_df.configs.DatabaseSchema(
14 | {"options": notion_df.configs.MultiSelectConfig()}
15 | )
16 |
17 | df = pd.DataFrame([{"options": [1, 2, 3]}])
18 | dff = schema.transform(df)
19 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema)
20 |
21 | # Not working because of commas in the option string
22 | df = pd.DataFrame([{"options": ["a,b", "c,d"]}])
23 | dff = schema.transform(df)
24 | with pytest.raises(ValidationError):
25 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema)
26 |
27 | # The following also checks whether it can convert elements into strings
28 | df = pd.DataFrame([{"options": [[1, 2, 3], [4, 5, 6]]}])
29 | dff = schema.transform(df)
30 | with pytest.raises(ValidationError):
31 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema)
32 |
33 |
34 | def test_rollup():
35 | NOTION_ROLLUP_DF = os.environ.get("NOTION_ROLLUP_DF")
36 |
37 | if not NOTION_ROLLUP_DF or not NOTION_API_KEY:
38 | pytest.skip("API key not provided")
39 |
40 | # Ensure the rollup values can be downloaded and uploaded
41 | df = download(NOTION_ROLLUP_DF, api_key=NOTION_API_KEY)
42 | upload(df[:2], NOTION_ROLLUP_DF, api_key=NOTION_API_KEY)
43 | # TODO: Add remove rollup values
44 |
45 |
46 | def test_files_edit_by():
47 | NOTION_FILES_DF = os.environ.get("NOTION_FILES_DF")
48 |
49 | if not NOTION_FILES_DF or not NOTION_API_KEY:
50 | pytest.skip("API key not provided")
51 |
52 | df = download(NOTION_FILES_DF, api_key=NOTION_API_KEY)
53 |
54 |
55 | def test_formula():
56 | NOTION_FORMULA_DF = os.environ.get("NOTION_FORMULA_DF")
57 |
58 | if not NOTION_FORMULA_DF or not NOTION_API_KEY:
59 | pytest.skip("API key not provided")
60 |
61 | df = download(NOTION_FORMULA_DF, api_key=NOTION_API_KEY)
62 |
63 |
64 | def test_relation():
65 | NOTION_RELATION_DF = os.environ.get("NOTION_RELATION_DF")
66 | NOTION_RELATION_TARGET_DF = os.environ.get("NOTION_RELATION_TARGET_DF")
67 |
68 | if not NOTION_RELATION_DF or not NOTION_RELATION_TARGET_DF or not NOTION_API_KEY:
69 | pytest.skip("API key not provided")
70 |
71 | # download: resolve
72 | # upload: resolve
73 | df = download(
74 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=True
75 | )
76 | df_target = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY)
77 |
78 | assert "private_page" not in df.columns
79 | # See https://github.com/lolipopshock/notion-df/issues/17
80 |
81 | ## witout a new key
82 | upload(
83 | df[:1],
84 | NOTION_RELATION_DF,
85 | resolve_relation_values=True,
86 | create_new_rows_in_relation_target=True,
87 | )
88 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY)
89 | assert len(df_target_new) == len(df_target)
90 |
91 | ## with a new key
92 | rint = random.randint(0, 100000)
93 | df.at[0, "Related to Tasks"] = [f"test {rint}"]
94 | upload(
95 | df[:1],
96 | NOTION_RELATION_DF,
97 | resolve_relation_values=True,
98 | create_new_rows_in_relation_target=True,
99 | )
100 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY)
101 | assert len(df_target_new) == len(df_target) + 1
102 | df_target_new.iloc[-1]["name"] == f"test {rint}"
103 |
104 | # download: not-resolve
105 | # upload: resolve
106 | # Avoids creating new rows for uuid only lists
107 | df = download(
108 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=False
109 | )
110 | df_target = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY)
111 |
112 | upload(
113 | df[:1],
114 | NOTION_RELATION_DF,
115 | resolve_relation_values=True,
116 | create_new_rows_in_relation_target=True,
117 | )
118 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY)
119 | assert len(df_target_new) == len(df_target)
120 |
121 | # download: resolve
122 | # upload: not-resolve
123 | # Raises error
124 | df = download(
125 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=True
126 | )
127 |
128 | with pytest.raises(ValidationError):
129 | upload(
130 | df[:1],
131 | NOTION_RELATION_DF,
132 | resolve_relation_values=False,
133 | )
134 |
135 | def test_long_string():
136 | NOTION_LONG_STRING_DF = os.environ.get("NOTION_LONG_STRING_DF")
137 |
138 | if not NOTION_LONG_STRING_DF or not NOTION_API_KEY:
139 | pytest.skip("API key not provided")
140 |
141 | df = download(NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY)
142 | assert len(df.iloc[0,1]) == 7721
143 |
144 | upload(df[:1], NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY)
145 | df_new = download(NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY)
146 | # assert len(df_new.iloc[0,1]) == 7721
147 | # This might not be true -- understand why?
148 |
149 | def test_rich_text():
150 | NOTION_RICH_TEXT_DF = os.environ.get("NOTION_RICH_TEXT_DF")
151 |
152 | if not NOTION_RICH_TEXT_DF or not NOTION_API_KEY:
153 | pytest.skip("API key not provided")
154 |
155 | df = download(NOTION_RICH_TEXT_DF, api_key=NOTION_API_KEY)
--------------------------------------------------------------------------------