├── .github └── workflows │ ├── codeql-analysis.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── setup.py ├── src └── notion_df │ ├── __init__.py │ ├── _pandas.py │ ├── agent.py │ ├── base.py │ ├── blocks.py │ ├── configs.py │ ├── constants.py │ ├── utils.py │ └── values.py └── tests ├── test_agent.py └── test_base.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '29 20 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | release-pypi: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | .gitattributes 3 | .last_checked 4 | .gitconfig 5 | *.bak 6 | *.log 7 | *~ 8 | ~* 9 | _tmp* 10 | tmp* 11 | tags 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # dotenv 95 | .env 96 | 97 | # virtualenv 98 | .venv 99 | venv/ 100 | ENV/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | 115 | .vscode 116 | *.swp 117 | 118 | # osx generated files 119 | .DS_Store 120 | .DS_Store? 121 | .Trashes 122 | ehthumbs.db 123 | Thumbs.db 124 | .idea 125 | 126 | # pytest 127 | .pytest_cache 128 | 129 | # tools/trust-doc-nbs 130 | docs_src/.last_checked 131 | 132 | # symlinks to fastai 133 | docs_src/fastai 134 | tools/fastai 135 | 136 | # link checker 137 | checklink/cookies.txt 138 | 139 | # .gitconfig is now autogenerated 140 | .gitconfig 141 | 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Shannon Shen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `notion-df`: Seamlessly Connecting Notion Database with Pandas DataFrame 2 | 3 | *Please Note: This project is currently in pre-alpha stage. The code are not appropriately documented and tested. Please report any issues you find. Thanks!* 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install notion-df 9 | ``` 10 | 11 | ## Usage 12 | 13 | - Before starting, please follow the instructions to [create a new integration](https://www.notion.com/my-integrations) and [add it to your Notion page or database](https://developers.notion.com/docs/getting-started#step-2-share-a-database-with-your-integration). 14 | - We'll refer `Internal Integration Token` as the `api_key` below. 15 | 16 | - Pandas-flavored APIs: Just need to add two additional lines of code: 17 | ```python 18 | import notion_df 19 | notion_df.pandas() #That's it! 20 | 21 | page_url = "paste your page url from Notion" 22 | api_key = "paste your api key (internal integration key)" 23 | 24 | import pandas as pd 25 | df = pd.read_notion(page_url, api_key=api_key) 26 | df.to_notion(page_url, api_key=api_key) 27 | ``` 28 | 29 | - Download your Notion table as a pandas DataFrame 30 | ```python 31 | import notion_df 32 | df = notion_df.download(notion_database_url, api_key=api_key) 33 | # Equivalent to: df = pd.read_notion(notion_database_url, api_key=api_key) 34 | df.head() 35 | ``` 36 |
37 | Only downloading the first `nrows` from a database 38 | 39 | ```python 40 | df = notion_df.download(notion_database_url, nrows=nrows) #e.g., 10 41 | ``` 42 | 43 |
44 | 45 |
46 | What if your table has a relation column? 47 | 48 | ```python 49 | df = notion_df.download(notion_database_url, 50 | resolve_relation_values=True) 51 | ``` 52 | The `resolve_relation_values=True` will automatically resolve the linking for all the relation columns whose target can be accessed by the current notion integration. 53 | 54 | In details, let's say the `"test"` column in df is a relation column in Notion. 55 | 1. When `resolve_relation_values=False`, the return results for that column will be a list of UUIDs of the target page: `['65e04f11-xxxx', 'b0ffcb4b-xxxx', ]`. 56 | 2. When `resolve_relation_values=True`, the return results for that column will be a list of regular strings corresponding to the name column of the target pages: `['page1', 'page2', ]`. 57 | 58 |
59 | 60 | - Append a local `df` to a Notion database: 61 | 62 | ```python 63 | import notion_df 64 | notion_df.upload(df, notion_database_url, title="page-title", api_key=api_key) 65 | # Equivalent to: df.to_notion(notion_database_url, title="page-title", api_key=api_key) 66 | ``` 67 | 68 | - Upload a local `df` to a newly created database in a Notion page: 69 | 70 | ```python 71 | import notion_df 72 | notion_df.upload(df, notion_page_url, title="page-title", api_key=api_key) 73 | # Equivalent to: df.to_notion(notion_page_url, title="page-title", api_key=api_key) 74 | ``` 75 | 76 | - Tired of typing `api_key=api_key` each time? 77 | 78 | ```python 79 | import notion_df 80 | notion_df.config(api_key=api_key) # Or set an environment variable `NOTION_API_KEY` 81 | df = notion_df.download(notion_database_url) 82 | notion_df.upload(df, notion_page_url, title="page-title") 83 | # Similarly in pandas APIs: df.to_notion(notion_page_url, title="page-title") 84 | ``` 85 | 86 | ## Development 87 | 88 | 1. Clone the repo and install the dependencies: 89 | ```bash 90 | git clone git@github.com:lolipopshock/notion-df.git 91 | cd notion-df 92 | pip install -e .[dev] 93 | ``` 94 | 2. How to run tests? 95 | ```bash 96 | NOTION_API_KEY="" pytest tests/ 97 | ``` 98 | The tests are dependent on a list of notebooks, specified by the following environment variables: 99 | 100 | | Environment Variable | Description | 101 | | --------------------------- | --------------------------------------- | 102 | | `NOTION_API_KEY` | The API key for your Notion integration | 103 | | `NOTION_ROLLUP_DF` | - | 104 | | `NOTION_FILES_DF` | - | 105 | | `NOTION_FORMULA_DF` | - | 106 | | `NOTION_RELATION_DF` | - | 107 | | `NOTION_RELATION_TARGET_DF` | - | 108 | | `NOTION_LONG_STRING_DF` | - | 109 | | `NOTION_RICH_TEXT_DF` | - | 110 | 111 | 112 | ## TODOs 113 | 114 | - [ ] Add tests for 115 | - [ ] `load` 116 | - [ ] `upload` 117 | - [ ] `values.py` 118 | - [ ] `configs.py` 119 | - [ ] `base.py` 120 | - [ ] Better class organizations/namings for `*Configs` and `*Values` 121 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | notion-client>=0.8.0 2 | pydantic~=1.9.0 3 | pandas 4 | dataclasses -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | 5 | def get_requirements(req_file): 6 | reqs = [] 7 | with open(req_file, "r") as fp: 8 | for line in fp.readlines(): 9 | if line.startswith("#") or line.strip() == "": 10 | continue 11 | else: 12 | reqs.append(line.strip()) 13 | return reqs 14 | 15 | 16 | # A trick from https://github.com/jina-ai/jina/blob/79b302c93b01689e82cf4b52f46522eb7497c404/setup.py#L20 17 | libinfo_py = os.path.join("src", "notion_df", "__init__.py") 18 | libinfo_content = open(libinfo_py, "r", encoding="utf8").readlines() 19 | version_line = [l.strip() for l in libinfo_content if l.startswith("__version__")][0] 20 | exec(version_line) # gives __version__ 21 | 22 | setup( 23 | name="notion-df", 24 | version=__version__, 25 | description="Notion-DF: Seamlessly Connecting Notion Database with Pandas DataFrame", 26 | author="Zejiang Shen", 27 | author_email="zejiangshen@gmail.com", 28 | license="MIT", 29 | url="https://github.com/lolipopshock/notion-df", 30 | package_dir={"": "src"}, 31 | packages=find_packages("src"), 32 | long_description=open("README.md", "r", encoding="utf-8").read(), 33 | long_description_content_type="text/markdown", 34 | python_requires=">=3.6", 35 | install_requires=get_requirements("requirements.txt"), 36 | extras_require={ 37 | "dev": [ 38 | "black==21.12b0", 39 | "pytest", 40 | ], 41 | } 42 | ) -------------------------------------------------------------------------------- /src/notion_df/__init__.py: -------------------------------------------------------------------------------- 1 | from notion_df.agent import download, upload, config 2 | from notion_df._pandas import pandas 3 | 4 | __version__ = "0.0.5" 5 | -------------------------------------------------------------------------------- /src/notion_df/_pandas.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from notion_df import upload, download 3 | 4 | 5 | def read_notion( 6 | notion_url: str, 7 | nrows: Optional[int] = None, 8 | resolve_relation_values: bool = False, 9 | errors: str = "strict", 10 | api_key: str = None, 11 | ) -> "pd.DataFrame": 12 | """Download a Notion database as a pandas DataFrame. 13 | 14 | Args: 15 | notion_url (str): 16 | The URL of the Notion database to download from. 17 | nrows (int, optional): 18 | Number of rows of file to read. Useful for reading 19 | pieces of large files. 20 | resolve_relation_values (bool, optional): 21 | By default, when downloading relation columns, notion-df 22 | will just download the object ids. If set `resolve_relation_values` 23 | to `True`, notion-df will try to pull the values of the title 24 | column from the target table and map the object ids to those values. 25 | Defaults to False. 26 | errors (str, optional): 27 | You can specify how to handle errors during downloading. There 28 | are several options: 29 | 1. "strict": raise an error when there is one. 30 | 2. "ignore": ignore errors. 31 | 3. "warn": print the error message. 32 | Defaults to "strict". 33 | api_key (str, optional): 34 | The API key of the Notion integration. 35 | Defaults to None. 36 | Returns: 37 | pd.DataFrame: the loaded dataframe. 38 | """ 39 | return download( 40 | notion_url, 41 | nrows=nrows, 42 | resolve_relation_values=resolve_relation_values, 43 | errors=errors, 44 | api_key=api_key, 45 | ) 46 | 47 | 48 | def to_notion( 49 | self, 50 | notion_url: str, 51 | schema=None, 52 | mode: str = "a", 53 | title: str = "", 54 | title_col: str = "", 55 | errors: str = "strict", 56 | resolve_relation_values: bool = False, 57 | create_new_rows_in_relation_target: bool = False, 58 | return_response: bool = False, 59 | api_key: str = None, 60 | ): 61 | 62 | """Upload a dataframe to the specified Notion database. 63 | 64 | Args: 65 | df (pd.DataFrame): 66 | The dataframe to upload. 67 | notion_url (str): 68 | The URL of the Notion page to upload to. 69 | If it is a notion page, then it will create a new database 70 | under that page and upload the dataframe to it. 71 | schema (DatabaseSchema, optional): 72 | The schema of the Notion database. 73 | When not set, it will be inferred from (1) the target 74 | notion database (if it is) then (2) the dataframe itself. 75 | mode (str, optional): 76 | (the function is not supported yet.) 77 | Whether to append to the database or overwrite. 78 | Defaults to "a". 79 | title (str, optional): 80 | The title of the Notion database. 81 | Defaults to "". 82 | title_col (str, optional): 83 | Every Notion database requires a "title" column. 84 | When the schema is not set, by default it infers the first 85 | column of uploaded dataframe as the title column. You can 86 | set this value to specify the title column. 87 | Defaults to "". 88 | errors (str, optional): 89 | Since we upload the dataframe to Notion row by row, you 90 | can specify how to handle errors during uploading. There 91 | are several options: 92 | 1. "strict": raise an error when there is one. 93 | 2. "ignore": ignore errors and continue uploading 94 | subsequent rows. 95 | 3. "warn": print the error message and continue uploading 96 | Defaults to "strict". 97 | resolve_relation_values (bool, optional): 98 | If `True`, notion-df assumes the items in any relation columns 99 | are not notion object ids, but the value of the corresponding 100 | "title column" in the target table. It will try to convert the 101 | relation column to notion object ids by looking up the value. 102 | Defaults to False. 103 | create_new_rows_in_relation_target (bool, optional): 104 | This argument is used in conjunction with `resolve_relation_values`. 105 | If True, then notion-df will try to create new rows in the target 106 | the relation table if the relation column value is not found there. 107 | Defaults to False. 108 | return_response (bool, optional): 109 | If True, then the function will return a list of responses for 110 | the updates from Notion. 111 | api_key (str, optional): 112 | The API key of the Notion integration. 113 | Defaults to None. 114 | """ 115 | 116 | return upload( 117 | df=self, 118 | notion_url=notion_url, 119 | schema=schema, 120 | mode=mode, 121 | title=title, 122 | title_col=title_col, 123 | errors=errors, 124 | resolve_relation_values=resolve_relation_values, 125 | create_new_rows_in_relation_target=create_new_rows_in_relation_target, 126 | return_response=return_response, 127 | api_key=api_key, 128 | ) 129 | 130 | 131 | def pandas(): 132 | import pandas as pd 133 | 134 | pd.read_notion = read_notion 135 | pd.DataFrame.to_notion = to_notion 136 | -------------------------------------------------------------------------------- /src/notion_df/agent.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Union, Tuple 2 | from datetime import datetime 3 | import warnings 4 | import os 5 | from functools import wraps 6 | 7 | import pandas as pd 8 | from httpx import HTTPStatusError 9 | from notion_client import Client 10 | from notion_client.helpers import get_id 11 | 12 | from notion_df.values import PageProperties, PageProperty 13 | from notion_df.configs import DatabaseSchema, NON_EDITABLE_TYPES 14 | from notion_df.utils import is_uuid, flatten_dict 15 | from notion_df.blocks import parse_blocks, BaseNotionBlock 16 | 17 | API_KEY = None 18 | NOT_REVERSE_DATAFRAME = -1 19 | # whether to reverse the dataframe when performing uploading. 20 | # for some reason, notion will reverse the order of dataframe 21 | # when uploading. 22 | # -1 for reversing, for not reversing 23 | NOTION_DEFAULT_PAGE_SIZE = 100 24 | NOTION_MAX_PAGE_SIZE = 100 25 | 26 | 27 | def config(api_key: str): 28 | global API_KEY 29 | API_KEY = api_key 30 | 31 | 32 | def _load_api_key(api_key: str) -> str: 33 | if api_key is not None: 34 | return api_key 35 | elif API_KEY is not None: 36 | return API_KEY 37 | elif os.environ.get("NOTION_API_KEY") is not None: 38 | return os.environ.get("NOTION_API_KEY") 39 | else: 40 | raise ValueError("No API key provided") 41 | 42 | 43 | def _is_notion_database(notion_url): 44 | return "?v=" in notion_url.split("/")[-1] 45 | 46 | 47 | def use_client(func): 48 | @wraps(func) 49 | def wrapper(*args, **kwargs): 50 | orig_client = client = kwargs.pop("client", None) 51 | 52 | if client is None: 53 | api_key = _load_api_key(kwargs.pop("api_key", None)) 54 | client = Client(auth=api_key) 55 | out = func(client=client, *args, **kwargs) 56 | 57 | if orig_client is None: 58 | # Automatically close the client if it was not passed in 59 | client.close() 60 | return out 61 | 62 | return wrapper 63 | 64 | 65 | def query_database( 66 | database_id: str, 67 | client: Client, 68 | start_cursor: Optional[str] = None, 69 | page_size: int = NOTION_DEFAULT_PAGE_SIZE, 70 | ): 71 | query_dict = {"database_id": database_id, "page_size": page_size} 72 | if start_cursor is not None: 73 | query_dict["start_cursor"] = start_cursor 74 | # For now, Notion API doesn't allow start_cursor='null' 75 | 76 | query_results = client.databases.query(**query_dict) 77 | 78 | assert query_results["object"] == "list" 79 | return query_results 80 | 81 | 82 | def load_df_from_queries( 83 | database_query_results: List[Dict], 84 | ): 85 | properties = PageProperties.from_raw(database_query_results) 86 | df = properties.to_frame() 87 | 88 | with warnings.catch_warnings(): 89 | warnings.simplefilter("ignore") 90 | # TODO: figure out a better solution 91 | # When doing the following, Pandas may think you are trying 92 | # to add a new column to the dataframe; it will show the warnings, 93 | # but it will not actually add the column. So we use catch_warnings 94 | # to hide the warnings. 95 | # However this might not be the best way to do so. Some alternatives 96 | # include setting df.attrs https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.attrs.html 97 | # Or even use something like multi-level index for saving notion_ids. 98 | # Nevertheless, all of them seems not that perfect -- for example, 99 | # after copying or slicing, the values will disappear. 100 | # Should try to figure out a better solution in the future. 101 | df.notion_urls = pd.Series([ele["url"] for ele in database_query_results]) 102 | df.notion_ids = pd.Series([ele["id"] for ele in database_query_results]) 103 | df.notion_query_results = database_query_results 104 | # TODO: Rethink if this should be private 105 | 106 | return df 107 | 108 | 109 | def download_df_from_database( 110 | notion_url: str, 111 | client: Client, 112 | nrows: Optional[int] = None, 113 | errors: str = "strict", 114 | ) -> pd.DataFrame: 115 | """Download a Notion database as a pandas DataFrame. 116 | 117 | Args: 118 | notion_url (str): 119 | The URL of the Notion database to download from. 120 | nrows (int, optional): 121 | Number of rows of file to read. Useful for reading 122 | pieces of large files. 123 | api_key (str, optional): 124 | The API key of the Notion integration. 125 | Defaults to None. 126 | client (Client, optional): 127 | The notion client. 128 | Defaults to None. 129 | Returns: 130 | pd.DataFrame: the loaded dataframe. 131 | """ 132 | if not is_uuid(notion_url): 133 | assert _is_notion_database(notion_url) 134 | database_id = get_id(notion_url) 135 | else: 136 | database_id = notion_url 137 | 138 | # Check the if the id is a database first 139 | try: 140 | retrieve_results = client.databases.retrieve(database_id=database_id) 141 | schema = DatabaseSchema.from_raw(retrieve_results["properties"]) 142 | except HTTPStatusError: 143 | error_msg = ( 144 | f"The object {database_id} might not be a notion database, " 145 | "or integration associated with the API key don't have access " 146 | "to it." 147 | ) 148 | if errors == "strict": 149 | raise ValueError(error_msg) 150 | elif errors == "warn": 151 | warnings.warn(error_msg) 152 | return None 153 | elif errors == "ignore": 154 | return None 155 | 156 | downloaded_rows = [] 157 | 158 | page_size = NOTION_MAX_PAGE_SIZE 159 | if nrows is not None: 160 | if nrows <= NOTION_MAX_PAGE_SIZE: 161 | page_size = nrows 162 | 163 | query_results = query_database(database_id, client, page_size=page_size) 164 | downloaded_rows.extend(query_results["results"]) 165 | 166 | while query_results["has_more"]: 167 | if nrows is not None: 168 | if len(downloaded_rows) >= nrows: 169 | break 170 | else: 171 | page_size = nrows - len(downloaded_rows) 172 | else: 173 | page_size = NOTION_MAX_PAGE_SIZE 174 | 175 | query_results = query_database( 176 | database_id, 177 | client, 178 | start_cursor=query_results["next_cursor"], 179 | page_size=page_size, 180 | ) 181 | downloaded_rows.extend(query_results["results"]) 182 | 183 | df = load_df_from_queries(downloaded_rows) 184 | df = schema.create_df(df) 185 | return df 186 | 187 | 188 | @use_client 189 | def download( 190 | notion_url: str, 191 | nrows: Optional[int] = None, 192 | resolve_relation_values: Optional[bool] = False, 193 | errors: str = "strict", 194 | *, 195 | api_key: str = None, 196 | client: Client = None, 197 | ): 198 | df = download_df_from_database( 199 | notion_url=notion_url, 200 | nrows=nrows, 201 | client=client, 202 | errors=errors, 203 | ) 204 | if resolve_relation_values: 205 | for col in df.columns: 206 | if df.schema[col].type == "relation": 207 | relation_df = download_df_from_database( 208 | df.schema[col].relation.database_id, 209 | errors="warn", 210 | client=client, 211 | ) 212 | if relation_df is not None: 213 | rel_title_col = relation_df.schema.title_column 214 | obj_id_to_string = { 215 | obj_id: obj_title 216 | for obj_id, obj_title in zip( 217 | relation_df.notion_ids, relation_df[rel_title_col] 218 | ) 219 | } 220 | df[col] = df[col].apply( 221 | lambda row: [obj_id_to_string[ele] for ele in row] 222 | ) 223 | return df 224 | 225 | 226 | def create_database( 227 | page_id: str, client: Client, schema: DatabaseSchema, title: str = "" 228 | ): 229 | response = client.databases.create( 230 | parent={"type": "page_id", "page_id": page_id}, 231 | title=[{"type": "text", "text": {"content": title}}], 232 | properties=schema.query_dict(), 233 | ) 234 | assert response["object"] == "database" 235 | return response 236 | 237 | 238 | def upload_row_to_database(row, database_id, schema, children, client) -> Dict: 239 | 240 | properties = PageProperty.from_series(row, schema).query_dict() 241 | if children: 242 | if not isinstance(children, list): 243 | children = [children] 244 | for cid in range(len(children)): 245 | if isinstance(children[cid], BaseNotionBlock): 246 | children[cid] = flatten_dict(children[cid].dict()) 247 | 248 | response = client.pages.create( 249 | parent={"database_id": database_id}, properties=properties, children=children 250 | ) 251 | else: 252 | response = client.pages.create( 253 | parent={"database_id": database_id}, properties=properties, 254 | ) 255 | return response 256 | 257 | 258 | def upload_to_database(df, databse_id, schema, client, errors, children) -> List[Dict]: 259 | all_response = [] 260 | if children is not None: 261 | assert len(children) == len(df) 262 | children = children[::NOT_REVERSE_DATAFRAME] 263 | 264 | for idx, (_, row) in enumerate(df[::NOT_REVERSE_DATAFRAME].iterrows(), ): 265 | try: 266 | child = children[idx] if children is not None else None 267 | response = upload_row_to_database(row, databse_id, schema, child, client) 268 | all_response.append(response) 269 | except Exception as e: 270 | if errors == "strict": 271 | raise e 272 | elif errors == "warn": 273 | warnings.warn(f"Encountered errors {e} while uploading row: {row}") 274 | elif errors == "ignore": 275 | continue 276 | return all_response[::NOT_REVERSE_DATAFRAME] 277 | 278 | 279 | def load_database_schema(database_id, client): 280 | return DatabaseSchema.from_raw( 281 | client.databases.retrieve(database_id=database_id)["properties"] 282 | ) 283 | 284 | 285 | @use_client 286 | def upload( 287 | df: pd.DataFrame, 288 | notion_url: str, 289 | schema: DatabaseSchema = None, 290 | mode: str = "a", 291 | title: str = "", 292 | title_col: str = "", 293 | errors: str = "strict", 294 | resolve_relation_values: bool = False, 295 | create_new_rows_in_relation_target: bool = False, 296 | children: List[Union[Dict, BaseNotionBlock]] = None, 297 | return_response: bool = False, 298 | *, 299 | api_key: str = None, 300 | client: Client = None, 301 | ) -> Union[str, Tuple[str, List[Dict]]]: 302 | """Upload a dataframe to the specified Notion database. 303 | 304 | Args: 305 | df (pd.DataFrame): 306 | The dataframe to upload. 307 | notion_url (str): 308 | The URL of the Notion page to upload to. 309 | If it is a notion page, then it will create a new database 310 | under that page and upload the dataframe to it. 311 | schema (DatabaseSchema, optional): 312 | The schema of the Notion database. 313 | When not set, it will be inferred from (1) the target 314 | notion database (if it is) then (2) the dataframe itself. 315 | mode (str, optional): 316 | (the function is not supported yet.) 317 | Whether to append to the database or overwrite. 318 | Defaults to "a". 319 | title (str, optional): 320 | The title of the Notion database. 321 | Defaults to "". 322 | title_col (str, optional): 323 | Every Notion database requires a "title" column. 324 | When the schema is not set, by default it infers the first 325 | column of uploaded dataframe as the title column. You can 326 | set this value to specify the title column. 327 | Defaults to "". 328 | errors (str, optional): 329 | Since we upload the dataframe to Notion row by row, you 330 | can specify how to handle errors during uploading. There 331 | are several options: 332 | 1. "strict": raise an error when there is one. 333 | 2. "ignore": ignore errors and continue uploading 334 | subsequent rows. 335 | 3. "warn": print the error message and continue uploading 336 | Defaults to "strict". 337 | children (List[Union[Dict, BaseNotionBlock]], optional): 338 | The corresponding children of the uploaded Notion page. It should be 339 | a list of the same length as the dataframe. 340 | resolve_relation_values (bool, optional): 341 | If `True`, notion-df assumes the items in any relation columns 342 | are not notion object ids, but the value of the corresponding 343 | "title column" in the target table. It will try to convert the 344 | relation column to notion object ids by looking up the value. 345 | Defaults to False. 346 | create_new_rows_in_relation_target (bool, optional): 347 | This argument is used in conjunction with `resolve_relation_values`. 348 | If True, then notion-df will try to create new rows in the target 349 | the relation table if the relation column value is not found there. 350 | Defaults to False. 351 | return_response (bool, optional): 352 | If True, then the function will return a list of responses for 353 | the updates from Notion. 354 | api_key (str, optional): 355 | The API key of the Notion integration. 356 | Defaults to None. 357 | client (Client, optional): 358 | The notion client. 359 | Defaults to None. 360 | """ 361 | if schema is None: 362 | if hasattr(df, "schema"): 363 | schema = df.schema 364 | 365 | if not _is_notion_database(notion_url): 366 | if schema is None: 367 | schema = DatabaseSchema.from_df(df, title_col=title_col) 368 | database_properties = create_database(get_id(notion_url), client, schema, title) 369 | databse_id = database_properties["id"] 370 | notion_url = database_properties["url"] 371 | else: 372 | databse_id = get_id(notion_url) 373 | if schema is None: 374 | schema = load_database_schema(databse_id, client) 375 | 376 | # At this stage, we should have the appropriate schema 377 | assert schema is not None 378 | 379 | if not schema.is_df_compatible(df): 380 | raise ValueError( 381 | "The dataframe is not compatible with the database schema." 382 | "The df contains columns that are not in the databse: " 383 | + f"{[col for col in df.columns if col not in schema.configs.keys()]}" 384 | ) 385 | 386 | if mode not in ("a", "append"): 387 | raise NotImplementedError 388 | # TODO: clean the current values in the notion database (if any) 389 | 390 | df = schema.transform(df, remove_non_editables=True) 391 | 392 | # Assumes the notion database is created and has the appropriate schema 393 | if resolve_relation_values: 394 | for col in df.columns: 395 | if schema[col].type == "relation": 396 | 397 | if df[col].apply(lambda row: all([is_uuid(ele) for ele in row])).all(): 398 | # The column is all in uuid, we don't need to resolve it 399 | continue 400 | 401 | # Try to download the target_relation_df 402 | relation_db_id = schema[col].relation.database_id 403 | relation_df = download_df_from_database( 404 | relation_db_id, 405 | errors="warn", 406 | client=client, 407 | ) 408 | 409 | if relation_df is not None: 410 | rel_title_col = relation_df.schema.title_column 411 | obj_string_to_id = { 412 | obj_title: obj_id 413 | for obj_id, obj_title in zip( 414 | relation_df.notion_ids, relation_df[rel_title_col] 415 | ) 416 | } 417 | 418 | all_unique_obj_strings_in_relation_df = set( 419 | relation_df[rel_title_col].tolist() 420 | ) 421 | all_unique_obj_strings_in_df = set(sum(df[col].tolist(), [])) 422 | # This assumes the column has been transformed to a list of lists; 423 | # which is a true assumption given the transformation for the relation 424 | # column (LIST_TRANSFORM). 425 | new_object_strings = all_unique_obj_strings_in_df.difference( 426 | all_unique_obj_strings_in_relation_df 427 | ) 428 | 429 | if create_new_rows_in_relation_target and len(new_object_strings) > 0: 430 | new_relation_df = pd.DataFrame( 431 | list(new_object_strings), columns=[rel_title_col] 432 | ) 433 | responses = upload_to_database( 434 | new_relation_df, 435 | relation_db_id, 436 | relation_df.schema, 437 | client, 438 | "warn", 439 | ) 440 | appended_relation_df = load_df_from_queries(responses) 441 | obj_string_to_id.update( 442 | { 443 | obj_title: obj_id 444 | for obj_id, obj_title in zip( 445 | appended_relation_df.notion_ids, 446 | appended_relation_df[rel_title_col], 447 | ) 448 | } 449 | ) 450 | 451 | df[col] = df[col].apply( 452 | lambda row: [obj_string_to_id[ele] for ele in row if ele in obj_string_to_id] 453 | ) 454 | 455 | response = upload_to_database(df, databse_id, schema, client, errors, children) 456 | 457 | print(f"Your dataframe has been uploaded to the Notion page: {notion_url} .") 458 | if return_response: 459 | return notion_url, response 460 | return notion_url 461 | 462 | @use_client 463 | def download_page_children( 464 | notion_url: str, 465 | api_key: str = None, 466 | client: Client = None, 467 | ): 468 | """Download the children of a Notion page. 469 | 470 | Args: 471 | notion_url (str): 472 | The url of the Notion page. 473 | api_key (str, optional): 474 | The API key of the Notion integration. 475 | Defaults to None. 476 | client (Client, optional): 477 | The notion client. 478 | Defaults to None. 479 | """ 480 | page_id = get_id(notion_url) 481 | r = client.blocks.children.list(block_id=page_id) 482 | return parse_blocks(r['results'], recursive=True, client=client) -------------------------------------------------------------------------------- /src/notion_df/base.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Any 2 | from enum import Enum 3 | from pydantic import BaseModel, validator, root_validator 4 | import pandas as pd 5 | 6 | from notion_df.utils import is_time_string, is_uuid 7 | from notion_df.constants import RICH_TEXT_CONTENT_MAX_LENGTH 8 | 9 | ### All colors supported in NOTION 10 | 11 | 12 | class NotionColorEnum(str, Enum): 13 | Default = "default" 14 | Gray = "gray" 15 | Brown = "brown" 16 | Orange = "orange" 17 | Yellow = "yellow" 18 | Green = "green" 19 | Blue = "blue" 20 | Purple = "purple" 21 | Pink = "pink" 22 | Red = "red" 23 | 24 | 25 | class NotionExtendedColorEnum(str, Enum): 26 | Default = "default" 27 | Gray = "gray" 28 | Brown = "brown" 29 | Orange = "orange" 30 | Yellow = "yellow" 31 | Green = "green" 32 | Blue = "blue" 33 | Purple = "purple" 34 | Pink = "pink" 35 | Red = "red" 36 | GrayBackground = "gray_background" 37 | BrownBackground = "brown_background" 38 | OrangeBackground = "orange_background" 39 | YellowBackground = "yellow_background" 40 | GreenBackground = "green_background" 41 | BlueBackground = "blue_background" 42 | PurpleBackground = "purple_background" 43 | PinkBackground = "pink_background" 44 | RedBackground = "red_background" 45 | 46 | 47 | class RichTextTypeEnum(str, Enum): 48 | Text = "text" 49 | Mention = "mention" 50 | Equation = "equation" 51 | 52 | 53 | class SelectOption(BaseModel): 54 | id: Optional[str] 55 | name: str 56 | color: Optional[NotionColorEnum] 57 | 58 | @classmethod 59 | def from_value(cls, value: str): 60 | return cls(name=value) 61 | 62 | @validator("name") 63 | def name_cannot_contain_comma(cls, v): 64 | if "," in v: 65 | raise ValueError(f"Invalid option name {v} that contains comma") 66 | return v 67 | 68 | 69 | class SelectOptions(BaseModel): 70 | options: Optional[List[SelectOption]] 71 | 72 | @classmethod 73 | def from_value(cls, values: List[str]): 74 | return cls(options=[SelectOption.from_value(value) for value in values]) 75 | 76 | 77 | class RelationObject(BaseModel): 78 | id: str 79 | # TODO: Change this to UUID validation 80 | 81 | @classmethod 82 | def from_value(cls, value: str): 83 | return cls(id=value) 84 | 85 | @validator("id") 86 | def id_must_be_uuid(cls, v): 87 | if not is_uuid(v): 88 | raise ValueError(f"Invalid id {v}") 89 | return v 90 | 91 | 92 | class UserObject(BaseModel): 93 | object: str = "user" 94 | id: str 95 | type: Optional[str] 96 | name: Optional[str] 97 | avatar_url: Optional[str] 98 | 99 | @classmethod 100 | def from_value(cls, value: str): 101 | return cls(id=value) 102 | 103 | @validator("object") 104 | def object_is_name(cls, v): 105 | if v != "user": 106 | raise ValueError(f"Invalid user object value {v}") 107 | return v 108 | 109 | @property 110 | def value(self): 111 | return self.name 112 | 113 | 114 | class NumberFormat(BaseModel): 115 | format: str 116 | 117 | 118 | class FormulaProperty(BaseModel): 119 | expression: str 120 | 121 | 122 | class RelationProperty(BaseModel): 123 | database_id: str 124 | # TODO: Change this to UUID validation 125 | synced_property_name: Optional[str] 126 | synced_property_id: Optional[str] 127 | 128 | 129 | class DateObject(BaseModel): 130 | start: Optional[str] = None 131 | end: Optional[str] = None 132 | time_zone: Optional[str] = None 133 | 134 | @validator("start") 135 | def is_start_ISO8601(cls, v): 136 | # TODO: Currently it cannot suport time ranges 137 | if v is not None: 138 | if not is_time_string(v): 139 | raise ValueError( 140 | "The data start is not appropriately formatted as an ISO 8601 date string." 141 | ) 142 | return v 143 | 144 | @validator("end") 145 | def is_end_ISO8601(cls, v): 146 | if v is not None: 147 | if not is_time_string(v): 148 | raise ValueError( 149 | "The data end is not appropriately formatted as an ISO 8601 date string." 150 | ) 151 | return v 152 | 153 | @classmethod 154 | def from_value(cls, value: str): 155 | return cls(start=value) 156 | # TODO: Now we assume the value has already been formated as strings 157 | # But we should parse them into appropriate formats. 158 | 159 | @property 160 | def value(self): 161 | return pd.to_datetime(self.start) 162 | # TODO: what should the data structure be if self.end is not None? 163 | 164 | 165 | class RollupProperty(BaseModel): 166 | relation_property_name: Optional[str] 167 | relation_property_id: Optional[str] 168 | rollup_property_name: Optional[str] 169 | rollup_property_id: Optional[str] 170 | function: str 171 | # TODO: Change this to ENUM - https://developers.notion.com/reference/create-a-database#rollup-configuration 172 | 173 | 174 | class RollupObject(BaseModel): 175 | type: str 176 | # TODO: Change this to ENUM - https://developers.notion.com/reference/property-value-object#rollup-property-values 177 | number: Optional[float] 178 | date: Optional[DateObject] 179 | array: Optional[List[Any]] 180 | # Based on the description in https://developers.notion.com/reference/property-value-object#rollup-property-value-element 181 | # Each element is exactly like property value object, but without the "id" key. 182 | # As there's a preprocess step in RollupValues, each item of the array must 183 | # be a property value object. 184 | function: Optional[str] 185 | # Though the function param doesn't appear in the documentation, it exists 186 | # in the return values of the API. Set it as optional for future compatibility. 187 | # TODO: check in the future if the function param should be updated. 188 | 189 | @validator("type") 190 | def ensure_non_empty_data(cls, v): 191 | data_type = v 192 | if data_type is None: 193 | raise ValueError("RollupObject must have a type.") 194 | if data_type not in ["number", "date", "array"]: 195 | raise ValueError(f"RollupObject type {data_type} is invalid.") 196 | return v 197 | 198 | @property 199 | def value(self): 200 | if self.type == "number": 201 | return self.number 202 | if self.type == "date": 203 | if self.date is not None: 204 | return self.date.value 205 | if self.type == "array": 206 | return [ele.value for ele in self.array] 207 | 208 | 209 | class FileTargetObject(BaseModel): 210 | url: str 211 | expiry_time: Optional[str] 212 | 213 | @property 214 | def value(self): 215 | return self.url 216 | 217 | 218 | class FileObject(BaseModel): 219 | name: Optional[str] #TODO: Figure out why this is not required... 220 | type: str 221 | file: Optional[FileTargetObject] 222 | external: Optional[FileTargetObject] 223 | 224 | @property 225 | def value(self): 226 | if self.type == "file": 227 | if self.file is not None: 228 | return self.file.value 229 | else: 230 | if self.external is not None: 231 | return self.external.value 232 | 233 | 234 | class FormulaObject(BaseModel): 235 | type: str 236 | string: Optional[str] 237 | number: Optional[float] 238 | boolean: Optional[bool] 239 | date: Optional[DateObject] 240 | 241 | @property 242 | def value(self): 243 | if self.type == "string": 244 | return self.string 245 | elif self.type == "number": 246 | return self.number 247 | elif self.type == "boolean": 248 | return self.boolean 249 | elif self.type == "date": 250 | if self.date is not None: 251 | return self.date.value 252 | 253 | 254 | class AnnotationObject(BaseModel): 255 | bold: bool 256 | italic: bool 257 | strikethrough: bool 258 | underline: bool 259 | code: bool 260 | color: NotionExtendedColorEnum 261 | 262 | 263 | class TextLinkObject(BaseModel): 264 | type: Optional[str] = "url" 265 | url: str 266 | 267 | 268 | class TextObject(BaseModel): 269 | content: str 270 | link: Optional[TextLinkObject] 271 | 272 | 273 | class PageReferenceObject(BaseModel): 274 | id: str 275 | 276 | 277 | class LinkPreviewMentionObject(BaseModel): 278 | url: str 279 | 280 | 281 | class MentionObject(BaseModel): 282 | type: str 283 | user: Optional[UserObject] 284 | page: Optional[PageReferenceObject] 285 | database: Optional[PageReferenceObject] 286 | date: Optional[DateObject] 287 | link_preview: Optional[LinkPreviewMentionObject] 288 | 289 | 290 | class EquationObject(BaseModel): 291 | expression: str 292 | 293 | 294 | class BaseRichTextObject(BaseModel): 295 | plain_text: Optional[str] 296 | # TODO: The Optional[plain_text] is used when creating property values 297 | href: Optional[str] = None 298 | annotations: Optional[AnnotationObject] = None 299 | type: Optional[RichTextTypeEnum] 300 | 301 | @property 302 | def value(self): 303 | return self.plain_text 304 | 305 | 306 | class RichTextObject(BaseRichTextObject): 307 | text: Optional[TextObject] 308 | mention: Optional[MentionObject] 309 | equation: Optional[EquationObject] 310 | 311 | @classmethod 312 | def from_value(cls, value: str): 313 | return cls(text=TextObject(content=value)) 314 | 315 | @classmethod 316 | def encode_string(cls, value: str) -> List["RichTextObject"]: 317 | chunk_size = RICH_TEXT_CONTENT_MAX_LENGTH 318 | return [ 319 | cls(text=TextObject(content=value[idx : idx + chunk_size])) 320 | for idx in range(0, len(value), chunk_size) 321 | ] 322 | 323 | 324 | class EmojiObject(BaseModel): 325 | type: str = "emoji" 326 | emoji: str 327 | -------------------------------------------------------------------------------- /src/notion_df/blocks.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import List, Union, Dict, Any, Tuple, Optional, Union 3 | 4 | from notion_client import Client 5 | from pydantic import BaseModel, parse_obj_as, validator, root_validator 6 | 7 | from notion_df.base import ( 8 | RichTextObject, 9 | SelectOption, 10 | DateObject, 11 | RelationObject, 12 | UserObject, 13 | RollupObject, 14 | FileObject, 15 | EmojiObject, 16 | FormulaObject, 17 | NotionExtendedColorEnum, 18 | ) 19 | 20 | 21 | class ParentObject(BaseModel): 22 | type: str 23 | database_id: Optional[str] 24 | page_id: Optional[str] 25 | workspace: Optional[bool] 26 | block_id: Optional[str] 27 | 28 | 29 | # BaseClasses 30 | class BaseAttributes(BaseModel): 31 | pass 32 | 33 | 34 | class BaseAttributeWithChildren(BaseModel): 35 | children: Optional[List["BaseNotionBlock"]] 36 | 37 | 38 | class TextBlockAttributes(BaseAttributeWithChildren): 39 | rich_text: List[RichTextObject] 40 | color: Optional[NotionExtendedColorEnum] 41 | 42 | 43 | class HeadingBlockAttributes(BaseAttributeWithChildren): 44 | rich_text: List[RichTextObject] 45 | color: Optional[NotionExtendedColorEnum] 46 | is_toggleable: bool 47 | # Whether or not the heading block is a toggle heading or not. If true, the heading block has toggle and can support children. If false, the heading block is a normal heading block. 48 | 49 | 50 | class CalloutBlockAttributes(BaseAttributeWithChildren): 51 | rich_text: List[RichTextObject] 52 | icon: Optional[Union[FileObject, EmojiObject]] 53 | color: Optional[NotionExtendedColorEnum] 54 | 55 | 56 | class ToDoBlockAttributes(BaseAttributeWithChildren): 57 | rich_text: List[RichTextObject] 58 | color: Optional[NotionExtendedColorEnum] 59 | checked: Optional[bool] 60 | 61 | 62 | class CodeBlockAttributes(BaseAttributes): 63 | rich_text: List[RichTextObject] 64 | caption: Optional[List[RichTextObject]] 65 | language: Optional[str] # TODO: it's actually an enum 66 | 67 | 68 | class ChildPageAttributes(BaseAttributes): 69 | title: List[RichTextObject] 70 | 71 | 72 | class EmbedBlockAttributes(BaseAttributes): 73 | url: str 74 | 75 | 76 | class ImageBlockAttributes(BaseAttributes, FileObject): 77 | caption: Optional[List[RichTextObject]] 78 | # This is not listed in the docs, but it is in the API response (Nov 2022) 79 | 80 | 81 | class VideoBlockAttributes(BaseAttributes): 82 | video: FileObject 83 | 84 | 85 | class FileBlockAttributes(BaseAttributes): 86 | file: FileObject 87 | caption: Optional[List[RichTextObject]] 88 | 89 | 90 | class PdfBlockAttributes(BaseAttributes): 91 | pdf: FileObject 92 | 93 | 94 | class BookmarkBlockAttributes(BaseAttributes): 95 | url: str 96 | caption: Optional[List[RichTextObject]] 97 | 98 | 99 | class EquationBlockAttributes(BaseAttributes): 100 | expression: str 101 | 102 | 103 | class TableOfContentsAttributes(BaseAttributes): 104 | color: Optional[NotionExtendedColorEnum] 105 | 106 | 107 | class LinkPreviewAttributes(BaseAttributes): 108 | url: str 109 | 110 | 111 | class LinkToPageAttributes(BaseAttributes): 112 | type: str 113 | page_id: Optional[str] 114 | database_id: Optional[str] 115 | 116 | 117 | ATTRIBUTES_MAPPING = { 118 | _cls.__name__: _cls 119 | for _cls in BaseAttributes.__subclasses__() 120 | + BaseAttributeWithChildren.__subclasses__() 121 | } 122 | 123 | 124 | class BaseNotionBlock(BaseModel): 125 | object: str = "block" 126 | parent: Optional[ParentObject] 127 | id: Optional[str] 128 | type: Optional[str] 129 | created_time: Optional[str] 130 | # created_by 131 | last_edited_time: Optional[str] 132 | # created_by 133 | has_children: Optional[bool] 134 | archived: Optional[bool] 135 | type: str 136 | 137 | @property 138 | def children(self): 139 | return self.__getattribute__(self.type).children 140 | 141 | def set_children(self, value: Any): 142 | self.__getattribute__(self.type).children = value 143 | 144 | 145 | class ParagraphBlock(BaseNotionBlock): 146 | type: str = "paragraph" 147 | paragraph: TextBlockAttributes 148 | 149 | 150 | class HeadingOneBlock(BaseNotionBlock): 151 | type: str = "heading_1" 152 | heading_1: HeadingBlockAttributes 153 | 154 | 155 | class HeadingTwoBlock(BaseNotionBlock): 156 | type: str = "heading_2" 157 | heading_2: HeadingBlockAttributes 158 | 159 | 160 | class HeadingThreeBlock(BaseNotionBlock): 161 | type: str = "heading_3" 162 | heading_3: HeadingBlockAttributes 163 | 164 | 165 | class CalloutBlock(BaseNotionBlock): 166 | type: str = "callout" 167 | callout: CalloutBlockAttributes 168 | 169 | 170 | class QuoteBlock(BaseNotionBlock): 171 | type: str = "quote" 172 | quote: TextBlockAttributes 173 | 174 | 175 | class BulletedListItemBlock(BaseNotionBlock): 176 | type: str = "bulleted_list_item" 177 | bulleted_list_item: TextBlockAttributes 178 | 179 | 180 | class NumberedListItemBlock(BaseNotionBlock): 181 | type: str = "numbered_list_item" 182 | numbered_list_item: TextBlockAttributes 183 | 184 | 185 | class ToDoBlock(BaseNotionBlock): 186 | type: str = "to_do" 187 | to_do: ToDoBlockAttributes 188 | 189 | 190 | class ToggleBlock(BaseNotionBlock): 191 | type: str = "toggle" 192 | toggle: TextBlockAttributes 193 | 194 | 195 | class CodeBlock(BaseNotionBlock): 196 | type: str = "code" 197 | code: CodeBlockAttributes 198 | 199 | 200 | class ChildPageBlock(BaseNotionBlock): 201 | type: str = "child_page" 202 | child_page: ChildPageAttributes 203 | 204 | 205 | class ChildDatabaseBlock(BaseNotionBlock): 206 | type: str = "child_database" 207 | child_database: ChildPageAttributes 208 | 209 | 210 | class EmbedBlock(BaseNotionBlock): 211 | type: str = "embed" 212 | embed: EmbedBlockAttributes 213 | 214 | 215 | class ImageBlock(BaseNotionBlock): 216 | type: str = "image" 217 | image: ImageBlockAttributes 218 | 219 | 220 | class VideoBlock(BaseNotionBlock): 221 | type: str = "video" 222 | video: VideoBlockAttributes 223 | 224 | 225 | class FileBlock(BaseNotionBlock): 226 | type: str = "file" 227 | file: FileBlockAttributes 228 | 229 | 230 | class PdfBlock(BaseNotionBlock): 231 | type: str = "pdf" 232 | pdf: PdfBlockAttributes 233 | 234 | 235 | class BookmarkBlock(BaseNotionBlock): 236 | type: str = "bookmark" 237 | bookmark: BookmarkBlockAttributes 238 | 239 | 240 | class EquationBlock(BaseNotionBlock): 241 | type: str = "equation" 242 | equation: EquationBlockAttributes 243 | 244 | 245 | class DividerBlock(BaseNotionBlock): 246 | type: str = "divider" 247 | divider: Optional[Dict] 248 | 249 | 250 | class TableOfContentsBlock(BaseNotionBlock): 251 | type: str = "table_of_contents" 252 | table_of_contents: TableOfContentsAttributes 253 | 254 | 255 | class BreadcrumbBlock(BaseNotionBlock): 256 | type: str = "breadcrumb" 257 | breadcrumb: Optional[Dict] 258 | 259 | 260 | # TODO: Column List and Column Blocks 261 | 262 | 263 | class LinkPreviewBlock(BaseNotionBlock): 264 | type: str = "link_preview" 265 | link_preview: LinkPreviewAttributes 266 | 267 | 268 | # TODO: Template blocks 269 | 270 | 271 | class LinkToPageBlock(BaseNotionBlock): 272 | type: str = "link_to_page" 273 | link_to_page: LinkToPageAttributes 274 | 275 | 276 | # TODO: Synced Block blocks 277 | 278 | # TODO: Table blocks 279 | 280 | # TODO: Table row blocks 281 | 282 | BLOCKS_MAPPING = { 283 | list(_cls.__fields__.keys())[-1]: _cls for _cls in BaseNotionBlock.__subclasses__() 284 | } 285 | 286 | 287 | def parse_one_block(data: Dict) -> BaseNotionBlock: 288 | if data["type"] not in BLOCKS_MAPPING: 289 | warnings.warn(f"Unknown block type: {data['type']}") 290 | return None 291 | 292 | return parse_obj_as(BLOCKS_MAPPING[data["type"]], data) 293 | 294 | 295 | def parse_blocks( 296 | data: List[Dict], recursive: bool = False, client: Client = None 297 | ) -> List[BaseNotionBlock]: 298 | all_blocks = [] 299 | for block_data in data: 300 | block = parse_one_block(block_data) 301 | if block.has_children and recursive and client: 302 | block.set_children( 303 | parse_blocks( 304 | client.blocks.children.list(block_id=block.id)["results"], 305 | recursive=recursive, 306 | client=client, 307 | ) 308 | ) 309 | all_blocks.append(block) 310 | return all_blocks 311 | -------------------------------------------------------------------------------- /src/notion_df/configs.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Callable, Tuple 2 | import warnings 3 | import itertools 4 | from dataclasses import dataclass 5 | 6 | from pydantic import BaseModel, validator, parse_obj_as 7 | from pandas.api.types import ( 8 | is_datetime64_any_dtype, 9 | is_numeric_dtype, 10 | is_object_dtype, 11 | is_bool_dtype, 12 | is_categorical_dtype, 13 | is_list_like, 14 | ) 15 | 16 | from notion_df.base import ( 17 | SelectOptions, 18 | NumberFormat, 19 | RollupProperty, 20 | FormulaProperty, 21 | RelationProperty, 22 | ) 23 | from notion_df.utils import ( 24 | flatten_dict, 25 | IDENTITY_TRANSFORM, 26 | REMOVE_EMPTY_STR_TRANSFORM, 27 | SECURE_STR_TRANSFORM, 28 | SECURE_BOOL_TRANSFORM, 29 | SECURE_TIME_TRANSFORM, 30 | LIST_TRANSFORM, 31 | ) 32 | 33 | 34 | class BasePropertyConfig(BaseModel): 35 | id: Optional[str] 36 | type: Optional[str] 37 | 38 | def query_dict(self): 39 | return flatten_dict(self.dict()) 40 | 41 | @validator("type", always=True) 42 | def automatically_set_type_value(cls, v): 43 | _type = list(cls.__fields__.keys())[-1] 44 | if v is None: 45 | return _type 46 | else: 47 | assert _type == v, f"{_type} != {v}" 48 | return _type 49 | 50 | 51 | class TitleConfig(BasePropertyConfig): 52 | title: Dict = {} 53 | 54 | # TODO: Make the validator automatically geneerated 55 | @validator("title") 56 | def title_is_empty_dict(cls, v): 57 | if v: 58 | raise ValueError("The title dict must be empty") 59 | return v 60 | 61 | 62 | class RichTextConfig(BasePropertyConfig): 63 | rich_text: Dict = {} 64 | 65 | @validator("rich_text") 66 | def title_is_empty_dict(cls, v): 67 | if v: 68 | raise ValueError("The rich_text dict must be empty") 69 | return v 70 | 71 | 72 | class NumberConfig(BasePropertyConfig): 73 | number: NumberFormat 74 | 75 | # TODO:Add enum based on https://developers.notion.com/reference/create-a-database#number-configuration 76 | 77 | 78 | class SelectConfig(BasePropertyConfig): 79 | select: Optional[SelectOptions] 80 | 81 | 82 | class MultiSelectConfig(BasePropertyConfig): 83 | multi_select: Optional[SelectOptions] 84 | 85 | 86 | class DateConfig(BasePropertyConfig): 87 | date: Dict = {} 88 | 89 | @validator("date") 90 | def title_is_empty_dict(cls, v): 91 | if v: 92 | raise ValueError("The date dict must be empty") 93 | return v 94 | 95 | 96 | class PeopleConfig(BasePropertyConfig): 97 | people: Dict = {} 98 | 99 | @validator("people") 100 | def title_is_empty_dict(cls, v): 101 | if v: 102 | raise ValueError("The people dict must be empty") 103 | return v 104 | 105 | 106 | class FilesConfig(BasePropertyConfig): 107 | files: Dict = {} 108 | 109 | @validator("files") 110 | def title_is_empty_dict(cls, v): 111 | if v: 112 | raise ValueError("The files dict must be empty") 113 | return v 114 | 115 | 116 | class CheckboxConfig(BasePropertyConfig): 117 | checkbox: Dict = {} 118 | 119 | @validator("checkbox") 120 | def title_is_empty_dict(cls, v): 121 | if v: 122 | raise ValueError("The checkbox dict must be empty") 123 | return v 124 | 125 | 126 | class URLConfig(BasePropertyConfig): 127 | url: Dict = {} 128 | 129 | @validator("url") 130 | def title_is_empty_dict(cls, v): 131 | if v: 132 | raise ValueError("The url dict must be empty") 133 | return v 134 | 135 | 136 | class EmailConfig(BasePropertyConfig): 137 | email: Dict = {} 138 | 139 | @validator("email") 140 | def title_is_empty_dict(cls, v): 141 | if v: 142 | raise ValueError("The email dict must be empty") 143 | return v 144 | 145 | 146 | class PhoneNumberConfig(BasePropertyConfig): 147 | phone_number: Dict = {} 148 | 149 | @validator("phone_number") 150 | def title_is_empty_dict(cls, v): 151 | if v: 152 | raise ValueError("The phone_number dict must be empty") 153 | return v 154 | 155 | 156 | class FormulaConfig(BasePropertyConfig): 157 | formula: FormulaProperty 158 | 159 | 160 | class RelationConfig(BasePropertyConfig): 161 | relation: RelationProperty 162 | 163 | 164 | class RollupConfig(BasePropertyConfig): 165 | rollup: RollupProperty 166 | 167 | 168 | class CreatedTimeConfig(BasePropertyConfig): 169 | created_time: Dict = {} 170 | 171 | @validator("created_time") 172 | def title_is_empty_dict(cls, v): 173 | if v: 174 | raise ValueError("The created_time dict must be empty") 175 | return v 176 | 177 | 178 | class CreatedByConfig(BasePropertyConfig): 179 | created_by: Dict = {} 180 | 181 | @validator("created_by") 182 | def title_is_empty_dict(cls, v): 183 | if v: 184 | raise ValueError("The created_by dict must be empty") 185 | return v 186 | 187 | 188 | class LastEditedTimeConfig(BasePropertyConfig): 189 | last_edited_time: Dict = {} 190 | 191 | @validator("last_edited_time") 192 | def title_is_empty_dict(cls, v): 193 | if v: 194 | raise ValueError("The last_edited_time dict must be empty") 195 | return v 196 | 197 | 198 | class LastEditedByConfig(BasePropertyConfig): 199 | last_edited_by: Dict = {} 200 | 201 | @validator("last_edited_by") 202 | def title_is_empty_dict(cls, v): 203 | if v: 204 | raise ValueError("The last_edited_by dict must be empty") 205 | return v 206 | 207 | 208 | def _convert_classname_to_typename(s): 209 | import re 210 | 211 | s = s.replace("Config", "").replace("URL", "Url") 212 | return re.sub(r"(? BasePropertyConfig: 232 | return parse_obj_as(CONFIGS_MAPPING[data["type"]], data) 233 | 234 | 235 | CONFIGS_DF_TRANSFORMER = { 236 | "title": SECURE_STR_TRANSFORM, 237 | "rich_text": SECURE_STR_TRANSFORM, 238 | "number": None, 239 | "select": REMOVE_EMPTY_STR_TRANSFORM, 240 | "multi_select": lambda lst: [str(ele) for ele in lst] 241 | if is_list_like(lst) 242 | else str(lst), 243 | "date": SECURE_TIME_TRANSFORM, 244 | "checkbox": SECURE_BOOL_TRANSFORM, 245 | ### Notion-specific Properties ### 246 | # Currently we don't automatically convert these properties 247 | # We assume the users will use the correct type and we don't need to perform any transformation 248 | "people": IDENTITY_TRANSFORM, 249 | "relation": LIST_TRANSFORM, 250 | "url": REMOVE_EMPTY_STR_TRANSFORM, 251 | "email": REMOVE_EMPTY_STR_TRANSFORM, 252 | ### TODO: check the following ### 253 | "files": SECURE_STR_TRANSFORM, 254 | "phone_number": SECURE_STR_TRANSFORM, 255 | "formula": SECURE_STR_TRANSFORM, 256 | "rollup": SECURE_STR_TRANSFORM, 257 | "created_time": SECURE_STR_TRANSFORM, 258 | "created_by": SECURE_STR_TRANSFORM, 259 | "last_edited_time": SECURE_STR_TRANSFORM, 260 | "last_edited_by": SECURE_STR_TRANSFORM, 261 | } 262 | 263 | 264 | def _infer_series_config(column: "pd.Series") -> BasePropertyConfig: 265 | dtype = column.dtype 266 | 267 | if is_object_dtype(dtype): 268 | if all(is_list_like(ele) for ele in column): 269 | all_possible_values = set( 270 | list(itertools.chain.from_iterable(column.to_list())) 271 | ) 272 | all_possible_values = [str(ele) for ele in all_possible_values] 273 | return MultiSelectConfig( 274 | multi_select=SelectOptions.from_value(all_possible_values), 275 | ) 276 | else: 277 | return RichTextConfig() 278 | if is_numeric_dtype(dtype): 279 | return NumberConfig(number=NumberFormat(format="number")) 280 | if is_bool_dtype(dtype): 281 | return CheckboxConfig() 282 | if is_categorical_dtype(dtype): 283 | return SelectConfig( 284 | select=SelectOptions.from_value([str for cat in dtype.categories]), 285 | ) 286 | if is_datetime64_any_dtype(dtype): 287 | return DateConfig() 288 | 289 | return None 290 | 291 | 292 | @dataclass 293 | class DatabaseSchema: 294 | 295 | configs: Dict[str, BasePropertyConfig] 296 | 297 | @classmethod 298 | def from_raw(cls, configs: Dict) -> "DatabaseSchema": 299 | 300 | configs = {key: parse_single_config(config) for key, config in configs.items()} 301 | return cls(configs) 302 | 303 | def __getitem__(self, key: int): 304 | return self.configs[key] 305 | 306 | def query_dict(self) -> Dict: 307 | return {key: config.query_dict() for key, config in self.configs.items()} 308 | 309 | @classmethod 310 | def from_df( 311 | cls, df: "pd.DataFrame", title_col: Optional[str] = None 312 | ) -> "DatabaseSchema": 313 | """Automatically infer the schema from a pandas dataframe""" 314 | df = df.infer_objects() 315 | 316 | configs = {} 317 | for col in df.columns: 318 | config = _infer_series_config(df[col]) 319 | configs[col] = config 320 | 321 | if title_col is not None: 322 | configs[title_col] = TitleConfig() 323 | else: 324 | configs[df.columns[0]] = TitleConfig() 325 | 326 | return cls(configs) 327 | 328 | @property 329 | def title_column(self) -> Optional[str]: 330 | for key, config in self.configs.items(): 331 | if isinstance(config, TitleConfig) or config.type == "title": 332 | # TODO: Rethink this 333 | return key 334 | 335 | def create_df(self, df) -> "pd.DataFrame": 336 | 337 | notion_urls = df.notion_urls 338 | notion_ids = df.notion_ids 339 | notion_query_results = df.notion_query_results 340 | 341 | df = df.copy() 342 | # Ensure the column integrity 343 | # See the issue mentioned in https://github.com/lolipopshock/notion-df/issues/17 344 | columns = [col for col in df.columns if col in self.configs] 345 | df = df[columns] 346 | 347 | df.schema = self 348 | 349 | with warnings.catch_warnings(): 350 | warnings.simplefilter("ignore") 351 | df.notion_urls = notion_urls 352 | df.notion_ids = notion_ids 353 | df.notion_query_results = notion_query_results 354 | 355 | return df 356 | 357 | def is_df_compatible(self, df: "pd.DataFrame") -> bool: 358 | """Validate the dataframe against the schema""" 359 | 360 | if hasattr(df, "schema"): 361 | if not df.schema == self: 362 | return False 363 | 364 | # TODO: There might miss one thing: if the rollup is not configured 365 | # the database reterive result will be empty for that column. 366 | # But the database query will return the value for that column 367 | # (even if that's empty). So this would miss this check... 368 | else: 369 | for col in df.columns: 370 | if col not in self.configs.keys(): 371 | return False 372 | 373 | # TODO: Add more advanced check on datatypes 374 | return True 375 | 376 | def transform( 377 | self, df: "pd.DataFrame", remove_non_editables=False 378 | ) -> "pd.DataFrame": 379 | """Transform the df such that the data values are compatible with the schema. 380 | It assumes the df has already been validated against the schema. 381 | """ 382 | df = df.copy() 383 | used_columns = [] 384 | for col in df.columns: 385 | if self[col].type in NON_EDITABLE_TYPES: 386 | continue # Skip non-editable columns 387 | 388 | transform = CONFIGS_DF_TRANSFORMER[self[col].type] 389 | if transform is not None: 390 | df[col] = df[col].apply(transform) 391 | used_columns.append(col) 392 | if remove_non_editables: 393 | return df[used_columns] 394 | return df 395 | -------------------------------------------------------------------------------- /src/notion_df/constants.py: -------------------------------------------------------------------------------- 1 | # See https://developers.notion.com/reference/request-limits 2 | 3 | RICH_TEXT_CONTENT_MAX_LENGTH = 2000 4 | RICH_TEXT_LINK_MAX_LENGTH = 1000 5 | EQUATION_EXPRESSION_MAX_LENGTH = 1000 -------------------------------------------------------------------------------- /src/notion_df/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Union, Any 2 | from datetime import datetime 3 | from dateutil.parser import parse 4 | from uuid import UUID 5 | 6 | import pandas as pd 7 | from pandas.api.types import is_array_like, is_datetime64_any_dtype, is_list_like 8 | 9 | 10 | def flatten_dict(data: Dict): 11 | """Remove entries in dict whose values are None""" 12 | if isinstance(data, dict): 13 | return { 14 | key: flatten_dict(value) for key, value in data.items() if value is not None 15 | } 16 | elif isinstance(data, list) or isinstance(data, tuple): 17 | return [flatten_dict(value) for value in data] 18 | else: 19 | return data 20 | 21 | 22 | def is_item_empty(item: Any) -> bool: 23 | 24 | if item is None or item == []: 25 | return True 26 | 27 | isna = pd.isna(item) 28 | if is_array_like(isna): 29 | isna = isna.all() 30 | # TODO: Rethink it is all or any 31 | 32 | return isna 33 | 34 | 35 | def is_time_string(s: str) -> bool: 36 | 37 | # Ref https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format 38 | try: 39 | parse(s) 40 | return True 41 | except ValueError: 42 | return False 43 | 44 | 45 | def is_uuid(s: str) -> bool: 46 | # Kind of an OK solution.. But can be further improved? 47 | try: 48 | UUID(str(s)) 49 | return True 50 | except ValueError: 51 | return False 52 | 53 | 54 | ISO8601_REGEX = r"^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]+)?(Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$" 55 | # See https://stackoverflow.com/questions/41129921/validate-an-iso-8601-datetime-string-in-python 56 | ISO8601_STRFTIME_TRANSFORM = lambda ele: ele.strftime("%Y-%m-%dT%H:%M:%SZ") 57 | 58 | strtime_transform = lambda ele: parse(ele).strftime("%Y-%m-%dT%H:%M:%SZ") 59 | datetime_transform = lambda ele: ele.strftime("%Y-%m-%dT%H:%M:%SZ") 60 | 61 | 62 | def transform_time(s: Any) -> str: 63 | if not is_item_empty(s): 64 | if isinstance(s, str): 65 | return strtime_transform(s) 66 | elif isinstance(s, datetime): 67 | return datetime_transform(s) 68 | elif is_datetime64_any_dtype(s): 69 | return datetime_transform(s) 70 | 71 | 72 | IDENTITY_TRANSFORM = lambda ele: ele 73 | SECURE_STR_TRANSFORM = lambda ele: str(ele) if not is_item_empty(ele) else "" 74 | LIST_TRANSFORM = lambda ele: ele if is_list_like(ele) else [ele] 75 | REMOVE_EMPTY_STR_TRANSFORM = ( 76 | lambda ele: None if ele == "" or ele is None or pd.isna(ele) else SECURE_STR_TRANSFORM(ele) 77 | ) 78 | SECURE_BOOL_TRANSFORM = lambda ele: bool(ele) if not is_item_empty(ele) else None 79 | SECURE_TIME_TRANSFORM = transform_time 80 | -------------------------------------------------------------------------------- /src/notion_df/values.py: -------------------------------------------------------------------------------- 1 | ### Referring to https://developers.notion.com/reference/page#property-value-object 2 | 3 | from typing import List, Dict, Optional, Union, Any 4 | from dataclasses import dataclass 5 | from copy import deepcopy 6 | import numbers 7 | 8 | from pydantic import BaseModel, parse_obj_as, validator, root_validator 9 | import pandas as pd 10 | from pandas.api.types import is_array_like 11 | 12 | from notion_df.base import ( 13 | RichTextObject, 14 | SelectOption, 15 | DateObject, 16 | RelationObject, 17 | UserObject, 18 | RollupObject, 19 | FileObject, 20 | FormulaObject 21 | ) 22 | from notion_df.utils import ( 23 | flatten_dict, 24 | is_list_like 25 | ) 26 | 27 | 28 | class BasePropertyValues(BaseModel): 29 | id: Optional[str] # TODO: Rethink whether we can do this 30 | # The Optional[id] is used when creating property values 31 | type: Optional[str] 32 | 33 | # TODO: Add abstractmethods for them 34 | @classmethod 35 | def from_value(cls, value): 36 | pass 37 | 38 | @property 39 | def value(self): 40 | pass 41 | 42 | def query_dict(self): 43 | return flatten_dict(self.dict()) 44 | 45 | 46 | class TitleValues(BasePropertyValues): 47 | title: List[RichTextObject] 48 | 49 | @property 50 | def value(self) -> Optional[str]: 51 | return ( 52 | None 53 | if len(self.title) == 0 54 | else " ".join([text.value for text in self.title]) 55 | ) 56 | 57 | @classmethod 58 | def from_value(cls, value): 59 | return cls(title=RichTextObject.encode_string(value)) 60 | # TODO: Rethink whether we should split input string to multiple elements in the list 61 | 62 | 63 | class RichTextValues(BasePropertyValues): 64 | rich_text: List[RichTextObject] 65 | 66 | @property 67 | def value(self) -> Optional[str]: 68 | return ( 69 | None 70 | if len(self.rich_text) == 0 71 | else " ".join([text.value for text in self.rich_text]) 72 | ) 73 | 74 | @classmethod 75 | def from_value(cls, value: str): 76 | return cls(rich_text=RichTextObject.encode_string(value)) 77 | 78 | 79 | class NumberValues(BasePropertyValues): 80 | number: Optional[Union[float, int]] 81 | 82 | @property 83 | def value(self) -> str: 84 | return self.number 85 | 86 | @classmethod 87 | def from_value(cls, value: Union[float, int]): 88 | return cls(number=value) 89 | 90 | 91 | class SelectValues(BasePropertyValues): 92 | select: Optional[SelectOption] 93 | 94 | @property 95 | def value(self) -> Optional[str]: 96 | return self.select.name if self.select else None 97 | 98 | @classmethod 99 | def from_value(cls, value: str): 100 | return cls(select=SelectOption.from_value(value)) 101 | 102 | 103 | class MultiSelectValues(BasePropertyValues): 104 | multi_select: List[SelectOption] 105 | 106 | @property 107 | def value(self) -> List[str]: 108 | return [select.name for select in self.multi_select] 109 | 110 | @classmethod 111 | def from_value(cls, values: Union[List[str], str]): 112 | if is_list_like(values): 113 | return cls( 114 | multi_select=[SelectOption.from_value(value) for value in values] 115 | ) 116 | else: 117 | return cls(multi_select=[SelectOption.from_value(values)]) 118 | 119 | 120 | class DateValues(BasePropertyValues): 121 | date: Optional[DateObject] 122 | 123 | @property 124 | def value(self) -> str: 125 | return self.date.value if self.date else None 126 | 127 | @classmethod 128 | def from_value(cls, value: str): 129 | return cls(date=DateObject.from_value(value)) 130 | 131 | 132 | class FormulaValues(BasePropertyValues): 133 | formula: FormulaObject 134 | 135 | @property 136 | def value(self): 137 | return self.formula.value 138 | 139 | 140 | class RelationValues(BasePropertyValues): 141 | relation: List[RelationObject] 142 | 143 | @property 144 | def value(self) -> List[str]: 145 | return [relation.id for relation in self.relation] 146 | 147 | @classmethod 148 | def from_value(cls, values: Union[List[str], str]): 149 | if is_list_like(values): 150 | return cls(relation=[RelationObject.from_value(value) for value in values]) 151 | else: 152 | return cls(relation=[RelationObject.from_value(values)]) 153 | 154 | 155 | class PeopleValues(BasePropertyValues): 156 | people: List[UserObject] 157 | 158 | @property 159 | def value(self) -> List[str]: 160 | return [people.id for people in self.people] 161 | 162 | @classmethod 163 | def from_value(cls, values: Union[List[str], str]): 164 | if is_list_like(values): 165 | return cls(people=[UserObject.from_value(value) for value in values]) 166 | else: 167 | return cls(people=[UserObject.from_value(values)]) 168 | 169 | 170 | class FilesValues(BasePropertyValues): 171 | files: List[FileObject] 172 | 173 | @property 174 | def value(self) -> List[str]: 175 | return [file.value for file in self.files] 176 | 177 | class CheckboxValues(BasePropertyValues): 178 | checkbox: Optional[bool] 179 | 180 | @property 181 | def value(self) -> Optional[bool]: 182 | return self.checkbox 183 | 184 | @classmethod 185 | def from_value(cls, value: bool): 186 | return cls(checkbox=value) 187 | 188 | 189 | class URLValues(BasePropertyValues): 190 | url: Optional[str] 191 | 192 | @property 193 | def value(self) -> Optional[str]: 194 | return self.url 195 | 196 | @classmethod 197 | def from_value(cls, value: Optional[str]): 198 | return cls(url=value) 199 | 200 | def query_dict(self): 201 | res = flatten_dict(self.dict()) 202 | if "url" not in res: 203 | res["url"] = None 204 | # The url value is required by the notion API 205 | return res 206 | 207 | 208 | class EmailValues(BasePropertyValues): 209 | email: Optional[str] 210 | 211 | @property 212 | def value(self) -> Optional[str]: 213 | return self.email 214 | 215 | @classmethod 216 | def from_value(cls, value: str): 217 | return cls(email=value) 218 | 219 | 220 | class PhoneNumberValues(BasePropertyValues): 221 | phone_number: Optional[str] 222 | 223 | @property 224 | def value(self) -> Optional[str]: 225 | return self.phone_number 226 | 227 | @classmethod 228 | def from_value(cls, value: str): 229 | return cls(phone_number=value) 230 | 231 | 232 | class CreatedTimeValues(BasePropertyValues): 233 | created_time: Optional[str] 234 | 235 | @property 236 | def value(self) -> Optional[str]: 237 | return self.created_time 238 | 239 | @classmethod 240 | def from_value(cls, value: str): 241 | return cls(created_time=value) 242 | 243 | 244 | class CreatedByValues(BasePropertyValues): 245 | created_by: UserObject 246 | 247 | @property 248 | def value(self) -> List[str]: 249 | return self.created_by.value 250 | 251 | 252 | class LastEditedTimeValues(BasePropertyValues): 253 | last_edited_time: str 254 | 255 | @property 256 | def value(self) -> Optional[str]: 257 | return self.last_edited_time 258 | 259 | @classmethod 260 | def from_value(cls, value: str): 261 | return cls(last_edited_time=value) 262 | 263 | 264 | class LastEditedByValues(BasePropertyValues): 265 | last_edited_by: UserObject 266 | 267 | @property 268 | def value(self) -> List[str]: 269 | return self.last_edited_by.value 270 | 271 | 272 | VALUES_MAPPING = { 273 | list(_cls.__fields__.keys())[-1]: _cls 274 | for _cls in BasePropertyValues.__subclasses__() 275 | if len(_cls.__fields__) 276 | == 3 # TODO: When all classes have been implemented, we can just remove this check 277 | } 278 | 279 | 280 | class RollupValues(BasePropertyValues): 281 | rollup: RollupObject 282 | 283 | @validator("rollup", pre=True) 284 | def check_rollup_values(cls, val): 285 | val = deepcopy(val) 286 | if val.get("array") is not None: 287 | val["array"] = [ 288 | parse_obj_as(VALUES_MAPPING[data["type"]], data) 289 | for data in val["array"] 290 | ] 291 | return val 292 | 293 | @property 294 | def value(self): 295 | return self.rollup.value 296 | 297 | 298 | VALUES_MAPPING["rollup"] = RollupValues 299 | 300 | 301 | def parse_single_values(data: Dict) -> BasePropertyValues: 302 | return parse_obj_as(VALUES_MAPPING[data["type"]], data) 303 | 304 | 305 | def _guess_value_schema(val: Any) -> object: 306 | 307 | if isinstance(val, str): 308 | return RichTextValues 309 | elif isinstance(val, numbers.Number): 310 | return NumberValues 311 | elif isinstance(val, bool): 312 | return CheckboxValues 313 | else: 314 | raise ValueError(f"Unknown value type: {type(val)}") 315 | 316 | 317 | def _is_item_empty(item): 318 | 319 | if item is None or item == []: 320 | return True 321 | 322 | isna = pd.isna(item) 323 | if is_array_like(isna): 324 | isna = isna.all() 325 | # TODO: Rethink it is all or any 326 | 327 | return isna 328 | 329 | 330 | RESERVED_VALUES = ["url"] 331 | # Even if the value is none, we still want to keep it in the dataframe 332 | 333 | 334 | def _is_reserved_value(key, schema): 335 | return schema[key].type in RESERVED_VALUES 336 | 337 | 338 | def parse_value_with_schema( 339 | idx: int, key: str, value: Any, schema: "DatabaseSchema" 340 | ) -> BasePropertyValues: 341 | # TODO: schema shouldn't be allowed to be empty in the future version 342 | # schema should be determined at the dataframe level. 343 | 344 | if schema is not None: 345 | value_func = VALUES_MAPPING[schema[key].type] 346 | else: 347 | if idx == 0: 348 | # TODO: Brutally enforce the first one to be the title, though 349 | # should be optimized in future versions 350 | value_func = TitleValues 351 | value = str(value) 352 | else: 353 | value_func = _guess_value_schema(value) 354 | 355 | return value_func.from_value(value) 356 | 357 | 358 | @dataclass 359 | class PageProperty: 360 | """This class is used to parse properties of a single Notion Page. 361 | 362 | :: example: 363 | 364 | >>> data = \ 365 | {"Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []}, 366 | "Created": {"id": "mbOA", "type": "date", "date": None}, 367 | "Title": {"id": "title", "type": "title", "title": []}} 368 | >>> property = PageProperty.from_raw(data) 369 | """ 370 | 371 | properties: Dict[str, BasePropertyValues] 372 | 373 | @classmethod 374 | def from_raw(cls, properties: Dict) -> "PageProperty": 375 | properties = {k: parse_single_values(v) for k, v in properties.items()} 376 | return cls(properties) 377 | 378 | def __getitem__(self, key): 379 | return self.properties[key] 380 | 381 | def to_series(self): 382 | return pd.Series( 383 | {key: property.value for key, property in self.properties.items()} 384 | ) 385 | 386 | @classmethod 387 | def from_series( 388 | cls, series: pd.Series, schema: "DatabaseSchema" = None 389 | ) -> "PageProperty": 390 | return cls( 391 | { 392 | key: parse_value_with_schema(idx, key, val, schema) 393 | for idx, (key, val) in enumerate(series.items()) 394 | if not _is_item_empty(val) or _is_reserved_value(key, schema) 395 | } 396 | ) 397 | 398 | def query_dict(self) -> Dict: 399 | return {key: property.query_dict() for key, property in self.properties.items()} 400 | 401 | 402 | @dataclass 403 | class PageProperties: 404 | """This class is used to parse multiple page properties within a database 405 | 406 | :: example: 407 | 408 | >>> data = \ 409 | [ 410 | { 411 | "object": "page", 412 | "id": "xxxx", 413 | "created_time": "2032-01-03T00:00:00.000Z", 414 | "properties": { 415 | "Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []}, 416 | "Created": {"id": "mbOA", "type": "date", "date": None}, 417 | "Title": {"id": "title", "type": "title", "title": []} 418 | } 419 | }, 420 | { 421 | "object": "page", 422 | "id": "xxxx", 423 | "created_time": "2032-01-03T00:00:01.000Z", 424 | "properties": { 425 | "Description": {"id": "ji%3Dc", "type": "rich_text", "rich_text": []}, 426 | "Created": {"id": "mbOA", "type": "date", "date": None}, 427 | "Title": {"id": "title", "type": "title", "title": []} 428 | } 429 | } 430 | ] 431 | >>> property = PageProperties.from_raw(data) 432 | """ 433 | 434 | page_properties: List[PageProperty] 435 | 436 | @classmethod 437 | def from_raw(cls, properties: List[Dict]) -> "PageProperties": 438 | page_properties = [ 439 | PageProperty.from_raw(property["properties"]) for property in properties 440 | ] 441 | return cls(page_properties) 442 | 443 | def __getitem__(self, key: int): 444 | return self.page_properties[key] 445 | 446 | def to_frame(self): 447 | return pd.DataFrame([property.to_series() for property in self.page_properties]) 448 | -------------------------------------------------------------------------------- /tests/test_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from notion_df.agent import download 4 | 5 | NOTION_API_KEY = os.environ.get("NOTION_API_KEY") 6 | NOTION_LARGE_DF = os.environ.get("NOTION_LARGE_DF") 7 | NOTION_LARGE_DF_ROWS = 150 8 | 9 | def test_nrows(): 10 | if not NOTION_LARGE_DF or not NOTION_API_KEY: 11 | pytest.skip("API key not provided") 12 | 13 | df = download(NOTION_LARGE_DF, api_key=NOTION_API_KEY) 14 | assert len(df) == NOTION_LARGE_DF_ROWS 15 | 16 | df = download(NOTION_LARGE_DF, nrows=101, api_key=NOTION_API_KEY) 17 | assert len(df) == 101 18 | 19 | df = download(NOTION_LARGE_DF, nrows=15, api_key=NOTION_API_KEY) 20 | assert len(df) == 15 -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pytest 4 | import notion_df 5 | import pandas as pd 6 | from pydantic import ValidationError 7 | from notion_df.agent import download, upload 8 | 9 | NOTION_API_KEY = os.environ.get("NOTION_API_KEY") 10 | 11 | 12 | def test_select_option(): 13 | schema = notion_df.configs.DatabaseSchema( 14 | {"options": notion_df.configs.MultiSelectConfig()} 15 | ) 16 | 17 | df = pd.DataFrame([{"options": [1, 2, 3]}]) 18 | dff = schema.transform(df) 19 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema) 20 | 21 | # Not working because of commas in the option string 22 | df = pd.DataFrame([{"options": ["a,b", "c,d"]}]) 23 | dff = schema.transform(df) 24 | with pytest.raises(ValidationError): 25 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema) 26 | 27 | # The following also checks whether it can convert elements into strings 28 | df = pd.DataFrame([{"options": [[1, 2, 3], [4, 5, 6]]}]) 29 | dff = schema.transform(df) 30 | with pytest.raises(ValidationError): 31 | notion_df.values.PageProperty.from_series(dff.iloc[0], schema) 32 | 33 | 34 | def test_rollup(): 35 | NOTION_ROLLUP_DF = os.environ.get("NOTION_ROLLUP_DF") 36 | 37 | if not NOTION_ROLLUP_DF or not NOTION_API_KEY: 38 | pytest.skip("API key not provided") 39 | 40 | # Ensure the rollup values can be downloaded and uploaded 41 | df = download(NOTION_ROLLUP_DF, api_key=NOTION_API_KEY) 42 | upload(df[:2], NOTION_ROLLUP_DF, api_key=NOTION_API_KEY) 43 | # TODO: Add remove rollup values 44 | 45 | 46 | def test_files_edit_by(): 47 | NOTION_FILES_DF = os.environ.get("NOTION_FILES_DF") 48 | 49 | if not NOTION_FILES_DF or not NOTION_API_KEY: 50 | pytest.skip("API key not provided") 51 | 52 | df = download(NOTION_FILES_DF, api_key=NOTION_API_KEY) 53 | 54 | 55 | def test_formula(): 56 | NOTION_FORMULA_DF = os.environ.get("NOTION_FORMULA_DF") 57 | 58 | if not NOTION_FORMULA_DF or not NOTION_API_KEY: 59 | pytest.skip("API key not provided") 60 | 61 | df = download(NOTION_FORMULA_DF, api_key=NOTION_API_KEY) 62 | 63 | 64 | def test_relation(): 65 | NOTION_RELATION_DF = os.environ.get("NOTION_RELATION_DF") 66 | NOTION_RELATION_TARGET_DF = os.environ.get("NOTION_RELATION_TARGET_DF") 67 | 68 | if not NOTION_RELATION_DF or not NOTION_RELATION_TARGET_DF or not NOTION_API_KEY: 69 | pytest.skip("API key not provided") 70 | 71 | # download: resolve 72 | # upload: resolve 73 | df = download( 74 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=True 75 | ) 76 | df_target = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY) 77 | 78 | assert "private_page" not in df.columns 79 | # See https://github.com/lolipopshock/notion-df/issues/17 80 | 81 | ## witout a new key 82 | upload( 83 | df[:1], 84 | NOTION_RELATION_DF, 85 | resolve_relation_values=True, 86 | create_new_rows_in_relation_target=True, 87 | ) 88 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY) 89 | assert len(df_target_new) == len(df_target) 90 | 91 | ## with a new key 92 | rint = random.randint(0, 100000) 93 | df.at[0, "Related to Tasks"] = [f"test {rint}"] 94 | upload( 95 | df[:1], 96 | NOTION_RELATION_DF, 97 | resolve_relation_values=True, 98 | create_new_rows_in_relation_target=True, 99 | ) 100 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY) 101 | assert len(df_target_new) == len(df_target) + 1 102 | df_target_new.iloc[-1]["name"] == f"test {rint}" 103 | 104 | # download: not-resolve 105 | # upload: resolve 106 | # Avoids creating new rows for uuid only lists 107 | df = download( 108 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=False 109 | ) 110 | df_target = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY) 111 | 112 | upload( 113 | df[:1], 114 | NOTION_RELATION_DF, 115 | resolve_relation_values=True, 116 | create_new_rows_in_relation_target=True, 117 | ) 118 | df_target_new = download(NOTION_RELATION_TARGET_DF, api_key=NOTION_API_KEY) 119 | assert len(df_target_new) == len(df_target) 120 | 121 | # download: resolve 122 | # upload: not-resolve 123 | # Raises error 124 | df = download( 125 | NOTION_RELATION_DF, api_key=NOTION_API_KEY, resolve_relation_values=True 126 | ) 127 | 128 | with pytest.raises(ValidationError): 129 | upload( 130 | df[:1], 131 | NOTION_RELATION_DF, 132 | resolve_relation_values=False, 133 | ) 134 | 135 | def test_long_string(): 136 | NOTION_LONG_STRING_DF = os.environ.get("NOTION_LONG_STRING_DF") 137 | 138 | if not NOTION_LONG_STRING_DF or not NOTION_API_KEY: 139 | pytest.skip("API key not provided") 140 | 141 | df = download(NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY) 142 | assert len(df.iloc[0,1]) == 7721 143 | 144 | upload(df[:1], NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY) 145 | df_new = download(NOTION_LONG_STRING_DF, api_key=NOTION_API_KEY) 146 | # assert len(df_new.iloc[0,1]) == 7721 147 | # This might not be true -- understand why? 148 | 149 | def test_rich_text(): 150 | NOTION_RICH_TEXT_DF = os.environ.get("NOTION_RICH_TEXT_DF") 151 | 152 | if not NOTION_RICH_TEXT_DF or not NOTION_API_KEY: 153 | pytest.skip("API key not provided") 154 | 155 | df = download(NOTION_RICH_TEXT_DF, api_key=NOTION_API_KEY) --------------------------------------------------------------------------------