├── .github └── workflows │ ├── cd.yml │ ├── ci.yml │ └── docs.yml ├── .gitignore ├── MAINTAINERS.md ├── README.md ├── mypy.ini ├── pdoc_theme ├── syntax-highlighting.css └── theme.css ├── pinecone_datasets ├── __init__.py ├── catalog.py ├── cfg.py ├── dataset.py ├── dataset_fsreader.py ├── dataset_fswriter.py ├── dataset_metadata.py ├── fs.py ├── public.py ├── tqdm.py └── utils.py ├── pyproject.toml └── tests ├── __init__.py ├── integration ├── test_io_local.py ├── test_io_private_cloud_storage_gcs.py ├── test_list_public_datasets.py └── test_load_public_dataset.py ├── unit ├── __init__.py ├── test_basics.py ├── test_dataset_metadata.py ├── test_fs.py ├── test_private_datasets.py ├── test_schema_validation.py └── test_utils.py └── utils ├── __init__.py └── test_public_datasets.py /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | 7 | jobs: 8 | 9 | release: 10 | permissions: 11 | contents: write 12 | 13 | name: Release 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Install Poetry 19 | uses: snok/install-poetry@v1 20 | with: 21 | version: 1.5.0 22 | 23 | - name: Set Version 24 | run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV 25 | 26 | # - name: Create tag 27 | # uses: actions/github-script@v5 28 | # with: 29 | # script: | 30 | # github.rest.git.createRef({ 31 | # owner: context.repo.owner, 32 | # repo: context.repo.repo, 33 | # ref: 'refs/tags/V${{ env.VERSION }}', 34 | # sha: context.sha 35 | # }) 36 | 37 | - name: Build and publish to pypi 38 | run: | 39 | poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }} 40 | poetry publish --build 41 | 42 | - name: Create GH release 43 | uses: ncipollo/release-action@v1 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | with: 47 | tag: ${{ env.VERSION }} 48 | name: ${{ env.VERSION }} 49 | artifacts: "dist/*" 50 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | linting: 13 | name: Run lint and type checking 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ['3.10'] 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install Poetry 27 | uses: snok/install-poetry@v1 28 | with: 29 | version: 1.5.0 30 | - name: install dependencies 31 | run: poetry install --with dev --all-extras 32 | 33 | - name: Run Black Check 34 | run: poetry run black --check . 35 | 36 | - name: Run mypy check 37 | run: poetry run mypy . 38 | 39 | run-tests: 40 | name: Run tests 41 | needs: linting 42 | runs-on: ubuntu-latest 43 | strategy: 44 | fail-fast: false 45 | matrix: 46 | python-version: [3.9, '3.10', 3.11, 3.12, 3.13] 47 | 48 | steps: 49 | - uses: actions/checkout@v4 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | 55 | - name: Install Poetry 56 | uses: snok/install-poetry@v1 57 | with: 58 | version: 1.5.0 59 | - name: install dependencies 60 | run: poetry install --with dev --all-extras 61 | 62 | - name: Run pytest (unit tests) 63 | env: 64 | PY_VERSION: ${{ matrix.python-version }} 65 | # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 66 | # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 67 | run: poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/unit 68 | 69 | - name: upload pytest report.html 70 | uses: actions/upload-artifact@v4 71 | if: always() 72 | with: 73 | name: dataset-pytest-report-py${{ matrix.python-version }} 74 | path: report.html 75 | 76 | - name: Write google service account credentials to a file 77 | id: prepare-google-credentials 78 | shell: bash 79 | run: | 80 | secrets_file="$(mktemp)" 81 | echo "$GCS_SERVICE_ACCOUNT_CREDS_BASE64" | base64 -d > $secrets_file 82 | echo "google_credentials_file=$secrets_file" >> $GITHUB_OUTPUT 83 | env: 84 | GCS_SERVICE_ACCOUNT_CREDS_BASE64: '${{ secrets.GCS_SERVICE_ACCOUNT_CREDS_BASE64 }}' 85 | 86 | - name: Run pytest (integration tests) 87 | run: poetry run pytest tests/integration 88 | env: 89 | GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.prepare-google-credentials.outputs.google_credentials_file }} -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | # build the documentation whenever there are new commits on main 4 | on: workflow_dispatch 5 | 6 | # security: restrict permissions for CI jobs. 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | # Build the documentation and upload the static HTML files as an artifact. 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.10' 19 | - name: Install Poetry 20 | uses: snok/install-poetry@v1 21 | with: 22 | version: 1.5.0 23 | 24 | - run: poetry install --with dev --all-extras 25 | # ADJUST THIS: build your documentation into docs/. 26 | # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here. 27 | - run: poetry run pdoc -d google -o docs/ --logo https://d33wubrfki0l68.cloudfront.net/682006698903a55560c796b901fdfe4446c6d27a/a00ee/images/pinecone-logo.svg --search -t pdoc_theme ./pinecone_datasets 28 | 29 | - uses: actions/upload-pages-artifact@v1 30 | with: 31 | path: docs/ 32 | 33 | # This is a separate job so that only actions/deploy-pages has the necessary permissions. 34 | deploy: 35 | needs: build 36 | runs-on: ubuntu-latest 37 | permissions: 38 | pages: write 39 | id-token: write 40 | environment: 41 | name: github-pages 42 | url: ${{ steps.deployment.outputs.page_url }} 43 | steps: 44 | - id: deployment 45 | uses: actions/deploy-pages@v1 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | examples/ 3 | internal_testing.ipynb 4 | .DS_Store 5 | dist/ 6 | .mypy_cache/ 7 | scratchpad.ipynb 8 | .pycache/ 9 | .pytest_cache/ 10 | .coverage 11 | poetry.lock 12 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | # Pinecone Datasets 2 | 3 | ### Supported storage options 4 | 5 | pinecone_datasets can load datasets from Google Cloud storage, Amazon S3, and local files. 6 | 7 | By default, the `load_dataset` and `list_datasets` packages will pull from Pinecone's public GCS bucket at `gs://pinecone-datasets-dev`, but you can interact with catalogs stored in other locations. 8 | 9 | ```python 10 | from pinecone_datasets import Catalog 11 | 12 | # Local catalog 13 | catalog = Catalog(base_path="/path/to/local/catalog") 14 | catalog.list_datasets() 15 | 16 | # Google Cloud 17 | catalog = Catalog(base_path="gs://bucket-name") 18 | 19 | # S3 catalog 20 | s3_catalog = Catalog(base_path="s3://bucket-name") 21 | ``` 22 | 23 | If you are using Amazon S3 or Google Cloud to access private buckets, you can use environment variables to configure your credentials. For example, if you set a base_path starting with "gs://", the `gcsfs` package will attempt to find credentials by looking in cache locations used by `gcloud auth login` or reading environment variables such as `GOOGLE_APPLICATION_CREDENTIALS`. 24 | 25 | ## Adding a new dataset to the public datasets repo 26 | 27 | Note: Only Pinecone employees with access to the bucket can complete this step. 28 | 29 | Prerequisites: 30 | 31 | 1. Install google cloud CLI 32 | 2. Authenticate with `gcloud auth login` 33 | 34 | ```python 35 | from pinecone_datasets import Catalog, Dataset, DatasetMetadata, DenseModelMetadata 36 | 37 | # 1. Prepare pandas dataframes containing your embeddings 38 | documents_df = ... 39 | queries_df = ... 40 | 41 | # 2. Create metadata to describe the dataset 42 | import datatime 43 | metadata = DatasetMetadata( 44 | name="new-dataset-name", 45 | created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), 46 | documents=len(documents_df), 47 | queries=len(queries_df), 48 | dense_model=DenseModelMetadata( 49 | name="ada2", 50 | dimension=2, 51 | ), 52 | ) 53 | 54 | # 3. Take all this, and instantiate a Dataset 55 | ds = Dataset.from_pandas( 56 | documents=documents_df, 57 | queries=queries_df, 58 | metadata=metadata 59 | ) 60 | 61 | # 4. Save to catalog (requires gcloud auth step above) 62 | catalog = Catalog(base_path="gs://pinecone-datasets-dev") 63 | catalog.save_dataset(ds) 64 | ``` 65 | 66 | Afterwards, verify the new dataset appears in list function and can be used 67 | 68 | ```python 69 | from pinecone_datasets import list_datasets, load_dataset 70 | 71 | list_datasets(as_df=True) 72 | 73 | ds = load_dataset("new-dataset-name") 74 | ds.documents 75 | ds.head() 76 | ``` 77 | 78 | ### Expected dataset structure 79 | 80 | The package expects data to be laid out with the following directory structure: 81 | 82 | ├── my-subdir # path to where all datasets 83 | │ ├── my-dataset # name of dataset 84 | │ │ ├── metadata.json # dataset metadata (optional, only for listed) 85 | │ │ ├── documents # datasets documents 86 | │ │ │ ├── file1.parquet 87 | │ │ │ └── file2.parquet 88 | │ │ ├── queries # dataset queries 89 | │ │ │ ├── file1.parquet 90 | │ │ │ └── file2.parquet 91 | └── ... 92 | 93 | The data schema is expected to be as follows: 94 | 95 | - `documents` directory contains parquet files with the following schema: 96 | - Mandatory: `id: str, values: list[float]` 97 | - Optional: `sparse_values: Dict: indices: List[int], values: List[float]`, `metadata: Dict`, `blob: dict` 98 | - note: blob is a dict that can contain any data, it is not returned when iterating over the dataset and is inteded to be used for storing additional data that is not part of the dataset schema. however, it is sometime useful to store additional data in the dataset, for example, a document text. In future version this may become a first class citizen in the dataset schema. 99 | - `queries` directory contains parquet files with the following schema: 100 | - Mandatory: `vector: list[float], top_k: int` 101 | - Optional: `sparse_vector: Dict: indices: List[int], values: List[float]`, `filter: Dict` 102 | - note: filter is a dict that contain pinecone filters, for more information see [here](https://docs.pinecone.io/docs/metadata-filtering) 103 | 104 | in addition, a metadata file is expected to be in the dataset directory, for example: `s3://my-bucket/my-dataset/metadata.json` 105 | 106 | ```python 107 | from pinecone_datasets.catalog import DatasetMetadata 108 | 109 | meta = DatasetMetadata( 110 | name="test_dataset", 111 | created_at="2023-02-17 14:17:01.481785", 112 | documents=2, 113 | queries=2, 114 | source="manual", 115 | bucket="LOCAL", 116 | task="unittests", 117 | dense_model={"name": "bert", "dimension": 3}, 118 | sparse_model={"name": "bm25"}, 119 | ) 120 | ``` 121 | 122 | full metadata schema can be found in `pinecone_datasets.dataset_metadata.DatasetMetadata.schema` 123 | 124 | ### The 'blob' column 125 | 126 | Pinecone dataset ship with a blob column which is inteneded to be used for storing additional data that is not part of the dataset schema. however, it is sometime useful to store additional data in the dataset, for example, a document text. We added a utility function to move data from the blob column to the metadata column. This is useful for example when upserting a dataset to an index and want to use the metadata to store text data. 127 | 128 | ```python 129 | from pinecone_datasets import import_documents_keys_from_blob_to_metadata 130 | 131 | new_dataset = import_documents_keys_from_blob_to_metadata(dataset, keys=["text"]) 132 | ``` 133 | 134 | ## Usage saving 135 | 136 | You can save your dataset to a catalog managed by you or to a local path or a remote path (GCS or S3). 137 | 138 | ### Saving a dataset to a Catalog 139 | 140 | To set you own catalog endpoint, set the environment variable `DATASETS_CATALOG_BASEPATH` to your bucket. Note that pinecone uses the default authentication method for the storage type (gcsfs for GCS and s3fs for S3). 141 | 142 | After this environment variable is set you can save your dataset to the catalog using the `save` function 143 | 144 | ```python 145 | from pinecone_datasets import Dataset 146 | 147 | metadata = DatasetMetadata(**{"name": "my-dataset", ...}) 148 | ``` 149 | 150 | 151 | ### Saving to Path 152 | 153 | You can save your dataset to a local path or a remote path (GCS or S3). Note that pinecone uses the default authentication method for the storage type (gcsfs for GCS and s3fs for S3). 154 | 155 | ```python 156 | dataset = Dataset.from_pandas(documents, queries, metadata) 157 | dataset.to_path("s3://my-bucket/my-subdir/my-dataset") 158 | ``` 159 | 160 | ## Running tests 161 | 162 | This project is using poetry for dependency managemet. To start developing, on project root directory run: 163 | 164 | ```bash 165 | poetry install --with dev 166 | ``` 167 | 168 | To run test locally run 169 | 170 | ```bash 171 | poetry run pytest test/unit --cov pinecone_datasets 172 | ``` 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pinecone Datasets 2 | 3 | ## install 4 | 5 | ```bash 6 | pip install pinecone-datasets 7 | ``` 8 | 9 | ### Loading public datasets 10 | 11 | Pinecone hosts a public datasets catalog, you can load a dataset by name using `list_datasets` and `load_dataset` functions. This will use the default catalog endpoint (currently GCS) to list and load datasets. 12 | 13 | ```python 14 | from pinecone_datasets import list_datasets, load_dataset 15 | 16 | list_datasets() 17 | # ["quora_all-MiniLM-L6-bm25", ... ] 18 | 19 | dataset = load_dataset("quora_all-MiniLM-L6-bm25") 20 | 21 | dataset.head() 22 | 23 | # Prints 24 | # ┌─────┬───────────────────────────┬─────────────────────────────────────┬───────────────────┬──────┐ 25 | # │ id ┆ values ┆ sparse_values ┆ metadata ┆ blob │ 26 | # │ ┆ ┆ ┆ ┆ │ 27 | # │ str ┆ list[f32] ┆ struct[2] ┆ struct[3] ┆ │ 28 | # ╞═════╪═══════════════════════════╪═════════════════════════════════════╪═══════════════════╪══════╡ 29 | # │ 0 ┆ [0.118014, -0.069717, ... ┆ {[470065541, 52922727, ... 22364... ┆ {2017,12,"other"} ┆ .... │ 30 | # │ ┆ 0.0060... ┆ ┆ ┆ │ 31 | # └─────┴───────────────────────────┴─────────────────────────────────────┴───────────────────┴──────┘ 32 | ``` 33 | 34 | 35 | ## Usage - Accessing data 36 | 37 | Each dataset has three main attributes, `documents`, `queries`, and `metadata` which are lazily loaded the first time they are accessed. You may notice a delay as the underlying parquet files are being downloaded the first time these attributes are accessed. 38 | 39 | Pinecone Datasets is build on top of pandas. `documents` and `queries` are lazily-loaded pandas dataframes. This means that you can use all the pandas API to access the data. In addition, we provide some helper functions to access the data in a more convenient way. 40 | 41 | accessing the documents and queries dataframes is done using the `documents` and `queries` properties. These properties are lazy and will only load the data when accessed. 42 | 43 | ```python 44 | from pinecone_datasets import list_datasets, load_dataset 45 | 46 | dataset = load_dataset("quora_all-MiniLM-L6-bm25") 47 | 48 | document_df: pd.DataFrame = dataset.documents 49 | 50 | query_df: pd.DataFrame = dataset.queries 51 | ``` 52 | 53 | 54 | ## Usage - Iterating over documents 55 | 56 | The `Dataset` class has helpers for iterating over your dataset. This is useful for upserting a dataset to an index, or for benchmarking. 57 | 58 | ```python 59 | 60 | # List Iterator, where every list of size N Dicts with ("id", "values", "sparse_values", "metadata") 61 | dataset.iter_documents(batch_size=n) 62 | 63 | # Dict Iterator, where every dict has ("vector", "sparse_vector", "filter", "top_k") 64 | dataset.iter_queries() 65 | ``` 66 | 67 | ### Upserting to Index 68 | 69 | To upsert data to the index, you should install the [Pinecone SDK](https://github.com/pinecone-io/pinecone-python-client) 70 | 71 | ```python 72 | from pinecone import Pinecone, ServerlessSpec 73 | from pinecone_datasets import load_dataset, list_datasets 74 | 75 | # See what datasets are available 76 | for ds in list_datasets(): 77 | print(ds) 78 | 79 | # Download embeddings data 80 | dataset = load_dataset(dataset_name) 81 | 82 | # Instantiate a Pinecone client using API key from app.pinecone.io 83 | pc = Pinecone(api_key='key') 84 | 85 | # Create a Pinecone index 86 | index_config = pc.create_index( 87 | name="demo-index", 88 | dimension=dataset.metadata.dense_model.dimension, 89 | spec=ServerlessSpec(cloud="aws", region="us-east1") 90 | ) 91 | 92 | # Instantiate an index client 93 | index = pc.Index(host=index_config.host) 94 | 95 | # Upsert data from the dataset 96 | index.upsert_from_dataframe(df=dataset.documents) 97 | ``` 98 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | ignore_errors = True 4 | 5 | [mypy-pinecone_dataset.*] 6 | ignore_errors = False 7 | disallow_untyped_defs = True 8 | disallow_untyped_calls = True -------------------------------------------------------------------------------- /pdoc_theme/syntax-highlighting.css: -------------------------------------------------------------------------------- 1 | /* monokai color scheme, see pdoc/template/README.md */ 2 | pre { line-height: 125%; } 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 20px; } 4 | .pdoc-code .hll { background-color: #49483e } 5 | .pdoc-code { background: #272822; color: #f8f8f2 } 6 | .pdoc-code .c { color: #75715e } /* Comment */ 7 | .pdoc-code .err { color: #960050; background-color: #1e0010 } /* Error */ 8 | .pdoc-code .esc { color: #f8f8f2 } /* Escape */ 9 | .pdoc-code .g { color: #f8f8f2 } /* Generic */ 10 | .pdoc-code .k { color: #66d9ef } /* Keyword */ 11 | .pdoc-code .l { color: #ae81ff } /* Literal */ 12 | .pdoc-code .n { color: #f8f8f2 } /* Name */ 13 | .pdoc-code .o { color: #f92672 } /* Operator */ 14 | .pdoc-code .x { color: #f8f8f2 } /* Other */ 15 | .pdoc-code .p { color: #f8f8f2 } /* Punctuation */ 16 | .pdoc-code .ch { color: #75715e } /* Comment.Hashbang */ 17 | .pdoc-code .cm { color: #75715e } /* Comment.Multiline */ 18 | .pdoc-code .cp { color: #75715e } /* Comment.Preproc */ 19 | .pdoc-code .cpf { color: #75715e } /* Comment.PreprocFile */ 20 | .pdoc-code .c1 { color: #75715e } /* Comment.Single */ 21 | .pdoc-code .cs { color: #75715e } /* Comment.Special */ 22 | .pdoc-code .gd { color: #f92672 } /* Generic.Deleted */ 23 | .pdoc-code .ge { color: #f8f8f2; font-style: italic } /* Generic.Emph */ 24 | .pdoc-code .gr { color: #f8f8f2 } /* Generic.Error */ 25 | .pdoc-code .gh { color: #f8f8f2 } /* Generic.Heading */ 26 | .pdoc-code .gi { color: #a6e22e } /* Generic.Inserted */ 27 | .pdoc-code .go { color: #66d9ef } /* Generic.Output */ 28 | .pdoc-code .gp { color: #f92672; font-weight: bold } /* Generic.Prompt */ 29 | .pdoc-code .gs { color: #f8f8f2; font-weight: bold } /* Generic.Strong */ 30 | .pdoc-code .gu { color: #75715e } /* Generic.Subheading */ 31 | .pdoc-code .gt { color: #f8f8f2 } /* Generic.Traceback */ 32 | .pdoc-code .kc { color: #66d9ef } /* Keyword.Constant */ 33 | .pdoc-code .kd { color: #66d9ef } /* Keyword.Declaration */ 34 | .pdoc-code .kn { color: #f92672 } /* Keyword.Namespace */ 35 | .pdoc-code .kp { color: #66d9ef } /* Keyword.Pseudo */ 36 | .pdoc-code .kr { color: #66d9ef } /* Keyword.Reserved */ 37 | .pdoc-code .kt { color: #66d9ef } /* Keyword.Type */ 38 | .pdoc-code .ld { color: #e6db74 } /* Literal.Date */ 39 | .pdoc-code .m { color: #ae81ff } /* Literal.Number */ 40 | .pdoc-code .s { color: #e6db74 } /* Literal.String */ 41 | .pdoc-code .na { color: #a6e22e } /* Name.Attribute */ 42 | .pdoc-code .nb { color: #f8f8f2 } /* Name.Builtin */ 43 | .pdoc-code .nc { color: #a6e22e } /* Name.Class */ 44 | .pdoc-code .no { color: #66d9ef } /* Name.Constant */ 45 | .pdoc-code .nd { color: #a6e22e } /* Name.Decorator */ 46 | .pdoc-code .ni { color: #f8f8f2 } /* Name.Entity */ 47 | .pdoc-code .ne { color: #a6e22e } /* Name.Exception */ 48 | .pdoc-code .nf { color: #a6e22e } /* Name.Function */ 49 | .pdoc-code .nl { color: #f8f8f2 } /* Name.Label */ 50 | .pdoc-code .nn { color: #f8f8f2 } /* Name.Namespace */ 51 | .pdoc-code .nx { color: #a6e22e } /* Name.Other */ 52 | .pdoc-code .py { color: #f8f8f2 } /* Name.Property */ 53 | .pdoc-code .nt { color: #f92672 } /* Name.Tag */ 54 | .pdoc-code .nv { color: #f8f8f2 } /* Name.Variable */ 55 | .pdoc-code .ow { color: #f92672 } /* Operator.Word */ 56 | .pdoc-code .w { color: #f8f8f2 } /* Text.Whitespace */ 57 | .pdoc-code .mb { color: #ae81ff } /* Literal.Number.Bin */ 58 | .pdoc-code .mf { color: #ae81ff } /* Literal.Number.Float */ 59 | .pdoc-code .mh { color: #ae81ff } /* Literal.Number.Hex */ 60 | .pdoc-code .mi { color: #ae81ff } /* Literal.Number.Integer */ 61 | .pdoc-code .mo { color: #ae81ff } /* Literal.Number.Oct */ 62 | .pdoc-code .sa { color: #e6db74 } /* Literal.String.Affix */ 63 | .pdoc-code .sb { color: #e6db74 } /* Literal.String.Backtick */ 64 | .pdoc-code .sc { color: #e6db74 } /* Literal.String.Char */ 65 | .pdoc-code .dl { color: #e6db74 } /* Literal.String.Delimiter */ 66 | .pdoc-code .sd { color: #e6db74 } /* Literal.String.Doc */ 67 | .pdoc-code .s2 { color: #e6db74 } /* Literal.String.Double */ 68 | .pdoc-code .se { color: #ae81ff } /* Literal.String.Escape */ 69 | .pdoc-code .sh { color: #e6db74 } /* Literal.String.Heredoc */ 70 | .pdoc-code .si { color: #e6db74 } /* Literal.String.Interpol */ 71 | .pdoc-code .sx { color: #e6db74 } /* Literal.String.Other */ 72 | .pdoc-code .sr { color: #e6db74 } /* Literal.String.Regex */ 73 | .pdoc-code .s1 { color: #e6db74 } /* Literal.String.Single */ 74 | .pdoc-code .ss { color: #e6db74 } /* Literal.String.Symbol */ 75 | .pdoc-code .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */ 76 | .pdoc-code .fm { color: #a6e22e } /* Name.Function.Magic */ 77 | .pdoc-code .vc { color: #f8f8f2 } /* Name.Variable.Class */ 78 | .pdoc-code .vg { color: #f8f8f2 } /* Name.Variable.Global */ 79 | .pdoc-code .vi { color: #f8f8f2 } /* Name.Variable.Instance */ 80 | .pdoc-code .vm { color: #f8f8f2 } /* Name.Variable.Magic */ -------------------------------------------------------------------------------- /pdoc_theme/theme.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --pdoc-background: #212529; 3 | } 4 | 5 | .pdoc { 6 | --text: #f7f7f7; 7 | --muted: #9d9d9d; 8 | --link: #58a6ff; 9 | --link-hover: #3989ff; 10 | --code: #333; 11 | --active: #555; 12 | 13 | --accent: #343434; 14 | --accent2: #555; 15 | 16 | --nav-hover: rgba(0, 0, 0, 0.1); 17 | --name: #77C1FF; 18 | --def: #0cdd0c; 19 | --annotation: #00c037; 20 | } -------------------------------------------------------------------------------- /pinecone_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. include:: ../README.md 3 | """ 4 | 5 | __version__ = "1.0.2" 6 | 7 | 8 | from .public import list_datasets, load_dataset 9 | from .dataset_metadata import DatasetMetadata, DenseModelMetadata 10 | from .catalog import Catalog 11 | from .dataset import Dataset 12 | -------------------------------------------------------------------------------- /pinecone_datasets/catalog.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import os 3 | import json 4 | from typing import List, Optional, Union, TYPE_CHECKING 5 | 6 | import logging 7 | from pydantic import BaseModel, ValidationError, Field 8 | 9 | from .cfg import Storage 10 | from .fs import get_cloud_fs 11 | from .dataset import Dataset 12 | from .dataset_fswriter import DatasetFSWriter 13 | from .dataset_metadata import DatasetMetadata 14 | 15 | if TYPE_CHECKING: 16 | import pandas as pd 17 | else: 18 | pd = None 19 | 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class Catalog(BaseModel): 25 | def __init__(self, base_path: Optional[str] = None, **kwargs): 26 | super().__init__(**kwargs) 27 | if base_path is None: 28 | self.base_path = os.environ.get( 29 | "DATASETS_CATALOG_BASEPATH", Storage.endpoint 30 | ) 31 | else: 32 | self.base_path = base_path 33 | 34 | base_path: str = Field(default=None) 35 | datasets: List[DatasetMetadata] = Field(default_factory=list) 36 | 37 | def load(self, **kwargs) -> "Catalog": 38 | """Loads metadata about all datasets from the catalog.""" 39 | fs = get_cloud_fs(self.base_path, **kwargs) 40 | collected_datasets = [] 41 | 42 | metadata_files_glob_path = os.path.join(self.base_path, "*", "metadata.json") 43 | for metadata_path in fs.glob(metadata_files_glob_path): 44 | with fs.open(metadata_path) as f: 45 | try: 46 | this_dataset_json = json.load(f) 47 | except json.JSONDecodeError: 48 | warnings.warn( 49 | f"Not a JSON: Invalid metadata.json for {metadata_path}, skipping" 50 | ) 51 | continue 52 | 53 | try: 54 | this_dataset = DatasetMetadata(**this_dataset_json) 55 | collected_datasets.append(this_dataset) 56 | except ValidationError as e: 57 | warnings.warn( 58 | f"metadata file for dataset: {metadata_path} is not valid, skipping: {e}" 59 | ) 60 | continue 61 | 62 | self.datasets = collected_datasets 63 | logger.info(f"Loaded {len(self.datasets)} datasets from {self.base_path}") 64 | return self 65 | 66 | def list_datasets(self, as_df: bool) -> Union[List[str], "pd.DataFrame"]: 67 | """Lists all datasets in the catalog.""" 68 | if self.datasets is None or len(self.datasets) == 0: 69 | self.load() 70 | 71 | import pandas as pd 72 | 73 | if as_df: 74 | return pd.DataFrame([ds.model_dump() for ds in self.datasets]) 75 | else: 76 | return [dataset.name for dataset in self.datasets] 77 | 78 | def load_dataset(self, dataset_id: str, **kwargs) -> "Dataset": 79 | """Loads the dataset from the catalog.""" 80 | ds_path = os.path.join(str(self.base_path), dataset_id) 81 | return Dataset.from_path(dataset_path=ds_path, **kwargs) 82 | 83 | def save_dataset( 84 | self, 85 | dataset: "Dataset", 86 | **kwargs, 87 | ): 88 | """ 89 | Save a dataset to the catalog. 90 | """ 91 | ds_path = os.path.join(self.base_path, dataset.metadata.name) 92 | DatasetFSWriter.write_dataset(dataset_path=ds_path, dataset=dataset, **kwargs) 93 | logger.info(f"Saved dataset {dataset.metadata.name} to {ds_path}") 94 | -------------------------------------------------------------------------------- /pinecone_datasets/cfg.py: -------------------------------------------------------------------------------- 1 | # from polars.datatypes import Utf8, Float32, List, Struct, Field, UInt32 2 | 3 | 4 | class Storage: 5 | endpoint: str = "gs://pinecone-datasets-dev" 6 | 7 | 8 | class Schema: 9 | class Names: 10 | documents = [ 11 | ("id", False, None), 12 | ("values", False, None), 13 | ("sparse_values", True, None), 14 | ("metadata", True, None), 15 | ("blob", True, None), 16 | ] 17 | queries = [ 18 | ("vector", False, None), 19 | ("sparse_vector", True, None), 20 | ("filter", True, None), 21 | ("top_k", False, 5), 22 | ("blob", True, None), 23 | ] 24 | 25 | # documents = { 26 | # "id": Utf8, 27 | # "values": List(Float32), 28 | # "sparse_values": Struct( 29 | # [Field("indices", List(UInt32)), Field("values", List(Float32))] 30 | # ), 31 | # } 32 | documents_select_columns = ["id", "values", "sparse_values", "metadata"] 33 | 34 | # queries = { 35 | # "vector": List(Float32), 36 | # "sparse_vector": Struct( 37 | # [Field("indices", List(UInt32)), Field("values", List(Float32))] 38 | # ), 39 | # "top_k": UInt32, 40 | # } 41 | queries_select_columns = ["vector", "sparse_vector", "filter", "top_k"] 42 | -------------------------------------------------------------------------------- /pinecone_datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from urllib.parse import urlparse 3 | from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple 4 | 5 | from .cfg import Schema 6 | from .dataset_metadata import DatasetMetadata 7 | from .fs import get_cloud_fs 8 | from .utils import deprecated 9 | 10 | from typing import TYPE_CHECKING 11 | 12 | if TYPE_CHECKING: 13 | import pandas as pd 14 | from .dataset_fsreader import DatasetFSReader 15 | else: 16 | pd = None # Placeholder for runtime 17 | DatasetFSReader = None # Placeholder for runtime 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def iter_pandas_dataframe_slices( 23 | df: "pd.DataFrame", batch_size, return_indexes 24 | ) -> Generator[List[Dict[str, Any]], None, None]: 25 | for i in range(0, len(df), batch_size): 26 | if return_indexes: 27 | yield (i, df.iloc[i : i + batch_size].to_dict(orient="records")) 28 | else: 29 | yield df.iloc[i : i + batch_size].to_dict(orient="records") 30 | 31 | 32 | def iter_pandas_dataframe_single( 33 | df: "pd.DataFrame", 34 | ) -> Generator[Dict[str, Any], None, None]: 35 | for i in range(0, len(df), 1): 36 | yield df.iloc[i : i + 1].to_dict(orient="records")[0] 37 | 38 | 39 | class Dataset: 40 | @classmethod 41 | def from_path(cls, dataset_path, **kwargs): 42 | """ 43 | Create a Dataset object from local or cloud storage 44 | Args: 45 | dataset_path (str): a path to a local or cloud storage path containing a valid dataset. 46 | 47 | Returns: 48 | Dataset: a Dataset object 49 | """ 50 | return cls(dataset_path=dataset_path, **kwargs) 51 | 52 | @classmethod 53 | def from_pandas( 54 | cls, 55 | documents: "pd.DataFrame", 56 | metadata: DatasetMetadata, 57 | documents_column_mapping: Optional[Dict] = None, 58 | queries: Optional["pd.DataFrame"] = None, 59 | queries_column_mapping: Optional[Dict] = None, 60 | **kwargs, 61 | ) -> "Dataset": 62 | """ 63 | Create a Dataset object from a pandas DataFrame 64 | 65 | Args: 66 | documents (pd.DataFrame): a pandas DataFrame containing the documents 67 | documents_column_mapping (Dict): a dictionary mapping the columns of the documents DataFrame to the Pinecone Datasets Schema 68 | queries (pd.DataFrame): a pandas DataFrame containing the queries 69 | queries_column_mapping (Dict): a dictionary mapping the columns of the queries DataFrame to the Pinecone Datasets Schema 70 | 71 | Keyword Args: 72 | kwargs (Dict): additional arguments to pass to the fsspec constructor 73 | 74 | Returns: 75 | Dataset: a Dataset object 76 | """ 77 | instance = cls(dataset_path=None, **kwargs) 78 | instance._documents = cls._read_pandas_dataframe( 79 | documents, documents_column_mapping, Schema.Names.documents 80 | ) 81 | instance._queries = cls._read_pandas_dataframe( 82 | queries, queries_column_mapping, Schema.Names.queries 83 | ) 84 | instance._metadata = metadata 85 | return instance 86 | 87 | @staticmethod 88 | def _read_pandas_dataframe( 89 | df: "pd.DataFrame", 90 | column_mapping: Dict[str, str], 91 | schema: List[Tuple[str, bool, Any]], 92 | ) -> "pd.DataFrame": 93 | """ 94 | Reads a pandas DataFrame and validates it against a schema. 95 | 96 | Args: 97 | df (pd.DataFrame): the pandas DataFrame to read 98 | column_mapping (Dict[str, str]): a dictionary mapping the columns of the DataFrame to the Pinecone Datasets Schema (col_name, pinecone_name) 99 | schema (List[Tuple[str, bool]]): the schema to validate against (column_name, is_nullable) 100 | 101 | Returns: 102 | pd.DataFrame: the validated, renamed DataFrame 103 | """ 104 | import pandas as pd 105 | 106 | if df is None or df.empty: 107 | return pd.DataFrame(columns=[column_name for column_name, _, _ in schema]) 108 | else: 109 | if column_mapping is not None: 110 | df.rename(columns=column_mapping, inplace=True) 111 | for column_name, is_nullable, null_value in schema: 112 | if column_name not in df.columns and not is_nullable: 113 | raise ValueError( 114 | f"error, file is not matching Pinecone Datasets Schmea: {column_name} not found" 115 | ) 116 | elif column_name not in df.columns and is_nullable: 117 | df[column_name] = null_value 118 | return df[[column_name for column_name, _, _ in schema]] 119 | 120 | def __init__( 121 | self, 122 | dataset_path: str, 123 | **kwargs, 124 | ) -> None: 125 | """ 126 | Dataset class to load and query datasets from the Pinecone Datasets catalog. 127 | See `from_path` and `from_dataset_id` for examples on how to load a dataset. 128 | 129 | Examples: 130 | ```python 131 | from pinecone_datasets import Dataset 132 | dataset = Dataset.from_dataset_id("dataset_name") 133 | # or 134 | dataset = Dataset.from_path("gs://my-bucket/my-dataset") 135 | 136 | for doc in dataset.iter_documents(batch_size=100): 137 | index.upsert(doc) 138 | for query in dataset.iter_queries(batch_size): 139 | results = index.search(query) 140 | # do something with the results 141 | # or 142 | dataset.documents # returns a pandas/polars DataFrame 143 | dataset.queries # returns a pandas/polars DataFrame 144 | ``` 145 | 146 | """ 147 | if dataset_path is not None: 148 | endpoint = urlparse(dataset_path)._replace(path="").geturl() 149 | self._fs = get_cloud_fs(endpoint, **kwargs) 150 | self._dataset_path = dataset_path 151 | if not self._fs.exists(self._dataset_path): 152 | raise FileNotFoundError( 153 | f"Dataset does not exist at path {self._dataset_path}" 154 | ) 155 | else: 156 | self._dataset_path = None 157 | self._fs = None 158 | self._documents = None 159 | self._queries = None 160 | self._metadata = None 161 | 162 | def __getitem__(self, key: str): 163 | if key in ["documents", "queries"]: 164 | return getattr(self, key) 165 | else: 166 | raise KeyError("Dataset does not have key: {}".format(key)) 167 | 168 | def __len__(self) -> int: 169 | return self.documents.shape[0] 170 | 171 | @property 172 | def documents(self) -> "pd.DataFrame": 173 | if self._documents is None and self._dataset_path is not None: 174 | from .dataset_fsreader import DatasetFSReader 175 | 176 | self._documents = DatasetFSReader.read_documents( 177 | self._fs, self._dataset_path 178 | ) 179 | return self._documents 180 | 181 | @property 182 | def queries(self) -> "pd.DataFrame": 183 | if self._queries is None and self._dataset_path is not None: 184 | from .dataset_fsreader import DatasetFSReader 185 | 186 | self._queries = DatasetFSReader.read_queries(self._fs, self._dataset_path) 187 | return self._queries 188 | 189 | @property 190 | def metadata(self) -> DatasetMetadata: 191 | if self._metadata is None and self._dataset_path is not None: 192 | from .dataset_fsreader import DatasetFSReader 193 | 194 | self._metadata = DatasetFSReader.read_metadata(self._fs, self._dataset_path) 195 | return self._metadata 196 | 197 | def iter_documents( 198 | self, batch_size: int = 1, return_indexes=False 199 | ) -> Iterator[List[Dict[str, Any]]]: 200 | """ 201 | Iterates over the documents in the dataset. 202 | 203 | Args: 204 | batch_size (int, optional): The batch size to use for the iterator. Defaults to 1. 205 | 206 | Returns: 207 | Iterator[List[Dict[str, Any]]]: An iterator over the documents in the dataset. 208 | 209 | Examples: 210 | for batch in dataset.iter_documents(batch_size=100): 211 | index.upsert(batch) 212 | """ 213 | if isinstance(batch_size, int) and batch_size > 0: 214 | return iter_pandas_dataframe_slices( 215 | df=self.documents[Schema.documents_select_columns].dropna( 216 | axis=1, how="all" 217 | ), 218 | batch_size=batch_size, 219 | return_indexes=return_indexes, 220 | ) 221 | else: 222 | raise ValueError("batch_size must be greater than 0") 223 | 224 | def iter_queries(self) -> Iterator[Dict[str, Any]]: 225 | """ 226 | Iterates over the queries in the dataset. 227 | 228 | Returns: 229 | Iterator[Dict[str, Any]]: An iterator over the queries in the dataset. 230 | 231 | Examples: 232 | for query in dataset.iter_queries(): 233 | results = index.query(**query) 234 | # do something with the results 235 | """ 236 | return iter_pandas_dataframe_single(self.queries[Schema.queries_select_columns]) 237 | 238 | def head(self, n: int = 5) -> "pd.DataFrame": 239 | return self.documents.head(n) 240 | 241 | @deprecated 242 | @classmethod 243 | def from_catalog(cls, dataset_id, catalog_base_path: str = "", **kwargs): 244 | """ 245 | DEPRECATED: This method has been removed. Please use `Catalog.load_dataset` instead. 246 | """ 247 | raise Exception( 248 | "This method has been removed. Please use `Catalog.load_dataset` instead." 249 | ) 250 | 251 | @deprecated 252 | def to_catalog( 253 | self, 254 | dataset_id: str, 255 | catalog_base_path: str = "", 256 | **kwargs, 257 | ): 258 | """ 259 | DEPRECATED: This method has been removed. Please use `Catalog.save_dataset` instead. 260 | """ 261 | raise Exception( 262 | "This method has been removed. Please use `Catalog.save_dataset` instead." 263 | ) 264 | 265 | @deprecated 266 | def to_pinecone_index(self, *args, **kwargs): 267 | """ 268 | DEPRECATED: This method has been removed. Please use the `pinecone.Index.upsert` method instead from the `pinecone` SDK package. 269 | """ 270 | raise Exception( 271 | "This method has been removed. Please use the `pinecone.Index.upsert` method instead from the `pinecone` SDK package." 272 | ) 273 | -------------------------------------------------------------------------------- /pinecone_datasets/dataset_fsreader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import warnings 5 | from typing import Literal, Optional 6 | 7 | import pandas as pd 8 | import pyarrow.parquet as pq 9 | from .tqdm import tqdm 10 | 11 | from .cfg import Schema 12 | from .dataset_metadata import DatasetMetadata 13 | from .fs import CloudOrLocalFS 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class DatasetFSReader: 19 | @staticmethod 20 | def read_documents(fs: CloudOrLocalFS, dataset_path: str) -> pd.DataFrame: 21 | logger.debug(f"reading documents from {dataset_path}") 22 | df = DatasetFSReader._safe_read_from_path(fs, dataset_path, "documents") 23 | 24 | # metadata supposed to be a dict [if legacy] or string 25 | df["metadata"] = df["metadata"].apply( 26 | DatasetFSReader._convert_metadata_from_json_to_dict 27 | ) 28 | return df 29 | 30 | @staticmethod 31 | def read_queries(fs: CloudOrLocalFS, dataset_path: str) -> pd.DataFrame: 32 | logger.debug(f"reading queries from {dataset_path}") 33 | df = DatasetFSReader._safe_read_from_path(fs, dataset_path, "queries") 34 | 35 | # filter supposed to be a dict [if legacy] or string 36 | df["filter"] = df["filter"].apply( 37 | DatasetFSReader._convert_metadata_from_json_to_dict 38 | ) 39 | 40 | return df 41 | 42 | @staticmethod 43 | def read_metadata(fs: CloudOrLocalFS, dataset_path: str) -> DatasetMetadata: 44 | logger.debug(f"reading metadata from {dataset_path}") 45 | with fs.open(os.path.join(dataset_path, "metadata.json"), "rb") as f: 46 | metadata = json.load(f) 47 | return DatasetMetadata(**metadata) 48 | 49 | @staticmethod 50 | def _convert_metadata_from_json_to_dict(metadata: Optional[str] = None) -> dict: 51 | if metadata is None: 52 | return None 53 | elif isinstance(metadata, dict): 54 | return metadata 55 | elif isinstance(metadata, str): 56 | return json.loads(metadata) 57 | else: 58 | raise TypeError("metadata must be a string or dict") 59 | 60 | @staticmethod 61 | def _does_datatype_exist( 62 | fs: CloudOrLocalFS, 63 | dataset_path: str, 64 | data_type: Literal["documents", "queries"], 65 | ) -> bool: 66 | return fs.exists(os.path.join(dataset_path, data_type)) 67 | 68 | @staticmethod 69 | def _safe_read_from_path( 70 | fs: CloudOrLocalFS, 71 | dataset_path: str, 72 | data_type: Literal["documents", "queries"], 73 | ) -> pd.DataFrame: 74 | read_path_str = os.path.join(dataset_path, data_type, "*.parquet") 75 | read_path = fs.glob(read_path_str) 76 | if DatasetFSReader._does_datatype_exist(fs, dataset_path, data_type): 77 | # First, collect all the dataframes 78 | dfs = [] 79 | for path in tqdm(read_path, desc=f"Loading {data_type} parquet files"): 80 | piece = pq.read_pandas(path, filesystem=fs) 81 | df_piece = piece.to_pandas() 82 | dfs.append(df_piece) 83 | 84 | if not dfs: 85 | raise ValueError(f"No parquet files found in {read_path_str}") 86 | 87 | # Combine all dataframes 88 | df = pd.concat(dfs, ignore_index=True) 89 | 90 | # Validate schema 91 | dataset_schema_names = df.columns.tolist() 92 | columns_to_null = [] 93 | columns_not_null = [] 94 | for column_name, is_nullable, null_value in getattr( 95 | Schema.Names, data_type 96 | ): 97 | if column_name not in dataset_schema_names and not is_nullable: 98 | raise ValueError( 99 | f"error, file is not matching Pinecone Datasets Schema: {column_name} not found" 100 | ) 101 | elif column_name not in dataset_schema_names and is_nullable: 102 | columns_to_null.append((column_name, null_value)) 103 | else: 104 | columns_not_null.append(column_name) 105 | 106 | # Add null columns if needed 107 | for column_name, null_value in columns_to_null: 108 | df[column_name] = null_value 109 | 110 | return df[columns_not_null + [col for col, _ in columns_to_null]] 111 | 112 | else: 113 | warnings.warn( 114 | "WARNING: No data found at: {}. Returning empty dataframe".format( 115 | read_path_str 116 | ), 117 | UserWarning, 118 | stacklevel=0, 119 | ) 120 | return pd.DataFrame( 121 | columns=[col[0] for col in getattr(Schema.Names, data_type)] 122 | ) 123 | -------------------------------------------------------------------------------- /pinecone_datasets/dataset_fswriter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import warnings 4 | import logging 5 | 6 | from .fs import get_cloud_fs, CloudOrLocalFS 7 | from typing import Optional, TYPE_CHECKING 8 | 9 | if TYPE_CHECKING: 10 | import pandas as pd 11 | else: 12 | pd = None 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class DatasetFSWriter: 18 | @staticmethod 19 | def write_dataset(dataset_path: str, dataset: "Dataset", **kwargs): 20 | """ 21 | Saves the dataset to a local or cloud storage path. 22 | """ 23 | fs = get_cloud_fs(dataset_path, **kwargs) 24 | logger.debug(f"writing dataset {dataset.metadata.name} to {dataset_path}") 25 | DatasetFSWriter._write_documents(fs, dataset_path, dataset) 26 | DatasetFSWriter._write_queries(fs, dataset_path, dataset) 27 | DatasetFSWriter._write_metadata(fs, dataset_path, dataset) 28 | 29 | @staticmethod 30 | def _write_documents(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"): 31 | documents_path = os.path.join(dataset_path, "documents") 32 | fs.makedirs(documents_path, exist_ok=True) 33 | 34 | documents_metadta_copy = dataset.documents["metadata"].copy() 35 | try: 36 | logger.debug( 37 | f"writing dataset {dataset.metadata.name} documents to {documents_path}" 38 | ) 39 | dataset.documents["metadata"] = dataset.documents["metadata"].apply( 40 | DatasetFSWriter._convert_metadata_from_dict_to_json 41 | ) 42 | dataset.documents.to_parquet( 43 | os.path.join(documents_path, "part-0.parquet"), 44 | engine="pyarrow", 45 | index=False, 46 | filesystem=fs, 47 | ) 48 | finally: 49 | dataset.documents["metadata"] = documents_metadta_copy 50 | 51 | @staticmethod 52 | def _write_queries(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"): 53 | if dataset.queries.empty: 54 | warnings.warn("Queries are empty, not saving queries") 55 | else: 56 | queries_path = os.path.join(dataset_path, "queries") 57 | logger.debug( 58 | f"writing dataset {dataset.metadata.name} queries to {queries_path}" 59 | ) 60 | fs.makedirs(queries_path, exist_ok=True) 61 | queries_filter_copy = dataset.queries["filter"].copy() 62 | try: 63 | dataset.queries["filter"] = dataset.queries["filter"].apply( 64 | DatasetFSWriter._convert_metadata_from_dict_to_json 65 | ) 66 | dataset.queries.to_parquet( 67 | os.path.join(queries_path, "part-0.parquet"), 68 | engine="pyarrow", 69 | index=False, 70 | filesystem=fs, 71 | ) 72 | finally: 73 | dataset.queries["filter"] = queries_filter_copy 74 | 75 | @staticmethod 76 | def _write_metadata(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"): 77 | metadata_path = os.path.join(dataset_path, "metadata.json") 78 | logger.debug( 79 | f"writing dataset {dataset.metadata.name} metadata to {metadata_path}" 80 | ) 81 | with fs.open(metadata_path, "w") as f: 82 | json.dump(dataset.metadata.model_dump(), f) 83 | 84 | @staticmethod 85 | def _convert_metadata_from_dict_to_json(metadata: Optional[dict]) -> str: 86 | import pandas as pd 87 | 88 | if pd.isna(metadata): 89 | return None 90 | if metadata and not isinstance(metadata, dict): 91 | raise TypeError( 92 | f"metadata must be a dict but its {type(metadata)} meta = {metadata}" 93 | ) 94 | return json.dumps(metadata, ensure_ascii=False) 95 | -------------------------------------------------------------------------------- /pinecone_datasets/dataset_metadata.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List, Optional, Any, Dict 3 | from pydantic import BaseModel 4 | 5 | 6 | class DenseModelMetadata(BaseModel): 7 | name: str 8 | tokenizer: Optional[str] = None 9 | dimension: int 10 | 11 | 12 | class SparseModelMetdata(BaseModel): 13 | name: Optional[str] = None 14 | tokenizer: Optional[str] = None 15 | 16 | 17 | def get_time_now() -> str: 18 | return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") 19 | 20 | 21 | class DatasetMetadata(BaseModel): 22 | name: str 23 | created_at: str 24 | documents: int 25 | queries: int 26 | source: Optional[str] = None 27 | license: Optional[str] = None 28 | bucket: Optional[str] = None 29 | task: Optional[str] = None 30 | dense_model: DenseModelMetadata 31 | sparse_model: Optional[SparseModelMetdata] = None 32 | description: Optional[str] = None 33 | tags: Optional[List[str]] = None 34 | args: Optional[Dict[str, Any]] = None 35 | 36 | @staticmethod 37 | def empty() -> "DatasetMetadata": 38 | return DatasetMetadata( 39 | name="", 40 | created_at=get_time_now(), 41 | documents=0, 42 | queries=0, 43 | dense_model=DenseModelMetadata(name="", dimension=0), 44 | ) 45 | 46 | def is_empty(self) -> bool: 47 | return self.name == "" and self.documents == 0 and self.queries == 0 48 | -------------------------------------------------------------------------------- /pinecone_datasets/fs.py: -------------------------------------------------------------------------------- 1 | from typing import Union, TYPE_CHECKING 2 | from importlib import import_module 3 | 4 | from pinecone_datasets import cfg 5 | 6 | if TYPE_CHECKING: 7 | import gcsfs 8 | import s3fs 9 | from fsspec.implementations.local import LocalFileSystem 10 | 11 | CloudOrLocalFS = Union[gcsfs.GCSFileSystem, s3fs.S3FileSystem, LocalFileSystem] 12 | else: 13 | CloudOrLocalFS = Union[object] # type: ignore 14 | 15 | 16 | def get_cloud_fs(path: str, **kwargs) -> CloudOrLocalFS: 17 | """ 18 | returns a filesystem object for the given path, if it is a cloud storage path (gs:// or s3://) 19 | 20 | Args: 21 | path (str): the path to the file or directory 22 | **kwargs: additional arguments to pass to the filesystem constructor 23 | 24 | Returns: 25 | fs: Union[gcsfs.GCSFileSystem, s3fs.S3FileSystem] - the filesystem object 26 | """ 27 | is_anon = path == cfg.Storage.endpoint 28 | 29 | if path.startswith("gs://") or "storage.googleapis.com" in path: 30 | gcsfs = import_module("gcsfs") 31 | if kwargs.get("token", None): 32 | fs = gcsfs.GCSFileSystem(**kwargs) 33 | else: 34 | fs = gcsfs.GCSFileSystem(token="anon" if is_anon else None, **kwargs) 35 | elif path.startswith("s3://") or "s3.amazonaws.com" in path: 36 | s3fs = import_module("s3fs") 37 | fs = s3fs.S3FileSystem(anon=is_anon, **kwargs) 38 | else: 39 | local_fs = import_module("fsspec.implementations.local") 40 | fs = local_fs.LocalFileSystem() 41 | return fs 42 | -------------------------------------------------------------------------------- /pinecone_datasets/public.py: -------------------------------------------------------------------------------- 1 | from .dataset import Dataset 2 | from .catalog import Catalog 3 | 4 | global catalog 5 | catalog = None 6 | 7 | 8 | def list_datasets(as_df=False, **kwargs) -> list: 9 | """ 10 | List all datasets in the catalog, optionally as a pandas DataFrame. 11 | Catalog is set using the `DATASETS_CATALOG_BASEPATH` environment variable. 12 | 13 | Args: 14 | as_df (bool, optional): Whether to return the list as a pandas DataFrame. Defaults to False. 15 | 16 | Returns: 17 | list: A list of dataset names; or 18 | df: A pandas DataFrame of dataset names and metadata 19 | 20 | Example: 21 | 22 | ```python 23 | from pinecone_datasets import list_datasets 24 | list_datasets() # -> ['dataset1', 'dataset2', ...] 25 | list_datasets(as_df=True) # -> pandas DataFrame of dataset names and metadata 26 | ``` 27 | 28 | """ 29 | global catalog 30 | if catalog is None: 31 | catalog = Catalog() 32 | return catalog.list_datasets(as_df=as_df) 33 | 34 | 35 | def load_dataset(dataset_id: str, **kwargs) -> Dataset: 36 | """ 37 | Load a dataset from the catalog 38 | 39 | Args: 40 | dataset_id (str): The name of the dataset to load 41 | **kwargs: Additional keyword arguments to pass to the Dataset constructor, e.g. `engine='polars'` 42 | 43 | Returns: 44 | Dataset: A Dataset object 45 | 46 | Example: 47 | 48 | ```python 49 | from pinecone_datasets import load_dataset 50 | dataset = load_dataset("dataset_name") 51 | ``` 52 | """ 53 | global catalog 54 | if catalog is None: 55 | catalog = Catalog() 56 | return catalog.load_dataset(dataset_id=dataset_id, **kwargs) 57 | -------------------------------------------------------------------------------- /pinecone_datasets/tqdm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | __all__ = ["tqdm"] 4 | 5 | try: 6 | # Suppress the specific tqdm warning about IProgress 7 | with warnings.catch_warnings(): 8 | warnings.filterwarnings("ignore", category=UserWarning, module="tqdm") 9 | warnings.filterwarnings("ignore", message="IProgress not found.*") 10 | from tqdm.auto import tqdm 11 | except ImportError: 12 | # Fallback: define a dummy tqdm that supports the same interface. 13 | class tqdm: # type: ignore 14 | def __init__(self, iterable=None, total=None, desc="", **kwargs): 15 | self.iterable = iterable 16 | self.total = total 17 | self.desc = desc 18 | # You can store additional kwargs if needed 19 | 20 | def __iter__(self): 21 | # Just iterate over the underlying iterable 22 | for item in self.iterable: 23 | yield item 24 | 25 | def update(self, n=1): 26 | # No-op: This stub doesn't track progress 27 | pass 28 | 29 | def __enter__(self): 30 | # Allow use as a context manager 31 | return self 32 | 33 | def __exit__(self, exc_type, exc_value, traceback): 34 | # Nothing to cleanup 35 | pass 36 | -------------------------------------------------------------------------------- /pinecone_datasets/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import functools 3 | 4 | 5 | def deprecated(func): 6 | @functools.wraps(func) 7 | def wrapper(*args, **kwargs): 8 | warnings.warn( 9 | f"{func.__name__} is deprecated and will be removed in a future version.", 10 | category=DeprecationWarning, 11 | stacklevel=2, 12 | ) 13 | return func(*args, **kwargs) 14 | 15 | return wrapper 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pinecone-datasets" 3 | version = "1.0.2" 4 | description = "Load datasets to explore Pinecone" 5 | authors = ["Pinecone Support "] 6 | maintainers = [ 7 | "Amnon Catav ", 8 | "Roy Miara ", 9 | "Jen Hamon ", 10 | ] 11 | readme = "README.md" 12 | 13 | 14 | [tool.poetry.dependencies] 15 | python = ">=3.9,<3.14" 16 | fsspec = "^2025.2.0" 17 | gcsfs = "^2025.2.0" 18 | s3fs = "^2025.2.0" 19 | pydantic = "^2.0.0" 20 | pandas = "^2.0.0" 21 | pyarrow = "^18.0.0" 22 | 23 | 24 | [tool.poetry.group.dev] 25 | optional = true 26 | 27 | [tool.poetry.group.dev.dependencies] 28 | ipykernel = "^6.21.1" 29 | black = "^23.1.0" 30 | pytest-cov = "^4.0.0" 31 | mypy = "^1.0.1" 32 | pytest = "^7.2.2" 33 | pytest-html = "^3.2.0" 34 | pdoc = "^13.0.0" 35 | toml = "^0.10.2" 36 | pytest-xdist = "^3.3.1" 37 | tuna = "^0.5.11" 38 | tqdm = "^4.67.1" 39 | 40 | 41 | 42 | [build-system] 43 | requires = ["poetry-core"] 44 | build-backend = "poetry.core.masonry.api" 45 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Tests 2 | -------------------------------------------------------------------------------- /tests/integration/test_io_local.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import logging 4 | from pandas.testing import assert_frame_equal as pd_assert_frame_equal 5 | 6 | from pinecone_datasets import Dataset, Catalog, DenseModelMetadata, DatasetMetadata 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | d = pd.DataFrame( 11 | [ 12 | { 13 | "id": "1", 14 | "values": [0.1, 0.2, 0.3], 15 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 16 | "metadata": {"title": "title1", "url": "url1"}, 17 | "blob": None, 18 | }, 19 | { 20 | "id": "2", 21 | "values": [0.4, 0.5, 0.6], 22 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 23 | "metadata": {"title": "title2", "url": "url2"}, 24 | "blob": None, 25 | }, 26 | ] 27 | ) 28 | 29 | q = pd.DataFrame( 30 | [ 31 | { 32 | "vector": [0.1, 0.2, 0.3], 33 | "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 34 | "filter": {"filter1": {"$eq": "filter1"}}, 35 | "top_k": 1, 36 | "blob": None, 37 | }, 38 | { 39 | "vector": [0.4, 0.5, 0.6], 40 | "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 41 | "filter": {"filter2": {"$eq": "filter2"}}, 42 | "top_k": 2, 43 | "blob": None, 44 | }, 45 | ] 46 | ) 47 | 48 | 49 | class TestLocalIO: 50 | def test_empty_catalog(self, tmpdir): 51 | catalog = Catalog(base_path=str(tmpdir.mkdir("catalog"))) 52 | assert catalog.list_datasets(as_df=False) == [] 53 | 54 | def test_io_write_to_local(self, tmpdir): 55 | dataset_name = "test_io_dataset" 56 | metadata = DatasetMetadata( 57 | name=dataset_name, 58 | created_at="2021-01-01 00:00:00.000000", 59 | documents=2, 60 | queries=2, 61 | dense_model=DenseModelMetadata( 62 | name="ada2", 63 | dimension=2, 64 | ), 65 | ) 66 | ds = Dataset.from_pandas(documents=d, queries=q, metadata=metadata) 67 | assert ds._fs is None 68 | assert ds._dataset_path is None 69 | 70 | catalog_path = tmpdir.mkdir("catalog") 71 | catalog = Catalog(base_path=str(catalog_path)) 72 | catalog.save_dataset(ds) 73 | 74 | loaded_ds = catalog.load_dataset(dataset_name) 75 | assert loaded_ds.metadata == metadata 76 | pd_assert_frame_equal(loaded_ds.documents, ds.documents) 77 | pd_assert_frame_equal(loaded_ds.queries, ds.queries) 78 | assert loaded_ds._fs is not None 79 | assert loaded_ds._dataset_path is not None 80 | 81 | def test_io_no_queries(self, tmpdir): 82 | dataset_name = "test_io_dataset_no_q" 83 | metadata = DatasetMetadata( 84 | name=dataset_name, 85 | created_at="2021-01-01 00:00:00.000000", 86 | documents=2, 87 | queries=0, 88 | dense_model=DenseModelMetadata( 89 | name="ada2", 90 | dimension=2, 91 | ), 92 | ) 93 | ds = Dataset.from_pandas(documents=d, queries=None, metadata=metadata) 94 | assert ds._fs is None 95 | assert ds._dataset_path is None 96 | 97 | catalog_path = tmpdir.mkdir("catalog") 98 | catalog = Catalog(base_path=str(catalog_path)) 99 | catalog.save_dataset(ds) 100 | 101 | loaded_ds = catalog.load_dataset(dataset_name) 102 | assert loaded_ds.metadata == metadata 103 | pd_assert_frame_equal(loaded_ds.documents, ds.documents) 104 | assert loaded_ds.queries.empty 105 | assert loaded_ds._fs is not None 106 | assert loaded_ds._dataset_path is not None 107 | 108 | def test_load_from_cloud_and_save_to_local(self, tmpdir): 109 | public_catalog = Catalog() 110 | ds = public_catalog.load_dataset("langchain-python-docs-text-embedding-ada-002") 111 | 112 | local_catalog_path = tmpdir.mkdir("catalog") 113 | local_catalog = Catalog(base_path=str(local_catalog_path)) 114 | local_catalog.save_dataset(ds) 115 | 116 | logger.debug(f"wrote data to local_catalog_path: {str(local_catalog_path)}") 117 | 118 | loaded_ds = local_catalog.load_dataset(ds.metadata.name) 119 | # Assert frames have the same number of rows 120 | assert loaded_ds.documents.shape[0] == ds.documents.shape[0] 121 | assert loaded_ds.queries.shape[0] == ds.queries.shape[0] 122 | # Assert frames have the same columns 123 | assert loaded_ds.documents.columns.tolist() == ds.documents.columns.tolist() 124 | assert loaded_ds.queries.columns.tolist() == ds.queries.columns.tolist() 125 | -------------------------------------------------------------------------------- /tests/integration/test_io_private_cloud_storage_gcs.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pandas as pd 4 | import random 5 | from pandas.testing import assert_frame_equal as pd_assert_frame_equal 6 | 7 | from pinecone_datasets import ( 8 | Dataset, 9 | Catalog, 10 | list_datasets, 11 | DatasetMetadata, 12 | DenseModelMetadata, 13 | ) 14 | import os 15 | 16 | GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") 17 | if not GOOGLE_APPLICATION_CREDENTIALS: 18 | raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set") 19 | 20 | d = pd.DataFrame( 21 | [ 22 | { 23 | "id": "1", 24 | "values": [0.1, 0.2, 0.3], 25 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 26 | "metadata": {"title": "title1", "url": "url1"}, 27 | "blob": None, 28 | }, 29 | { 30 | "id": "2", 31 | "values": [0.4, 0.5, 0.6], 32 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 33 | "metadata": {"title": "title2", "url": "url2"}, 34 | "blob": None, 35 | }, 36 | ] 37 | ) 38 | 39 | q = pd.DataFrame( 40 | [ 41 | { 42 | "vector": [0.1, 0.2, 0.3], 43 | "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 44 | "filter": {"filter1": {"$eq": "filter1"}}, 45 | "top_k": 1, 46 | "blob": None, 47 | }, 48 | { 49 | "vector": [0.4, 0.5, 0.6], 50 | "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 51 | "filter": {"filter2": {"$eq": "filter2"}}, 52 | "top_k": 2, 53 | "blob": None, 54 | }, 55 | ] 56 | ) 57 | 58 | 59 | class TestSaveDatasetToGCS: 60 | def test_io_cloud_storage(self): 61 | dataset_name = "test_io_dataset_" + str(random.randint(0, 1000000)) 62 | metadata = DatasetMetadata( 63 | name=dataset_name, 64 | created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), 65 | documents=2, 66 | queries=2, 67 | dense_model=DenseModelMetadata( 68 | name="ada2", 69 | dimension=2, 70 | ), 71 | ) 72 | ds = Dataset.from_pandas(documents=d, queries=q, metadata=metadata) 73 | 74 | catalog = Catalog(base_path="gs://pinecone-datasets-test/catalog") 75 | catalog.save_dataset(dataset=ds) 76 | 77 | loaded_ds = catalog.load_dataset(dataset_name) 78 | print(catalog.list_datasets(as_df=True)) 79 | 80 | assert loaded_ds.metadata == metadata 81 | pd_assert_frame_equal(loaded_ds.documents, ds.documents) 82 | pd_assert_frame_equal(loaded_ds.queries, ds.queries) 83 | -------------------------------------------------------------------------------- /tests/integration/test_list_public_datasets.py: -------------------------------------------------------------------------------- 1 | from pinecone_datasets import list_datasets 2 | 3 | 4 | class TestListDatasets: 5 | def test_list_datasets(self): 6 | datasets = list_datasets() 7 | assert len(datasets) > 0 8 | assert "quora_all-MiniLM-L6-bm25" in datasets 9 | -------------------------------------------------------------------------------- /tests/integration/test_load_public_dataset.py: -------------------------------------------------------------------------------- 1 | from pinecone_datasets import load_dataset 2 | 3 | 4 | class TestLoadDataset: 5 | def test_load_dataset(self): 6 | ds = load_dataset("langchain-python-docs-text-embedding-ada-002") 7 | assert ds is not None 8 | 9 | headdf = ds.head() 10 | assert headdf is not None 11 | assert len(headdf) > 0 12 | columns = headdf.columns.tolist() 13 | assert "id" in columns 14 | assert "values" in columns 15 | assert "sparse_values" in columns 16 | assert "metadata" in columns 17 | assert "blob" in columns 18 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/pinecone-datasets/247837d84163450f15ce7d3a9919b13a9325ac43/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_basics.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pinecone_datasets import __version__ 3 | 4 | if sys.version_info > (3, 11): 5 | import tomllib as toml 6 | 7 | with open("pyproject.toml", "rb") as f: 8 | assert toml.load(f)["tool"]["poetry"]["version"] == __version__ 9 | else: 10 | import toml 11 | 12 | with open("pyproject.toml") as f: 13 | assert toml.load(f)["tool"]["poetry"]["version"] == __version__ 14 | 15 | 16 | def test_version(): 17 | assert __version__ == "1.0.2" 18 | -------------------------------------------------------------------------------- /tests/unit/test_dataset_metadata.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pinecone_datasets.dataset_metadata import DatasetMetadata, DenseModelMetadata 4 | 5 | from pydantic import ValidationError 6 | 7 | 8 | def test_metadta_fields_minimal(): 9 | try: 10 | meta = DatasetMetadata( 11 | name="test", 12 | documents=1, 13 | created_at="2021-01-01 00:00:00.000000", 14 | queries=1, 15 | dense_model=DenseModelMetadata( 16 | name="ada2", 17 | dimension=2, 18 | ), 19 | ) 20 | except NameError: 21 | pytest.fail("Validation error") 22 | 23 | 24 | def test_validation_error_mandatory_field(): 25 | with pytest.raises(ValidationError): 26 | meta = DatasetMetadata( 27 | documents=1, 28 | queries=1, 29 | dense_model=DenseModelMetadata( 30 | name="ada2", 31 | dimensions=2, 32 | ), 33 | ) 34 | 35 | 36 | def test_validation_error_optional_field(): 37 | with pytest.raises(ValidationError): 38 | meta = DatasetMetadata( 39 | name="test", 40 | documents=1, 41 | queries=1, 42 | dense_model=DenseModelMetadata(name="ada2", dimension=2), 43 | tags="test", 44 | ) 45 | -------------------------------------------------------------------------------- /tests/unit/test_fs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import s3fs 3 | import gcsfs 4 | 5 | from pinecone_datasets.fs import get_cloud_fs 6 | 7 | 8 | def test_get_cloud_fs_nullability(): 9 | assert get_cloud_fs("s3://pinecone-datasets") is not None 10 | assert get_cloud_fs("gs://pinecone-datasets") is not None 11 | assert get_cloud_fs("pinecone-datasets") is not None 12 | 13 | 14 | def test_get_cloud_fs_s3(): 15 | fs = get_cloud_fs("s3://not-pinecone-datasets") 16 | assert isinstance(fs, s3fs.S3FileSystem) 17 | assert fs.anon is False 18 | 19 | 20 | def test_get_cloud_fs_gs(): 21 | fs = get_cloud_fs("gs://not-pinecone-datasets") 22 | assert isinstance(fs, gcsfs.GCSFileSystem) 23 | assert fs.credentials.token is None 24 | 25 | 26 | def test_get_cloud_fs_on_pinecone_endpoint(): 27 | fs = get_cloud_fs("gs://pinecone-datasets-dev") 28 | assert isinstance(fs, gcsfs.GCSFileSystem) 29 | assert fs.credentials.token == "anon" 30 | -------------------------------------------------------------------------------- /tests/unit/test_private_datasets.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from pinecone_datasets import list_datasets, load_dataset, Dataset 5 | from tests.utils.test_public_datasets import deep_list_cmp 6 | 7 | 8 | @pytest.mark.skip(reason="Need to figure out credentials to run these tests") 9 | class TestPrivateDatasets: 10 | def test_list_private_datasets(self): 11 | os.environ["DATASETS_CATALOG_BASEPATH"] = "s3://ram-datasets" 12 | lst = list_datasets(endpoint_url="https://storage.googleapis.com") 13 | print(lst) 14 | del os.environ["DATASETS_CATALOG_BASEPATH"] 15 | assert "test_dataset" in lst 16 | 17 | def test_load_private_dataset(self): 18 | os.environ["DATASETS_CATALOG_BASEPATH"] = "s3://ram-datasets" 19 | ds = load_dataset("test_dataset", endpoint_url="https://storage.googleapis.com") 20 | assert isinstance(ds, Dataset) 21 | assert ds.queries.shape[0] == 2 22 | assert ds.documents.shape[0] == 2 23 | assert deep_list_cmp( 24 | ds.documents.columns, ["id", "values", "sparse_values", "metadata"] 25 | ) 26 | del os.environ["DATASETS_CATALOG_BASEPATH"] 27 | 28 | def test_dataset_from_path(self): 29 | dataset_path = "s3://ram-datasets/test_dataset" 30 | ds = Dataset.from_path( 31 | dataset_path, endpoint_url="https://storage.googleapis.com" 32 | ) 33 | assert isinstance(ds, Dataset) 34 | assert ds.queries.shape[0] == 2 35 | assert ds.documents.shape[0] == 2 36 | assert deep_list_cmp( 37 | ds.documents.columns, ["id", "values", "sparse_values", "metadata"] 38 | ) 39 | -------------------------------------------------------------------------------- /tests/unit/test_schema_validation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | import pandas as pd 5 | from pydantic import ValidationError 6 | 7 | from pinecone_datasets import Dataset, DatasetMetadata, DenseModelMetadata 8 | 9 | 10 | def test_datasets_schema_name_happy(tmpdir): 11 | documents_data = [ 12 | { 13 | "id": "1", 14 | "values": [0.1, 0.2, 0.3], 15 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 16 | "metadata": {"title": "title1", "url": "url1"}, 17 | "blob": None, 18 | }, 19 | { 20 | "id": "2", 21 | "values": [0.4, 0.5, 0.6], 22 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 23 | "metadata": {"title": "title2", "url": "url2"}, 24 | "blob": None, 25 | }, 26 | ] 27 | 28 | dataset_name = "test_dataset" 29 | dataset_path = tmpdir.mkdir(dataset_name) 30 | documents_path = dataset_path.mkdir("documents") 31 | pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet")) 32 | 33 | queries_data = [ 34 | { 35 | "vector": [0.1, 0.2, 0.3], 36 | "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 37 | "filter": {"filter1": {"$eq": "filter1"}}, 38 | "top_k": 1, 39 | "blob": None, 40 | }, 41 | { 42 | "vector": [0.4, 0.5, 0.6], 43 | "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 44 | "filter": {"filter2": {"$eq": "filter2"}}, 45 | "top_k": 2, 46 | "blob": None, 47 | }, 48 | ] 49 | 50 | queries_path = dataset_path.mkdir("queries") 51 | pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet")) 52 | 53 | metadata: DatasetMetadata = DatasetMetadata( 54 | name=dataset_name, 55 | created_at="2021-01-01 00:00:00.000000", 56 | documents=2, 57 | queries=2, 58 | dense_model=DenseModelMetadata( 59 | name="ada2", 60 | dimension=2, 61 | ), 62 | ) 63 | 64 | with open(dataset_path.join("metadata.json"), "w") as f: 65 | json.dump(metadata.model_dump(), f) 66 | 67 | ds = Dataset.from_path(str(dataset_path)) 68 | assert isinstance(ds, Dataset) 69 | assert ds.queries.shape[0] == 2 70 | assert ds.documents.shape[0] == 2 71 | 72 | 73 | def test_datasets_schema_name_documents_missing_propery(tmpdir): 74 | documents_data = [ 75 | { 76 | "id": "1", 77 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 78 | "metadata": {"title": "title1", "url": "url1"}, 79 | "blob": None, 80 | }, 81 | { 82 | "id": "2", 83 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 84 | "metadata": {"title": "title2", "url": "url2"}, 85 | "blob": None, 86 | }, 87 | ] 88 | 89 | dataset_name = "test_dataset" 90 | dataset_path = tmpdir.mkdir(dataset_name) 91 | documents_path = dataset_path.mkdir("documents") 92 | pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet")) 93 | 94 | queries_data = [ 95 | { 96 | "vector": [0.1, 0.2, 0.3], 97 | "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 98 | "filter": {"filter1": {"$eq": "filter1"}}, 99 | "top_k": 1, 100 | "blob": None, 101 | }, 102 | { 103 | "vector": [0.4, 0.5, 0.6], 104 | "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 105 | "filter": {"filter2": {"$eq": "filter2"}}, 106 | "top_k": 2, 107 | "blob": None, 108 | }, 109 | ] 110 | 111 | queries_path = dataset_path.mkdir("queries") 112 | pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet")) 113 | 114 | metadata: DatasetMetadata = DatasetMetadata( 115 | name=dataset_name, 116 | created_at="2021-01-01 00:00:00.000000", 117 | documents=2, 118 | queries=2, 119 | dense_model=DenseModelMetadata( 120 | name="ada2", 121 | dimension=2, 122 | ), 123 | ) 124 | 125 | with open(dataset_path.join("metadata.json"), "w") as f: 126 | json.dump(metadata.model_dump(), f) 127 | 128 | with pytest.raises(ValueError): 129 | ds = Dataset.from_path(str(dataset_path)) 130 | assert isinstance(ds, Dataset) 131 | assert ds.queries.shape[0] == 2 132 | assert ds.documents.shape[0] == 2 133 | 134 | 135 | def test_datasets_schema_name_queries_missing_propery(tmpdir): 136 | documents_data = [ 137 | { 138 | "id": "1", 139 | "values": [0.1, 0.2, 0.3], 140 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 141 | "metadata": {"title": "title1", "url": "url1"}, 142 | "blob": None, 143 | }, 144 | { 145 | "id": "2", 146 | "values": [0.4, 0.5, 0.6], 147 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 148 | "metadata": {"title": "title2", "url": "url2"}, 149 | "blob": None, 150 | }, 151 | ] 152 | 153 | dataset_name = "test_dataset" 154 | dataset_path = tmpdir.mkdir(dataset_name) 155 | documents_path = dataset_path.mkdir("documents") 156 | pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet")) 157 | 158 | queries_data = [ 159 | { 160 | "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 161 | "filter": {"filter1": {"$eq": "filter1"}}, 162 | "top_k": 1, 163 | }, 164 | { 165 | "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 166 | "filter": {"filter2": {"$eq": "filter2"}}, 167 | "top_k": 2, 168 | }, 169 | ] 170 | 171 | queries_path = dataset_path.mkdir("queries") 172 | pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet")) 173 | 174 | metadata: DatasetMetadata = DatasetMetadata( 175 | name=dataset_name, 176 | created_at="2021-01-01 00:00:00.000000", 177 | documents=2, 178 | queries=2, 179 | dense_model=DenseModelMetadata( 180 | name="ada2", 181 | dimension=2, 182 | ), 183 | ) 184 | 185 | with open(dataset_path.join("metadata.json"), "w") as f: 186 | json.dump(metadata.model_dump(), f) 187 | 188 | with pytest.raises(ValueError): 189 | ds = Dataset.from_path(str(dataset_path)) 190 | assert isinstance(ds, Dataset) 191 | assert ds.queries.shape[0] == 2 192 | assert ds.documents.shape[0] == 2 193 | 194 | 195 | def test_datasets_schema_metadata_wrong(tmpdir): 196 | with pytest.raises(ValidationError): 197 | metadata: DatasetMetadata = DatasetMetadata( 198 | created_at="2021-01-01 00:00:00.000000", 199 | documents=2, 200 | queries=2, 201 | dense_model=DenseModelMetadata( 202 | name="ada2", 203 | dimension=2, 204 | ), 205 | ) 206 | -------------------------------------------------------------------------------- /tests/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pytest 4 | from pinecone_datasets.dataset_fsreader import DatasetFSReader 5 | from pinecone_datasets.dataset_fswriter import DatasetFSWriter 6 | from pinecone_datasets.dataset import Dataset 7 | 8 | 9 | def test_read_pandas_dataframe(tmpdir): 10 | d = [ 11 | { 12 | "id": "1", 13 | "values": [0.1, 0.2, 0.3], 14 | "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}, 15 | "metadata": {"title": "title1", "url": "url1"}, 16 | "blob": None, 17 | }, 18 | { 19 | "id": "2", 20 | "values": [0.4, 0.5, 0.6], 21 | "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]}, 22 | "metadata": {"title": "title2", "url": "url2"}, 23 | "blob": None, 24 | }, 25 | ] 26 | df = pd.DataFrame(d) 27 | 28 | schema_documents = [ 29 | ("id", False, None), 30 | ("values", False, None), 31 | ("sparse_values", True, None), 32 | ("metadata", True, None), 33 | ("blob", True, None), 34 | ] 35 | 36 | # create tempdir 37 | dataset_name = "test_read_pandas_dataframe" 38 | dataset_path = tmpdir.mkdir(dataset_name) 39 | 40 | read_df = Dataset._read_pandas_dataframe( 41 | df, column_mapping=None, schema=schema_documents 42 | ) 43 | assert isinstance(read_df, pd.DataFrame) 44 | 45 | # check if the dataframe is the same 46 | pd.testing.assert_frame_equal(df, read_df) 47 | 48 | # test None case 49 | none_df = Dataset._read_pandas_dataframe( 50 | None, column_mapping=None, schema=schema_documents 51 | ) 52 | assert none_df.empty 53 | 54 | for k, _, _ in schema_documents: 55 | assert k in read_df.columns 56 | assert k in none_df.columns 57 | 58 | 59 | def test_convert_metadata_from_dict_to_json(): 60 | d1 = {"a": 1, "b": 2} 61 | s1 = '{"a": 1, "b": 2}' 62 | assert DatasetFSWriter._convert_metadata_from_dict_to_json(d1) == s1 63 | assert ( 64 | DatasetFSReader._convert_metadata_from_json_to_dict( 65 | DatasetFSWriter._convert_metadata_from_dict_to_json(d1) 66 | ) 67 | == d1 68 | ) 69 | 70 | d2 = {"a": 1, "b": None} 71 | s2 = '{"a": 1, "b": null}' 72 | assert DatasetFSWriter._convert_metadata_from_dict_to_json(d2) == s2 73 | assert ( 74 | DatasetFSReader._convert_metadata_from_json_to_dict( 75 | DatasetFSWriter._convert_metadata_from_dict_to_json(d2) 76 | ) 77 | == d2 78 | ) 79 | 80 | d3 = None 81 | s3 = None 82 | assert DatasetFSWriter._convert_metadata_from_dict_to_json(d3) == s3 83 | assert ( 84 | DatasetFSReader._convert_metadata_from_json_to_dict( 85 | DatasetFSWriter._convert_metadata_from_dict_to_json(d3) 86 | ) 87 | == d3 88 | ) 89 | 90 | d4 = {"a": 1, "b": np.nan} 91 | s4 = '{"a": 1, "b": NaN}' 92 | assert DatasetFSWriter._convert_metadata_from_dict_to_json(d4) == s4 93 | 94 | # TODO WTF? 95 | # print({"a": 1, "b": np.nan}) 96 | # print(Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4))) 97 | # print(type(Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4))['b'])) 98 | # print(type(np.nan)) 99 | # assert Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4)) == d4 100 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/pinecone-datasets/247837d84163450f15ce7d3a9919b13a9325ac43/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_public_datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def is_dicts_equal(d1, d2): 5 | return d1.keys() == d2.keys() and recursive_dict_compare(d1, d2) 6 | 7 | 8 | def deep_list_cmp(l1, l2): 9 | same = True 10 | for l, r in zip(l1, l2): 11 | same = same and l == r 12 | return same 13 | 14 | 15 | def approx_deep_list_cmp(l1, l2): 16 | same = True 17 | for l, r in zip(l1, l2): 18 | same = same and np.isclose(l, r) 19 | return same 20 | 21 | 22 | def recursive_dict_compare(d1, d2): 23 | for k, v in d1.items(): 24 | if isinstance(v, dict): 25 | return recursive_dict_compare(v, d2[k]) 26 | elif isinstance(v, (list, np.ndarray)): 27 | return deep_list_cmp(v, d2[k]) 28 | return v == d2[k] 29 | --------------------------------------------------------------------------------