├── .github
    └── workflows
    │   ├── cd.yml
    │   ├── ci.yml
    │   └── docs.yml
├── .gitignore
├── MAINTAINERS.md
├── README.md
├── mypy.ini
├── pdoc_theme
    ├── syntax-highlighting.css
    └── theme.css
├── pinecone_datasets
    ├── __init__.py
    ├── catalog.py
    ├── cfg.py
    ├── dataset.py
    ├── dataset_fsreader.py
    ├── dataset_fswriter.py
    ├── dataset_metadata.py
    ├── fs.py
    ├── public.py
    ├── tqdm.py
    └── utils.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── integration
        ├── test_io_local.py
        ├── test_io_private_cloud_storage_gcs.py
        ├── test_list_public_datasets.py
        └── test_load_public_dataset.py
    ├── unit
        ├── __init__.py
        ├── test_basics.py
        ├── test_dataset_metadata.py
        ├── test_fs.py
        ├── test_private_datasets.py
        ├── test_schema_validation.py
        └── test_utils.py
    └── utils
        ├── __init__.py
        └── test_public_datasets.py


/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: CD
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | 
 7 | jobs:
 8 | 
 9 |   release:
10 |     permissions:
11 |       contents: write
12 | 
13 |     name: Release
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v3
17 | 
18 |     - name: Install Poetry
19 |       uses: snok/install-poetry@v1
20 |       with:
21 |         version: 1.5.0
22 | 
23 |     - name: Set Version
24 |       run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV
25 | 
26 | #    - name: Create tag
27 | #      uses: actions/github-script@v5
28 | #      with:
29 | #        script: |
30 | #          github.rest.git.createRef({
31 | #            owner: context.repo.owner,
32 | #            repo: context.repo.repo,
33 | #            ref: 'refs/tags/V${{ env.VERSION }}',
34 | #            sha: context.sha
35 | #          })
36 | 
37 |     - name: Build and publish to pypi
38 |       run: |
39 |         poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }}
40 |         poetry publish --build
41 | 
42 |     - name: Create GH release
43 |       uses: ncipollo/release-action@v1
44 |       env:
45 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
46 |       with:
47 |         tag: ${{ env.VERSION }}
48 |         name: ${{ env.VERSION }}
49 |         artifacts: "dist/*"
50 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |       branches:
 9 |         - main
10 | 
11 | jobs:
12 |   linting:
13 |     name: Run lint and type checking
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ['3.10']
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 | 
26 |     - name: Install Poetry
27 |       uses: snok/install-poetry@v1
28 |       with:
29 |         version: 1.5.0
30 |     - name: install dependencies
31 |       run: poetry install --with dev --all-extras
32 |     
33 |     - name: Run Black Check
34 |       run: poetry run black --check .
35 |     
36 |     - name: Run mypy check
37 |       run: poetry run mypy .
38 | 
39 |   run-tests:
40 |     name: Run tests
41 |     needs: linting
42 |     runs-on: ubuntu-latest
43 |     strategy:
44 |       fail-fast: false
45 |       matrix:
46 |         python-version: [3.9, '3.10', 3.11, 3.12, 3.13]
47 | 
48 |     steps:
49 |     - uses: actions/checkout@v4
50 |     - name: Set up Python ${{ matrix.python-version }}
51 |       uses: actions/setup-python@v5
52 |       with:
53 |         python-version: ${{ matrix.python-version }}
54 | 
55 |     - name: Install Poetry
56 |       uses: snok/install-poetry@v1
57 |       with:
58 |         version: 1.5.0
59 |     - name: install dependencies
60 |       run: poetry install --with dev --all-extras
61 | 
62 |     - name: Run pytest (unit tests)
63 |       env:
64 |         PY_VERSION: ${{ matrix.python-version }}
65 |         # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
66 |         # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
67 |       run: poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/unit
68 |     
69 |     - name: upload pytest report.html
70 |       uses: actions/upload-artifact@v4
71 |       if: always()
72 |       with:
73 |         name: dataset-pytest-report-py${{ matrix.python-version }}
74 |         path: report.html
75 | 
76 |     - name: Write google service account credentials to a file
77 |       id: prepare-google-credentials
78 |       shell: bash
79 |       run: |
80 |         secrets_file="$(mktemp)"
81 |         echo "$GCS_SERVICE_ACCOUNT_CREDS_BASE64" | base64 -d > $secrets_file
82 |         echo "google_credentials_file=$secrets_file" >> $GITHUB_OUTPUT
83 |       env:
84 |         GCS_SERVICE_ACCOUNT_CREDS_BASE64: '${{ secrets.GCS_SERVICE_ACCOUNT_CREDS_BASE64 }}'
85 | 
86 |     - name: Run pytest (integration tests)
87 |       run: poetry run pytest tests/integration
88 |       env:
89 |         GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.prepare-google-credentials.outputs.google_credentials_file }}


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | # build the documentation whenever there are new commits on main
 4 | on: workflow_dispatch
 5 | 
 6 | # security: restrict permissions for CI jobs.
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   # Build the documentation and upload the static HTML files as an artifact.
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: actions/setup-python@v5
17 |         with:
18 |           python-version: '3.10'
19 |       - name: Install Poetry
20 |         uses: snok/install-poetry@v1
21 |         with:
22 |           version: 1.5.0
23 | 
24 |       - run: poetry install --with dev --all-extras
25 |       # ADJUST THIS: build your documentation into docs/.
26 |       # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here.
27 |       - run: poetry run pdoc -d google -o docs/ --logo https://d33wubrfki0l68.cloudfront.net/682006698903a55560c796b901fdfe4446c6d27a/a00ee/images/pinecone-logo.svg --search -t pdoc_theme ./pinecone_datasets
28 | 
29 |       - uses: actions/upload-pages-artifact@v1
30 |         with:
31 |           path: docs/
32 | 
33 |   # This is a separate job so that only actions/deploy-pages has the necessary permissions.
34 |   deploy:
35 |     needs: build
36 |     runs-on: ubuntu-latest
37 |     permissions:
38 |       pages: write
39 |       id-token: write
40 |     environment:
41 |       name: github-pages
42 |       url: ${{ steps.deployment.outputs.page_url }}
43 |     steps:
44 |       - id: deployment
45 |         uses: actions/deploy-pages@v1
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints/
 2 | examples/
 3 | internal_testing.ipynb
 4 | .DS_Store
 5 | dist/
 6 | .mypy_cache/
 7 | scratchpad.ipynb
 8 | .pycache/
 9 | .pytest_cache/
10 | .coverage
11 | poetry.lock
12 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
  1 | # Pinecone Datasets
  2 | 
  3 | ### Supported storage options
  4 | 
  5 | pinecone_datasets can load datasets from Google Cloud storage, Amazon S3, and local files.
  6 | 
  7 | By default, the `load_dataset` and `list_datasets` packages will pull from Pinecone's public GCS bucket at `gs://pinecone-datasets-dev`, but you can interact with catalogs stored in other locations.
  8 | 
  9 | ```python
 10 | from pinecone_datasets import Catalog
 11 | 
 12 | # Local catalog
 13 | catalog = Catalog(base_path="/path/to/local/catalog")
 14 | catalog.list_datasets()
 15 | 
 16 | # Google Cloud
 17 | catalog = Catalog(base_path="gs://bucket-name")
 18 | 
 19 | # S3 catalog
 20 | s3_catalog = Catalog(base_path="s3://bucket-name")
 21 | ```
 22 | 
 23 | If you are using Amazon S3 or Google Cloud to access private buckets, you can use environment variables to configure your credentials. For example, if you set a base_path starting with "gs://", the `gcsfs` package will attempt to find credentials by looking in cache locations used by `gcloud auth login` or reading environment variables such as `GOOGLE_APPLICATION_CREDENTIALS`.
 24 | 
 25 | ## Adding a new dataset to the public datasets repo
 26 | 
 27 | Note: Only Pinecone employees with access to the bucket can complete this step.
 28 | 
 29 | Prerequisites:
 30 | 
 31 | 1. Install google cloud CLI
 32 | 2. Authenticate with `gcloud auth login`
 33 | 
 34 | ```python
 35 | from pinecone_datasets import Catalog, Dataset, DatasetMetadata, DenseModelMetadata
 36 | 
 37 | # 1. Prepare pandas dataframes containing your embeddings
 38 | documents_df = ...
 39 | queries_df = ...
 40 | 
 41 | # 2. Create metadata to describe the dataset
 42 | import datatime
 43 | metadata = DatasetMetadata(
 44 |     name="new-dataset-name",
 45 |     created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
 46 |     documents=len(documents_df),
 47 |     queries=len(queries_df),
 48 |     dense_model=DenseModelMetadata(
 49 |         name="ada2",
 50 |         dimension=2,
 51 |     ),
 52 | )
 53 | 
 54 | # 3. Take all this, and instantiate a Dataset
 55 | ds = Dataset.from_pandas(
 56 |     documents=documents_df,
 57 |     queries=queries_df,
 58 |     metadata=metadata
 59 | )
 60 | 
 61 | # 4. Save to catalog (requires gcloud auth step above)
 62 | catalog = Catalog(base_path="gs://pinecone-datasets-dev")
 63 | catalog.save_dataset(ds)
 64 | ```
 65 | 
 66 | Afterwards, verify the new dataset appears in list function and can be used
 67 | 
 68 | ```python
 69 | from pinecone_datasets import list_datasets, load_dataset
 70 | 
 71 | list_datasets(as_df=True)
 72 | 
 73 | ds = load_dataset("new-dataset-name")
 74 | ds.documents
 75 | ds.head()
 76 | ```
 77 | 
 78 | ### Expected dataset structure
 79 | 
 80 | The package expects data to be laid out with the following directory structure:
 81 | 
 82 |     ├── my-subdir                     # path to where all datasets
 83 |     │   ├── my-dataset                # name of dataset
 84 |     │   │   ├── metadata.json         # dataset metadata (optional, only for listed)
 85 |     │   │   ├── documents             # datasets documents
 86 |     │   │   │   ├── file1.parquet      
 87 |     │   │   │   └── file2.parquet      
 88 |     │   │   ├── queries               # dataset queries
 89 |     │   │   │   ├── file1.parquet  
 90 |     │   │   │   └── file2.parquet   
 91 |     └── ...
 92 | 
 93 | The data schema is expected to be as follows:
 94 | 
 95 | - `documents` directory contains parquet files with the following schema:
 96 |     - Mandatory: `id: str, values: list[float]`
 97 |     - Optional: `sparse_values: Dict: indices: List[int], values: List[float]`, `metadata: Dict`, `blob: dict`
 98 |         - note: blob is a dict that can contain any data, it is not returned when iterating over the dataset and is inteded to be used for storing additional data that is not part of the dataset schema. however, it is sometime useful to store additional data in the dataset, for example, a document text. In future version this may become a first class citizen in the dataset schema.
 99 | - `queries` directory contains parquet files with the following schema:
100 |     - Mandatory: `vector: list[float], top_k: int`
101 |     - Optional: `sparse_vector: Dict: indices: List[int], values: List[float]`, `filter: Dict`
102 |         - note: filter is a dict that contain pinecone filters, for more information see [here](https://docs.pinecone.io/docs/metadata-filtering)
103 | 
104 | in addition, a metadata file is expected to be in the dataset directory, for example: `s3://my-bucket/my-dataset/metadata.json`
105 | 
106 | ```python
107 | from pinecone_datasets.catalog import DatasetMetadata
108 | 
109 | meta = DatasetMetadata(
110 |     name="test_dataset",
111 |     created_at="2023-02-17 14:17:01.481785",
112 |     documents=2,
113 |     queries=2,
114 |     source="manual",
115 |     bucket="LOCAL",
116 |     task="unittests",
117 |     dense_model={"name": "bert", "dimension": 3},
118 |     sparse_model={"name": "bm25"},
119 | )
120 | ```
121 | 
122 | full metadata schema can be found in `pinecone_datasets.dataset_metadata.DatasetMetadata.schema`
123 | 
124 | ### The 'blob' column
125 | 
126 | Pinecone dataset ship with a blob column which is inteneded to be used for storing additional data that is not part of the dataset schema. however, it is sometime useful to store additional data in the dataset, for example, a document text. We added a utility function to move data from the blob column to the metadata column. This is useful for example when upserting a dataset to an index and want to use the metadata to store text data.
127 | 
128 | ```python
129 | from pinecone_datasets import import_documents_keys_from_blob_to_metadata
130 | 
131 | new_dataset = import_documents_keys_from_blob_to_metadata(dataset, keys=["text"])
132 | ```
133 | 
134 | ## Usage saving
135 | 
136 | You can save your dataset to a catalog managed by you or to a local path or a remote path (GCS or S3). 
137 | 
138 | ### Saving a dataset to a Catalog
139 | 
140 | To set you own catalog endpoint, set the environment variable `DATASETS_CATALOG_BASEPATH` to your bucket. Note that pinecone uses the default authentication method for the storage type (gcsfs for GCS and s3fs for S3).
141 | 
142 | After this environment variable is set you can save your dataset to the catalog using the `save` function
143 | 
144 | ```python
145 | from pinecone_datasets import Dataset
146 | 
147 | metadata = DatasetMetadata(**{"name": "my-dataset", ...})
148 | ```
149 | 
150 | 
151 | ### Saving to Path
152 | 
153 | You can save your dataset to a local path or a remote path (GCS or S3). Note that pinecone uses the default authentication method for the storage type (gcsfs for GCS and s3fs for S3).
154 | 
155 | ```python
156 | dataset = Dataset.from_pandas(documents, queries, metadata)
157 | dataset.to_path("s3://my-bucket/my-subdir/my-dataset")
158 | ```
159 | 
160 | ## Running tests
161 | 
162 | This project is using poetry for dependency managemet. To start developing, on project root directory run:
163 | 
164 | ```bash
165 | poetry install --with dev
166 | ```
167 | 
168 | To run test locally run 
169 | 
170 | ```bash
171 | poetry run pytest test/unit --cov pinecone_datasets
172 | ```
173 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pinecone Datasets
 2 | 
 3 | ## install
 4 | 
 5 | ```bash
 6 | pip install pinecone-datasets
 7 | ```
 8 | 
 9 | ### Loading public datasets
10 | 
11 | Pinecone hosts a public datasets catalog, you can load a dataset by name using `list_datasets` and `load_dataset` functions. This will use the default catalog endpoint (currently GCS) to list and load datasets.
12 | 
13 | ```python
14 | from pinecone_datasets import list_datasets, load_dataset
15 | 
16 | list_datasets()
17 | # ["quora_all-MiniLM-L6-bm25", ... ]
18 | 
19 | dataset = load_dataset("quora_all-MiniLM-L6-bm25")
20 | 
21 | dataset.head()
22 | 
23 | # Prints
24 | # ┌─────┬───────────────────────────┬─────────────────────────────────────┬───────────────────┬──────┐
25 | # │ id  ┆ values                    ┆ sparse_values                       ┆ metadata          ┆ blob │
26 | # │     ┆                           ┆                                     ┆                   ┆      │
27 | # │ str ┆ list[f32]                 ┆ struct[2]                           ┆ struct[3]         ┆      │
28 | # ╞═════╪═══════════════════════════╪═════════════════════════════════════╪═══════════════════╪══════╡
29 | # │ 0   ┆ [0.118014, -0.069717, ... ┆ {[470065541, 52922727, ... 22364... ┆ {2017,12,"other"} ┆ .... │
30 | # │     ┆ 0.0060...                 ┆                                     ┆                   ┆      │
31 | # └─────┴───────────────────────────┴─────────────────────────────────────┴───────────────────┴──────┘
32 | ```
33 | 
34 | 
35 | ## Usage - Accessing data
36 | 
37 | Each dataset has three main attributes, `documents`, `queries`, and `metadata` which are lazily loaded the first time they are accessed. You may notice a delay as the underlying parquet files are being downloaded the first time these attributes are accessed.
38 | 
39 | Pinecone Datasets is build on top of pandas. `documents` and `queries` are lazily-loaded pandas dataframes. This means that you can use all the pandas API to access the data. In addition, we provide some helper functions to access the data in a more convenient way. 
40 | 
41 | accessing the documents and queries dataframes is done using the `documents` and `queries` properties. These properties are lazy and will only load the data when accessed. 
42 | 
43 | ```python
44 | from pinecone_datasets import list_datasets, load_dataset
45 | 
46 | dataset = load_dataset("quora_all-MiniLM-L6-bm25")
47 | 
48 | document_df: pd.DataFrame = dataset.documents
49 | 
50 | query_df: pd.DataFrame = dataset.queries
51 | ```
52 | 
53 | 
54 | ## Usage - Iterating over documents
55 | 
56 | The `Dataset` class has helpers for iterating over your dataset. This is useful for upserting a dataset to an index, or for benchmarking.
57 | 
58 | ```python
59 | 
60 | # List Iterator, where every list of size N Dicts with ("id", "values", "sparse_values", "metadata")
61 | dataset.iter_documents(batch_size=n) 
62 | 
63 | # Dict Iterator, where every dict has ("vector", "sparse_vector", "filter", "top_k")
64 | dataset.iter_queries()
65 | ```
66 | 
67 | ### Upserting to Index
68 | 
69 | To upsert data to the index, you should install the [Pinecone SDK](https://github.com/pinecone-io/pinecone-python-client)
70 | 
71 | ```python
72 | from pinecone import Pinecone, ServerlessSpec
73 | from pinecone_datasets import load_dataset, list_datasets
74 | 
75 | # See what datasets are available
76 | for ds in list_datasets():
77 |     print(ds)
78 | 
79 | # Download embeddings data 
80 | dataset = load_dataset(dataset_name)
81 | 
82 | # Instantiate a Pinecone client using API key from app.pinecone.io
83 | pc = Pinecone(api_key='key')
84 | 
85 | # Create a Pinecone index
86 | index_config = pc.create_index(
87 |     name="demo-index",
88 |     dimension=dataset.metadata.dense_model.dimension,
89 |     spec=ServerlessSpec(cloud="aws", region="us-east1")
90 | )
91 | 
92 | # Instantiate an index client
93 | index = pc.Index(host=index_config.host)
94 | 
95 | # Upsert data from the dataset
96 | index.upsert_from_dataframe(df=dataset.documents)
97 | ```
98 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | ignore_errors = True
4 | 
5 | [mypy-pinecone_dataset.*]
6 | ignore_errors = False
7 | disallow_untyped_defs = True
8 | disallow_untyped_calls = True


--------------------------------------------------------------------------------
/pdoc_theme/syntax-highlighting.css:
--------------------------------------------------------------------------------
 1 | /* monokai color scheme, see pdoc/template/README.md */
 2 | pre { line-height: 125%; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 20px; }
 4 | .pdoc-code .hll { background-color: #49483e }
 5 | .pdoc-code { background: #272822; color: #f8f8f2 }
 6 | .pdoc-code .c { color: #75715e } /* Comment */
 7 | .pdoc-code .err { color: #960050; background-color: #1e0010 } /* Error */
 8 | .pdoc-code .esc { color: #f8f8f2 } /* Escape */
 9 | .pdoc-code .g { color: #f8f8f2 } /* Generic */
10 | .pdoc-code .k { color: #66d9ef } /* Keyword */
11 | .pdoc-code .l { color: #ae81ff } /* Literal */
12 | .pdoc-code .n { color: #f8f8f2 } /* Name */
13 | .pdoc-code .o { color: #f92672 } /* Operator */
14 | .pdoc-code .x { color: #f8f8f2 } /* Other */
15 | .pdoc-code .p { color: #f8f8f2 } /* Punctuation */
16 | .pdoc-code .ch { color: #75715e } /* Comment.Hashbang */
17 | .pdoc-code .cm { color: #75715e } /* Comment.Multiline */
18 | .pdoc-code .cp { color: #75715e } /* Comment.Preproc */
19 | .pdoc-code .cpf { color: #75715e } /* Comment.PreprocFile */
20 | .pdoc-code .c1 { color: #75715e } /* Comment.Single */
21 | .pdoc-code .cs { color: #75715e } /* Comment.Special */
22 | .pdoc-code .gd { color: #f92672 } /* Generic.Deleted */
23 | .pdoc-code .ge { color: #f8f8f2; font-style: italic } /* Generic.Emph */
24 | .pdoc-code .gr { color: #f8f8f2 } /* Generic.Error */
25 | .pdoc-code .gh { color: #f8f8f2 } /* Generic.Heading */
26 | .pdoc-code .gi { color: #a6e22e } /* Generic.Inserted */
27 | .pdoc-code .go { color: #66d9ef } /* Generic.Output */
28 | .pdoc-code .gp { color: #f92672; font-weight: bold } /* Generic.Prompt */
29 | .pdoc-code .gs { color: #f8f8f2; font-weight: bold } /* Generic.Strong */
30 | .pdoc-code .gu { color: #75715e } /* Generic.Subheading */
31 | .pdoc-code .gt { color: #f8f8f2 } /* Generic.Traceback */
32 | .pdoc-code .kc { color: #66d9ef } /* Keyword.Constant */
33 | .pdoc-code .kd { color: #66d9ef } /* Keyword.Declaration */
34 | .pdoc-code .kn { color: #f92672 } /* Keyword.Namespace */
35 | .pdoc-code .kp { color: #66d9ef } /* Keyword.Pseudo */
36 | .pdoc-code .kr { color: #66d9ef } /* Keyword.Reserved */
37 | .pdoc-code .kt { color: #66d9ef } /* Keyword.Type */
38 | .pdoc-code .ld { color: #e6db74 } /* Literal.Date */
39 | .pdoc-code .m { color: #ae81ff } /* Literal.Number */
40 | .pdoc-code .s { color: #e6db74 } /* Literal.String */
41 | .pdoc-code .na { color: #a6e22e } /* Name.Attribute */
42 | .pdoc-code .nb { color: #f8f8f2 } /* Name.Builtin */
43 | .pdoc-code .nc { color: #a6e22e } /* Name.Class */
44 | .pdoc-code .no { color: #66d9ef } /* Name.Constant */
45 | .pdoc-code .nd { color: #a6e22e } /* Name.Decorator */
46 | .pdoc-code .ni { color: #f8f8f2 } /* Name.Entity */
47 | .pdoc-code .ne { color: #a6e22e } /* Name.Exception */
48 | .pdoc-code .nf { color: #a6e22e } /* Name.Function */
49 | .pdoc-code .nl { color: #f8f8f2 } /* Name.Label */
50 | .pdoc-code .nn { color: #f8f8f2 } /* Name.Namespace */
51 | .pdoc-code .nx { color: #a6e22e } /* Name.Other */
52 | .pdoc-code .py { color: #f8f8f2 } /* Name.Property */
53 | .pdoc-code .nt { color: #f92672 } /* Name.Tag */
54 | .pdoc-code .nv { color: #f8f8f2 } /* Name.Variable */
55 | .pdoc-code .ow { color: #f92672 } /* Operator.Word */
56 | .pdoc-code .w { color: #f8f8f2 } /* Text.Whitespace */
57 | .pdoc-code .mb { color: #ae81ff } /* Literal.Number.Bin */
58 | .pdoc-code .mf { color: #ae81ff } /* Literal.Number.Float */
59 | .pdoc-code .mh { color: #ae81ff } /* Literal.Number.Hex */
60 | .pdoc-code .mi { color: #ae81ff } /* Literal.Number.Integer */
61 | .pdoc-code .mo { color: #ae81ff } /* Literal.Number.Oct */
62 | .pdoc-code .sa { color: #e6db74 } /* Literal.String.Affix */
63 | .pdoc-code .sb { color: #e6db74 } /* Literal.String.Backtick */
64 | .pdoc-code .sc { color: #e6db74 } /* Literal.String.Char */
65 | .pdoc-code .dl { color: #e6db74 } /* Literal.String.Delimiter */
66 | .pdoc-code .sd { color: #e6db74 } /* Literal.String.Doc */
67 | .pdoc-code .s2 { color: #e6db74 } /* Literal.String.Double */
68 | .pdoc-code .se { color: #ae81ff } /* Literal.String.Escape */
69 | .pdoc-code .sh { color: #e6db74 } /* Literal.String.Heredoc */
70 | .pdoc-code .si { color: #e6db74 } /* Literal.String.Interpol */
71 | .pdoc-code .sx { color: #e6db74 } /* Literal.String.Other */
72 | .pdoc-code .sr { color: #e6db74 } /* Literal.String.Regex */
73 | .pdoc-code .s1 { color: #e6db74 } /* Literal.String.Single */
74 | .pdoc-code .ss { color: #e6db74 } /* Literal.String.Symbol */
75 | .pdoc-code .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
76 | .pdoc-code .fm { color: #a6e22e } /* Name.Function.Magic */
77 | .pdoc-code .vc { color: #f8f8f2 } /* Name.Variable.Class */
78 | .pdoc-code .vg { color: #f8f8f2 } /* Name.Variable.Global */
79 | .pdoc-code .vi { color: #f8f8f2 } /* Name.Variable.Instance */
80 | .pdoc-code .vm { color: #f8f8f2 } /* Name.Variable.Magic */


--------------------------------------------------------------------------------
/pdoc_theme/theme.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |     --pdoc-background: #212529;
 3 | }
 4 | 
 5 | .pdoc {
 6 |     --text: #f7f7f7;
 7 |     --muted: #9d9d9d;
 8 |     --link: #58a6ff;
 9 |     --link-hover: #3989ff;
10 |     --code: #333;
11 |     --active: #555;
12 | 
13 |     --accent: #343434;
14 |     --accent2: #555;
15 | 
16 |     --nav-hover: rgba(0, 0, 0, 0.1);
17 |     --name: #77C1FF;
18 |     --def: #0cdd0c;
19 |     --annotation: #00c037;
20 | }


--------------------------------------------------------------------------------
/pinecone_datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. include:: ../README.md
 3 | """
 4 | 
 5 | __version__ = "1.0.2"
 6 | 
 7 | 
 8 | from .public import list_datasets, load_dataset
 9 | from .dataset_metadata import DatasetMetadata, DenseModelMetadata
10 | from .catalog import Catalog
11 | from .dataset import Dataset
12 | 


--------------------------------------------------------------------------------
/pinecone_datasets/catalog.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import os
 3 | import json
 4 | from typing import List, Optional, Union, TYPE_CHECKING
 5 | 
 6 | import logging
 7 | from pydantic import BaseModel, ValidationError, Field
 8 | 
 9 | from .cfg import Storage
10 | from .fs import get_cloud_fs
11 | from .dataset import Dataset
12 | from .dataset_fswriter import DatasetFSWriter
13 | from .dataset_metadata import DatasetMetadata
14 | 
15 | if TYPE_CHECKING:
16 |     import pandas as pd
17 | else:
18 |     pd = None
19 | 
20 | 
21 | logger = logging.getLogger(__name__)
22 | 
23 | 
24 | class Catalog(BaseModel):
25 |     def __init__(self, base_path: Optional[str] = None, **kwargs):
26 |         super().__init__(**kwargs)
27 |         if base_path is None:
28 |             self.base_path = os.environ.get(
29 |                 "DATASETS_CATALOG_BASEPATH", Storage.endpoint
30 |             )
31 |         else:
32 |             self.base_path = base_path
33 | 
34 |     base_path: str = Field(default=None)
35 |     datasets: List[DatasetMetadata] = Field(default_factory=list)
36 | 
37 |     def load(self, **kwargs) -> "Catalog":
38 |         """Loads metadata about all datasets from the catalog."""
39 |         fs = get_cloud_fs(self.base_path, **kwargs)
40 |         collected_datasets = []
41 | 
42 |         metadata_files_glob_path = os.path.join(self.base_path, "*", "metadata.json")
43 |         for metadata_path in fs.glob(metadata_files_glob_path):
44 |             with fs.open(metadata_path) as f:
45 |                 try:
46 |                     this_dataset_json = json.load(f)
47 |                 except json.JSONDecodeError:
48 |                     warnings.warn(
49 |                         f"Not a JSON: Invalid metadata.json for {metadata_path}, skipping"
50 |                     )
51 |                     continue
52 | 
53 |                 try:
54 |                     this_dataset = DatasetMetadata(**this_dataset_json)
55 |                     collected_datasets.append(this_dataset)
56 |                 except ValidationError as e:
57 |                     warnings.warn(
58 |                         f"metadata file for dataset: {metadata_path} is not valid, skipping: {e}"
59 |                     )
60 |                     continue
61 | 
62 |         self.datasets = collected_datasets
63 |         logger.info(f"Loaded {len(self.datasets)} datasets from {self.base_path}")
64 |         return self
65 | 
66 |     def list_datasets(self, as_df: bool) -> Union[List[str], "pd.DataFrame"]:
67 |         """Lists all datasets in the catalog."""
68 |         if self.datasets is None or len(self.datasets) == 0:
69 |             self.load()
70 | 
71 |         import pandas as pd
72 | 
73 |         if as_df:
74 |             return pd.DataFrame([ds.model_dump() for ds in self.datasets])
75 |         else:
76 |             return [dataset.name for dataset in self.datasets]
77 | 
78 |     def load_dataset(self, dataset_id: str, **kwargs) -> "Dataset":
79 |         """Loads the dataset from the catalog."""
80 |         ds_path = os.path.join(str(self.base_path), dataset_id)
81 |         return Dataset.from_path(dataset_path=ds_path, **kwargs)
82 | 
83 |     def save_dataset(
84 |         self,
85 |         dataset: "Dataset",
86 |         **kwargs,
87 |     ):
88 |         """
89 |         Save a dataset to the catalog.
90 |         """
91 |         ds_path = os.path.join(self.base_path, dataset.metadata.name)
92 |         DatasetFSWriter.write_dataset(dataset_path=ds_path, dataset=dataset, **kwargs)
93 |         logger.info(f"Saved dataset {dataset.metadata.name} to {ds_path}")
94 | 


--------------------------------------------------------------------------------
/pinecone_datasets/cfg.py:
--------------------------------------------------------------------------------
 1 | # from polars.datatypes import Utf8, Float32, List, Struct, Field, UInt32
 2 | 
 3 | 
 4 | class Storage:
 5 |     endpoint: str = "gs://pinecone-datasets-dev"
 6 | 
 7 | 
 8 | class Schema:
 9 |     class Names:
10 |         documents = [
11 |             ("id", False, None),
12 |             ("values", False, None),
13 |             ("sparse_values", True, None),
14 |             ("metadata", True, None),
15 |             ("blob", True, None),
16 |         ]
17 |         queries = [
18 |             ("vector", False, None),
19 |             ("sparse_vector", True, None),
20 |             ("filter", True, None),
21 |             ("top_k", False, 5),
22 |             ("blob", True, None),
23 |         ]
24 | 
25 |     # documents = {
26 |     #     "id": Utf8,
27 |     #     "values": List(Float32),
28 |     #     "sparse_values": Struct(
29 |     #         [Field("indices", List(UInt32)), Field("values", List(Float32))]
30 |     #     ),
31 |     # }
32 |     documents_select_columns = ["id", "values", "sparse_values", "metadata"]
33 | 
34 |     # queries = {
35 |     #     "vector": List(Float32),
36 |     #     "sparse_vector": Struct(
37 |     #         [Field("indices", List(UInt32)), Field("values", List(Float32))]
38 |     #     ),
39 |     #     "top_k": UInt32,
40 |     # }
41 |     queries_select_columns = ["vector", "sparse_vector", "filter", "top_k"]
42 | 


--------------------------------------------------------------------------------
/pinecone_datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from urllib.parse import urlparse
  3 | from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple
  4 | 
  5 | from .cfg import Schema
  6 | from .dataset_metadata import DatasetMetadata
  7 | from .fs import get_cloud_fs
  8 | from .utils import deprecated
  9 | 
 10 | from typing import TYPE_CHECKING
 11 | 
 12 | if TYPE_CHECKING:
 13 |     import pandas as pd
 14 |     from .dataset_fsreader import DatasetFSReader
 15 | else:
 16 |     pd = None  # Placeholder for runtime
 17 |     DatasetFSReader = None  # Placeholder for runtime
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def iter_pandas_dataframe_slices(
 23 |     df: "pd.DataFrame", batch_size, return_indexes
 24 | ) -> Generator[List[Dict[str, Any]], None, None]:
 25 |     for i in range(0, len(df), batch_size):
 26 |         if return_indexes:
 27 |             yield (i, df.iloc[i : i + batch_size].to_dict(orient="records"))
 28 |         else:
 29 |             yield df.iloc[i : i + batch_size].to_dict(orient="records")
 30 | 
 31 | 
 32 | def iter_pandas_dataframe_single(
 33 |     df: "pd.DataFrame",
 34 | ) -> Generator[Dict[str, Any], None, None]:
 35 |     for i in range(0, len(df), 1):
 36 |         yield df.iloc[i : i + 1].to_dict(orient="records")[0]
 37 | 
 38 | 
 39 | class Dataset:
 40 |     @classmethod
 41 |     def from_path(cls, dataset_path, **kwargs):
 42 |         """
 43 |         Create a Dataset object from local or cloud storage
 44 |         Args:
 45 |             dataset_path (str): a path to a local or cloud storage path containing a valid dataset.
 46 | 
 47 |         Returns:
 48 |             Dataset: a Dataset object
 49 |         """
 50 |         return cls(dataset_path=dataset_path, **kwargs)
 51 | 
 52 |     @classmethod
 53 |     def from_pandas(
 54 |         cls,
 55 |         documents: "pd.DataFrame",
 56 |         metadata: DatasetMetadata,
 57 |         documents_column_mapping: Optional[Dict] = None,
 58 |         queries: Optional["pd.DataFrame"] = None,
 59 |         queries_column_mapping: Optional[Dict] = None,
 60 |         **kwargs,
 61 |     ) -> "Dataset":
 62 |         """
 63 |         Create a Dataset object from a pandas DataFrame
 64 | 
 65 |         Args:
 66 |             documents (pd.DataFrame): a pandas DataFrame containing the documents
 67 |             documents_column_mapping (Dict): a dictionary mapping the columns of the documents DataFrame to the Pinecone Datasets Schema
 68 |             queries (pd.DataFrame): a pandas DataFrame containing the queries
 69 |             queries_column_mapping (Dict): a dictionary mapping the columns of the queries DataFrame to the Pinecone Datasets Schema
 70 | 
 71 |         Keyword Args:
 72 |             kwargs (Dict): additional arguments to pass to the fsspec constructor
 73 | 
 74 |         Returns:
 75 |             Dataset: a Dataset object
 76 |         """
 77 |         instance = cls(dataset_path=None, **kwargs)
 78 |         instance._documents = cls._read_pandas_dataframe(
 79 |             documents, documents_column_mapping, Schema.Names.documents
 80 |         )
 81 |         instance._queries = cls._read_pandas_dataframe(
 82 |             queries, queries_column_mapping, Schema.Names.queries
 83 |         )
 84 |         instance._metadata = metadata
 85 |         return instance
 86 | 
 87 |     @staticmethod
 88 |     def _read_pandas_dataframe(
 89 |         df: "pd.DataFrame",
 90 |         column_mapping: Dict[str, str],
 91 |         schema: List[Tuple[str, bool, Any]],
 92 |     ) -> "pd.DataFrame":
 93 |         """
 94 |         Reads a pandas DataFrame and validates it against a schema.
 95 | 
 96 |         Args:
 97 |             df (pd.DataFrame): the pandas DataFrame to read
 98 |             column_mapping (Dict[str, str]): a dictionary mapping the columns of the DataFrame to the Pinecone Datasets Schema (col_name, pinecone_name)
 99 |             schema (List[Tuple[str, bool]]): the schema to validate against (column_name, is_nullable)
100 | 
101 |         Returns:
102 |             pd.DataFrame: the validated, renamed DataFrame
103 |         """
104 |         import pandas as pd
105 | 
106 |         if df is None or df.empty:
107 |             return pd.DataFrame(columns=[column_name for column_name, _, _ in schema])
108 |         else:
109 |             if column_mapping is not None:
110 |                 df.rename(columns=column_mapping, inplace=True)
111 |             for column_name, is_nullable, null_value in schema:
112 |                 if column_name not in df.columns and not is_nullable:
113 |                     raise ValueError(
114 |                         f"error, file is not matching Pinecone Datasets Schmea: {column_name} not found"
115 |                     )
116 |                 elif column_name not in df.columns and is_nullable:
117 |                     df[column_name] = null_value
118 |             return df[[column_name for column_name, _, _ in schema]]
119 | 
120 |     def __init__(
121 |         self,
122 |         dataset_path: str,
123 |         **kwargs,
124 |     ) -> None:
125 |         """
126 |         Dataset class to load and query datasets from the Pinecone Datasets catalog.
127 |         See `from_path` and `from_dataset_id` for examples on how to load a dataset.
128 | 
129 |         Examples:
130 |             ```python
131 |             from pinecone_datasets import Dataset
132 |             dataset = Dataset.from_dataset_id("dataset_name")
133 |             # or
134 |             dataset = Dataset.from_path("gs://my-bucket/my-dataset")
135 | 
136 |             for doc in dataset.iter_documents(batch_size=100):
137 |                 index.upsert(doc)
138 |             for query in dataset.iter_queries(batch_size):
139 |                 results = index.search(query)
140 |                 # do something with the results
141 |             # or
142 |             dataset.documents # returns a pandas/polars DataFrame
143 |             dataset.queries # returns a pandas/polars DataFrame
144 |             ```
145 | 
146 |         """
147 |         if dataset_path is not None:
148 |             endpoint = urlparse(dataset_path)._replace(path="").geturl()
149 |             self._fs = get_cloud_fs(endpoint, **kwargs)
150 |             self._dataset_path = dataset_path
151 |             if not self._fs.exists(self._dataset_path):
152 |                 raise FileNotFoundError(
153 |                     f"Dataset does not exist at path {self._dataset_path}"
154 |                 )
155 |         else:
156 |             self._dataset_path = None
157 |             self._fs = None
158 |         self._documents = None
159 |         self._queries = None
160 |         self._metadata = None
161 | 
162 |     def __getitem__(self, key: str):
163 |         if key in ["documents", "queries"]:
164 |             return getattr(self, key)
165 |         else:
166 |             raise KeyError("Dataset does not have key: {}".format(key))
167 | 
168 |     def __len__(self) -> int:
169 |         return self.documents.shape[0]
170 | 
171 |     @property
172 |     def documents(self) -> "pd.DataFrame":
173 |         if self._documents is None and self._dataset_path is not None:
174 |             from .dataset_fsreader import DatasetFSReader
175 | 
176 |             self._documents = DatasetFSReader.read_documents(
177 |                 self._fs, self._dataset_path
178 |             )
179 |         return self._documents
180 | 
181 |     @property
182 |     def queries(self) -> "pd.DataFrame":
183 |         if self._queries is None and self._dataset_path is not None:
184 |             from .dataset_fsreader import DatasetFSReader
185 | 
186 |             self._queries = DatasetFSReader.read_queries(self._fs, self._dataset_path)
187 |         return self._queries
188 | 
189 |     @property
190 |     def metadata(self) -> DatasetMetadata:
191 |         if self._metadata is None and self._dataset_path is not None:
192 |             from .dataset_fsreader import DatasetFSReader
193 | 
194 |             self._metadata = DatasetFSReader.read_metadata(self._fs, self._dataset_path)
195 |         return self._metadata
196 | 
197 |     def iter_documents(
198 |         self, batch_size: int = 1, return_indexes=False
199 |     ) -> Iterator[List[Dict[str, Any]]]:
200 |         """
201 |         Iterates over the documents in the dataset.
202 | 
203 |         Args:
204 |             batch_size (int, optional): The batch size to use for the iterator. Defaults to 1.
205 | 
206 |         Returns:
207 |             Iterator[List[Dict[str, Any]]]: An iterator over the documents in the dataset.
208 | 
209 |         Examples:
210 |             for batch in dataset.iter_documents(batch_size=100):
211 |                 index.upsert(batch)
212 |         """
213 |         if isinstance(batch_size, int) and batch_size > 0:
214 |             return iter_pandas_dataframe_slices(
215 |                 df=self.documents[Schema.documents_select_columns].dropna(
216 |                     axis=1, how="all"
217 |                 ),
218 |                 batch_size=batch_size,
219 |                 return_indexes=return_indexes,
220 |             )
221 |         else:
222 |             raise ValueError("batch_size must be greater than 0")
223 | 
224 |     def iter_queries(self) -> Iterator[Dict[str, Any]]:
225 |         """
226 |         Iterates over the queries in the dataset.
227 | 
228 |         Returns:
229 |             Iterator[Dict[str, Any]]: An iterator over the queries in the dataset.
230 | 
231 |         Examples:
232 |             for query in dataset.iter_queries():
233 |                 results = index.query(**query)
234 |                 # do something with the results
235 |         """
236 |         return iter_pandas_dataframe_single(self.queries[Schema.queries_select_columns])
237 | 
238 |     def head(self, n: int = 5) -> "pd.DataFrame":
239 |         return self.documents.head(n)
240 | 
241 |     @deprecated
242 |     @classmethod
243 |     def from_catalog(cls, dataset_id, catalog_base_path: str = "", **kwargs):
244 |         """
245 |         DEPRECATED: This method has been removed. Please use `Catalog.load_dataset` instead.
246 |         """
247 |         raise Exception(
248 |             "This method has been removed. Please use `Catalog.load_dataset` instead."
249 |         )
250 | 
251 |     @deprecated
252 |     def to_catalog(
253 |         self,
254 |         dataset_id: str,
255 |         catalog_base_path: str = "",
256 |         **kwargs,
257 |     ):
258 |         """
259 |         DEPRECATED: This method has been removed. Please use `Catalog.save_dataset` instead.
260 |         """
261 |         raise Exception(
262 |             "This method has been removed. Please use `Catalog.save_dataset` instead."
263 |         )
264 | 
265 |     @deprecated
266 |     def to_pinecone_index(self, *args, **kwargs):
267 |         """
268 |         DEPRECATED: This method has been removed. Please use the `pinecone.Index.upsert` method instead from the `pinecone` SDK package.
269 |         """
270 |         raise Exception(
271 |             "This method has been removed. Please use the `pinecone.Index.upsert` method instead from the `pinecone` SDK package."
272 |         )
273 | 


--------------------------------------------------------------------------------
/pinecone_datasets/dataset_fsreader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import warnings
  5 | from typing import Literal, Optional
  6 | 
  7 | import pandas as pd
  8 | import pyarrow.parquet as pq
  9 | from .tqdm import tqdm
 10 | 
 11 | from .cfg import Schema
 12 | from .dataset_metadata import DatasetMetadata
 13 | from .fs import CloudOrLocalFS
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class DatasetFSReader:
 19 |     @staticmethod
 20 |     def read_documents(fs: CloudOrLocalFS, dataset_path: str) -> pd.DataFrame:
 21 |         logger.debug(f"reading documents from {dataset_path}")
 22 |         df = DatasetFSReader._safe_read_from_path(fs, dataset_path, "documents")
 23 | 
 24 |         # metadata supposed to be a dict [if legacy] or string
 25 |         df["metadata"] = df["metadata"].apply(
 26 |             DatasetFSReader._convert_metadata_from_json_to_dict
 27 |         )
 28 |         return df
 29 | 
 30 |     @staticmethod
 31 |     def read_queries(fs: CloudOrLocalFS, dataset_path: str) -> pd.DataFrame:
 32 |         logger.debug(f"reading queries from {dataset_path}")
 33 |         df = DatasetFSReader._safe_read_from_path(fs, dataset_path, "queries")
 34 | 
 35 |         # filter supposed to be a dict [if legacy] or string
 36 |         df["filter"] = df["filter"].apply(
 37 |             DatasetFSReader._convert_metadata_from_json_to_dict
 38 |         )
 39 | 
 40 |         return df
 41 | 
 42 |     @staticmethod
 43 |     def read_metadata(fs: CloudOrLocalFS, dataset_path: str) -> DatasetMetadata:
 44 |         logger.debug(f"reading metadata from {dataset_path}")
 45 |         with fs.open(os.path.join(dataset_path, "metadata.json"), "rb") as f:
 46 |             metadata = json.load(f)
 47 |         return DatasetMetadata(**metadata)
 48 | 
 49 |     @staticmethod
 50 |     def _convert_metadata_from_json_to_dict(metadata: Optional[str] = None) -> dict:
 51 |         if metadata is None:
 52 |             return None
 53 |         elif isinstance(metadata, dict):
 54 |             return metadata
 55 |         elif isinstance(metadata, str):
 56 |             return json.loads(metadata)
 57 |         else:
 58 |             raise TypeError("metadata must be a string or dict")
 59 | 
 60 |     @staticmethod
 61 |     def _does_datatype_exist(
 62 |         fs: CloudOrLocalFS,
 63 |         dataset_path: str,
 64 |         data_type: Literal["documents", "queries"],
 65 |     ) -> bool:
 66 |         return fs.exists(os.path.join(dataset_path, data_type))
 67 | 
 68 |     @staticmethod
 69 |     def _safe_read_from_path(
 70 |         fs: CloudOrLocalFS,
 71 |         dataset_path: str,
 72 |         data_type: Literal["documents", "queries"],
 73 |     ) -> pd.DataFrame:
 74 |         read_path_str = os.path.join(dataset_path, data_type, "*.parquet")
 75 |         read_path = fs.glob(read_path_str)
 76 |         if DatasetFSReader._does_datatype_exist(fs, dataset_path, data_type):
 77 |             # First, collect all the dataframes
 78 |             dfs = []
 79 |             for path in tqdm(read_path, desc=f"Loading {data_type} parquet files"):
 80 |                 piece = pq.read_pandas(path, filesystem=fs)
 81 |                 df_piece = piece.to_pandas()
 82 |                 dfs.append(df_piece)
 83 | 
 84 |             if not dfs:
 85 |                 raise ValueError(f"No parquet files found in {read_path_str}")
 86 | 
 87 |             # Combine all dataframes
 88 |             df = pd.concat(dfs, ignore_index=True)
 89 | 
 90 |             # Validate schema
 91 |             dataset_schema_names = df.columns.tolist()
 92 |             columns_to_null = []
 93 |             columns_not_null = []
 94 |             for column_name, is_nullable, null_value in getattr(
 95 |                 Schema.Names, data_type
 96 |             ):
 97 |                 if column_name not in dataset_schema_names and not is_nullable:
 98 |                     raise ValueError(
 99 |                         f"error, file is not matching Pinecone Datasets Schema: {column_name} not found"
100 |                     )
101 |                 elif column_name not in dataset_schema_names and is_nullable:
102 |                     columns_to_null.append((column_name, null_value))
103 |                 else:
104 |                     columns_not_null.append(column_name)
105 | 
106 |             # Add null columns if needed
107 |             for column_name, null_value in columns_to_null:
108 |                 df[column_name] = null_value
109 | 
110 |             return df[columns_not_null + [col for col, _ in columns_to_null]]
111 | 
112 |         else:
113 |             warnings.warn(
114 |                 "WARNING: No data found at: {}. Returning empty dataframe".format(
115 |                     read_path_str
116 |                 ),
117 |                 UserWarning,
118 |                 stacklevel=0,
119 |             )
120 |             return pd.DataFrame(
121 |                 columns=[col[0] for col in getattr(Schema.Names, data_type)]
122 |             )
123 | 


--------------------------------------------------------------------------------
/pinecone_datasets/dataset_fswriter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import warnings
 4 | import logging
 5 | 
 6 | from .fs import get_cloud_fs, CloudOrLocalFS
 7 | from typing import Optional, TYPE_CHECKING
 8 | 
 9 | if TYPE_CHECKING:
10 |     import pandas as pd
11 | else:
12 |     pd = None
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class DatasetFSWriter:
18 |     @staticmethod
19 |     def write_dataset(dataset_path: str, dataset: "Dataset", **kwargs):
20 |         """
21 |         Saves the dataset to a local or cloud storage path.
22 |         """
23 |         fs = get_cloud_fs(dataset_path, **kwargs)
24 |         logger.debug(f"writing dataset {dataset.metadata.name} to {dataset_path}")
25 |         DatasetFSWriter._write_documents(fs, dataset_path, dataset)
26 |         DatasetFSWriter._write_queries(fs, dataset_path, dataset)
27 |         DatasetFSWriter._write_metadata(fs, dataset_path, dataset)
28 | 
29 |     @staticmethod
30 |     def _write_documents(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"):
31 |         documents_path = os.path.join(dataset_path, "documents")
32 |         fs.makedirs(documents_path, exist_ok=True)
33 | 
34 |         documents_metadta_copy = dataset.documents["metadata"].copy()
35 |         try:
36 |             logger.debug(
37 |                 f"writing dataset {dataset.metadata.name} documents to {documents_path}"
38 |             )
39 |             dataset.documents["metadata"] = dataset.documents["metadata"].apply(
40 |                 DatasetFSWriter._convert_metadata_from_dict_to_json
41 |             )
42 |             dataset.documents.to_parquet(
43 |                 os.path.join(documents_path, "part-0.parquet"),
44 |                 engine="pyarrow",
45 |                 index=False,
46 |                 filesystem=fs,
47 |             )
48 |         finally:
49 |             dataset.documents["metadata"] = documents_metadta_copy
50 | 
51 |     @staticmethod
52 |     def _write_queries(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"):
53 |         if dataset.queries.empty:
54 |             warnings.warn("Queries are empty, not saving queries")
55 |         else:
56 |             queries_path = os.path.join(dataset_path, "queries")
57 |             logger.debug(
58 |                 f"writing dataset {dataset.metadata.name} queries to {queries_path}"
59 |             )
60 |             fs.makedirs(queries_path, exist_ok=True)
61 |             queries_filter_copy = dataset.queries["filter"].copy()
62 |             try:
63 |                 dataset.queries["filter"] = dataset.queries["filter"].apply(
64 |                     DatasetFSWriter._convert_metadata_from_dict_to_json
65 |                 )
66 |                 dataset.queries.to_parquet(
67 |                     os.path.join(queries_path, "part-0.parquet"),
68 |                     engine="pyarrow",
69 |                     index=False,
70 |                     filesystem=fs,
71 |                 )
72 |             finally:
73 |                 dataset.queries["filter"] = queries_filter_copy
74 | 
75 |     @staticmethod
76 |     def _write_metadata(fs: CloudOrLocalFS, dataset_path: str, dataset: "Dataset"):
77 |         metadata_path = os.path.join(dataset_path, "metadata.json")
78 |         logger.debug(
79 |             f"writing dataset {dataset.metadata.name} metadata to {metadata_path}"
80 |         )
81 |         with fs.open(metadata_path, "w") as f:
82 |             json.dump(dataset.metadata.model_dump(), f)
83 | 
84 |     @staticmethod
85 |     def _convert_metadata_from_dict_to_json(metadata: Optional[dict]) -> str:
86 |         import pandas as pd
87 | 
88 |         if pd.isna(metadata):
89 |             return None
90 |         if metadata and not isinstance(metadata, dict):
91 |             raise TypeError(
92 |                 f"metadata must be a dict but its {type(metadata)} meta = {metadata}"
93 |             )
94 |         return json.dumps(metadata, ensure_ascii=False)
95 | 


--------------------------------------------------------------------------------
/pinecone_datasets/dataset_metadata.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import List, Optional, Any, Dict
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class DenseModelMetadata(BaseModel):
 7 |     name: str
 8 |     tokenizer: Optional[str] = None
 9 |     dimension: int
10 | 
11 | 
12 | class SparseModelMetdata(BaseModel):
13 |     name: Optional[str] = None
14 |     tokenizer: Optional[str] = None
15 | 
16 | 
17 | def get_time_now() -> str:
18 |     return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
19 | 
20 | 
21 | class DatasetMetadata(BaseModel):
22 |     name: str
23 |     created_at: str
24 |     documents: int
25 |     queries: int
26 |     source: Optional[str] = None
27 |     license: Optional[str] = None
28 |     bucket: Optional[str] = None
29 |     task: Optional[str] = None
30 |     dense_model: DenseModelMetadata
31 |     sparse_model: Optional[SparseModelMetdata] = None
32 |     description: Optional[str] = None
33 |     tags: Optional[List[str]] = None
34 |     args: Optional[Dict[str, Any]] = None
35 | 
36 |     @staticmethod
37 |     def empty() -> "DatasetMetadata":
38 |         return DatasetMetadata(
39 |             name="",
40 |             created_at=get_time_now(),
41 |             documents=0,
42 |             queries=0,
43 |             dense_model=DenseModelMetadata(name="", dimension=0),
44 |         )
45 | 
46 |     def is_empty(self) -> bool:
47 |         return self.name == "" and self.documents == 0 and self.queries == 0
48 | 


--------------------------------------------------------------------------------
/pinecone_datasets/fs.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, TYPE_CHECKING
 2 | from importlib import import_module
 3 | 
 4 | from pinecone_datasets import cfg
 5 | 
 6 | if TYPE_CHECKING:
 7 |     import gcsfs
 8 |     import s3fs
 9 |     from fsspec.implementations.local import LocalFileSystem
10 | 
11 |     CloudOrLocalFS = Union[gcsfs.GCSFileSystem, s3fs.S3FileSystem, LocalFileSystem]
12 | else:
13 |     CloudOrLocalFS = Union[object]  # type: ignore
14 | 
15 | 
16 | def get_cloud_fs(path: str, **kwargs) -> CloudOrLocalFS:
17 |     """
18 |     returns a filesystem object for the given path, if it is a cloud storage path (gs:// or s3://)
19 | 
20 |     Args:
21 |         path (str): the path to the file or directory
22 |         **kwargs: additional arguments to pass to the filesystem constructor
23 | 
24 |     Returns:
25 |         fs: Union[gcsfs.GCSFileSystem, s3fs.S3FileSystem] - the filesystem object
26 |     """
27 |     is_anon = path == cfg.Storage.endpoint
28 | 
29 |     if path.startswith("gs://") or "storage.googleapis.com" in path:
30 |         gcsfs = import_module("gcsfs")
31 |         if kwargs.get("token", None):
32 |             fs = gcsfs.GCSFileSystem(**kwargs)
33 |         else:
34 |             fs = gcsfs.GCSFileSystem(token="anon" if is_anon else None, **kwargs)
35 |     elif path.startswith("s3://") or "s3.amazonaws.com" in path:
36 |         s3fs = import_module("s3fs")
37 |         fs = s3fs.S3FileSystem(anon=is_anon, **kwargs)
38 |     else:
39 |         local_fs = import_module("fsspec.implementations.local")
40 |         fs = local_fs.LocalFileSystem()
41 |     return fs
42 | 


--------------------------------------------------------------------------------
/pinecone_datasets/public.py:
--------------------------------------------------------------------------------
 1 | from .dataset import Dataset
 2 | from .catalog import Catalog
 3 | 
 4 | global catalog
 5 | catalog = None
 6 | 
 7 | 
 8 | def list_datasets(as_df=False, **kwargs) -> list:
 9 |     """
10 |     List all datasets in the catalog, optionally as a pandas DataFrame.
11 |     Catalog is set using the `DATASETS_CATALOG_BASEPATH` environment variable.
12 | 
13 |     Args:
14 |         as_df (bool, optional): Whether to return the list as a pandas DataFrame. Defaults to False.
15 | 
16 |     Returns:
17 |         list: A list of dataset names; or
18 |         df: A pandas DataFrame of dataset names and metadata
19 | 
20 |     Example:
21 | 
22 |         ```python
23 |         from pinecone_datasets import list_datasets
24 |         list_datasets() # -> ['dataset1', 'dataset2', ...]
25 |         list_datasets(as_df=True) # -> pandas DataFrame of dataset names and metadata
26 |         ```
27 | 
28 |     """
29 |     global catalog
30 |     if catalog is None:
31 |         catalog = Catalog()
32 |     return catalog.list_datasets(as_df=as_df)
33 | 
34 | 
35 | def load_dataset(dataset_id: str, **kwargs) -> Dataset:
36 |     """
37 |     Load a dataset from the catalog
38 | 
39 |     Args:
40 |         dataset_id (str): The name of the dataset to load
41 |         **kwargs: Additional keyword arguments to pass to the Dataset constructor, e.g. `engine='polars'`
42 | 
43 |     Returns:
44 |         Dataset: A Dataset object
45 | 
46 |     Example:
47 | 
48 |         ```python
49 |         from pinecone_datasets import load_dataset
50 |         dataset = load_dataset("dataset_name")
51 |         ```
52 |     """
53 |     global catalog
54 |     if catalog is None:
55 |         catalog = Catalog()
56 |     return catalog.load_dataset(dataset_id=dataset_id, **kwargs)
57 | 


--------------------------------------------------------------------------------
/pinecone_datasets/tqdm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | __all__ = ["tqdm"]
 4 | 
 5 | try:
 6 |     # Suppress the specific tqdm warning about IProgress
 7 |     with warnings.catch_warnings():
 8 |         warnings.filterwarnings("ignore", category=UserWarning, module="tqdm")
 9 |         warnings.filterwarnings("ignore", message="IProgress not found.*")
10 |         from tqdm.auto import tqdm
11 | except ImportError:
12 |     # Fallback: define a dummy tqdm that supports the same interface.
13 |     class tqdm:  # type: ignore
14 |         def __init__(self, iterable=None, total=None, desc="", **kwargs):
15 |             self.iterable = iterable
16 |             self.total = total
17 |             self.desc = desc
18 |             # You can store additional kwargs if needed
19 | 
20 |         def __iter__(self):
21 |             # Just iterate over the underlying iterable
22 |             for item in self.iterable:
23 |                 yield item
24 | 
25 |         def update(self, n=1):
26 |             # No-op: This stub doesn't track progress
27 |             pass
28 | 
29 |         def __enter__(self):
30 |             # Allow use as a context manager
31 |             return self
32 | 
33 |         def __exit__(self, exc_type, exc_value, traceback):
34 |             # Nothing to cleanup
35 |             pass
36 | 


--------------------------------------------------------------------------------
/pinecone_datasets/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import functools
 3 | 
 4 | 
 5 | def deprecated(func):
 6 |     @functools.wraps(func)
 7 |     def wrapper(*args, **kwargs):
 8 |         warnings.warn(
 9 |             f"{func.__name__} is deprecated and will be removed in a future version.",
10 |             category=DeprecationWarning,
11 |             stacklevel=2,
12 |         )
13 |         return func(*args, **kwargs)
14 | 
15 |     return wrapper
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "pinecone-datasets"
 3 | version = "1.0.2"
 4 | description = "Load datasets to explore Pinecone"
 5 | authors = ["Pinecone Support <support@pinecone.io>"]
 6 | maintainers = [
 7 |     "Amnon Catav <amnon@pinecone.io>",
 8 |     "Roy Miara <miararoy@gmail.com>",
 9 |     "Jen Hamon <jhamon@pinecone.io>",
10 | ]
11 | readme = "README.md"
12 | 
13 | 
14 | [tool.poetry.dependencies]
15 | python = ">=3.9,<3.14"
16 | fsspec = "^2025.2.0"
17 | gcsfs = "^2025.2.0"
18 | s3fs = "^2025.2.0"
19 | pydantic = "^2.0.0"
20 | pandas = "^2.0.0"
21 | pyarrow = "^18.0.0"
22 | 
23 | 
24 | [tool.poetry.group.dev]
25 | optional = true
26 | 
27 | [tool.poetry.group.dev.dependencies]
28 | ipykernel = "^6.21.1"
29 | black = "^23.1.0"
30 | pytest-cov = "^4.0.0"
31 | mypy = "^1.0.1"
32 | pytest = "^7.2.2"
33 | pytest-html = "^3.2.0"
34 | pdoc = "^13.0.0"
35 | toml = "^0.10.2"
36 | pytest-xdist = "^3.3.1"
37 | tuna = "^0.5.11"
38 | tqdm = "^4.67.1"
39 | 
40 | 
41 | 
42 | [build-system]
43 | requires = ["poetry-core"]
44 | build-backend = "poetry.core.masonry.api"
45 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Tests
2 | 


--------------------------------------------------------------------------------
/tests/integration/test_io_local.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pandas as pd
  3 | import logging
  4 | from pandas.testing import assert_frame_equal as pd_assert_frame_equal
  5 | 
  6 | from pinecone_datasets import Dataset, Catalog, DenseModelMetadata, DatasetMetadata
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | d = pd.DataFrame(
 11 |     [
 12 |         {
 13 |             "id": "1",
 14 |             "values": [0.1, 0.2, 0.3],
 15 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 16 |             "metadata": {"title": "title1", "url": "url1"},
 17 |             "blob": None,
 18 |         },
 19 |         {
 20 |             "id": "2",
 21 |             "values": [0.4, 0.5, 0.6],
 22 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 23 |             "metadata": {"title": "title2", "url": "url2"},
 24 |             "blob": None,
 25 |         },
 26 |     ]
 27 | )
 28 | 
 29 | q = pd.DataFrame(
 30 |     [
 31 |         {
 32 |             "vector": [0.1, 0.2, 0.3],
 33 |             "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 34 |             "filter": {"filter1": {"$eq": "filter1"}},
 35 |             "top_k": 1,
 36 |             "blob": None,
 37 |         },
 38 |         {
 39 |             "vector": [0.4, 0.5, 0.6],
 40 |             "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 41 |             "filter": {"filter2": {"$eq": "filter2"}},
 42 |             "top_k": 2,
 43 |             "blob": None,
 44 |         },
 45 |     ]
 46 | )
 47 | 
 48 | 
 49 | class TestLocalIO:
 50 |     def test_empty_catalog(self, tmpdir):
 51 |         catalog = Catalog(base_path=str(tmpdir.mkdir("catalog")))
 52 |         assert catalog.list_datasets(as_df=False) == []
 53 | 
 54 |     def test_io_write_to_local(self, tmpdir):
 55 |         dataset_name = "test_io_dataset"
 56 |         metadata = DatasetMetadata(
 57 |             name=dataset_name,
 58 |             created_at="2021-01-01 00:00:00.000000",
 59 |             documents=2,
 60 |             queries=2,
 61 |             dense_model=DenseModelMetadata(
 62 |                 name="ada2",
 63 |                 dimension=2,
 64 |             ),
 65 |         )
 66 |         ds = Dataset.from_pandas(documents=d, queries=q, metadata=metadata)
 67 |         assert ds._fs is None
 68 |         assert ds._dataset_path is None
 69 | 
 70 |         catalog_path = tmpdir.mkdir("catalog")
 71 |         catalog = Catalog(base_path=str(catalog_path))
 72 |         catalog.save_dataset(ds)
 73 | 
 74 |         loaded_ds = catalog.load_dataset(dataset_name)
 75 |         assert loaded_ds.metadata == metadata
 76 |         pd_assert_frame_equal(loaded_ds.documents, ds.documents)
 77 |         pd_assert_frame_equal(loaded_ds.queries, ds.queries)
 78 |         assert loaded_ds._fs is not None
 79 |         assert loaded_ds._dataset_path is not None
 80 | 
 81 |     def test_io_no_queries(self, tmpdir):
 82 |         dataset_name = "test_io_dataset_no_q"
 83 |         metadata = DatasetMetadata(
 84 |             name=dataset_name,
 85 |             created_at="2021-01-01 00:00:00.000000",
 86 |             documents=2,
 87 |             queries=0,
 88 |             dense_model=DenseModelMetadata(
 89 |                 name="ada2",
 90 |                 dimension=2,
 91 |             ),
 92 |         )
 93 |         ds = Dataset.from_pandas(documents=d, queries=None, metadata=metadata)
 94 |         assert ds._fs is None
 95 |         assert ds._dataset_path is None
 96 | 
 97 |         catalog_path = tmpdir.mkdir("catalog")
 98 |         catalog = Catalog(base_path=str(catalog_path))
 99 |         catalog.save_dataset(ds)
100 | 
101 |         loaded_ds = catalog.load_dataset(dataset_name)
102 |         assert loaded_ds.metadata == metadata
103 |         pd_assert_frame_equal(loaded_ds.documents, ds.documents)
104 |         assert loaded_ds.queries.empty
105 |         assert loaded_ds._fs is not None
106 |         assert loaded_ds._dataset_path is not None
107 | 
108 |     def test_load_from_cloud_and_save_to_local(self, tmpdir):
109 |         public_catalog = Catalog()
110 |         ds = public_catalog.load_dataset("langchain-python-docs-text-embedding-ada-002")
111 | 
112 |         local_catalog_path = tmpdir.mkdir("catalog")
113 |         local_catalog = Catalog(base_path=str(local_catalog_path))
114 |         local_catalog.save_dataset(ds)
115 | 
116 |         logger.debug(f"wrote data to local_catalog_path: {str(local_catalog_path)}")
117 | 
118 |         loaded_ds = local_catalog.load_dataset(ds.metadata.name)
119 |         # Assert frames have the same number of rows
120 |         assert loaded_ds.documents.shape[0] == ds.documents.shape[0]
121 |         assert loaded_ds.queries.shape[0] == ds.queries.shape[0]
122 |         # Assert frames have the same columns
123 |         assert loaded_ds.documents.columns.tolist() == ds.documents.columns.tolist()
124 |         assert loaded_ds.queries.columns.tolist() == ds.queries.columns.tolist()
125 | 


--------------------------------------------------------------------------------
/tests/integration/test_io_private_cloud_storage_gcs.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pandas as pd
 4 | import random
 5 | from pandas.testing import assert_frame_equal as pd_assert_frame_equal
 6 | 
 7 | from pinecone_datasets import (
 8 |     Dataset,
 9 |     Catalog,
10 |     list_datasets,
11 |     DatasetMetadata,
12 |     DenseModelMetadata,
13 | )
14 | import os
15 | 
16 | GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
17 | if not GOOGLE_APPLICATION_CREDENTIALS:
18 |     raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set")
19 | 
20 | d = pd.DataFrame(
21 |     [
22 |         {
23 |             "id": "1",
24 |             "values": [0.1, 0.2, 0.3],
25 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
26 |             "metadata": {"title": "title1", "url": "url1"},
27 |             "blob": None,
28 |         },
29 |         {
30 |             "id": "2",
31 |             "values": [0.4, 0.5, 0.6],
32 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
33 |             "metadata": {"title": "title2", "url": "url2"},
34 |             "blob": None,
35 |         },
36 |     ]
37 | )
38 | 
39 | q = pd.DataFrame(
40 |     [
41 |         {
42 |             "vector": [0.1, 0.2, 0.3],
43 |             "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
44 |             "filter": {"filter1": {"$eq": "filter1"}},
45 |             "top_k": 1,
46 |             "blob": None,
47 |         },
48 |         {
49 |             "vector": [0.4, 0.5, 0.6],
50 |             "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
51 |             "filter": {"filter2": {"$eq": "filter2"}},
52 |             "top_k": 2,
53 |             "blob": None,
54 |         },
55 |     ]
56 | )
57 | 
58 | 
59 | class TestSaveDatasetToGCS:
60 |     def test_io_cloud_storage(self):
61 |         dataset_name = "test_io_dataset_" + str(random.randint(0, 1000000))
62 |         metadata = DatasetMetadata(
63 |             name=dataset_name,
64 |             created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
65 |             documents=2,
66 |             queries=2,
67 |             dense_model=DenseModelMetadata(
68 |                 name="ada2",
69 |                 dimension=2,
70 |             ),
71 |         )
72 |         ds = Dataset.from_pandas(documents=d, queries=q, metadata=metadata)
73 | 
74 |         catalog = Catalog(base_path="gs://pinecone-datasets-test/catalog")
75 |         catalog.save_dataset(dataset=ds)
76 | 
77 |         loaded_ds = catalog.load_dataset(dataset_name)
78 |         print(catalog.list_datasets(as_df=True))
79 | 
80 |         assert loaded_ds.metadata == metadata
81 |         pd_assert_frame_equal(loaded_ds.documents, ds.documents)
82 |         pd_assert_frame_equal(loaded_ds.queries, ds.queries)
83 | 


--------------------------------------------------------------------------------
/tests/integration/test_list_public_datasets.py:
--------------------------------------------------------------------------------
1 | from pinecone_datasets import list_datasets
2 | 
3 | 
4 | class TestListDatasets:
5 |     def test_list_datasets(self):
6 |         datasets = list_datasets()
7 |         assert len(datasets) > 0
8 |         assert "quora_all-MiniLM-L6-bm25" in datasets
9 | 


--------------------------------------------------------------------------------
/tests/integration/test_load_public_dataset.py:
--------------------------------------------------------------------------------
 1 | from pinecone_datasets import load_dataset
 2 | 
 3 | 
 4 | class TestLoadDataset:
 5 |     def test_load_dataset(self):
 6 |         ds = load_dataset("langchain-python-docs-text-embedding-ada-002")
 7 |         assert ds is not None
 8 | 
 9 |         headdf = ds.head()
10 |         assert headdf is not None
11 |         assert len(headdf) > 0
12 |         columns = headdf.columns.tolist()
13 |         assert "id" in columns
14 |         assert "values" in columns
15 |         assert "sparse_values" in columns
16 |         assert "metadata" in columns
17 |         assert "blob" in columns
18 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/pinecone-datasets/247837d84163450f15ce7d3a9919b13a9325ac43/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/test_basics.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pinecone_datasets import __version__
 3 | 
 4 | if sys.version_info > (3, 11):
 5 |     import tomllib as toml
 6 | 
 7 |     with open("pyproject.toml", "rb") as f:
 8 |         assert toml.load(f)["tool"]["poetry"]["version"] == __version__
 9 | else:
10 |     import toml
11 | 
12 |     with open("pyproject.toml") as f:
13 |         assert toml.load(f)["tool"]["poetry"]["version"] == __version__
14 | 
15 | 
16 | def test_version():
17 |     assert __version__ == "1.0.2"
18 | 


--------------------------------------------------------------------------------
/tests/unit/test_dataset_metadata.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pinecone_datasets.dataset_metadata import DatasetMetadata, DenseModelMetadata
 4 | 
 5 | from pydantic import ValidationError
 6 | 
 7 | 
 8 | def test_metadta_fields_minimal():
 9 |     try:
10 |         meta = DatasetMetadata(
11 |             name="test",
12 |             documents=1,
13 |             created_at="2021-01-01 00:00:00.000000",
14 |             queries=1,
15 |             dense_model=DenseModelMetadata(
16 |                 name="ada2",
17 |                 dimension=2,
18 |             ),
19 |         )
20 |     except NameError:
21 |         pytest.fail("Validation error")
22 | 
23 | 
24 | def test_validation_error_mandatory_field():
25 |     with pytest.raises(ValidationError):
26 |         meta = DatasetMetadata(
27 |             documents=1,
28 |             queries=1,
29 |             dense_model=DenseModelMetadata(
30 |                 name="ada2",
31 |                 dimensions=2,
32 |             ),
33 |         )
34 | 
35 | 
36 | def test_validation_error_optional_field():
37 |     with pytest.raises(ValidationError):
38 |         meta = DatasetMetadata(
39 |             name="test",
40 |             documents=1,
41 |             queries=1,
42 |             dense_model=DenseModelMetadata(name="ada2", dimension=2),
43 |             tags="test",
44 |         )
45 | 


--------------------------------------------------------------------------------
/tests/unit/test_fs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import s3fs
 3 | import gcsfs
 4 | 
 5 | from pinecone_datasets.fs import get_cloud_fs
 6 | 
 7 | 
 8 | def test_get_cloud_fs_nullability():
 9 |     assert get_cloud_fs("s3://pinecone-datasets") is not None
10 |     assert get_cloud_fs("gs://pinecone-datasets") is not None
11 |     assert get_cloud_fs("pinecone-datasets") is not None
12 | 
13 | 
14 | def test_get_cloud_fs_s3():
15 |     fs = get_cloud_fs("s3://not-pinecone-datasets")
16 |     assert isinstance(fs, s3fs.S3FileSystem)
17 |     assert fs.anon is False
18 | 
19 | 
20 | def test_get_cloud_fs_gs():
21 |     fs = get_cloud_fs("gs://not-pinecone-datasets")
22 |     assert isinstance(fs, gcsfs.GCSFileSystem)
23 |     assert fs.credentials.token is None
24 | 
25 | 
26 | def test_get_cloud_fs_on_pinecone_endpoint():
27 |     fs = get_cloud_fs("gs://pinecone-datasets-dev")
28 |     assert isinstance(fs, gcsfs.GCSFileSystem)
29 |     assert fs.credentials.token == "anon"
30 | 


--------------------------------------------------------------------------------
/tests/unit/test_private_datasets.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | 
 4 | from pinecone_datasets import list_datasets, load_dataset, Dataset
 5 | from tests.utils.test_public_datasets import deep_list_cmp
 6 | 
 7 | 
 8 | @pytest.mark.skip(reason="Need to figure out credentials to run these tests")
 9 | class TestPrivateDatasets:
10 |     def test_list_private_datasets(self):
11 |         os.environ["DATASETS_CATALOG_BASEPATH"] = "s3://ram-datasets"
12 |         lst = list_datasets(endpoint_url="https://storage.googleapis.com")
13 |         print(lst)
14 |         del os.environ["DATASETS_CATALOG_BASEPATH"]
15 |         assert "test_dataset" in lst
16 | 
17 |     def test_load_private_dataset(self):
18 |         os.environ["DATASETS_CATALOG_BASEPATH"] = "s3://ram-datasets"
19 |         ds = load_dataset("test_dataset", endpoint_url="https://storage.googleapis.com")
20 |         assert isinstance(ds, Dataset)
21 |         assert ds.queries.shape[0] == 2
22 |         assert ds.documents.shape[0] == 2
23 |         assert deep_list_cmp(
24 |             ds.documents.columns, ["id", "values", "sparse_values", "metadata"]
25 |         )
26 |         del os.environ["DATASETS_CATALOG_BASEPATH"]
27 | 
28 |     def test_dataset_from_path(self):
29 |         dataset_path = "s3://ram-datasets/test_dataset"
30 |         ds = Dataset.from_path(
31 |             dataset_path, endpoint_url="https://storage.googleapis.com"
32 |         )
33 |         assert isinstance(ds, Dataset)
34 |         assert ds.queries.shape[0] == 2
35 |         assert ds.documents.shape[0] == 2
36 |         assert deep_list_cmp(
37 |             ds.documents.columns, ["id", "values", "sparse_values", "metadata"]
38 |         )
39 | 


--------------------------------------------------------------------------------
/tests/unit/test_schema_validation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pytest
  3 | 
  4 | import pandas as pd
  5 | from pydantic import ValidationError
  6 | 
  7 | from pinecone_datasets import Dataset, DatasetMetadata, DenseModelMetadata
  8 | 
  9 | 
 10 | def test_datasets_schema_name_happy(tmpdir):
 11 |     documents_data = [
 12 |         {
 13 |             "id": "1",
 14 |             "values": [0.1, 0.2, 0.3],
 15 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 16 |             "metadata": {"title": "title1", "url": "url1"},
 17 |             "blob": None,
 18 |         },
 19 |         {
 20 |             "id": "2",
 21 |             "values": [0.4, 0.5, 0.6],
 22 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 23 |             "metadata": {"title": "title2", "url": "url2"},
 24 |             "blob": None,
 25 |         },
 26 |     ]
 27 | 
 28 |     dataset_name = "test_dataset"
 29 |     dataset_path = tmpdir.mkdir(dataset_name)
 30 |     documents_path = dataset_path.mkdir("documents")
 31 |     pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet"))
 32 | 
 33 |     queries_data = [
 34 |         {
 35 |             "vector": [0.1, 0.2, 0.3],
 36 |             "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 37 |             "filter": {"filter1": {"$eq": "filter1"}},
 38 |             "top_k": 1,
 39 |             "blob": None,
 40 |         },
 41 |         {
 42 |             "vector": [0.4, 0.5, 0.6],
 43 |             "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 44 |             "filter": {"filter2": {"$eq": "filter2"}},
 45 |             "top_k": 2,
 46 |             "blob": None,
 47 |         },
 48 |     ]
 49 | 
 50 |     queries_path = dataset_path.mkdir("queries")
 51 |     pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet"))
 52 | 
 53 |     metadata: DatasetMetadata = DatasetMetadata(
 54 |         name=dataset_name,
 55 |         created_at="2021-01-01 00:00:00.000000",
 56 |         documents=2,
 57 |         queries=2,
 58 |         dense_model=DenseModelMetadata(
 59 |             name="ada2",
 60 |             dimension=2,
 61 |         ),
 62 |     )
 63 | 
 64 |     with open(dataset_path.join("metadata.json"), "w") as f:
 65 |         json.dump(metadata.model_dump(), f)
 66 | 
 67 |     ds = Dataset.from_path(str(dataset_path))
 68 |     assert isinstance(ds, Dataset)
 69 |     assert ds.queries.shape[0] == 2
 70 |     assert ds.documents.shape[0] == 2
 71 | 
 72 | 
 73 | def test_datasets_schema_name_documents_missing_propery(tmpdir):
 74 |     documents_data = [
 75 |         {
 76 |             "id": "1",
 77 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 78 |             "metadata": {"title": "title1", "url": "url1"},
 79 |             "blob": None,
 80 |         },
 81 |         {
 82 |             "id": "2",
 83 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 84 |             "metadata": {"title": "title2", "url": "url2"},
 85 |             "blob": None,
 86 |         },
 87 |     ]
 88 | 
 89 |     dataset_name = "test_dataset"
 90 |     dataset_path = tmpdir.mkdir(dataset_name)
 91 |     documents_path = dataset_path.mkdir("documents")
 92 |     pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet"))
 93 | 
 94 |     queries_data = [
 95 |         {
 96 |             "vector": [0.1, 0.2, 0.3],
 97 |             "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 98 |             "filter": {"filter1": {"$eq": "filter1"}},
 99 |             "top_k": 1,
100 |             "blob": None,
101 |         },
102 |         {
103 |             "vector": [0.4, 0.5, 0.6],
104 |             "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
105 |             "filter": {"filter2": {"$eq": "filter2"}},
106 |             "top_k": 2,
107 |             "blob": None,
108 |         },
109 |     ]
110 | 
111 |     queries_path = dataset_path.mkdir("queries")
112 |     pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet"))
113 | 
114 |     metadata: DatasetMetadata = DatasetMetadata(
115 |         name=dataset_name,
116 |         created_at="2021-01-01 00:00:00.000000",
117 |         documents=2,
118 |         queries=2,
119 |         dense_model=DenseModelMetadata(
120 |             name="ada2",
121 |             dimension=2,
122 |         ),
123 |     )
124 | 
125 |     with open(dataset_path.join("metadata.json"), "w") as f:
126 |         json.dump(metadata.model_dump(), f)
127 | 
128 |     with pytest.raises(ValueError):
129 |         ds = Dataset.from_path(str(dataset_path))
130 |         assert isinstance(ds, Dataset)
131 |         assert ds.queries.shape[0] == 2
132 |         assert ds.documents.shape[0] == 2
133 | 
134 | 
135 | def test_datasets_schema_name_queries_missing_propery(tmpdir):
136 |     documents_data = [
137 |         {
138 |             "id": "1",
139 |             "values": [0.1, 0.2, 0.3],
140 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
141 |             "metadata": {"title": "title1", "url": "url1"},
142 |             "blob": None,
143 |         },
144 |         {
145 |             "id": "2",
146 |             "values": [0.4, 0.5, 0.6],
147 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
148 |             "metadata": {"title": "title2", "url": "url2"},
149 |             "blob": None,
150 |         },
151 |     ]
152 | 
153 |     dataset_name = "test_dataset"
154 |     dataset_path = tmpdir.mkdir(dataset_name)
155 |     documents_path = dataset_path.mkdir("documents")
156 |     pd.DataFrame(documents_data).to_parquet(documents_path.join("part-0.parquet"))
157 | 
158 |     queries_data = [
159 |         {
160 |             "sparse_vector": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
161 |             "filter": {"filter1": {"$eq": "filter1"}},
162 |             "top_k": 1,
163 |         },
164 |         {
165 |             "sparse_vector": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
166 |             "filter": {"filter2": {"$eq": "filter2"}},
167 |             "top_k": 2,
168 |         },
169 |     ]
170 | 
171 |     queries_path = dataset_path.mkdir("queries")
172 |     pd.DataFrame(queries_data).to_parquet(queries_path.join("part-0.parquet"))
173 | 
174 |     metadata: DatasetMetadata = DatasetMetadata(
175 |         name=dataset_name,
176 |         created_at="2021-01-01 00:00:00.000000",
177 |         documents=2,
178 |         queries=2,
179 |         dense_model=DenseModelMetadata(
180 |             name="ada2",
181 |             dimension=2,
182 |         ),
183 |     )
184 | 
185 |     with open(dataset_path.join("metadata.json"), "w") as f:
186 |         json.dump(metadata.model_dump(), f)
187 | 
188 |     with pytest.raises(ValueError):
189 |         ds = Dataset.from_path(str(dataset_path))
190 |         assert isinstance(ds, Dataset)
191 |         assert ds.queries.shape[0] == 2
192 |         assert ds.documents.shape[0] == 2
193 | 
194 | 
195 | def test_datasets_schema_metadata_wrong(tmpdir):
196 |     with pytest.raises(ValidationError):
197 |         metadata: DatasetMetadata = DatasetMetadata(
198 |             created_at="2021-01-01 00:00:00.000000",
199 |             documents=2,
200 |             queries=2,
201 |             dense_model=DenseModelMetadata(
202 |                 name="ada2",
203 |                 dimension=2,
204 |             ),
205 |         )
206 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pytest
  4 | from pinecone_datasets.dataset_fsreader import DatasetFSReader
  5 | from pinecone_datasets.dataset_fswriter import DatasetFSWriter
  6 | from pinecone_datasets.dataset import Dataset
  7 | 
  8 | 
  9 | def test_read_pandas_dataframe(tmpdir):
 10 |     d = [
 11 |         {
 12 |             "id": "1",
 13 |             "values": [0.1, 0.2, 0.3],
 14 |             "sparse_values": {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]},
 15 |             "metadata": {"title": "title1", "url": "url1"},
 16 |             "blob": None,
 17 |         },
 18 |         {
 19 |             "id": "2",
 20 |             "values": [0.4, 0.5, 0.6],
 21 |             "sparse_values": {"indices": [4, 5, 6], "values": [0.4, 0.5, 0.6]},
 22 |             "metadata": {"title": "title2", "url": "url2"},
 23 |             "blob": None,
 24 |         },
 25 |     ]
 26 |     df = pd.DataFrame(d)
 27 | 
 28 |     schema_documents = [
 29 |         ("id", False, None),
 30 |         ("values", False, None),
 31 |         ("sparse_values", True, None),
 32 |         ("metadata", True, None),
 33 |         ("blob", True, None),
 34 |     ]
 35 | 
 36 |     # create tempdir
 37 |     dataset_name = "test_read_pandas_dataframe"
 38 |     dataset_path = tmpdir.mkdir(dataset_name)
 39 | 
 40 |     read_df = Dataset._read_pandas_dataframe(
 41 |         df, column_mapping=None, schema=schema_documents
 42 |     )
 43 |     assert isinstance(read_df, pd.DataFrame)
 44 | 
 45 |     # check if the dataframe is the same
 46 |     pd.testing.assert_frame_equal(df, read_df)
 47 | 
 48 |     # test None case
 49 |     none_df = Dataset._read_pandas_dataframe(
 50 |         None, column_mapping=None, schema=schema_documents
 51 |     )
 52 |     assert none_df.empty
 53 | 
 54 |     for k, _, _ in schema_documents:
 55 |         assert k in read_df.columns
 56 |         assert k in none_df.columns
 57 | 
 58 | 
 59 | def test_convert_metadata_from_dict_to_json():
 60 |     d1 = {"a": 1, "b": 2}
 61 |     s1 = '{"a": 1, "b": 2}'
 62 |     assert DatasetFSWriter._convert_metadata_from_dict_to_json(d1) == s1
 63 |     assert (
 64 |         DatasetFSReader._convert_metadata_from_json_to_dict(
 65 |             DatasetFSWriter._convert_metadata_from_dict_to_json(d1)
 66 |         )
 67 |         == d1
 68 |     )
 69 | 
 70 |     d2 = {"a": 1, "b": None}
 71 |     s2 = '{"a": 1, "b": null}'
 72 |     assert DatasetFSWriter._convert_metadata_from_dict_to_json(d2) == s2
 73 |     assert (
 74 |         DatasetFSReader._convert_metadata_from_json_to_dict(
 75 |             DatasetFSWriter._convert_metadata_from_dict_to_json(d2)
 76 |         )
 77 |         == d2
 78 |     )
 79 | 
 80 |     d3 = None
 81 |     s3 = None
 82 |     assert DatasetFSWriter._convert_metadata_from_dict_to_json(d3) == s3
 83 |     assert (
 84 |         DatasetFSReader._convert_metadata_from_json_to_dict(
 85 |             DatasetFSWriter._convert_metadata_from_dict_to_json(d3)
 86 |         )
 87 |         == d3
 88 |     )
 89 | 
 90 |     d4 = {"a": 1, "b": np.nan}
 91 |     s4 = '{"a": 1, "b": NaN}'
 92 |     assert DatasetFSWriter._convert_metadata_from_dict_to_json(d4) == s4
 93 | 
 94 |     # TODO WTF?
 95 |     # print({"a": 1, "b": np.nan})
 96 |     # print(Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4)))
 97 |     # print(type(Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4))['b']))
 98 |     # print(type(np.nan))
 99 |     # assert Dataset._convert_metadata_from_json_to_dict(Dataset._convert_metadata_from_dict_to_json(d4)) == d4
100 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/pinecone-datasets/247837d84163450f15ce7d3a9919b13a9325ac43/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_public_datasets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def is_dicts_equal(d1, d2):
 5 |     return d1.keys() == d2.keys() and recursive_dict_compare(d1, d2)
 6 | 
 7 | 
 8 | def deep_list_cmp(l1, l2):
 9 |     same = True
10 |     for l, r in zip(l1, l2):
11 |         same = same and l == r
12 |     return same
13 | 
14 | 
15 | def approx_deep_list_cmp(l1, l2):
16 |     same = True
17 |     for l, r in zip(l1, l2):
18 |         same = same and np.isclose(l, r)
19 |     return same
20 | 
21 | 
22 | def recursive_dict_compare(d1, d2):
23 |     for k, v in d1.items():
24 |         if isinstance(v, dict):
25 |             return recursive_dict_compare(v, d2[k])
26 |         elif isinstance(v, (list, np.ndarray)):
27 |             return deep_list_cmp(v, d2[k])
28 |         return v == d2[k]
29 | 


--------------------------------------------------------------------------------