├── .gitignore
├── txt2dataset
    ├── txt2dataset
    │   ├── __init__.py
    │   └── dataset_builder.py
    └── setup.py
├── .github
    └── workflows
    │   └── build_wheels.yml
├── LICENSE
├── examples
    ├── quickstart.csv
    └── quickstart.py
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.__pycache__


--------------------------------------------------------------------------------
/txt2dataset/txt2dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset_builder import DatasetBuilder


--------------------------------------------------------------------------------
/txt2dataset/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | 
 4 | # Read the contents of your README file
 5 | with open(os.path.join(os.path.dirname(__file__), '../readme.md'), encoding='utf-8') as f:
 6 |     long_description = f.read()
 7 | 
 8 | setup(
 9 |     name="txt2dataset",
10 |     version="0.4.1",
11 |     packages=find_packages(),
12 |     install_requires=[
13 |         "google-genai",
14 |         "tqdm",
15 |     ],
16 |     description="Convert text to datasets",
17 |     long_description=long_description,
18 |     long_description_content_type='text/markdown',
19 | )


--------------------------------------------------------------------------------
/.github/workflows/build_wheels.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Upload to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: '3.11'
18 |     
19 |     - name: Install dependencies
20 |       working-directory: ./txt2dataset    # Added this
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install setuptools wheel twine
24 |     
25 |     - name: Build package
26 |       working-directory: ./txt2dataset    # Added this
27 |       run: |
28 |         python setup.py sdist bdist_wheel
29 |     
30 |     - name: Upload to PyPI
31 |       env:
32 |         TWINE_USERNAME: __token__
33 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
34 |       working-directory: ./txt2dataset    # Added this
35 |       run: |
36 |         twine upload dist/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 John Friedman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/quickstart.csv:
--------------------------------------------------------------------------------
 1 | "_id","dividend_per_share","payment_date","record_date","stock_type_specified"
 2 | "0","0.18","2021-05-24 00:00:00+00:00","2021-05-10 00:00:00+00:00",""
 3 | "0","0.15","2020-07-12 00:00:00+00:00","2020-07-01 00:00:00+00:00",""
 4 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
 5 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
 6 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
 7 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
 8 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
 9 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
10 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
11 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
12 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
13 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
14 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
15 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
16 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
17 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
18 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
19 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
20 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
21 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
22 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
23 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
24 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
25 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
26 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
27 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
28 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
29 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
30 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
31 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
32 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
33 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
34 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
35 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
36 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00",""
37 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # txt2dataset
 2 | A package for building, standardizing and validating datasets using language models. Supports the [Structured Output](https://github.com/Structured-Output) project. 
 3 | 
 4 | * [Get a Gemini API Key](https://ai.google.dev/gemini-api/docs/api-key)
 5 | 
 6 | ## Models Supported
 7 | * Gemini
 8 | 
 9 | ## Installation
10 | 
11 | ```bash
12 | pip install txt2dataset
13 | ```
14 | 
15 | ## Usage
16 | 
17 | ### Schema
18 | 
19 | ```python
20 | from pydantic import BaseModel
21 | from typing import Optional, List
22 | from datetime import datetime
23 | 
24 | class SingleDividend(BaseModel):
25 |     dividend_per_share: float
26 |     payment_date: Optional[datetime] = None
27 |     record_date: Optional[datetime] = None
28 |     stock_type_specified: Optional[str] = None
29 | 
30 | class DividendExtraction(BaseModel):
31 |     info_found: bool
32 |     data: List[SingleDividend] = []
33 | ```
34 | 
35 | ### Entries
36 | Entries consist of an identifier and the text to be structured.
37 | ```python
38 | entries = [
39 |     (0,
40 |     """First Business Financial Services, Inc. (the "Company") issued a press release today 
41 |     announcing that the Company's Board of Directors declared a quarterly dividend of $0.18 
42 |     per share on April 30, 2021, unchanged compared to the last quarterly dividend per share. 
43 |     The dividend is payable on May 24, 2021 to shareholders of record on May 10, 2021. 
44 |     Also on July 12, 2020 there was a payable dividend of $0.15 per share to shareholders 
45 |     of record on July 1st, 2020."""),
46 | 
47 |     (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
48 |     to shareholders of record as of June 1, 2021.""")
49 | ]
50 | ```
51 | 
52 | ### Prompt
53 | Choose a prompt such as:
54 | ```python
55 | prompt = "Extract ALL dividend information from this text"
56 | ```
57 | 
58 | ### Dataset Builder Initialization
59 | 
60 | Choose the requests per minute that work for your api key and model.
61 | 
62 | ```python
63 | builder = DatasetBuilder(
64 |     prompt=prompt,
65 |     schema=DividendExtraction,
66 |     model="gemini-2.5-flash-lite",
67 |     entries=entries,
68 |     rpm=4000
69 | )
70 | ```
71 | 
72 | ### Build
73 | ```python
74 | builder.build()
75 | ```
76 | 
77 | ### Save
78 | ```python
79 | builder.save('test.csv')
80 | ```
81 | 
82 | Result:
83 | 
84 | | _id | dividend_per_share | payment_date                  | record_date                   | stock_type_specified |
85 | |-----|---------------------|-------------------------------|-------------------------------|-----------------------|
86 | | 0   | 0.18                | 2021-05-24 00:00:00+00:00    | 2021-05-10 00:00:00+00:00    |                       |
87 | | 0   | 0.15                | 2020-07-12 00:00:00+00:00    | 2020-07-01 00:00:00+00:00    |                       |
88 | | 1   | 0.25                | 2021-06-15 00:00:00+00:00    | 2021-06-01 00:00:00+00:00    |                       |
89 | 
90 | 
91 | ## Future Features
92 | * validate() - checks that data types are expected. Needed less, thanks to the development of pydantic.
93 | * standardize() - standardizes data.
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/examples/quickstart.py:
--------------------------------------------------------------------------------
  1 | from txt2dataset import DatasetBuilder
  2 | from pydantic import BaseModel
  3 | from typing import Optional, List
  4 | from datetime import datetime
  5 | 
  6 | class SingleDividend(BaseModel):
  7 |     dividend_per_share: float
  8 |     payment_date: Optional[datetime] = None
  9 |     record_date: Optional[datetime] = None
 10 |     stock_type_specified: Optional[str] = None
 11 | 
 12 | class DividendExtraction(BaseModel):
 13 |     info_found: bool
 14 |     data: List[SingleDividend] = []
 15 | 
 16 | # Sample texts
 17 | entries = [(0,
 18 |     """First Business Financial Services, Inc. (the "Company") issued a press release today 
 19 |     announcing that the Company's Board of Directors declared a quarterly dividend of $0.18 
 20 |     per share on April 30, 2021, unchanged compared to the last quarterly dividend per share. 
 21 |     The dividend is payable on May 24, 2021 to shareholders of record on May 10, 2021. 
 22 |     Also on July 12, 2020 there was a payable dividend of $0.15 per share to shareholders 
 23 |     of record on July 1st, 2020."""),
 24 | 
 25 |     
 26 |     
 27 |     (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 28 |     to shareholders of record as of June 1, 2021."""),
 29 |         
 30 |             (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 31 |     to shareholders of record as of June 1, 2021."""),
 32 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 33 |     to shareholders of record as of June 1, 2021."""),
 34 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 35 |     to shareholders of record as of June 1, 2021."""),
 36 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 37 |     to shareholders of record as of June 1, 2021."""),
 38 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 39 |     to shareholders of record as of June 1, 2021."""),
 40 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 41 |     to shareholders of record as of June 1, 2021."""),
 42 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 43 |     to shareholders of record as of June 1, 2021."""),
 44 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 45 |     to shareholders of record as of June 1, 2021."""),
 46 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 47 |     to shareholders of record as of June 1, 2021."""),
 48 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 49 |     to shareholders of record as of June 1, 2021."""),
 50 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 51 |     to shareholders of record as of June 1, 2021."""),
 52 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 53 |     to shareholders of record as of June 1, 2021."""),
 54 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 55 |     to shareholders of record as of June 1, 2021."""),
 56 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 57 |     to shareholders of record as of June 1, 2021."""),
 58 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 59 |     to shareholders of record as of June 1, 2021."""),
 60 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 61 |     to shareholders of record as of June 1, 2021."""),
 62 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 63 |     to shareholders of record as of June 1, 2021."""),
 64 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 65 |     to shareholders of record as of June 1, 2021."""),
 66 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 67 |     to shareholders of record as of June 1, 2021."""),
 68 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 69 |     to shareholders of record as of June 1, 2021."""),
 70 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 71 |     to shareholders of record as of June 1, 2021."""),
 72 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 73 |     to shareholders of record as of June 1, 2021."""),
 74 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 75 |     to shareholders of record as of June 1, 2021."""),
 76 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 77 |     to shareholders of record as of June 1, 2021."""),
 78 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 79 |     to shareholders of record as of June 1, 2021."""),
 80 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 81 |     to shareholders of record as of June 1, 2021."""),
 82 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 83 |     to shareholders of record as of June 1, 2021."""),
 84 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 85 |     to shareholders of record as of June 1, 2021."""),
 86 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 87 |     to shareholders of record as of June 1, 2021."""),
 88 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 89 |     to shareholders of record as of June 1, 2021."""),
 90 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 91 |     to shareholders of record as of June 1, 2021."""),
 92 |         (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 
 93 |     to shareholders of record as of June 1, 2021."""),
 94 |     
 95 |     
 96 | ]
 97 | 
 98 | # Create builder
 99 | builder = DatasetBuilder(
100 |     prompt="Extract ALL dividend information from this text",
101 |     schema=DividendExtraction,
102 |     model="gemini-2.5-flash-lite",
103 |     entries=entries,
104 |     rpm=4000
105 | )
106 | 
107 | # Build dataset
108 | builder.build()
109 | 
110 | # Save to csv
111 | builder.save('quickstart.csv')


--------------------------------------------------------------------------------
/txt2dataset/txt2dataset/dataset_builder.py:
--------------------------------------------------------------------------------
  1 | from google import genai
  2 | 
  3 | import os
  4 | import asyncio
  5 | import time
  6 | import csv
  7 | from collections import deque
  8 | from tqdm import tqdm
  9 | 
 10 | class AsyncRateLimiter:
 11 |     def __init__(self, rpm=60):
 12 |         self.rpm = rpm
 13 |         self.request_times = deque()
 14 |         self.lock = asyncio.Lock()
 15 |     
 16 |     async def acquire(self):
 17 |         """Wait until it's safe to make the next request using sliding window"""
 18 |         async with self.lock:
 19 |             now = time.time()
 20 |             
 21 |             # Remove requests older than 60 seconds
 22 |             while self.request_times and now - self.request_times[0] >= 60:
 23 |                 self.request_times.popleft()
 24 |             
 25 |             # If we're at the rate limit, wait until the oldest request is 60 seconds old
 26 |             if len(self.request_times) >= self.rpm:
 27 |                 sleep_time = 60 - (now - self.request_times[0]) + 0.1  # Small buffer
 28 |                 if sleep_time > 0:
 29 |                     print(f"Rate limiting: waiting {sleep_time:.2f}s")
 30 |                     await asyncio.sleep(sleep_time)
 31 |                     # Clean up again after sleeping
 32 |                     now = time.time()
 33 |                     while self.request_times and now - self.request_times[0] >= 60:
 34 |                         self.request_times.popleft()
 35 |             
 36 |             # Record this request
 37 |             self.request_times.append(now)
 38 | 
 39 | class DatasetBuilder:
 40 |     def __init__(self, prompt, schema, model, entries, rpm=60, api_key=None, max_concurrent=10):
 41 |         self.prompt = prompt
 42 |         self.schema = schema
 43 |         self.model = model
 44 |         self.rpm = rpm
 45 |         self.max_concurrent = max_concurrent
 46 |         self.entries = entries  # Will be modified in place
 47 |         self.info_found_bool = "info_found"
 48 |         
 49 |         # Check API key
 50 |         if not api_key and not os.getenv("GEMINI_API_KEY"):
 51 |             raise ValueError("API key must be provided either as an argument or through the GEMINI_API_KEY environment variable.")
 52 |         
 53 |         self.api_key = api_key
 54 |         self.client = genai.Client()
 55 |         self.rate_limiter = AsyncRateLimiter(rpm)
 56 |         self.semaphore = asyncio.Semaphore(max_concurrent)
 57 |         
 58 |         # Progress tracking
 59 |         self.pbar = None
 60 |         self.success_count = 0
 61 |         self.error_count = 0
 62 |         self.progress_lock = asyncio.Lock()
 63 | 
 64 |     def _calculate_input_tokens_single(self, prompt, text):
 65 |         """Calculate estimated input tokens for a single text"""
 66 |         full_text = f"{prompt}: {text}"
 67 |         return len(full_text) // 4
 68 | 
 69 |     def _get_entry_state(self, entry):
 70 |         """Determine the state of an entry tuple"""
 71 |         if len(entry) == 2:
 72 |             return "unprocessed"
 73 |         elif len(entry) == 3:
 74 |             return "error"
 75 |         elif len(entry) == 4:
 76 |             return "success"
 77 |         else:
 78 |             return "unknown"
 79 | 
 80 |     def _get_entries_to_process(self):
 81 |         """Get list of indices for entries that need processing"""
 82 |         to_process = []
 83 |         for i, entry in enumerate(self.entries):
 84 |             state = self._get_entry_state(entry)
 85 |             if state in ["unprocessed", "error"]:
 86 |                 to_process.append(i)
 87 |         return to_process
 88 | 
 89 |     async def _update_progress(self, success=False, error=False):
 90 |         """Thread-safe progress bar update"""
 91 |         async with self.progress_lock:
 92 |             if success:
 93 |                 self.success_count += 1
 94 |             if error:
 95 |                 self.error_count += 1
 96 |             
 97 |             if self.pbar:
 98 |                 self.pbar.set_description(f"✓{self.success_count} ✗{self.error_count}")
 99 |                 self.pbar.update(1)
100 | 
101 |     async def _make_api_call(self, text):
102 |         """Make rate-limited API call"""
103 |         await self.rate_limiter.acquire()
104 |         
105 |         response = await self.client.aio.models.generate_content(
106 |             model=self.model,
107 |             contents=f"{self.prompt}: {text}",
108 |             config={
109 |                 "response_mime_type": "application/json",
110 |                 "response_schema": self.schema,
111 |             },
112 |         )
113 |         return response
114 | 
115 |     def _process_response(self, response, entry_id):
116 |         """Process a single API response and return (results, tokens_used)"""
117 |         if not response:
118 |             return [], 0
119 |         
120 |         try:
121 |             response_data = response.parsed
122 |             
123 |             # Calculate output tokens (rough estimate)
124 |             tokens_used = len(response.text) // 4
125 |             
126 |             # Check if info was found using the configurable boolean field
127 |             if hasattr(response_data, self.info_found_bool) and getattr(response_data, self.info_found_bool):
128 |                 if hasattr(response_data, 'data') and response_data.data:
129 |                     results = []
130 |                     if isinstance(response_data.data, list):
131 |                         for item in response_data.data:
132 |                             result_with_id = {"_id": entry_id, **item.dict()}
133 |                             results.append(result_with_id)
134 |                     else:
135 |                         result_with_id = {"_id": entry_id, **response_data.data.dict()}
136 |                         results.append(result_with_id)
137 |                     return results, tokens_used
138 |             
139 |             return [], tokens_used
140 |             
141 |         except Exception as e:
142 |             raise Exception(f"Error processing response: {e}")
143 | 
144 |     async def _process_single_entry(self, entry_index):
145 |         """Process a single entry with semaphore control"""
146 |         async with self.semaphore:
147 |             entry = self.entries[entry_index]
148 |             entry_id = entry[0]
149 |             text = entry[1]
150 |             
151 |             try:
152 |                 # Make API call
153 |                 response = await self._make_api_call(text)
154 |                 
155 |                 # Process response
156 |                 results, tokens_used = self._process_response(response, entry_id)
157 |                 
158 |                 # Update entry in place - success state
159 |                 self.entries[entry_index] = (entry_id, text, results, tokens_used)
160 |                 
161 |                 # Update progress
162 |                 await self._update_progress(success=True)
163 |                 
164 |             except Exception as e:
165 |                 # Update entry in place - error state
166 |                 error_message = str(e)
167 |                 self.entries[entry_index] = (entry_id, text, error_message)
168 |                 print(f"✗ Error processing entry {entry_id}: {error_message}")
169 |                 
170 |                 # Update progress
171 |                 await self._update_progress(error=True)
172 | 
173 |     async def _build(self):
174 |         """Build the dataset with concurrent processing"""
175 |         # Find entries that need processing
176 |         to_process = self._get_entries_to_process()
177 |         
178 |         if not to_process:
179 |             print("No entries need processing")
180 |             return
181 |         
182 |         print(f"Starting concurrent dataset building:")
183 |         print(f"- {len(to_process)} entries to process")
184 |         print(f"- Rate limit: {self.rpm} RPM")
185 |         print(f"- Max concurrent: {self.max_concurrent}")
186 |         
187 |         # Calculate estimated input tokens for entries being processed
188 |         total_input_tokens = sum(
189 |             self._calculate_input_tokens_single(self.prompt, self.entries[i][1])
190 |             for i in to_process
191 |         )
192 |         print(f"- Estimated input tokens: {total_input_tokens:,}")
193 |         
194 |         # Initialize progress tracking
195 |         self.success_count = 0
196 |         self.error_count = 0
197 |         self.pbar = tqdm(total=len(to_process), desc="✓0 ✗0", unit="entries")
198 |         
199 |         try:
200 |             # Process all entries concurrently
201 |             tasks = [self._process_single_entry(i) for i in to_process]
202 |             await asyncio.gather(*tasks, return_exceptions=True)
203 |         finally:
204 |             # Clean up progress bar
205 |             if self.pbar:
206 |                 self.pbar.close()
207 |                 self.pbar = None
208 |         
209 |         # Print summary
210 |         self._print_summary()
211 |     
212 |     def _print_summary(self):
213 |         """Print processing summary"""
214 |         success_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "success")
215 |         error_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "error")
216 |         unprocessed_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "unprocessed")
217 |         
218 |         # Count results and tokens
219 |         total_results = 0
220 |         total_tokens = 0
221 |         for entry in self.entries:
222 |             if self._get_entry_state(entry) == "success":
223 |                 total_results += len(entry[2])  # results list
224 |                 total_tokens += entry[3]  # tokens used
225 |         
226 |         print(f"\nDataset building complete!")
227 |         print(f"Successful: {success_count}")
228 |         print(f"Errors: {error_count}")
229 |         print(f"Unprocessed: {unprocessed_count}")
230 |         print(f"Total results: {total_results}")
231 |         print(f"Total tokens used: {total_tokens:,}")
232 |     
233 |     def build(self):
234 |         """Public interface to build the dataset"""
235 |         asyncio.run(self._build())
236 | 
237 |     def get_results(self):
238 |         """Extract all successful results from entries"""
239 |         results = []
240 |         for entry in self.entries:
241 |             if self._get_entry_state(entry) == "success":
242 |                 results.extend(entry[2])  # Add all results from this entry
243 |         return results
244 | 
245 |     def save(self, filename):
246 |         """Save results to CSV with all fields quoted"""
247 |         results = self.get_results()
248 |         
249 |         if not results:
250 |             print("No results to save")
251 |             return
252 |         
253 |         # Get all unique fieldnames from all results
254 |         fieldnames = set()
255 |         for result in results:
256 |             fieldnames.update(result.keys())
257 |         fieldnames = sorted(list(fieldnames))
258 |         
259 |         with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
260 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
261 |             writer.writeheader()
262 |             writer.writerows(results)
263 |         
264 |         print(f"Saved {len(results)} records to {filename}")
265 |     
266 |     def get_errors(self):
267 |         """Get all entries that have errors"""
268 |         errors = []
269 |         for entry in self.entries:
270 |             if self._get_entry_state(entry) == "error":
271 |                 errors.append({
272 |                     "id": entry[0],
273 |                     "text": entry[1][:100] + "..." if len(entry[1]) > 100 else entry[1],
274 |                     "error": entry[2]
275 |                 })
276 |         return errors
277 |     
278 |     def print_errors(self):
279 |         """Print all error entries for debugging"""
280 |         errors = self.get_errors()
281 |         if not errors:
282 |             print("No errors found")
283 |             return
284 |         
285 |         print(f"\nFound {len(errors)} errors:")
286 |         for error in errors:
287 |             print(f"ID {error['id']}: {error['error']}")
288 |             print(f"  Text: {error['text']}")
289 |             print()


--------------------------------------------------------------------------------