├── .gitignore ├── txt2dataset ├── txt2dataset │ ├── __init__.py │ └── dataset_builder.py └── setup.py ├── .github └── workflows │ └── build_wheels.yml ├── LICENSE ├── examples ├── quickstart.csv └── quickstart.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.__pycache__ -------------------------------------------------------------------------------- /txt2dataset/txt2dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_builder import DatasetBuilder -------------------------------------------------------------------------------- /txt2dataset/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | # Read the contents of your README file 5 | with open(os.path.join(os.path.dirname(__file__), '../readme.md'), encoding='utf-8') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name="txt2dataset", 10 | version="0.4.1", 11 | packages=find_packages(), 12 | install_requires=[ 13 | "google-genai", 14 | "tqdm", 15 | ], 16 | description="Convert text to datasets", 17 | long_description=long_description, 18 | long_description_content_type='text/markdown', 19 | ) -------------------------------------------------------------------------------- /.github/workflows/build_wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and Upload to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.11' 18 | 19 | - name: Install dependencies 20 | working-directory: ./txt2dataset # Added this 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install setuptools wheel twine 24 | 25 | - name: Build package 26 | working-directory: ./txt2dataset # Added this 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | 30 | - name: Upload to PyPI 31 | env: 32 | TWINE_USERNAME: __token__ 33 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 34 | working-directory: ./txt2dataset # Added this 35 | run: | 36 | twine upload dist/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 John Friedman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/quickstart.csv: -------------------------------------------------------------------------------- 1 | "_id","dividend_per_share","payment_date","record_date","stock_type_specified" 2 | "0","0.18","2021-05-24 00:00:00+00:00","2021-05-10 00:00:00+00:00","" 3 | "0","0.15","2020-07-12 00:00:00+00:00","2020-07-01 00:00:00+00:00","" 4 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 5 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 6 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 7 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 8 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 9 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 10 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 11 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 12 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 13 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 14 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 15 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 16 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 17 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 18 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 19 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 20 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 21 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 22 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 23 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 24 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 25 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 26 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 27 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 28 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 29 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 30 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 31 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 32 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 33 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 34 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 35 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 36 | "1","0.25","2021-06-15 00:00:00+00:00","2021-06-01 00:00:00+00:00","" 37 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # txt2dataset 2 | A package for building, standardizing and validating datasets using language models. Supports the [Structured Output](https://github.com/Structured-Output) project. 3 | 4 | * [Get a Gemini API Key](https://ai.google.dev/gemini-api/docs/api-key) 5 | 6 | ## Models Supported 7 | * Gemini 8 | 9 | ## Installation 10 | 11 | ```bash 12 | pip install txt2dataset 13 | ``` 14 | 15 | ## Usage 16 | 17 | ### Schema 18 | 19 | ```python 20 | from pydantic import BaseModel 21 | from typing import Optional, List 22 | from datetime import datetime 23 | 24 | class SingleDividend(BaseModel): 25 | dividend_per_share: float 26 | payment_date: Optional[datetime] = None 27 | record_date: Optional[datetime] = None 28 | stock_type_specified: Optional[str] = None 29 | 30 | class DividendExtraction(BaseModel): 31 | info_found: bool 32 | data: List[SingleDividend] = [] 33 | ``` 34 | 35 | ### Entries 36 | Entries consist of an identifier and the text to be structured. 37 | ```python 38 | entries = [ 39 | (0, 40 | """First Business Financial Services, Inc. (the "Company") issued a press release today 41 | announcing that the Company's Board of Directors declared a quarterly dividend of $0.18 42 | per share on April 30, 2021, unchanged compared to the last quarterly dividend per share. 43 | The dividend is payable on May 24, 2021 to shareholders of record on May 10, 2021. 44 | Also on July 12, 2020 there was a payable dividend of $0.15 per share to shareholders 45 | of record on July 1st, 2020."""), 46 | 47 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 48 | to shareholders of record as of June 1, 2021.""") 49 | ] 50 | ``` 51 | 52 | ### Prompt 53 | Choose a prompt such as: 54 | ```python 55 | prompt = "Extract ALL dividend information from this text" 56 | ``` 57 | 58 | ### Dataset Builder Initialization 59 | 60 | Choose the requests per minute that work for your api key and model. 61 | 62 | ```python 63 | builder = DatasetBuilder( 64 | prompt=prompt, 65 | schema=DividendExtraction, 66 | model="gemini-2.5-flash-lite", 67 | entries=entries, 68 | rpm=4000 69 | ) 70 | ``` 71 | 72 | ### Build 73 | ```python 74 | builder.build() 75 | ``` 76 | 77 | ### Save 78 | ```python 79 | builder.save('test.csv') 80 | ``` 81 | 82 | Result: 83 | 84 | | _id | dividend_per_share | payment_date | record_date | stock_type_specified | 85 | |-----|---------------------|-------------------------------|-------------------------------|-----------------------| 86 | | 0 | 0.18 | 2021-05-24 00:00:00+00:00 | 2021-05-10 00:00:00+00:00 | | 87 | | 0 | 0.15 | 2020-07-12 00:00:00+00:00 | 2020-07-01 00:00:00+00:00 | | 88 | | 1 | 0.25 | 2021-06-15 00:00:00+00:00 | 2021-06-01 00:00:00+00:00 | | 89 | 90 | 91 | ## Future Features 92 | * validate() - checks that data types are expected. Needed less, thanks to the development of pydantic. 93 | * standardize() - standardizes data. 94 | 95 | 96 | -------------------------------------------------------------------------------- /examples/quickstart.py: -------------------------------------------------------------------------------- 1 | from txt2dataset import DatasetBuilder 2 | from pydantic import BaseModel 3 | from typing import Optional, List 4 | from datetime import datetime 5 | 6 | class SingleDividend(BaseModel): 7 | dividend_per_share: float 8 | payment_date: Optional[datetime] = None 9 | record_date: Optional[datetime] = None 10 | stock_type_specified: Optional[str] = None 11 | 12 | class DividendExtraction(BaseModel): 13 | info_found: bool 14 | data: List[SingleDividend] = [] 15 | 16 | # Sample texts 17 | entries = [(0, 18 | """First Business Financial Services, Inc. (the "Company") issued a press release today 19 | announcing that the Company's Board of Directors declared a quarterly dividend of $0.18 20 | per share on April 30, 2021, unchanged compared to the last quarterly dividend per share. 21 | The dividend is payable on May 24, 2021 to shareholders of record on May 10, 2021. 22 | Also on July 12, 2020 there was a payable dividend of $0.15 per share to shareholders 23 | of record on July 1st, 2020."""), 24 | 25 | 26 | 27 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 28 | to shareholders of record as of June 1, 2021."""), 29 | 30 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 31 | to shareholders of record as of June 1, 2021."""), 32 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 33 | to shareholders of record as of June 1, 2021."""), 34 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 35 | to shareholders of record as of June 1, 2021."""), 36 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 37 | to shareholders of record as of June 1, 2021."""), 38 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 39 | to shareholders of record as of June 1, 2021."""), 40 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 41 | to shareholders of record as of June 1, 2021."""), 42 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 43 | to shareholders of record as of June 1, 2021."""), 44 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 45 | to shareholders of record as of June 1, 2021."""), 46 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 47 | to shareholders of record as of June 1, 2021."""), 48 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 49 | to shareholders of record as of June 1, 2021."""), 50 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 51 | to shareholders of record as of June 1, 2021."""), 52 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 53 | to shareholders of record as of June 1, 2021."""), 54 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 55 | to shareholders of record as of June 1, 2021."""), 56 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 57 | to shareholders of record as of June 1, 2021."""), 58 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 59 | to shareholders of record as of June 1, 2021."""), 60 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 61 | to shareholders of record as of June 1, 2021."""), 62 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 63 | to shareholders of record as of June 1, 2021."""), 64 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 65 | to shareholders of record as of June 1, 2021."""), 66 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 67 | to shareholders of record as of June 1, 2021."""), 68 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 69 | to shareholders of record as of June 1, 2021."""), 70 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 71 | to shareholders of record as of June 1, 2021."""), 72 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 73 | to shareholders of record as of June 1, 2021."""), 74 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 75 | to shareholders of record as of June 1, 2021."""), 76 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 77 | to shareholders of record as of June 1, 2021."""), 78 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 79 | to shareholders of record as of June 1, 2021."""), 80 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 81 | to shareholders of record as of June 1, 2021."""), 82 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 83 | to shareholders of record as of June 1, 2021."""), 84 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 85 | to shareholders of record as of June 1, 2021."""), 86 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 87 | to shareholders of record as of June 1, 2021."""), 88 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 89 | to shareholders of record as of June 1, 2021."""), 90 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 91 | to shareholders of record as of June 1, 2021."""), 92 | (1,"""XYZ Corp declared a dividend of $0.25 per share, payable June 15, 2021 93 | to shareholders of record as of June 1, 2021."""), 94 | 95 | 96 | ] 97 | 98 | # Create builder 99 | builder = DatasetBuilder( 100 | prompt="Extract ALL dividend information from this text", 101 | schema=DividendExtraction, 102 | model="gemini-2.5-flash-lite", 103 | entries=entries, 104 | rpm=4000 105 | ) 106 | 107 | # Build dataset 108 | builder.build() 109 | 110 | # Save to csv 111 | builder.save('quickstart.csv') -------------------------------------------------------------------------------- /txt2dataset/txt2dataset/dataset_builder.py: -------------------------------------------------------------------------------- 1 | from google import genai 2 | 3 | import os 4 | import asyncio 5 | import time 6 | import csv 7 | from collections import deque 8 | from tqdm import tqdm 9 | 10 | class AsyncRateLimiter: 11 | def __init__(self, rpm=60): 12 | self.rpm = rpm 13 | self.request_times = deque() 14 | self.lock = asyncio.Lock() 15 | 16 | async def acquire(self): 17 | """Wait until it's safe to make the next request using sliding window""" 18 | async with self.lock: 19 | now = time.time() 20 | 21 | # Remove requests older than 60 seconds 22 | while self.request_times and now - self.request_times[0] >= 60: 23 | self.request_times.popleft() 24 | 25 | # If we're at the rate limit, wait until the oldest request is 60 seconds old 26 | if len(self.request_times) >= self.rpm: 27 | sleep_time = 60 - (now - self.request_times[0]) + 0.1 # Small buffer 28 | if sleep_time > 0: 29 | print(f"Rate limiting: waiting {sleep_time:.2f}s") 30 | await asyncio.sleep(sleep_time) 31 | # Clean up again after sleeping 32 | now = time.time() 33 | while self.request_times and now - self.request_times[0] >= 60: 34 | self.request_times.popleft() 35 | 36 | # Record this request 37 | self.request_times.append(now) 38 | 39 | class DatasetBuilder: 40 | def __init__(self, prompt, schema, model, entries, rpm=60, api_key=None, max_concurrent=10): 41 | self.prompt = prompt 42 | self.schema = schema 43 | self.model = model 44 | self.rpm = rpm 45 | self.max_concurrent = max_concurrent 46 | self.entries = entries # Will be modified in place 47 | self.info_found_bool = "info_found" 48 | 49 | # Check API key 50 | if not api_key and not os.getenv("GEMINI_API_KEY"): 51 | raise ValueError("API key must be provided either as an argument or through the GEMINI_API_KEY environment variable.") 52 | 53 | self.api_key = api_key 54 | self.client = genai.Client() 55 | self.rate_limiter = AsyncRateLimiter(rpm) 56 | self.semaphore = asyncio.Semaphore(max_concurrent) 57 | 58 | # Progress tracking 59 | self.pbar = None 60 | self.success_count = 0 61 | self.error_count = 0 62 | self.progress_lock = asyncio.Lock() 63 | 64 | def _calculate_input_tokens_single(self, prompt, text): 65 | """Calculate estimated input tokens for a single text""" 66 | full_text = f"{prompt}: {text}" 67 | return len(full_text) // 4 68 | 69 | def _get_entry_state(self, entry): 70 | """Determine the state of an entry tuple""" 71 | if len(entry) == 2: 72 | return "unprocessed" 73 | elif len(entry) == 3: 74 | return "error" 75 | elif len(entry) == 4: 76 | return "success" 77 | else: 78 | return "unknown" 79 | 80 | def _get_entries_to_process(self): 81 | """Get list of indices for entries that need processing""" 82 | to_process = [] 83 | for i, entry in enumerate(self.entries): 84 | state = self._get_entry_state(entry) 85 | if state in ["unprocessed", "error"]: 86 | to_process.append(i) 87 | return to_process 88 | 89 | async def _update_progress(self, success=False, error=False): 90 | """Thread-safe progress bar update""" 91 | async with self.progress_lock: 92 | if success: 93 | self.success_count += 1 94 | if error: 95 | self.error_count += 1 96 | 97 | if self.pbar: 98 | self.pbar.set_description(f"✓{self.success_count} ✗{self.error_count}") 99 | self.pbar.update(1) 100 | 101 | async def _make_api_call(self, text): 102 | """Make rate-limited API call""" 103 | await self.rate_limiter.acquire() 104 | 105 | response = await self.client.aio.models.generate_content( 106 | model=self.model, 107 | contents=f"{self.prompt}: {text}", 108 | config={ 109 | "response_mime_type": "application/json", 110 | "response_schema": self.schema, 111 | }, 112 | ) 113 | return response 114 | 115 | def _process_response(self, response, entry_id): 116 | """Process a single API response and return (results, tokens_used)""" 117 | if not response: 118 | return [], 0 119 | 120 | try: 121 | response_data = response.parsed 122 | 123 | # Calculate output tokens (rough estimate) 124 | tokens_used = len(response.text) // 4 125 | 126 | # Check if info was found using the configurable boolean field 127 | if hasattr(response_data, self.info_found_bool) and getattr(response_data, self.info_found_bool): 128 | if hasattr(response_data, 'data') and response_data.data: 129 | results = [] 130 | if isinstance(response_data.data, list): 131 | for item in response_data.data: 132 | result_with_id = {"_id": entry_id, **item.dict()} 133 | results.append(result_with_id) 134 | else: 135 | result_with_id = {"_id": entry_id, **response_data.data.dict()} 136 | results.append(result_with_id) 137 | return results, tokens_used 138 | 139 | return [], tokens_used 140 | 141 | except Exception as e: 142 | raise Exception(f"Error processing response: {e}") 143 | 144 | async def _process_single_entry(self, entry_index): 145 | """Process a single entry with semaphore control""" 146 | async with self.semaphore: 147 | entry = self.entries[entry_index] 148 | entry_id = entry[0] 149 | text = entry[1] 150 | 151 | try: 152 | # Make API call 153 | response = await self._make_api_call(text) 154 | 155 | # Process response 156 | results, tokens_used = self._process_response(response, entry_id) 157 | 158 | # Update entry in place - success state 159 | self.entries[entry_index] = (entry_id, text, results, tokens_used) 160 | 161 | # Update progress 162 | await self._update_progress(success=True) 163 | 164 | except Exception as e: 165 | # Update entry in place - error state 166 | error_message = str(e) 167 | self.entries[entry_index] = (entry_id, text, error_message) 168 | print(f"✗ Error processing entry {entry_id}: {error_message}") 169 | 170 | # Update progress 171 | await self._update_progress(error=True) 172 | 173 | async def _build(self): 174 | """Build the dataset with concurrent processing""" 175 | # Find entries that need processing 176 | to_process = self._get_entries_to_process() 177 | 178 | if not to_process: 179 | print("No entries need processing") 180 | return 181 | 182 | print(f"Starting concurrent dataset building:") 183 | print(f"- {len(to_process)} entries to process") 184 | print(f"- Rate limit: {self.rpm} RPM") 185 | print(f"- Max concurrent: {self.max_concurrent}") 186 | 187 | # Calculate estimated input tokens for entries being processed 188 | total_input_tokens = sum( 189 | self._calculate_input_tokens_single(self.prompt, self.entries[i][1]) 190 | for i in to_process 191 | ) 192 | print(f"- Estimated input tokens: {total_input_tokens:,}") 193 | 194 | # Initialize progress tracking 195 | self.success_count = 0 196 | self.error_count = 0 197 | self.pbar = tqdm(total=len(to_process), desc="✓0 ✗0", unit="entries") 198 | 199 | try: 200 | # Process all entries concurrently 201 | tasks = [self._process_single_entry(i) for i in to_process] 202 | await asyncio.gather(*tasks, return_exceptions=True) 203 | finally: 204 | # Clean up progress bar 205 | if self.pbar: 206 | self.pbar.close() 207 | self.pbar = None 208 | 209 | # Print summary 210 | self._print_summary() 211 | 212 | def _print_summary(self): 213 | """Print processing summary""" 214 | success_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "success") 215 | error_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "error") 216 | unprocessed_count = sum(1 for entry in self.entries if self._get_entry_state(entry) == "unprocessed") 217 | 218 | # Count results and tokens 219 | total_results = 0 220 | total_tokens = 0 221 | for entry in self.entries: 222 | if self._get_entry_state(entry) == "success": 223 | total_results += len(entry[2]) # results list 224 | total_tokens += entry[3] # tokens used 225 | 226 | print(f"\nDataset building complete!") 227 | print(f"Successful: {success_count}") 228 | print(f"Errors: {error_count}") 229 | print(f"Unprocessed: {unprocessed_count}") 230 | print(f"Total results: {total_results}") 231 | print(f"Total tokens used: {total_tokens:,}") 232 | 233 | def build(self): 234 | """Public interface to build the dataset""" 235 | asyncio.run(self._build()) 236 | 237 | def get_results(self): 238 | """Extract all successful results from entries""" 239 | results = [] 240 | for entry in self.entries: 241 | if self._get_entry_state(entry) == "success": 242 | results.extend(entry[2]) # Add all results from this entry 243 | return results 244 | 245 | def save(self, filename): 246 | """Save results to CSV with all fields quoted""" 247 | results = self.get_results() 248 | 249 | if not results: 250 | print("No results to save") 251 | return 252 | 253 | # Get all unique fieldnames from all results 254 | fieldnames = set() 255 | for result in results: 256 | fieldnames.update(result.keys()) 257 | fieldnames = sorted(list(fieldnames)) 258 | 259 | with open(filename, 'w', newline='', encoding='utf-8') as csvfile: 260 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) 261 | writer.writeheader() 262 | writer.writerows(results) 263 | 264 | print(f"Saved {len(results)} records to {filename}") 265 | 266 | def get_errors(self): 267 | """Get all entries that have errors""" 268 | errors = [] 269 | for entry in self.entries: 270 | if self._get_entry_state(entry) == "error": 271 | errors.append({ 272 | "id": entry[0], 273 | "text": entry[1][:100] + "..." if len(entry[1]) > 100 else entry[1], 274 | "error": entry[2] 275 | }) 276 | return errors 277 | 278 | def print_errors(self): 279 | """Print all error entries for debugging""" 280 | errors = self.get_errors() 281 | if not errors: 282 | print("No errors found") 283 | return 284 | 285 | print(f"\nFound {len(errors)} errors:") 286 | for error in errors: 287 | print(f"ID {error['id']}: {error['error']}") 288 | print(f" Text: {error['text']}") 289 | print() --------------------------------------------------------------------------------