├── .gitignore ├── LICENSE ├── README.md ├── environment.yml ├── generate_data.py └── lookup.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Coiled 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # One Trillion Row Challenge 2 | 3 | Inspired by Gunnar Morling's [one billion row challenge](https://github.com/gunnarmorling/1brc), we thought we'd take things one step further and start the one trillion row challenge (1TRC). 4 | 5 | We describe the 1TRC, dataset, and running the challenge with Dask on Coiled in [this blog post](https://docs.coiled.io/blog/1trc.html). 6 | 7 | ## The Challenge 8 | 9 | Your task is to use any tool(s) you’d like to calculate the min, mean, and max temperature per weather station, sorted alphabetically. The data is stored in Parquet on S3 in the `s3://coiled-datasets-rp/1trc` requester-pays bucket in AWS region `us-east-1`. Each file is 10 million rows and there are 100,000 files. For an extra challenge, you could also [generate the data yourself](#Data-Generation). 10 | 11 | ### How To Participate 12 | 13 | Open an issue in this repository with your submission and enough details for someone else to be able to run your implementation. This includes things like: 14 | 15 | - Hardware 16 | - Runtime 17 | - Reproducible code snippet 18 | 19 | There is no prize and everyone is a winner. Really, the idea is to solicit ideas and generate discussion. 20 | 21 | ## Data Generation 22 | 23 | You can generate the dataset yourself using the [data generation script](generate_data.py), adapted from [Jacob Tomlinson's 1BRC data generation script](https://github.com/gunnarmorling/1brc/discussions/487). We've also hosted the dataset in a requester pays S3 bucket `s3://coiled-datasets-rp/1trc` in `us-east-1`. 24 | 25 | It draws a random sample of weather stations and normally distributed temperatures drawn from the mean for each station based on the values in [lookup.csv](lookup.csv). 26 | 27 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: 1trc 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.12 6 | - dask 7 | - dask-expr 8 | - coiled 9 | - s3fs -------------------------------------------------------------------------------- /generate_data.py: -------------------------------------------------------------------------------- 1 | # This script was adapted from Jacob Tomlinson's 1BRC submission 2 | # https://github.com/gunnarmorling/1brc/discussions/487 3 | 4 | import os 5 | import tempfile 6 | import coiled 7 | import fsspec 8 | import numpy as np 9 | import pandas as pd 10 | from dask.distributed import progress 11 | 12 | n = 1_000_000_000_000 # Total number of rows of data to generate 13 | chunksize = 10_000_000 # Number of rows of data per file 14 | std = 10.0 # Assume normally distributed temperatures with a standard deviation of 10 15 | lookup_df = pd.read_csv("lookup.csv") # Lookup table of stations and their mean temperatures 16 | bucket = "s3://coiled-datasets-rp/1trc" 17 | 18 | 19 | def generate_chunk(partition_idx, bucket, chunksize, std, lookup_df): 20 | """Generate some sample data based on the lookup table.""" 21 | 22 | rng = np.random.default_rng(partition_idx) # Determinisitic data generation 23 | df = pd.DataFrame( 24 | { 25 | # Choose a random station from the lookup table for each row in our output 26 | "station": rng.integers(0, len(lookup_df) - 1, int(chunksize)), 27 | # Generate a normal distibution around zero for each row in our output 28 | # Because the std is the same for every station we can adjust the mean for each row afterwards 29 | "measure": rng.normal(0, std, int(chunksize)), 30 | } 31 | ) 32 | 33 | # Offset each measurement by the station's mean value 34 | df.measure += df.station.map(lookup_df.mean_temp) 35 | # Round the temprature to one decimal place 36 | df.measure = df.measure.round(decimals=1) 37 | # Convert the station index to the station name 38 | df.station = df.station.map(lookup_df.station) 39 | 40 | # Save this chunk to the output file 41 | filename = f"measurements-{partition_idx}.parquet" 42 | with tempfile.TemporaryDirectory() as tmpdir: 43 | local = os.path.join(tmpdir, filename) 44 | df.to_parquet(local, engine="pyarrow") 45 | fs = fsspec.filesystem("s3") 46 | fs.put(local, f"{bucket}/{filename}") 47 | 48 | 49 | if __name__ == "__main__": 50 | 51 | with coiled.Cluster( 52 | n_workers=500, 53 | worker_cpu=1, 54 | arm=True, 55 | region="us-east-1", 56 | spot_policy="spot_with_fallback", 57 | ) as cluster: 58 | with cluster.get_client() as client: 59 | # Generate partitioned dataset 60 | results = client.map( 61 | generate_chunk, 62 | range(int(n / chunksize)), 63 | bucket=bucket, 64 | chunksize=chunksize, 65 | std=std, 66 | lookup_df=lookup_df, 67 | ) 68 | progress(results) 69 | -------------------------------------------------------------------------------- /lookup.csv: -------------------------------------------------------------------------------- 1 | station,mean_temp 2 | Abha,18.0 3 | Abidjan,26.0 4 | Abéché,29.4 5 | Accra,26.4 6 | Addis Ababa,16.0 7 | Adelaide,17.3 8 | Aden,29.1 9 | Ahvaz,25.4 10 | Albuquerque,14.0 11 | Alexandra,11.0 12 | Alexandria,20.0 13 | Algiers,18.2 14 | Alice Springs,21.0 15 | Almaty,10.0 16 | Amsterdam,10.2 17 | Anadyr,-6.9 18 | Anchorage,2.8 19 | Andorra la Vella,9.8 20 | Ankara,12.0 21 | Antananarivo,17.9 22 | Antsiranana,25.2 23 | Arkhangelsk,1.3 24 | Ashgabat,17.1 25 | Asmara,15.6 26 | Assab,30.5 27 | Astana,3.5 28 | Athens,19.2 29 | Atlanta,17.0 30 | Auckland,15.2 31 | Austin,20.7 32 | Baghdad,22.77 33 | Baguio,19.5 34 | Baku,15.1 35 | Baltimore,13.1 36 | Bamako,27.8 37 | Bangkok,28.6 38 | Bangui,26.0 39 | Banjul,26.0 40 | Barcelona,18.2 41 | Bata,25.1 42 | Batumi,14.0 43 | Beijing,12.9 44 | Beirut,20.9 45 | Belgrade,12.5 46 | Belize City,26.7 47 | Benghazi,19.9 48 | Bergen,7.7 49 | Berlin,10.3 50 | Bilbao,14.7 51 | Birao,26.5 52 | Bishkek,11.3 53 | Bissau,27.0 54 | Blantyre,22.2 55 | Bloemfontein,15.6 56 | Boise,11.4 57 | Bordeaux,14.2 58 | Bosaso,30.0 59 | Boston,10.9 60 | Bouaké,26.0 61 | Bratislava,10.5 62 | Brazzaville,25.0 63 | Bridgetown,27.0 64 | Brisbane,21.4 65 | Brussels,10.5 66 | Bucharest,10.8 67 | Budapest,11.3 68 | Bujumbura,23.8 69 | Bulawayo,18.9 70 | Burnie,13.1 71 | Busan,15.0 72 | Cabo San Lucas,23.9 73 | Cairns,25.0 74 | Cairo,21.4 75 | Calgary,4.4 76 | Canberra,13.1 77 | Cape Town,16.2 78 | Changsha,17.4 79 | Charlotte,16.1 80 | Chiang Mai,25.8 81 | Chicago,9.8 82 | Chihuahua,18.6 83 | Chișinău,10.2 84 | Chittagong,25.9 85 | Chongqing,18.6 86 | Christchurch,12.2 87 | City of San Marino,11.8 88 | Colombo,27.4 89 | Columbus,11.7 90 | Conakry,26.4 91 | Copenhagen,9.1 92 | Cotonou,27.2 93 | Cracow,9.3 94 | Da Lat,17.9 95 | Da Nang,25.8 96 | Dakar,24.0 97 | Dallas,19.0 98 | Damascus,17.0 99 | Dampier,26.4 100 | Dar es Salaam,25.8 101 | Darwin,27.6 102 | Denpasar,23.7 103 | Denver,10.4 104 | Detroit,10.0 105 | Dhaka,25.9 106 | Dikson,-11.1 107 | Dili,26.6 108 | Djibouti,29.9 109 | Dodoma,22.7 110 | Dolisie,24.0 111 | Douala,26.7 112 | Dubai,26.9 113 | Dublin,9.8 114 | Dunedin,11.1 115 | Durban,20.6 116 | Dushanbe,14.7 117 | Edinburgh,9.3 118 | Edmonton,4.2 119 | El Paso,18.1 120 | Entebbe,21.0 121 | Erbil,19.5 122 | Erzurum,5.1 123 | Fairbanks,-2.3 124 | Fianarantsoa,17.9 125 | "Flores, Petén",26.4 126 | Frankfurt,10.6 127 | Fresno,17.9 128 | Fukuoka,17.0 129 | Gabès,19.5 130 | Gaborone,21.0 131 | Gagnoa,26.0 132 | Gangtok,15.2 133 | Garissa,29.3 134 | Garoua,28.3 135 | George Town,27.9 136 | Ghanzi,21.4 137 | Gjoa Haven,-14.4 138 | Guadalajara,20.9 139 | Guangzhou,22.4 140 | Guatemala City,20.4 141 | Halifax,7.5 142 | Hamburg,9.7 143 | Hamilton,13.8 144 | Hanga Roa,20.5 145 | Hanoi,23.6 146 | Harare,18.4 147 | Harbin,5.0 148 | Hargeisa,21.7 149 | Hat Yai,27.0 150 | Havana,25.2 151 | Helsinki,5.9 152 | Heraklion,18.9 153 | Hiroshima,16.3 154 | Ho Chi Minh City,27.4 155 | Hobart,12.7 156 | Hong Kong,23.3 157 | Honiara,26.5 158 | Honolulu,25.4 159 | Houston,20.8 160 | Ifrane,11.4 161 | Indianapolis,11.8 162 | Iqaluit,-9.3 163 | Irkutsk,1.0 164 | Istanbul,13.9 165 | İzmir,17.9 166 | Jacksonville,20.3 167 | Jakarta,26.7 168 | Jayapura,27.0 169 | Jerusalem,18.3 170 | Johannesburg,15.5 171 | Jos,22.8 172 | Juba,27.8 173 | Kabul,12.1 174 | Kampala,20.0 175 | Kandi,27.7 176 | Kankan,26.5 177 | Kano,26.4 178 | Kansas City,12.5 179 | Karachi,26.0 180 | Karonga,24.4 181 | Kathmandu,18.3 182 | Khartoum,29.9 183 | Kingston,27.4 184 | Kinshasa,25.3 185 | Kolkata,26.7 186 | Kuala Lumpur,27.3 187 | Kumasi,26.0 188 | Kunming,15.7 189 | Kuopio,3.4 190 | Kuwait City,25.7 191 | Kyiv,8.4 192 | Kyoto,15.8 193 | La Ceiba,26.2 194 | La Paz,23.7 195 | Lagos,26.8 196 | Lahore,24.3 197 | Lake Havasu City,23.7 198 | Lake Tekapo,8.7 199 | Las Palmas de Gran Canaria,21.2 200 | Las Vegas,20.3 201 | Launceston,13.1 202 | Lhasa,7.6 203 | Libreville,25.9 204 | Lisbon,17.5 205 | Livingstone,21.8 206 | Ljubljana,10.9 207 | Lodwar,29.3 208 | Lomé,26.9 209 | London,11.3 210 | Los Angeles,18.6 211 | Louisville,13.9 212 | Luanda,25.8 213 | Lubumbashi,20.8 214 | Lusaka,19.9 215 | Luxembourg City,9.3 216 | Lviv,7.8 217 | Lyon,12.5 218 | Madrid,15.0 219 | Mahajanga,26.3 220 | Makassar,26.7 221 | Makurdi,26.0 222 | Malabo,26.3 223 | Malé,28.0 224 | Managua,27.3 225 | Manama,26.5 226 | Mandalay,28.0 227 | Mango,28.1 228 | Manila,28.4 229 | Maputo,22.8 230 | Marrakesh,19.6 231 | Marseille,15.8 232 | Maun,22.4 233 | Medan,26.5 234 | Mek'ele,22.7 235 | Melbourne,15.1 236 | Memphis,17.2 237 | Mexicali,23.1 238 | Mexico City,17.5 239 | Miami,24.9 240 | Milan,13.0 241 | Milwaukee,8.9 242 | Minneapolis,7.8 243 | Minsk,6.7 244 | Mogadishu,27.1 245 | Mombasa,26.3 246 | Monaco,16.4 247 | Moncton,6.1 248 | Monterrey,22.3 249 | Montreal,6.8 250 | Moscow,5.8 251 | Mumbai,27.1 252 | Murmansk,0.6 253 | Muscat,28.0 254 | Mzuzu,17.7 255 | N'Djamena,28.3 256 | Naha,23.1 257 | Nairobi,17.8 258 | Nakhon Ratchasima,27.3 259 | Napier,14.6 260 | Napoli,15.9 261 | Nashville,15.4 262 | Nassau,24.6 263 | Ndola,20.3 264 | New Delhi,25.0 265 | New Orleans,20.7 266 | New York City,12.9 267 | Ngaoundéré,22.0 268 | Niamey,29.3 269 | Nicosia,19.7 270 | Niigata,13.9 271 | Nouadhibou,21.3 272 | Nouakchott,25.7 273 | Novosibirsk,1.7 274 | Nuuk,-1.4 275 | Odesa,10.7 276 | Odienné,26.0 277 | Oklahoma City,15.9 278 | Omaha,10.6 279 | Oranjestad,28.1 280 | Oslo,5.7 281 | Ottawa,6.6 282 | Ouagadougou,28.3 283 | Ouahigouya,28.6 284 | Ouarzazate,18.9 285 | Oulu,2.7 286 | Palembang,27.3 287 | Palermo,18.5 288 | Palm Springs,24.5 289 | Palmerston North,13.2 290 | Panama City,28.0 291 | Parakou,26.8 292 | Paris,12.3 293 | Perth,18.7 294 | Petropavlovsk-Kamchatsky,1.9 295 | Philadelphia,13.2 296 | Phnom Penh,28.3 297 | Phoenix,23.9 298 | Pittsburgh,10.8 299 | Podgorica,15.3 300 | Pointe-Noire,26.1 301 | Pontianak,27.7 302 | Port Moresby,26.9 303 | Port Sudan,28.4 304 | Port Vila,24.3 305 | Port-Gentil,26.0 306 | Portland (OR),12.4 307 | Porto,15.7 308 | Prague,8.4 309 | Praia,24.4 310 | Pretoria,18.2 311 | Pyongyang,10.8 312 | Rabat,17.2 313 | Rangpur,24.4 314 | Reggane,28.3 315 | Reykjavík,4.3 316 | Riga,6.2 317 | Riyadh,26.0 318 | Rome,15.2 319 | Roseau,26.2 320 | Rostov-on-Don,9.9 321 | Sacramento,16.3 322 | Saint Petersburg,5.8 323 | Saint-Pierre,5.7 324 | Salt Lake City,11.6 325 | San Antonio,20.8 326 | San Diego,17.8 327 | San Francisco,14.6 328 | San Jose,16.4 329 | San José,22.6 330 | San Juan,27.2 331 | San Salvador,23.1 332 | Sana'a,20.0 333 | Santo Domingo,25.9 334 | Sapporo,8.9 335 | Sarajevo,10.1 336 | Saskatoon,3.3 337 | Seattle,11.3 338 | Ségou,28.0 339 | Seoul,12.5 340 | Seville,19.2 341 | Shanghai,16.7 342 | Singapore,27.0 343 | Skopje,12.4 344 | Sochi,14.2 345 | Sofia,10.6 346 | Sokoto,28.0 347 | Split,16.1 348 | St. John's,5.0 349 | St. Louis,13.9 350 | Stockholm,6.6 351 | Surabaya,27.1 352 | Suva,25.6 353 | Suwałki,7.2 354 | Sydney,17.7 355 | Tabora,23.0 356 | Tabriz,12.6 357 | Taipei,23.0 358 | Tallinn,6.4 359 | Tamale,27.9 360 | Tamanrasset,21.7 361 | Tampa,22.9 362 | Tashkent,14.8 363 | Tauranga,14.8 364 | Tbilisi,12.9 365 | Tegucigalpa,21.7 366 | Tehran,17.0 367 | Tel Aviv,20.0 368 | Thessaloniki,16.0 369 | Thiès,24.0 370 | Tijuana,17.8 371 | Timbuktu,28.0 372 | Tirana,15.2 373 | Toamasina,23.4 374 | Tokyo,15.4 375 | Toliara,24.1 376 | Toluca,12.4 377 | Toronto,9.4 378 | Tripoli,20.0 379 | Tromsø,2.9 380 | Tucson,20.9 381 | Tunis,18.4 382 | Ulaanbaatar,-0.4 383 | Upington,20.4 384 | Ürümqi,7.4 385 | Vaduz,10.1 386 | Valencia,18.3 387 | Valletta,18.8 388 | Vancouver,10.4 389 | Veracruz,25.4 390 | Vienna,10.4 391 | Vientiane,25.9 392 | Villahermosa,27.1 393 | Vilnius,6.0 394 | Virginia Beach,15.8 395 | Vladivostok,4.9 396 | Warsaw,8.5 397 | "Washington, D.C.",14.6 398 | Wau,27.8 399 | Wellington,12.9 400 | Whitehorse,-0.1 401 | Wichita,13.9 402 | Willemstad,28.0 403 | Winnipeg,3.0 404 | Wrocław,9.6 405 | Xi'an,14.1 406 | Yakutsk,-8.8 407 | Yangon,27.5 408 | Yaoundé,23.8 409 | Yellowknife,-4.3 410 | Yerevan,12.4 411 | Yinchuan,9.0 412 | Zagreb,10.7 413 | Zanzibar City,26.0 414 | Zürich,9.3 415 | --------------------------------------------------------------------------------