├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE │ └── default.md └── workflows │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── dev-requirements.txt ├── fruits.csv ├── hello_datastream.py ├── misc ├── hello_pyfca.py └── proof-of-correctness.md ├── nvidia_scraper.py ├── pytest.ini ├── scramjet ├── __init__.py ├── ansi_color_codes.py ├── pyfca.py ├── streams.py └── utils.py ├── setup.py └── test ├── __init__.py ├── conftest.py ├── large_test_files.py ├── sample_multibyte_text.txt ├── sample_numbers_1.txt ├── sample_text_0.txt ├── sample_text_1.txt ├── sample_text_2.txt ├── sample_text_3.txt ├── some-old-test-cases.py ├── test_batch.py ├── test_consumable.py ├── test_datastream_buffering.py ├── test_datastream_creation.py ├── test_datastream_read_write.py ├── test_datastream_transformations.py ├── test_flatmap.py ├── test_miscellaneous.py ├── test_pipe.py ├── test_processing_order.py ├── test_pyfca_spec.py ├── test_reading_files.py ├── test_reading_network.py ├── test_reduce.py ├── test_sequence.py ├── test_stringstream.py └── test_write_to.py /.coveragerc: -------------------------------------------------------------------------------- 1 | # settings for coverage.py 2 | 3 | [run] 4 | branch = True 5 | command_line = -m pytest 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Steps to Reproduce** 14 | 15 | 1. Go to '...' 16 | 2. ... 17 | 18 | ***Reproduction repo/branch*** 19 | 20 | 1. `git clone ` // optional 21 | 2. `git checkout -b test/somebranch user:branch` // so we can check out the problem 22 | 23 | **Expected behavior** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **Version (please complete the following information):** 27 | 28 | - STH version: 29 | - python version: 30 | - os: 31 | 32 | **Additional information like logs, screenshots etc.** 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: "Scramjet's Discord" 3 | url: https://discord.gg/7ABeYbKDMb 4 | about: "Get support directly from the community and Scramjet's development team on our Discord server" -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Feature description** 11 | A clear and concise description of the feature -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/default.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the change. 4 | 5 | # Fixes/Implements 6 | 7 | Please provide here links to issues/features. 8 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | run: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: actions/setup-python@v2 12 | with: 13 | python-version: '3.8' 14 | - run: pip install -r dev-requirements.txt 15 | - run: pytest 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.pyc 3 | test_output 4 | large_text_* 5 | # output from coverage-py 6 | .coverage 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Scramjet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scramjet in Python 2 | ================== 3 | 4 |

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |

13 |

⭐ Star us on GitHub — it motivates us a lot! 🚀

14 |

15 | 16 |

17 | 18 | Scramjet is a simple reactive stream programming framework. The code is written 19 | by chaining functions that transform the streamed data, including well known 20 | map, filter and reduce. 21 | 22 | The main advantage of Scramjet is running asynchronous operations on your data 23 | streams concurrently. It allows you to perform the transformations both 24 | synchronously and asynchronously by using the same API - so now you can "map" 25 | your stream from whatever source and call any number of API's consecutively. 26 | 27 | [Originally written](https://github.com/scramjetorg/scramjet) on top of node.js 28 | object streams, Scramjet is now being ported into Python. This is what is 29 | happening in this repository. 30 | 31 | >_Tested with Python 3.8.10 and Ubuntu 20.04._ 32 | ## Table of contents 33 | 34 | - [Installation](#installation) 35 | - [Quick start](#quick-start) 36 | - [Usage](#usage) 37 | - [Requesting features](#requesting-features) 38 | - [Reporting bugs](#reporting-bugs) 39 | - [Contributing](#contributing) 40 | - [Development Setup](#development-setup) 41 | 42 | ## Installation 43 | 44 | Scramjet Framework is available on PyPI, You can install it with simple pip command: 45 | 46 | ```bash 47 | pip install scramjet-framework-py 48 | ``` 49 | ## Quick start 50 | 51 | Let's say we have a `fruits.csv` file like this: 52 | 53 | ```csv 54 | orange,sweet,1 55 | lemon,sour,2 56 | pigface,salty,5 57 | banana,sweet,3 58 | cranberries,bitter,6 59 | ``` 60 | 61 | and we want to write the names of the sweet fruits to a separate file. 62 | To do this, write an async function like this: 63 | 64 | 65 | ```python 66 | 67 | from scramjet import streams 68 | import asyncio 69 | 70 | 71 | async def sweet_stream(): 72 | with open("fruits.csv") as file_in, open("sweet.txt", "w") as file_out: 73 | await ( 74 | streams.Stream 75 | .read_from(file_in) 76 | .map(lambda line: line.split(',')) 77 | .filter(lambda record: record[1] == "sweet") 78 | .map(lambda record: f"{record[0]}\n") 79 | .write_to(file_out) 80 | ) 81 | 82 | asyncio.run(sweet_stream()) 83 | ``` 84 | 85 | output saved in sweet.txt: 86 | 87 | ``` 88 | orange 89 | banana 90 | ``` 91 | 92 | and that's it! 93 | 94 | ## Usage 95 | 96 | Basic building block of Scramjet is the `Stream` class. It reads input in 97 | chunks, performs operations on these chunks and produces an iterable output 98 | that can be collected and written somewhere. 99 | 100 | **Creating a stream** is done using `read_from` class method. It accepts 101 | any iterable or an object implementing .read() method as the input, and returns 102 | a `Stream` instance. 103 | 104 | **Transforming a stream:** 105 | 106 | * `map` - transform each chunk in a stream using specified function. 107 | * `filter` - keep only chunks for which specified function evaluates to `True`. 108 | * `flatmap` - run specified function on each chunk, and return all of its results as separate chunks. 109 | * `batch` - convert a stream of chunks into a stream of lists of chunks. 110 | 111 | Each of these methods return the modified stream, so they can be chained like 112 | this: `some_stream.map(...).filter(...).batch(...)` 113 | 114 | **Collecting data** from the stream (asynchronous): 115 | 116 | * `write_to` - write all resulting stream chunks into a target. 117 | * `to_list` - return a list with all stream chunks. 118 | * `reduce` - combine all chunks using specified function. 119 | 120 | 121 | Examples :books: 122 | -------- 123 | 124 | You can find more examples in [`hello_datastream.py`](./hello_datastream.py) 125 | file. They don't require any additional dependencies, just the standard library, 126 | so you can run them simply with: 127 | 128 | ```bash 129 | python hello_datastream.py 130 | ``` 131 | 132 | ## Requesting Features 133 | 134 | Anything missing? Or maybe there is something which would make using Scramjet Framework much easier or efficient? Don't hesitate to fill up a [new feature request](https://github.com/scramjetorg/framework-python/issues/new)! We really appreciate all feedback. 135 | 136 | ## Reporting bugs 137 | 138 | If you have found a bug, inconsistent or confusing behavior please fill up a [new bug report](https://github.com/scramjetorg/framework-python/issues/new). 139 | 140 | ## Contributing 141 | 142 | You can contribute to this project by giving us feedback ([reporting bugs](#reporting-bugs) and [requesting features](#reporting-features)) and also by writing code yourself! 143 | 144 | The easiest way is to [create a fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) of this repository and then [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) with all your changes. In most cases, you should branch from and target `main` branch. 145 | 146 | Please refer to [Development Setup](#development-setup) section on how to setup this project. 147 | 148 | ## Development Setup 149 | 150 | 1. Install Python3 interpreter on your computer. Refer to [official docs](https://wiki.python.org/moin/BeginnersGuide/Download). 151 | 152 | 2. Install `git` version control system. Refer to [official docs](https://git-scm.com/downloads). 153 | 154 | 3. Clone this repository: 155 | 156 | ```bash 157 | git clone git@github.com:scramjetorg/framework-python.git 158 | ``` 159 | 4. Create and activate a virtualenv: 160 | 161 | ```bash 162 | sudo apt install python3-virtualenv 163 | virtualenv -p python3 venv 164 | .venv/bin/activate 165 | ``` 166 | 167 | 5. Check Python version: 168 | 169 | ```bash 170 | $ python --version 171 | Python 3.8.10 172 | ``` 173 | 174 | 6. Install dependencies: 175 | 176 | ```bash 177 | pip install -r dev-requirements.txt 178 | ``` 179 | 180 | 7. Run test cases (with activated virtualenv): 181 | 182 | ```bash 183 | pytest 184 | ``` 185 | 186 | > :bulb: **HINT:** add a filename if you want to limit which tests are run 187 | 188 | 189 | 8. If you want to enable detailed debug logging, set one of the following env variables: 190 | 191 | ```bash 192 | PYFCA_DEBUG=1 # debug pyfca 193 | DATASTREAM_DEBUG=1 # debug datastream 194 | SCRAMJET_DEBUG=1 # debug both 195 | ``` -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | coverage 2 | pytest 3 | pytest-asyncio 4 | aiofiles 5 | pyppeteer==1.0.2 6 | requests==2.27.1 -------------------------------------------------------------------------------- /fruits.csv: -------------------------------------------------------------------------------- 1 | orange,sweet,1 2 | lemon,sour,2 3 | strawberry,sweet,9 4 | pigface,salty,5 5 | banana,sweet,3 6 | cranberries,bitter,6 -------------------------------------------------------------------------------- /hello_datastream.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StringStream 2 | import asyncio 3 | 4 | # color codes for pretty output 5 | grey="\033[37m" 6 | strong="\033[97;1m" 7 | reset="\033[0m" 8 | 9 | 10 | 11 | # Simple stream transforming a list of dollar amounts 12 | async def simple_stream_example(): 13 | data = ['$8', '$25', '$3', '$14', '$20', '$9', '$13', '$16'] 14 | print("Input:", data, '\n') 15 | result = await ( 16 | Stream 17 | .read_from(data) 18 | .each(lambda x: print("Echo (in):", repr(x))) 19 | .map(lambda s: int(s[1:])) 20 | .filter(lambda x: x % 2 == 0) 21 | .map(lambda x: x/2) 22 | .map(lambda x: "$" + str(x)) 23 | .each(lambda x: print("Echo (out):", repr(x))) 24 | .to_list() 25 | ) 26 | print("\nOutput:", result) # ['$4.0', '$7.0', '$10.0', '$8.0'] 27 | 28 | print(f"\n{strong}Running simple_stream_example:{reset}") 29 | asyncio.run(simple_stream_example()) 30 | 31 | 32 | 33 | # Asynchronous transformations are performed concurrently on multiple chunks. 34 | import random 35 | random.seed() 36 | 37 | async def delayed_square(x): 38 | delay = round(random.uniform(0.1, 0.5), 2) 39 | print(f"Start processing {x} {grey}({delay}s){reset}") 40 | await asyncio.sleep(delay) 41 | print(f"Result: {x} -> {x**2} {grey}({delay}s){reset}") 42 | return x**2 43 | 44 | async def async_stream_example(): 45 | result = await ( 46 | Stream 47 | .read_from(range(12), max_parallel=4) 48 | .map(delayed_square) 49 | .to_list() 50 | ) 51 | print("\nOutput:", result) 52 | 53 | print(f"\n{strong}Running async_stream_example:{reset}") 54 | asyncio.run(async_stream_example()) 55 | 56 | 57 | 58 | # Chunk size can be specified. Notice how words that were split across 59 | # the chunks are later glued together. 60 | async def stream_from_file_example(): 61 | path = 'test/sample_text_3.txt' 62 | with open(path) as file: 63 | print("Input:", file.read(), '\n') 64 | 65 | with open(path) as file: 66 | result = await ( 67 | StringStream 68 | .read_from(file, chunk_size=32) 69 | .each(lambda x: print(f"Read: {repr(x)}")) 70 | .split() 71 | .to_list() 72 | ) 73 | print("\nOutput:", result) 74 | 75 | print(f"\n{strong}Running stream_from_file_example:{reset}") 76 | asyncio.run(stream_from_file_example()) 77 | -------------------------------------------------------------------------------- /misc/hello_pyfca.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | # this is a demonstration of how pyfca, the internal 4 | # stream transformation scheduler, is working. 5 | 6 | import asyncio 7 | import sys 8 | from pprint import pprint 9 | import random 10 | 11 | from scramjet import pyfca 12 | import scramjet.utils as utils 13 | from scramjet.ansi_color_codes import * 14 | 15 | log = utils.LogWithTimer.log 16 | random.seed('Pyfca') 17 | 18 | # Input data 19 | 20 | NUMBER_SEQUENCE = [1, 3, 2, 6, 4, 5] 21 | TEST_DATA = [ 22 | {'id': count, 'delay': value*0.5} 23 | for count, value 24 | in enumerate(NUMBER_SEQUENCE) 25 | ] 26 | 27 | # Transformation functions 28 | 29 | async def mock_delay(data): 30 | delay = data['delay'] 31 | await asyncio.sleep(delay) 32 | 33 | async def async_identity(x): 34 | log(f'{yellow}identity start:{reset} {x}') 35 | await mock_delay(x) 36 | log(f'{yellow}identity end:{reset} -> {x}') 37 | return x 38 | 39 | # Processing samples 40 | 41 | async def simple_pyfca_example(): 42 | print('Input:'); pprint(TEST_DATA) 43 | 44 | p = pyfca.Pyfca(4, async_identity) 45 | reads = [p.read() for _ in TEST_DATA] 46 | for x in TEST_DATA: 47 | await p.write(x) 48 | results = await asyncio.gather(*reads) 49 | 50 | print('Results:'); pprint(results) 51 | 52 | print(f"\n{strong}Running simple_pyfca_example:{reset}") 53 | asyncio.run(simple_pyfca_example()) 54 | -------------------------------------------------------------------------------- /misc/proof-of-correctness.md: -------------------------------------------------------------------------------- 1 | Proof of correctness 2 | -------------------- 3 | 4 | `test/test_processing_order.py` demonstrates the correctness of the algorithm. 5 | 6 | The input sequence for the test is a list of dictionaries, each containing an 7 | `id` key which matches the order of the item in the sequence. 8 | 9 | The transformation consists of two functions: 10 | - the first one returns immediately for inputs with even `id` but has a delay 11 | for inputs with odd `id`, 12 | - the second one is synchronous and returns immediately. 13 | 14 | The soft limit for number of items processed in parallel is 4. 15 | 16 | To perform the test, run: 17 | 18 | PYFCA_DEBUG=1 pytest -vs test/test_processing_order.py 19 | 20 | The following happens: 21 | 22 | 1. Six items are written. 23 | 1. First 3 writes resolve immediately, because processing queue is initially 24 | empty. 25 | 1. Next 3 writes return a pending Future object, which get resolved as previous 26 | items are processed. 27 | 1. First transformation is performed on the items in the order matching their 28 | `id`s. 29 | 1. Second transformation is performed as soon as items become available. This 30 | means that it's performed on the elements with even `id`s first, because 31 | they are ready for processing earlier than the elements with odd `id`s. 32 | 1. However, the ordering of the results matches the input order. 33 | 34 | Final results look as follows: 35 | 36 | {'id': 0, 'n': 0, 'x': 0, 'y': 0} 37 | {'id': 1, 'n': 1, 'x': 1, 'y': 3} 38 | {'id': 2, 'n': 0, 'x': 2, 'y': 1} 39 | {'id': 3, 'n': 1, 'x': 3, 'y': 4} 40 | {'id': 4, 'n': 0, 'x': 4, 'y': 2} 41 | {'id': 5, 'n': 1, 'x': 5, 'y': 5} 42 | 43 | The added keys indicate the order of execution of specific operations within 44 | the `IFCA` transform chain. The meaning is as follows: 45 | 46 | - `id` is exactly the same as in input, 47 | - `n` denotes item parity (even items have `n=0`) - a visual helper to see 48 | which data points were delayed, 49 | - `x` is the order of the execution of the first function, 50 | - `y` is the order of the execution of the second function. 51 | 52 | The test indicates that the chained functions are executed immediately after 53 | each other (as soon as the item is processed by one function it starts being 54 | procesed by the next function), while the read order exactly follows the write 55 | order. 56 | -------------------------------------------------------------------------------- /nvidia_scraper.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | from pyppeteer import launch 3 | 4 | import asyncio 5 | import requests 6 | 7 | # color codes for pretty output 8 | grey="\033[37m" 9 | strong="\033[97;1m" 10 | reset="\033[0m" 11 | 12 | params = { 13 | 'skus': 'DE', 14 | 'locale': 'DE' 15 | } 16 | 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:93.0) Gecko/20100101 Firefox/93.0', 19 | 'Accept': 'application/json, text/plain, */*', 20 | 'Accept-Language': 'en-GB', 21 | 'Origin': 'https://shop.nvidia.com', 22 | 'DNT': '1', 23 | 'Connection': 'keep-alive', 24 | 'Referer': 'https://shop.nvidia.com/', 25 | 'Sec-Fetch-Dest': 'empty', 26 | 'Sec-Fetch-Mode': 'cors', 27 | 'Sec-Fetch-Site': 'same-site', 28 | 'TE': 'trailers', 29 | } 30 | 31 | # Simple Nvidia page scraper 32 | async def simple_nvidia_scraper_example() -> None: 33 | data = requests.get("https://api.store.nvidia.com/partner/v1/feinventory", params=params, headers=headers).json() 34 | urls = await ( 35 | Stream 36 | .read_from(data.get('listMap')) 37 | .filter(lambda x: x.get('is_active') == 'true') 38 | .map(lambda x: x.get('product_url')) 39 | .to_list() 40 | ) 41 | browser = await launch( 42 | headless=False, 43 | autoClose=False 44 | ) 45 | pages = await browser.pages() 46 | 47 | for idx, url in enumerate(urls): 48 | await pages[idx].goto(url) 49 | if len(pages) < len(urls): 50 | pages.append(await browser.newPage()) 51 | 52 | 53 | print(f"\n{strong}Running simple_stream_example:{reset}") 54 | asyncio.run(simple_nvidia_scraper_example()) 55 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | asyncio_mode = auto 4 | -------------------------------------------------------------------------------- /scramjet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scramjetorg/framework-python/7e84db2ef65af3f944957f1745de0045ee844060/scramjet/__init__.py -------------------------------------------------------------------------------- /scramjet/ansi_color_codes.py: -------------------------------------------------------------------------------- 1 | red="\033[31m" 2 | green="\033[32m" 3 | yellow="\033[33m" 4 | blue="\033[34m" 5 | pink="\033[35m" 6 | cyan="\033[36m" 7 | grey="\033[37m" 8 | 9 | bold="\033[1m" 10 | strong="\033[97;1m" 11 | reset="\033[0m" 12 | -------------------------------------------------------------------------------- /scramjet/pyfca.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import asyncio 4 | from os import environ 5 | from scramjet.ansi_color_codes import * 6 | import scramjet.utils as utils 7 | 8 | DEBUG = 'PYFCA_DEBUG' in environ or 'SCRAMJET_DEBUG' in environ 9 | fmt = utils.print_formatted 10 | 11 | def log(pyfca, *args): 12 | if DEBUG: # pragma: no cover 13 | utils.LogWithTimer.log(f"{grey}{pyfca.name}{reset}", *args) 14 | 15 | 16 | # Use this class to tell pyfca to drop a chunk. 17 | class DropChunk: 18 | pass 19 | 20 | 21 | class WriteAfterEnd(Exception): 22 | pass 23 | 24 | 25 | class MultipleEnd(Exception): 26 | pass 27 | 28 | 29 | class Pyfca: 30 | def __init__(self, max_parallel, name="pyfca"): 31 | self.max_parallel = max_parallel 32 | self.name = name 33 | self._transform_chain = [] 34 | 35 | self._processing = asyncio.Queue() 36 | self._ready = asyncio.Queue() 37 | self._waiting_for_read = asyncio.Queue() 38 | self.ended = False 39 | # increment on write, decrement on read 40 | self.read_write_balance = 0 41 | 42 | # sentinels for start/end conditions 43 | self.last_chunk_status = asyncio.Future() 44 | self.last_chunk_status.set_result(True) 45 | self.last_chunk_status.chunk = {'id': 'start-sentinel'} 46 | for _ in range(max_parallel - 1): 47 | self._processing.put_nowait(self.last_chunk_status) 48 | 49 | self._no_more_items = asyncio.Future() 50 | self._no_more_items.set_result(None) 51 | 52 | log(self, 'INIT finished') 53 | 54 | 55 | def write(self, chunk): 56 | if self.ended: 57 | raise WriteAfterEnd 58 | self.read_write_balance += 1 59 | 60 | chunk_status = asyncio.Future() 61 | self._processing.put_nowait(chunk_status) 62 | task = asyncio.create_task(self._process(chunk, chunk_status)) 63 | if self.read_write_balance < self.max_parallel: 64 | # if we always returned gather, we would loose sync 65 | drain = self._processing.get_nowait() 66 | else: 67 | waiting = asyncio.Future() 68 | self._waiting_for_read.put_nowait(waiting) 69 | drain = asyncio.gather(self._processing.get_nowait(), waiting) 70 | 71 | if DEBUG: # pragma: no cover 72 | chunk_status.chunk = chunk 73 | task.set_name(f'process {utils.pprint_chunk(chunk)}') 74 | log(self, f"WRITE {fmt(chunk)} r/w balance: {self.read_write_balance}") 75 | log(self, f" - {fmt(chunk)} scheduled: {task}") 76 | log(self, f" - {fmt(chunk)} return: {fmt(drain)}") 77 | 78 | return drain 79 | 80 | 81 | def read(self): 82 | if self.ended and self.read_write_balance <= 0: 83 | log(self, 'READ processing ended, return None') 84 | return self._no_more_items 85 | 86 | self.read_write_balance -= 1 87 | log(self, f'READ r/w balance: {self.read_write_balance}') 88 | 89 | try: 90 | waiting = self._waiting_for_read.get_nowait() 91 | waiting.set_result(True) 92 | except asyncio.queues.QueueEmpty: 93 | pass 94 | 95 | awaitable = self._ready.get() 96 | log(self, f' - return: {awaitable}') 97 | return awaitable 98 | 99 | 100 | def end(self): 101 | if self.ended: 102 | raise MultipleEnd 103 | log(self, f'{red}END{reset} stop accepting input.') 104 | log(self, f' - r/w balance: {self.read_write_balance}') 105 | self.ended = True 106 | # schedule as a task to make sure it will run after any pending 107 | # _process updates last_chunk_status 108 | asyncio.create_task(self._resolve_overflow_readers()) 109 | 110 | 111 | def add_transform(self, transformation): 112 | self._transform_chain.append(transformation) 113 | log(self, f'ADD_TRANSFORM current chain: {self._transform_chain}') 114 | 115 | 116 | async def _resolve_overflow_readers(self): 117 | log(self, f'END waiting for last item: {self.last_chunk_status}') 118 | await self.last_chunk_status 119 | 120 | log(self, f'END final r/w balance: {self.read_write_balance}') 121 | for _ in range(-self.read_write_balance): 122 | self._ready.put_nowait(None) 123 | log(self, f' - appended None') 124 | 125 | 126 | async def _process(self, chunk, chunk_status): 127 | previous = self.last_chunk_status 128 | 129 | if DEBUG: # pragma: no cover 130 | log(self, f'PROCESS {fmt(chunk)} previous item: {fmt(previous)}') 131 | log(self, f' - {fmt(chunk)} status: {fmt(chunk_status)}') 132 | 133 | self.last_chunk_status = chunk_status 134 | result = chunk 135 | for func in self._transform_chain: 136 | result = func(result) 137 | log(self, f' - {fmt(chunk)} function: {func}') 138 | log(self, f' - {fmt(chunk)} yielded: {repr(result)}') 139 | if asyncio.iscoroutine(result): 140 | result = await result 141 | log(self, f'PROCESS {fmt(chunk)} resolved: {repr(result)}') 142 | if result is DropChunk: 143 | break 144 | 145 | log(self, f' - {fmt(chunk)} processing {pink}finished{reset}') 146 | log(self, f' - {fmt(chunk)} awaiting for previous chunk: {fmt(previous)}') 147 | await previous 148 | chunk_status.set_result(True) 149 | log(self, f'PROCESS {fmt(chunk)} status: {fmt(chunk_status)}') 150 | 151 | if result is not DropChunk: 152 | log(self, f' - {fmt(chunk)} {green}return{reset}: ' 153 | f' {utils.print_trimmed(result, color=False)}') 154 | await self._ready.put(result) 155 | else: 156 | log(self, f' - {fmt(chunk)} {cyan}drop chunk{reset}') 157 | self.read_write_balance -= 1 158 | if self.read_write_balance == self.max_parallel - 1: 159 | waiting = self._waiting_for_read.get_nowait() 160 | waiting.set_result(True) 161 | 162 | -------------------------------------------------------------------------------- /scramjet/streams.py: -------------------------------------------------------------------------------- 1 | from scramjet.pyfca import Pyfca, DropChunk 2 | import asyncio 3 | from scramjet.ansi_color_codes import * 4 | from os import environ 5 | import scramjet.utils as utils 6 | from collections.abc import Iterable, AsyncIterable 7 | import re 8 | import time 9 | import random 10 | 11 | DEBUG = 'DATASTREAM_DEBUG' in environ or 'SCRAMJET_DEBUG' in environ 12 | tr = utils.print_trimmed 13 | 14 | def log(stream, *args): 15 | if DEBUG: # pragma: no cover 16 | utils.LogWithTimer.log(f"{grey}{stream.name}{reset}", *args) 17 | 18 | 19 | class UnsupportedOperation(Exception): 20 | pass 21 | 22 | class StreamAlreadyConsumed(Exception): 23 | pass 24 | 25 | 26 | class Stream(): 27 | def __init__(self, max_parallel=64, upstream=None, origin=None, name="datastream"): 28 | self._upstream = upstream 29 | self._origin = origin if origin else self 30 | self.name = name 31 | # whether we can write to the stream instance 32 | self._writable = True 33 | # whether the stream was already "used" (transformed/read from) 34 | self._consumed = False 35 | self._pyfca = upstream._pyfca if upstream else Pyfca(max_parallel) 36 | self._ready_to_start = asyncio.Future() 37 | self._sinks = [] 38 | self._uid = str(time.time()) + str(random.randint(1, 1000000)) 39 | log(self, f'INIT stream created with pyfca {self._pyfca}') 40 | 41 | def __await__(self): 42 | raise TypeError( 43 | "Stream objects cannot be awaited on. To get data from a stream, " 44 | "use a sink method (such as .to_list()) and await on that." 45 | ) 46 | 47 | async def __aiter__(self): 48 | self._uncork() 49 | while True: 50 | chunk = await self._pyfca.read() 51 | if chunk is None: 52 | break 53 | yield chunk 54 | 55 | def _uncork(self): 56 | if not self._ready_to_start.done(): 57 | self._ready_to_start.set_result(True) 58 | log(self, f'{green}uncorked{reset}') 59 | if self._upstream: 60 | log(self, f'uncorking upstream: {self._upstream.name}') 61 | self._upstream._uncork() 62 | 63 | def _mark_consumed(self): 64 | if self._consumed: # cannot consume the same stream twice 65 | raise StreamAlreadyConsumed 66 | else: 67 | self._consumed = True 68 | 69 | def _as(self, target_class): 70 | """Create a stream of type target_class from current one.""" 71 | return target_class( 72 | upstream=self, 73 | max_parallel=self._pyfca.max_parallel, 74 | name=f'{self.name}+_' 75 | ) 76 | 77 | def use(self, func): 78 | """Perform a function on the whole stream and return the result.""" 79 | return func(self) 80 | 81 | def write(self, chunk): 82 | """Write a single item to the datastream.""" 83 | return self._origin._pyfca.write(chunk) 84 | 85 | def end(self): 86 | """Mark the end of input to the datastream.""" 87 | self._pyfca.end() 88 | 89 | async def read(self): 90 | """Read a single item from the datastream.""" 91 | # cannot read from stream consumed by something else 92 | if self._consumed: 93 | raise StreamAlreadyConsumed 94 | self._uncork() 95 | return await self._pyfca.read() 96 | 97 | 98 | @classmethod 99 | def read_from(cls, source, max_parallel=64, chunk_size=None): 100 | """ 101 | Create a new stream from specified source, which must be either 102 | an Iterable or implement .read() method. 103 | """ 104 | if chunk_size: 105 | if hasattr(source, 'read'): 106 | return cls.from_callback( 107 | max_parallel, source.read, chunk_size) 108 | else: 109 | msg = (f"chunk_size was specified, but source {source} " 110 | "does not implement read() method.") 111 | raise UnsupportedOperation(msg) 112 | else: 113 | if isinstance(source, (Iterable, AsyncIterable)): 114 | return cls.from_iterable( 115 | source, max_parallel=max_parallel) 116 | else: 117 | msg = (f"Source {source} is not iterable. It cannot be used " 118 | "unless it exposes read() method and chunk_size " 119 | "is specified.") 120 | raise UnsupportedOperation(msg) 121 | 122 | 123 | @classmethod 124 | def from_iterable(cls, iterable, max_parallel=64): 125 | """Create a new stream from an iterable object.""" 126 | stream = cls(max_parallel) 127 | async def consume(): 128 | await stream._ready_to_start 129 | if isinstance(iterable, Iterable): 130 | for item in iterable: 131 | await stream._pyfca.write(item) 132 | if isinstance(iterable, AsyncIterable): 133 | [await stream._pyfca.write(item) async for item in iterable] 134 | stream._pyfca.end() 135 | 136 | asyncio.create_task(consume()) 137 | stream._writable = False 138 | return stream 139 | 140 | 141 | @classmethod 142 | def from_callback(cls, max_parallel, callback, *args): 143 | """Create a new stream using callback to get chunks.""" 144 | stream = cls(max_parallel) 145 | 146 | async def consume(): 147 | await stream._ready_to_start 148 | while True: 149 | chunk = callback(*args) 150 | if asyncio.iscoroutine(chunk): 151 | chunk = await chunk 152 | if chunk == '' or chunk == b'': 153 | break 154 | await stream._pyfca.write(chunk) 155 | stream._pyfca.end() 156 | 157 | asyncio.create_task(consume()) 158 | stream._writable = False 159 | return stream 160 | 161 | 162 | def map(self, func, *args): 163 | """Transform each chunk using a function.""" 164 | self._mark_consumed() 165 | new_stream = self.__class__(upstream=self, origin=self._origin, name=f'{self.name}+m') 166 | async def run_mapper(chunk): 167 | if args: 168 | log(new_stream, f'calling mapper {func} with args: {chunk, *args}') 169 | result = func(chunk, *args) 170 | if asyncio.iscoroutine(result): 171 | result = await result 172 | log(new_stream, f'mapper result: {tr(chunk)} -> {tr(result)}') 173 | return result 174 | log(new_stream, f'adding mapper: {func}') 175 | new_stream._pyfca.add_transform(run_mapper) 176 | return new_stream 177 | 178 | 179 | def each(self, func, *args): 180 | """Perform an operation on each chunk and return it unchanged.""" 181 | async def mapper(chunk): 182 | result = func(chunk, *args) 183 | if asyncio.iscoroutine(result): 184 | await result 185 | return chunk 186 | return self.map(mapper) 187 | 188 | 189 | def decode(self, encoding): 190 | """Convert chunks of bytes into strings using specified encoding.""" 191 | import codecs 192 | # Incremental decoders handle characters split across inputs. 193 | # Input with only partial data yields empty string - drop these. 194 | decoder = codecs.getincrementaldecoder(encoding)() 195 | return self._as(StringStream).map( 196 | lambda chunk: decoder.decode(chunk) or DropChunk 197 | ) 198 | 199 | 200 | def filter(self, func, *args): 201 | """Keep only chunks for which func evaluates to True.""" 202 | self._mark_consumed() 203 | new_stream = self.__class__(upstream=self, origin=self._origin, name=f'{self.name}+f') 204 | async def run_filter(chunk): 205 | if args: 206 | log(new_stream, f'calling filter {func} with args: {chunk, *args}') 207 | decision = func(chunk, *args) 208 | if asyncio.iscoroutine(decision): 209 | decision = await decision 210 | log(new_stream, f'filter result: {tr(chunk)} -> {cyan}{decision}{reset}') 211 | return chunk if decision else DropChunk 212 | log(new_stream, f'adding filter: {func}') 213 | new_stream._pyfca.add_transform(run_filter) 214 | return new_stream 215 | 216 | 217 | def flatmap(self, func, *args): 218 | """Run func on each chunk and return all results as separate chunks.""" 219 | self._mark_consumed() 220 | new_stream = self.__class__( 221 | max_parallel=self._pyfca.max_parallel, origin=self._origin, name=f'{self.name}+fm' 222 | ) 223 | async def consume(): 224 | self._uncork() 225 | while True: 226 | chunk = await self._pyfca.read() 227 | log(self, f'got: {tr(chunk)}') 228 | if chunk is None: 229 | break 230 | results = func(chunk, *args) 231 | if asyncio.iscoroutine(results): 232 | results = await results 233 | log(self, f'{cyan}split:{reset} -> {repr(results)}') 234 | for item in results: 235 | log(new_stream, f'put: {tr(item)}') 236 | await new_stream._pyfca.write(item) 237 | log(new_stream, f'{blue}drained{reset}') 238 | log(new_stream, f'ending pyfca {new_stream._pyfca}') 239 | new_stream._pyfca.end() 240 | asyncio.create_task(consume(), name='flatmap-consumer') 241 | return new_stream 242 | 243 | 244 | def batch(self, func, *args): 245 | """ 246 | Convert a stream of chunks into a stream of lists of chunks. 247 | 248 | func: called on each chunk to determine when the batch will end. 249 | """ 250 | self._mark_consumed() 251 | new_stream = self.__class__( 252 | max_parallel=self._pyfca.max_parallel, origin=self._origin, name=f'{self.name}+b' 253 | ) 254 | async def consume(): 255 | self._uncork() 256 | batch = [] 257 | 258 | while True: 259 | chunk = await self._pyfca.read() 260 | log(self, f'got: {tr(chunk)}') 261 | if chunk is None: 262 | break 263 | batch.append(chunk) 264 | if args: 265 | log(new_stream, f'calling {func} with args: {chunk, *args}') 266 | if func(chunk, *args): 267 | log(new_stream, f'{pink}put batch:{reset} {tr(batch)}') 268 | await new_stream._pyfca.write(batch) 269 | batch = [] 270 | 271 | if len(batch): 272 | log(new_stream, f'{pink}put batch:{reset} {tr(batch)}') 273 | await new_stream._pyfca.write(batch) 274 | 275 | log(new_stream, f'ending pyfca {new_stream._pyfca}') 276 | new_stream._pyfca.end() 277 | asyncio.create_task(consume()) 278 | return new_stream 279 | 280 | 281 | def sequence(self, sequencer, initialPartial=None): 282 | """ 283 | Change how the data is chopped into chunks. 284 | 285 | sequencer: two-argument function taking partial result from previous 286 | operation and current chunk. It should return an iterable; all items 287 | from the iterable except the last one will become new chunks, and the 288 | last one will be fed to the next call of the sequencer. 289 | """ 290 | self._mark_consumed() 291 | new_stream = self.__class__( 292 | max_parallel=self._pyfca.max_parallel, origin=self._origin, name=f'{self.name}+s' 293 | ) 294 | async def consume(): 295 | self._uncork() 296 | partial = initialPartial 297 | 298 | while True: 299 | chunk = await self._pyfca.read() 300 | log(self, f'got: {tr(chunk)}') 301 | if chunk is None: 302 | break 303 | chunks = sequencer(partial, chunk) 304 | if asyncio.iscoroutine(chunks): 305 | chunks = await chunks 306 | log(new_stream, f'{blue}{len(chunks)} chunks:{reset} {chunks}') 307 | for chunk in chunks[:-1]: 308 | log(new_stream, f'put: {tr(chunk)}') 309 | await new_stream._pyfca.write(chunk) 310 | log(new_stream, f'carrying over partial result: {tr(chunks[-1])}') 311 | partial = chunks[-1] 312 | 313 | log(new_stream, f'leftover: {tr(partial)}') 314 | # pytest claims that line #315 is not reacheable, cause of if statement is always True. 315 | # TODO: refactor code here or find exact reason for pytest problem 316 | if partial: # pragma: no cover 317 | log(new_stream, f'put: {tr(partial)}') 318 | await new_stream._pyfca.write(partial) 319 | log(new_stream, f'ending pyfca {new_stream._pyfca}') 320 | new_stream._pyfca.end() 321 | asyncio.create_task(consume()) 322 | return new_stream 323 | 324 | 325 | def unpipe(self, target=None): 326 | """Remove a target from the current stream.""" 327 | if target in self._sinks: 328 | self._sinks.remove(target) 329 | if len(self._sinks) == 0: 330 | task, = [task for task in asyncio.all_tasks() if task.get_name() == f'{self._uid}-pipe-consumer'] 331 | if task: 332 | task.cancel() 333 | return self 334 | 335 | 336 | def pipe(self, target, end=True): 337 | """Forward all chunks from current stream into target.""" 338 | self._consumed = True 339 | self._sinks.append(target) 340 | async def consume(): 341 | self._uncork() 342 | while True: 343 | chunk = await self._pyfca.read() 344 | if chunk is None: 345 | break 346 | drains = [target._pyfca.write(chunk) for target in self._sinks] 347 | await asyncio.gather(*drains) 348 | if end is True: 349 | for target in self._sinks: 350 | target._pyfca.end() 351 | if len(self._sinks) == 1: 352 | asyncio.create_task(consume(), name=f'{self._uid}-pipe-consumer') 353 | return target 354 | 355 | 356 | async def to_list(self): 357 | """Create a list with all resulting stream chunks.""" 358 | self._mark_consumed() 359 | self._uncork() 360 | result = [] 361 | log(self, f'sink: {repr(result)}') 362 | chunk = await self._pyfca.read() 363 | while chunk is not None: 364 | log(self, f'got: {tr(chunk)}') 365 | result.append(chunk) 366 | chunk = await self._pyfca.read() 367 | return result 368 | 369 | 370 | async def write_to(self, target): 371 | """ 372 | Write all resulting stream chunks into target. 373 | 374 | target: object implementing .write() method 375 | """ 376 | self._mark_consumed() 377 | self._uncork() 378 | log(self, f'sink: {repr(target)}') 379 | chunk = await self._pyfca.read() 380 | while chunk is not None: 381 | log(self, f'got: {tr(chunk)}') 382 | write = target.write(chunk) 383 | if asyncio.iscoroutine(write): 384 | await write 385 | chunk = await self._pyfca.read() 386 | return target 387 | 388 | 389 | async def reduce(self, func, initial=None): 390 | """ 391 | Apply two-argument func to elements from the stream cumulatively, 392 | producing an awaitable that will resolve to a single value when the 393 | stream ends. For a stream of [1,2,3,4] the result will be 394 | func(func(func(1,2),3),4). 395 | """ 396 | self._mark_consumed() 397 | self._uncork() 398 | if initial is None: 399 | accumulator = await self._pyfca.read() 400 | log(self, f'got: {tr(accumulator)}') 401 | else: 402 | accumulator = initial 403 | log(self, f'reducer: initialized accumulator with {initial}') 404 | while True: 405 | chunk = await self._pyfca.read() 406 | log(self, f'got: {tr(chunk)}') 407 | if chunk is None: 408 | break 409 | accumulator = func(accumulator, chunk) 410 | if asyncio.iscoroutine(accumulator): 411 | accumulator = await accumulator 412 | log(self, f'reduce - intermediate result: {accumulator}') 413 | return accumulator 414 | 415 | 416 | 417 | class StringStream(Stream): 418 | def __init__(self, max_parallel=64, upstream=None, origin=None, name="stringstream"): 419 | super().__init__(max_parallel=max_parallel, upstream=upstream, origin=origin, name=name) 420 | 421 | def parse(self, func, *args): 422 | """Transform StringStream into Stream.""" 423 | return self._as(Stream).map(func, *args) 424 | 425 | def match(self, pattern): 426 | """Extract matching parts of chunk as new chunks.""" 427 | regex = re.compile(pattern) 428 | def mapper(chunk): 429 | matches = regex.findall(chunk) 430 | if regex.groups <= 1: 431 | return matches 432 | else: 433 | flattened = [] 434 | for tuple in matches: 435 | flattened.extend(tuple) 436 | return flattened 437 | 438 | return self.flatmap(mapper) 439 | 440 | def split(self, separator=None): 441 | """Split each chunk into multiple new chunks.""" 442 | def splitter(part, chunk): 443 | words = (part+chunk).split(sep=separator) 444 | # .split() without delimiter ignores trailing whitespace, e.g. 445 | # "foo bar ".split() -> ["foo", "bar"] and not ["foo", "bar", ""]. 446 | # This would incorrectly treat last word as partial result, so we 447 | # add an empty string as a sentinel. 448 | if not separator and chunk[-1].isspace(): 449 | words.append("") 450 | return words 451 | return self.sequence(splitter, "") 452 | -------------------------------------------------------------------------------- /scramjet/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import asyncio 3 | from scramjet.ansi_color_codes import * 4 | import random 5 | 6 | # Debugging helpers, testing and logging utilities. 7 | 8 | random.seed('Pyfca') 9 | MAX_DELAY = 0.3 10 | 11 | 12 | def print_trimmed(item, color=grey): 13 | """For logging data that may be very long.""" 14 | if (type(item) is str or type(item) is bytes) and len(item) > 32: 15 | result = f'{repr(item[:16])}..(length: {len(item)})' 16 | else: 17 | result = repr(item) 18 | return f'{color}{result}{reset}' if color else result 19 | 20 | 21 | def pprint_chunk(item): 22 | """Print only the essential part of the chunk. For debugging.""" 23 | if type(item) is dict and 'id' in item: 24 | return f'chunk_id={item["id"]}' 25 | else: 26 | return f'' 27 | 28 | 29 | def print_formatted(item): 30 | """Pretty-print for debugging various object types.""" 31 | if isinstance(item, asyncio.Future): 32 | if hasattr(item, 'chunk'): 33 | default_info = item.__str__()[1:-1] # trim < > 34 | return f'<{default_info} {pprint_chunk(item.chunk)}>' 35 | else: 36 | return item.__str__() 37 | else: # most probably chunk 38 | return f'{grey}{pprint_chunk(item)}{reset}' 39 | 40 | 41 | async def mock_delay(data): 42 | """Pretend that we run some async operations that take some time.""" 43 | delay = 0 44 | if type(data) is dict and 'delay' in data: 45 | delay = data['delay'] 46 | else: 47 | delay = random.uniform(0, MAX_DELAY) 48 | if delay: 49 | await asyncio.sleep(delay) 50 | 51 | 52 | class _LogWithTimer: 53 | """Simple logger with time counted from initialization - 54 | makes it easier to follow timing relationships than absolute time.""" 55 | def __init__(self, epoch=time.perf_counter()): 56 | self.epoch = epoch 57 | 58 | def log(self, *args): 59 | time_delta = time.perf_counter() - self.epoch 60 | print(f'{time_delta:10.6f}', *args) 61 | 62 | def reset(self): 63 | self.epoch = time.perf_counter() 64 | 65 | # expose singleton for synchronization across modules using it 66 | LogWithTimer = _LogWithTimer() 67 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="scramjet-framework-py", 8 | version='0.1.1', 9 | author="Scramjet.org", 10 | author_email="", 11 | description='Scramjet is a simple reactive stream programming framework.', 12 | long_description_content_type="text/markdown", 13 | long_description=long_description, 14 | packages=find_packages(exclude=["test"]), 15 | install_requires=[], 16 | keywords=['python', 'streams'], 17 | classifiers=[ 18 | ] 19 | ) 20 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scramjetorg/framework-python/7e84db2ef65af3f944957f1745de0045ee844060/test/__init__.py -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import scramjet.utils as utils 4 | 5 | @pytest.fixture(autouse=True) 6 | def setup(): 7 | utils.LogWithTimer.reset() 8 | # add a newline before test output, so that it doesn't start on the same 9 | # line as pytest info (when pytest is ran with -vs) 10 | print() 11 | 12 | @pytest.fixture() 13 | def named_pipe(tmp_path): 14 | path = tmp_path / 'test_pipe' 15 | os.mkfifo(path) 16 | yield path 17 | os.remove(path) 18 | -------------------------------------------------------------------------------- /test/large_test_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from string import ascii_uppercase, ascii_lowercase, digits 4 | 5 | # make space 10 times likely than other characters 6 | charset = ascii_uppercase + digits + ascii_lowercase + ' '*10 7 | # Should be noticeably larger than unix pipe buffer (64k) and TCP buffer (128k) 8 | filesize = 1000000 9 | 10 | def ensure_exists(path, charset): 11 | if not os.path.isfile(path) or os.path.getsize(path) != filesize: 12 | with open(path, 'w') as f: 13 | f.write(''.join(random.choice(charset) for _ in range(filesize))) 14 | return os.path.abspath(path), filesize 15 | 16 | file_without_newlines = ensure_exists('./large_text_1', charset) 17 | file_with_newlines = ensure_exists('./large_text_2', charset + '\n') 18 | -------------------------------------------------------------------------------- /test/sample_multibyte_text.txt: -------------------------------------------------------------------------------- 1 | żółć -------------------------------------------------------------------------------- /test/sample_numbers_1.txt: -------------------------------------------------------------------------------- 1 | 8 2 | 25 3 | 3 4 | 14 5 | 20 6 | 9 7 | 13 8 | 16 9 | -------------------------------------------------------------------------------- /test/sample_text_0.txt: -------------------------------------------------------------------------------- 1 | foo 2 | -------------------------------------------------------------------------------- /test/sample_text_1.txt: -------------------------------------------------------------------------------- 1 | foo 2 | bar baz 3 | qux -------------------------------------------------------------------------------- /test/sample_text_2.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam laoreet dolor sed magna rutrum, eget fringilla augue ultrices. Vivamus interdum aliquet lobortis. Donec pretium sapien ut leo aliquet, sed sollicitudin massa vehicula. Vivamus vestibulum finibus dui ac pellentesque. Morbi consequat eu dui ac tincidunt. Duis velit ligula, congue at mi aliquet, fringilla blandit diam. Quisque et libero vitae justo interdum malesuada non ut massa. Nullam erat dolor, pellentesque in sodales at, consectetur at quam. Duis vitae commodo lorem, in sagittis tortor. Phasellus sollicitudin nulla nec felis varius, a suscipit eros commodo. Curabitur sed tellus sagittis, fermentum urna sit amet, gravida lectus. Maecenas ut risus ex. Phasellus sit amet volutpat arcu, quis egestas metus. Pellentesque a ligula lectus. Donec sit amet magna eget dolor volutpat vestibulum. Cras dictum, purus et viverra rutrum, mauris nulla sagittis erat, non dictum lorem quam vel odio. Interdum et malesuada fames ac ante ipsum primis in faucibus. Phasellus lobortis odio eget justo efficitur ultricies. Aliquam at luctus metus. Nunc maximus ipsum risus, id malesuada nunc mattis sed. Donec pharetra mauris in metus suscipit, ut consectetur dolor ultrices. Integer placerat enim a quam elementum pharetra. Quisque pulvinar ligula at risus volutpat, et mollis massa dictum. Nunc euismod ornare purus, vitae pretium odio aliquet ac. Cras ornare varius augue, vel cursus enim malesuada pellentesque. Nulla turpis diam, accumsan at eros et, maximus laoreet metus. Proin ac est molestie, fermentum leo id, tincidunt nulla. Cras egestas magna id fermentum consequat. Nullam eget ligula sed ipsum vestibulum rutrum. Duis dignissim nec dui id pretium. Sed rhoncus, risus non consectetur tincidunt, nisi ex tempor quam, quis convallis elit justo nec risus. In pellentesque tempor ultricies. Nullam sit amet metus condimentum, iaculis erat eu, volutpat felis. Proin augue mauris, vestibulum quis ex eu, tincidunt imperdiet magna. Vivamus nisi orci, varius porttitor laoreet ut, volutpat vestibulum urna. Phasellus vestibulum dui ex, ac accumsan est iaculis et. Etiam eu malesuada leo. Mauris lobortis ex porttitor eros scelerisque, vitae feugiat urna eleifend. Suspendisse eget mi non dolor molestie commodo. Maecenas in nunc id ex iaculis blandit id molestie lacus. Maecenas cursus purus nec est sagittis bibendum. Aliquam ac ligula arcu. Suspendisse tristique risus ac enim pulvinar malesuada. Suspendisse potenti. Nulla facilisi. Ut vel rhoncus urna. Vivamus tellus felis, aliquet quis augue pretium, imperdiet consequat augue. Integer ultrices, eros a fringilla imperdiet, ante metus condimentum eros, quis tempus erat magna ac sem. Duis ultricies erat id tortor blandit ornare. Vestibulum laoreet turpis eget augue volutpat, vitae facilisis quam pulvinar. Vestibulum pharetra tellus quis orci luctus, ut aliquet leo efficitur. Ut finibus tortor maximus bibendum lacinia. Ut nec massa ut dolor hendrerit aliquet condimentum quis diam. Quisque velit tortor, convallis et tellus eget, euismod mollis ante. Quisque lacinia tortor a pulvinar ultricies. Nulla dignissim eu metus vel aliquet. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Aenean in lectus vitae ex fermentum malesuada. Proin vestibulum tellus at felis vestibulum placerat. Vestibulum vel euismod tellus. Nullam auctor neque eros, fermentum semper ligula lobortis id. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam laoreet dolor sed magna rutrum, eget fringilla augue ultrices. Vivamus interdum aliquet lobortis. Donec pretium sapien ut leo aliquet, sed sollicitudin massa vehicula. Vivamus vestibulum finibus dui ac pellentesque. Morbi consequat eu dui ac tincidunt. Duis velit ligula, congue at mi aliquet, fringilla blandit diam. Quisque et libero vitae justo interdum malesuada non ut massa. Nullam erat dolor, pellentesque in sodales at, consectetur at quam. Duis vitae commodo lorem, in sagittis tortor. Phasellus sollicitudin nulla nec felis varius, a suscipit eros commodo. Curabitur sed tellus sagittis, fermentum urna sit amet, gravida lectus. Maecenas ut risus ex. Phasellus sit amet volutpat arcu, quis egestas metus. Pellentesque a ligula lectus. Donec sit amet magna eget dolor volutpat vestibulum. Cras dictum, purus et viverra rutrum, mauris nulla sagittis erat, non dictum lorem quam vel odio. Interdum et malesuada fames ac ante ipsum primis in faucibus. Phasellus lobortis odio eget justo efficitur ultricies. Aliquam at luctus metus. Nunc maximus ipsum risus, id malesuada nunc mattis sed. Donec pharetra mauris in metus suscipit, ut consectetur dolor ultrices. Integer placerat enim a quam elementum pharetra. Quisque pulvinar ligula at risus volutpat, et mollis massa dictum. Nunc euismod ornare purus, vitae pretium odio aliquet ac. Cras ornare varius augue, vel cursus enim malesuada pellentesque. Nulla turpis diam, accumsan at eros et, maximus laoreet metus. Proin ac est molestie, fermentum leo id, tincidunt nulla. Cras egestas magna id fermentum consequat. Nullam eget ligula sed ipsum vestibulum rutrum. Duis dignissim nec dui id pretium. Sed rhoncus, risus non consectetur tincidunt, nisi ex tempor quam, quis convallis elit justo nec risus. In pellentesque tempor ultricies. Nullam sit amet metus condimentum, iaculis erat eu, volutpat felis. Proin augue mauris, vestibulum quis ex eu, tincidunt imperdiet magna. Vivamus nisi orci, varius porttitor laoreet ut, volutpat vestibulum urna. Phasellus vestibulum dui ex, ac accumsan est iaculis et. Etiam eu malesuada leo. Mauris lobortis ex porttitor eros scelerisque, vitae feugiat urna eleifend. Suspendisse eget mi non dolor molestie commodo. Maecenas in nunc id ex iaculis blandit id molestie lacus. Maecenas cursus purus nec est sagittis bibendum. Aliquam ac ligula arcu. Suspendisse tristique risus ac enim pulvinar malesuada. Suspendisse potenti. Nulla facilisi. Ut vel rhoncus urna. Vivamus tellus felis, aliquet quis augue pretium, imperdiet consequat augue. Integer ultrices, eros a fringilla imperdiet, ante metus condimentum eros, quis tempus erat magna ac sem. Duis ultricies erat id tortor blandit ornare. Vestibulum laoreet turpis eget augue volutpat, vitae facilisis quam pulvinar. Vestibulum pharetra tellus quis orci luctus, ut aliquet leo efficitur. Ut finibus tortor maximus bibendum lacinia. Ut nec massa ut dolor hendrerit aliquet condimentum quis diam. Quisque velit tortor, convallis et tellus eget, euismod mollis ante. Quisque lacinia tortor a pulvinar ultricies. Nulla dignissim eu metus vel aliquet. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Aenean in lectus vitae ex fermentum malesuada. Proin vestibulum tellus at felis vestibulum placerat. Vestibulum vel euismod tellus. Nullam auctor neque eros, fermentum semper ligula lobortis id. -------------------------------------------------------------------------------- /test/sample_text_3.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -------------------------------------------------------------------------------- /test/some-old-test-cases.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import asyncio 4 | import sys 5 | from pprint import pprint 6 | import random 7 | 8 | import pyfca 9 | import scramjet.utils as utils 10 | from scramjet.ansi_color_codes import * 11 | 12 | log = utils.LogWithTimer.log 13 | fmt = utils.print_formatted 14 | random.seed('Pyfca') 15 | 16 | # Use to change delays mocking async function execution 17 | SLOMO_FACTOR = float(sys.argv[1]) if len(sys.argv) > 1 else 0.01 18 | MAX_DELAY = 0.3 19 | 20 | 21 | # Transformation functions and utilities 22 | 23 | def log_results(results): 24 | log('Results:') 25 | pprint(results) 26 | 27 | def log_drain_status(drain, item): 28 | log(f'Drain status: {blue}{drain.done()}{reset} ' 29 | f'{grey}(last write: {utils.pprint_chunk(item)}){reset}') 30 | 31 | async def mock_delay(data): 32 | """Pretend that we run some async operations that take some time.""" 33 | delay = 0 34 | if hasattr(data, 'delay'): 35 | delay = data.delay 36 | elif type(data) is dict: 37 | if 'delay' in data: 38 | delay = data['delay'] 39 | elif type(data) is int: 40 | delay = data 41 | if not delay: 42 | delay = random.uniform(0, MAX_DELAY) 43 | await asyncio.sleep(delay * SLOMO_FACTOR) 44 | 45 | def identity(x): 46 | log(f'{yellow}identity:{reset} {x}') 47 | return x 48 | 49 | async def async_identity(x): 50 | log(f'{yellow}identity start:{reset} {x}') 51 | await mock_delay(x) 52 | log(f'{yellow}identity end:{reset} -> {x}') 53 | return x 54 | 55 | def transform_dict_or_num(description, data, function): 56 | if type(data) is dict and 'value' in data: 57 | data['value'] = function(data['value']) 58 | # dropping value means dropping the whole chunk 59 | result = data if data['value'] is not pyfca.DropChunk else pyfca.DropChunk 60 | else: 61 | result = function(data) 62 | log(f'{yellow}{description}:{reset} -> {result}') 63 | return result 64 | 65 | def increment(x): 66 | return transform_dict_or_num('increment', x, lambda x: x+1) 67 | 68 | async def async_increment(x): 69 | await mock_delay(x) 70 | return increment(x) 71 | 72 | def double(x): 73 | return transform_dict_or_num('double', x, lambda x: x*2) 74 | 75 | async def async_double(x): 76 | await mock_delay(x) 77 | return double(x) 78 | 79 | def keep_even(x): 80 | func = lambda x: x if x % 2 == 0 else pyfca.DropChunk 81 | return transform_dict_or_num('keep_even', x, func) 82 | 83 | async def async_keep_even(x): 84 | await mock_delay(x) 85 | return keep_even(x) 86 | 87 | # Test cases 88 | 89 | TEST_SEQUENCE = [1,2,1,3,2,4] 90 | objects_with_delays = [ 91 | {'id': count, 'delay': 0.1 * value} 92 | for count, value 93 | in enumerate(TEST_SEQUENCE) 94 | ] 95 | objects_with_values = [ 96 | {'id': count, 'value': value} 97 | for count, value 98 | in enumerate(TEST_SEQUENCE) 99 | ] 100 | MAX_PARALLEL = 4 101 | def monotonic_sequence(n): 102 | return [{'id': i} for i in range(n)] 103 | 104 | async def test_write_then_read_concurrently(input_data): 105 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 106 | for x in input_data: 107 | p.write(x) 108 | reads = [p.read() for _ in input_data] 109 | results = await asyncio.gather(*reads) 110 | log_results(results) 111 | # items should appear in the output unchanged and in the same order 112 | assert results == input_data 113 | 114 | async def test_write_then_read_sequentially(input_data): 115 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 116 | for x in input_data: 117 | p.write(x) 118 | results = [await p.read() for _ in input_data] 119 | log_results(results) 120 | # items should appear in the output unchanged and in the same order 121 | assert results == input_data 122 | 123 | async def test_write_and_read_in_turn(input_data): 124 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 125 | reads = [] 126 | for x in input_data: 127 | p.write(x) 128 | reads.append(p.read()) 129 | results = await asyncio.gather(*reads) 130 | log_results(results) 131 | # items should appear in the output unchanged and in the same order 132 | assert results == input_data 133 | 134 | async def test_reads_before_write(input_data): 135 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 136 | reads = [p.read() for _ in input_data] 137 | for x in input_data: 138 | p.write(x) 139 | results = await asyncio.gather(*reads) 140 | log_results(results) 141 | # items should appear in the output unchanged and in the same order 142 | assert results == input_data 143 | 144 | async def test_reads_exceeding_writes(input_data): 145 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 146 | for x in input_data: 147 | p.write(x) 148 | reads = [p.read() for _ in range(len(input_data) + 4)] 149 | p.end() 150 | results = await asyncio.gather(*reads) 151 | log_results(results) 152 | # Reads exceeding writes should return None (if accepting input stops). 153 | assert results == input_data + [None]*4 154 | 155 | async def test_reads_after_end(input_data): 156 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 157 | for x in input_data: 158 | p.write(x) 159 | p.end() 160 | reads = [p.read() for _ in range(len(input_data) + 4)] 161 | results = await asyncio.gather(*reads) 162 | log_results(results) 163 | # It should be possible to read after pyfca stopped accepting input. 164 | # Reads exceeding writes should return None. 165 | assert results == input_data + [None]*4 166 | 167 | # If the number of items being processed is below limit, write() should return 168 | # a future that resolves immediately (and therefore code that awaits it should 169 | # actually run synchronously). 170 | async def test_synchronous_draining(input_data): 171 | p = pyfca.Pyfca(MAX_PARALLEL, identity) 172 | event_loop_flag = None 173 | 174 | def start_sync_check(): 175 | log('The following sequence of instructions should be synchronous.') 176 | nonlocal event_loop_flag 177 | event_loop_flag = False 178 | def update_flag(): 179 | log('Next event loop iteration.') 180 | nonlocal event_loop_flag 181 | event_loop_flag = True 182 | # schedule for next event loop iteration 183 | asyncio.get_event_loop().call_soon(update_flag) 184 | 185 | def check_async(expected): 186 | log(f'Did next iteration of event loop start already? ' 187 | f'{cyan}{event_loop_flag}{reset}') 188 | assert event_loop_flag == expected 189 | 190 | async def write_below_limit(): 191 | for _ in range(MAX_PARALLEL - 1): 192 | item = input_data.pop(0) 193 | drain = p.write(item) 194 | log_drain_status(drain, item) 195 | # Writes up till MAX_PARALLEL-1 should report below limit 196 | assert drain.done() == True 197 | # This should resolve synchronously 198 | await drain 199 | 200 | # Note that we run the test twice because the results may differ for the 201 | # first MAX_PARALLEL items (e.g. the algorithm may return placeholders etc.) 202 | for i in range(2): 203 | log(f'Start batch #{i+1}') 204 | start_sync_check() 205 | 206 | # Writes up till MAX_PARALLEL-1 should resolve immediately 207 | await write_below_limit() 208 | check_async(False) 209 | 210 | # Create readers so the queue won't get stuck. This is still synchronous. 211 | reads = [p.read() for _ in range(MAX_PARALLEL)] 212 | check_async(False) 213 | 214 | # MAX_PARALLEL-th write should reach the limit and awaiting on it 215 | # should trigger entering event loop and processing previous items 216 | await p.write(input_data.pop(0)) 217 | check_async(True) 218 | 219 | # clean up the queue. 220 | await asyncio.gather(*reads) 221 | 222 | async def read_with_debug(pyfca, live_results=None): 223 | """Log received result and update result list immediately.""" 224 | result = await pyfca.read() 225 | log(f'{green}Got result:{reset} {result}') 226 | if live_results is not None: 227 | live_results.append(result) 228 | return result 229 | 230 | async def test_limit_waiting_until_items_are_processed(input_data): 231 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 232 | 233 | results = [] 234 | reads = [read_with_debug(p, results) for _ in input_data] 235 | read_futures = asyncio.gather(*reads) 236 | 237 | def check(written_count, expected_len): 238 | log(f'Drain after {written_count} items written, ' 239 | f'at least {expected} results should be ready') 240 | assert len(results) >= expected_len 241 | 242 | for items_written, x in enumerate(input_data, start=1): 243 | drain = p.write(x) 244 | await drain 245 | log_drain_status(drain, x) 246 | expected = items_written - MAX_PARALLEL + 1 247 | # wait one event loop iteration so that appropriate read is evaluated 248 | asyncio.get_event_loop().call_soon(check, items_written, expected) 249 | 250 | await read_futures 251 | log_results(results) 252 | # items should appear in the output unchanged and in the same order 253 | assert results == input_data 254 | 255 | async def test_limit_waiting_for_reads(input_data): 256 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 257 | 258 | for x in input_data[:MAX_PARALLEL-1]: 259 | drain = p.write(x) 260 | await drain 261 | log_drain_status(drain, x) 262 | 263 | def check_drain(expected): 264 | log_drain_status(drain, next_item) 265 | assert drain.done() == expected 266 | 267 | next_item = input_data[MAX_PARALLEL-1] 268 | drain = p.write(next_item) 269 | # Pyfca should report that the limit was reached. 270 | check_drain(False) 271 | 272 | # Wait until all items are processed (we need to first ensure that 273 | # last_chunk_status is up-to-date). 274 | await asyncio.sleep(0) 275 | await p.last_chunk_status 276 | 277 | # We should still not be drained because there were no reads yet. 278 | check_drain(False) 279 | 280 | first_result = await read_with_debug(p) 281 | # Drain status should update after next run of event loop 282 | await asyncio.sleep(0) 283 | check_drain(True) 284 | 285 | async def test_writing_above_limit(input_data): 286 | p = pyfca.Pyfca(MAX_PARALLEL, identity) 287 | 288 | # Writing shouldn't block if we exceed the limit. 289 | writes = [p.write(x) for x in input_data] 290 | assert len(writes) > MAX_PARALLEL 291 | 292 | # First writes should report that they were below the limit 293 | for drain in writes[:MAX_PARALLEL-1]: 294 | assert drain.done() == True 295 | # After reaching the limit write() should return an unresolved future 296 | for drain in writes[MAX_PARALLEL-1:]: 297 | assert drain.done() == False 298 | 299 | # collect results to avoid CancelledError and "coroutine was never awaited" 300 | reads = [p.read() for _ in input_data] 301 | await asyncio.gather(*reads) 302 | 303 | 304 | async def test_empty_transformation_chain(input_data): 305 | p = pyfca.Pyfca(MAX_PARALLEL) 306 | for x in input_data: 307 | p.write(x) 308 | results = [await p.read() for _ in input_data] 309 | log_results(results) 310 | # items should appear in the output unchanged and in the same order 311 | assert results == input_data 312 | 313 | async def test_multitransform(input_data): 314 | p = pyfca.Pyfca(MAX_PARALLEL, async_identity) 315 | p.add_transform(async_double) 316 | p.add_transform(async_increment) 317 | for x in input_data: 318 | p.write(x) 319 | reads = [p.read() for _ in input_data] 320 | results = await asyncio.gather(*reads) 321 | log_results(results) 322 | # multiple transformations should be applied to each element, and they 323 | # should arrive in the same order they were written in. 324 | assert results == [ 325 | {'id': 0, 'value': 3}, 326 | {'id': 1, 'value': 5}, 327 | {'id': 2, 'value': 3}, 328 | {'id': 3, 'value': 7}, 329 | {'id': 4, 'value': 5}, 330 | {'id': 5, 'value': 9}, 331 | ] 332 | 333 | async def test_sync_chain(input_data): 334 | p = pyfca.Pyfca(MAX_PARALLEL, increment) 335 | p.add_transform(double) 336 | for x in input_data: 337 | p.write(x) 338 | reads = [p.read() for _ in input_data] 339 | results = await asyncio.gather(*reads) 340 | log_results(results) 341 | # Using synchronous functions as transformations should work. 342 | assert results == [ 343 | {'id': 0, 'value': 4}, 344 | {'id': 1, 'value': 6}, 345 | {'id': 2, 'value': 4}, 346 | {'id': 3, 'value': 8}, 347 | {'id': 4, 'value': 6}, 348 | {'id': 5, 'value': 10}, 349 | ] 350 | 351 | async def test_filtering_should_drop_items(input_data): 352 | p = pyfca.Pyfca(MAX_PARALLEL, async_keep_even) 353 | for x in input_data: 354 | p.write(x) 355 | p.end() 356 | results = [await p.read() for _ in input_data] 357 | log_results(results) 358 | assert results == [ 359 | {'id': 1, 'value': 2}, 360 | {'id': 4, 'value': 2}, 361 | {'id': 5, 'value': 4}, 362 | None, 363 | None, 364 | None, 365 | ] 366 | 367 | 368 | async def test_filtering_reads_before_end(input_data): 369 | p = pyfca.Pyfca(MAX_PARALLEL, async_keep_even) 370 | for x in input_data: 371 | p.write(x) 372 | reads = [p.read() for _ in input_data] 373 | p.end() 374 | results = await asyncio.gather(*reads) 375 | log_results(results) 376 | # even though the reads were performed before .end(), they should return 377 | # Nones for filtered out items, and with correct ordering 378 | assert results == [ 379 | {'id': 1, 'value': 2}, 380 | {'id': 4, 'value': 2}, 381 | {'id': 5, 'value': 4}, 382 | None, 383 | None, 384 | None, 385 | ] 386 | 387 | async def test_filtering_drops_everything(input_data): 388 | p = pyfca.Pyfca(MAX_PARALLEL, async_keep_even) 389 | for x in input_data: 390 | p.write(x) 391 | reads = [p.read() for _ in input_data] 392 | p.end() 393 | results = await asyncio.gather(*reads) 394 | log_results(results) 395 | # even though the reads were performed before .end(), they should return 396 | # Nones for filtered out items 397 | assert results == [ 398 | None, 399 | None, 400 | None, 401 | None, 402 | None, 403 | None, 404 | ] 405 | 406 | # Main test execution loop 407 | 408 | tests_to_run = [ 409 | (test_write_then_read_concurrently, objects_with_values), 410 | (test_write_then_read_sequentially, objects_with_values), 411 | (test_write_and_read_in_turn, objects_with_values), 412 | (test_reads_before_write, objects_with_values), 413 | (test_reads_exceeding_writes, objects_with_values), 414 | (test_reads_after_end, objects_with_values), 415 | (test_synchronous_draining, monotonic_sequence(2*MAX_PARALLEL)), 416 | (test_limit_waiting_until_items_are_processed, objects_with_delays), 417 | (test_limit_waiting_for_reads, objects_with_values), 418 | (test_writing_above_limit, monotonic_sequence(2*MAX_PARALLEL)), 419 | (test_empty_transformation_chain, objects_with_values), 420 | (test_multitransform, objects_with_values), 421 | (test_sync_chain, objects_with_values), 422 | (test_filtering_should_drop_items, objects_with_values), 423 | (test_filtering_reads_before_end, objects_with_values), 424 | (test_filtering_drops_everything, [1,3,5,7,9,11]), 425 | ] 426 | 427 | import time 428 | import copy 429 | for test, data in tests_to_run: 430 | print(f"\n\nRunning {strong}{test.__name__}{reset}:\n") 431 | # make sure we use fresh copy of data for each test 432 | input_data = copy.deepcopy(data) 433 | asyncio.run(test(input_data)) 434 | time.sleep(0.1 * SLOMO_FACTOR) 435 | utils.LogWithTimer.reset() 436 | -------------------------------------------------------------------------------- /test/test_batch.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import scramjet.utils as utils 3 | from scramjet.ansi_color_codes import * 4 | import pytest 5 | 6 | log = utils.LogWithTimer.log 7 | fmt = utils.print_formatted 8 | 9 | @pytest.mark.asyncio 10 | async def test_batching_conditionally(): 11 | data = ["foo", "bar.", "baz", "qux", "plox."] 12 | stream = Stream.from_iterable(data, max_parallel=4) 13 | results = await stream.batch(lambda s: s[-1] == '.').to_list() 14 | assert results == [['foo', 'bar.'], ['baz', 'qux', 'plox.']] 15 | 16 | @pytest.mark.asyncio 17 | async def test_batching_with_partial_batch_on_end(): 18 | data = ["foo", "bar.", "baz", "qux", ".", "plox"] 19 | stream = Stream.from_iterable(data, max_parallel=4) 20 | results = await stream.batch(lambda s: s[-1] == '.').to_list() 21 | assert results == [['foo', 'bar.'], ['baz', 'qux', '.'], ['plox']] 22 | 23 | @pytest.mark.asyncio 24 | async def test_batching_by_amount(): 25 | data = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] 26 | context = {'count': 0} 27 | def is_nth(chunk, ctx, N): 28 | ctx['count'] = ctx['count'] + 1 29 | return ctx['count']% N == 0 30 | stream = Stream.from_iterable(data, max_parallel=4) 31 | results = await stream.batch(is_nth, context, 3).to_list() 32 | assert results == [['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h']] 33 | -------------------------------------------------------------------------------- /test/test_consumable.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StreamAlreadyConsumed 2 | import pytest 3 | 4 | @pytest.mark.asyncio 5 | async def test_reading_from_one_stream_twice(): 6 | s = Stream.read_from([1, 2, 3, 4]) 7 | await s.to_list() 8 | with pytest.raises(StreamAlreadyConsumed): 9 | await s.to_list() 10 | 11 | @pytest.mark.asyncio 12 | async def test_transforming_one_stream_twice(): 13 | s1 = Stream.read_from([1, 2, 3, 4]) 14 | s1.map(lambda x: x+1) 15 | with pytest.raises(StreamAlreadyConsumed): 16 | s1.map(lambda x: x*2) 17 | -------------------------------------------------------------------------------- /test/test_datastream_buffering.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | from scramjet.ansi_color_codes import * 3 | import scramjet.utils as utils 4 | import pytest 5 | 6 | log = utils.LogWithTimer.log 7 | 8 | async def echo(x): 9 | log(f"{yellow}Processing:{reset} {repr(x)}") 10 | return x 11 | 12 | # test cases 13 | 14 | @pytest.mark.asyncio 15 | async def test_reading_and_writing_to_file(): 16 | with open('test/sample_text_1.txt') as file_in, \ 17 | open('test_output', 'w') as file_out: 18 | await Stream.read_from(file_in).write_to(file_out) 19 | with open('test/sample_text_1.txt') as source, open('test_output') as dest: 20 | assert source.read() == dest.read() 21 | 22 | 23 | @pytest.mark.asyncio 24 | async def test_reading_and_writing_to_file_with_coroutine(): 25 | import aiofiles 26 | async with aiofiles.open('test/sample_text_1.txt', mode='r') as file_in, \ 27 | aiofiles.open('test_output', mode='w') as file_out: 28 | await Stream.read_from(file_in, chunk_size=2).write_to(file_out) 29 | with open('test/sample_text_1.txt') as source, open('test_output') as dest: 30 | assert source.read() == dest.read() 31 | -------------------------------------------------------------------------------- /test/test_datastream_creation.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, UnsupportedOperation 2 | import asyncio 3 | from scramjet.ansi_color_codes import * 4 | import pytest 5 | 6 | # test cases 7 | 8 | @pytest.mark.asyncio 9 | async def test_creating_stream_using_constructor(): 10 | stream = Stream() 11 | assert isinstance(stream, Stream) 12 | 13 | @pytest.mark.asyncio 14 | async def test_creating_stream_from_list(): 15 | stream = Stream.from_iterable([1, 2, 3, 4]) 16 | assert [1, 2, 3, 4] == await stream.to_list() 17 | 18 | @pytest.mark.asyncio 19 | async def test_creating_stream_from_empty_list(): 20 | stream = Stream.from_iterable([]) 21 | assert [] == await stream.to_list() 22 | 23 | @pytest.mark.asyncio 24 | async def test_creating_stream_from_set(): 25 | stream = Stream.from_iterable({1, 2, 3, 4}) 26 | assert [1, 2, 3, 4] == await stream.to_list() 27 | 28 | @pytest.mark.asyncio 29 | async def test_creating_stream_from_string(): 30 | stream = Stream.from_iterable('abcd') 31 | assert ['a', 'b', 'c', 'd'] == await stream.to_list() 32 | 33 | @pytest.mark.asyncio 34 | async def test_creating_stream_from_dict_keys(): 35 | test_input = {'a': 1, 'b': 2, 'c': 3, 'd': 4} 36 | stream = Stream.from_iterable(test_input) 37 | assert ['a', 'b', 'c', 'd'] == await stream.to_list() 38 | 39 | @pytest.mark.asyncio 40 | async def test_creating_stream_from_dict_items(): 41 | test_input = {'a': 1, 'b': 2, 'c': 3, 'd': 4} 42 | stream = Stream.from_iterable(test_input.items()) 43 | assert test_input == dict(await stream.to_list()) 44 | 45 | @pytest.mark.asyncio 46 | async def test_creating_stream_from_generator(): 47 | stream = Stream.from_iterable(range(4)) 48 | assert [0, 1, 2, 3] == await stream.to_list() 49 | 50 | @pytest.mark.asyncio 51 | async def test_creating_stream_from_file_object(): 52 | with open("test/sample_text_1.txt") as f: 53 | stream = Stream.from_iterable(f) 54 | assert ['foo\n', 'bar baz\n', 'qux'] == await stream.to_list() 55 | 56 | @pytest.mark.asyncio 57 | async def test_specifying_chunk_size_on_plain_iterable(): 58 | with pytest.raises(UnsupportedOperation): 59 | result = Stream.read_from([1, 2, 3, 4], chunk_size=2) 60 | 61 | @pytest.mark.asyncio 62 | async def test_non_iterable_source_without_chunk_size(): 63 | class Foo(): 64 | def read(self, how_many): 65 | return "" + "foo"*how_many 66 | 67 | with pytest.raises(UnsupportedOperation): 68 | Stream.read_from(Foo()) 69 | 70 | class AsyncCountUntil(): 71 | def __init__(self, max) -> None: 72 | self.limit = max 73 | 74 | async def __aiter__(self): 75 | for i in range(self.limit): 76 | await asyncio.sleep(0.01) 77 | yield i+1 78 | 79 | @pytest.mark.asyncio 80 | async def test_creating_stream_from_async_iterable(): 81 | stream = Stream.read_from(AsyncCountUntil(8)) 82 | assert [1, 2, 3, 4, 5, 6, 7, 8] == await stream.to_list() 83 | 84 | @pytest.mark.asyncio 85 | async def test_creating_stream_from_another_stream(): 86 | s1 = Stream.read_from(range(8)) 87 | s2 = Stream.read_from(s1).map(lambda x: x*2) 88 | s3 = Stream.read_from(s2) 89 | assert [0, 2, 4, 6, 8, 10, 12, 14] == await s3.to_list() 90 | 91 | @pytest.mark.asyncio 92 | async def test_iterating_over_a_stream(): 93 | stream = Stream.read_from(range(8)) 94 | result = [chunk async for chunk in stream] 95 | assert [0, 1, 2, 3, 4, 5, 6, 7] == result 96 | -------------------------------------------------------------------------------- /test/test_datastream_read_write.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StreamAlreadyConsumed 2 | import asyncio 3 | from scramjet.ansi_color_codes import * 4 | import pytest 5 | 6 | # test cases 7 | 8 | @pytest.mark.asyncio 9 | async def test_writing_chunks_to_stream(): 10 | stream = Stream() 11 | for x in [1, 2, 3, 4]: 12 | stream.write(x) 13 | stream.end() 14 | assert [1, 2, 3, 4] == await stream.to_list() 15 | 16 | @pytest.mark.asyncio 17 | async def test_reading_chunks_from_stream(): 18 | stream = Stream.from_iterable('abcd') 19 | assert await stream.read() == 'a' 20 | assert await stream.read() == 'b' 21 | assert await stream.read() == 'c' 22 | assert await stream.read() == 'd' 23 | 24 | @pytest.mark.asyncio 25 | async def test_reading_from_consumed_stream(): 26 | s1 = Stream.from_iterable('abcd') 27 | s2 = s1.map(lambda x: x*2) 28 | with pytest.raises(StreamAlreadyConsumed): 29 | await s1.read() 30 | 31 | @pytest.mark.asyncio 32 | async def test_writing_to_imediate_stream(): 33 | s1 = Stream().map(lambda x: x*2) 34 | s2 = s1.map(lambda x: "foo-" + x) 35 | s2.write("a") 36 | assert await s2.read() == "foo-aa" 37 | 38 | @pytest.mark.asyncio 39 | async def test_writing_to_imediate_stream_with_pyfca_break(): 40 | s1 = Stream() 41 | s2 = s1.batch(lambda chunk: chunk > "d") 42 | s3 = s2.map(lambda x: len(x)) 43 | s3.write("a") 44 | s3.write("c") 45 | s3.write("e") 46 | s3.write("d") 47 | s3.write("b") 48 | s1.end() 49 | assert await s3.to_list() == [3, 2] 50 | 51 | @pytest.mark.asyncio 52 | async def test_reading_some_chunks_from_stream(): 53 | stream = Stream.from_iterable('abcd') 54 | assert await stream.read() == 'a' 55 | assert ['b', 'c', 'd'] == await stream.to_list() 56 | 57 | @pytest.mark.asyncio 58 | async def test_reading_and_writing_in_turn(): 59 | stream = Stream() 60 | for x in [1, 2, 3, 4]: 61 | await stream.write(x) 62 | assert await stream.read() == x 63 | stream.end() 64 | 65 | @pytest.mark.asyncio 66 | async def test_stream_write_returns_drain_status(): 67 | stream = Stream(max_parallel=4) 68 | data = [1, 2, 3, 4, 5, 6, 7, 8] 69 | writes = [stream.write(x) for x in data] 70 | # initially only writes below max_parallel should resolve drain 71 | for i, drain in enumerate(writes, 1): 72 | assert drain.done() == (i<4) 73 | stream.end() 74 | result = await stream.to_list() 75 | assert result == data 76 | # wait one even loop iteration for drain updates 77 | await asyncio.sleep(0) 78 | for drain in writes: 79 | assert drain.done() 80 | -------------------------------------------------------------------------------- /test/test_datastream_transformations.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | from scramjet.pyfca import DropChunk 3 | import asyncio 4 | from scramjet.ansi_color_codes import * 5 | import scramjet.utils as utils 6 | import pytest 7 | 8 | log = utils.LogWithTimer.log 9 | 10 | # transformations 11 | 12 | async def async_is_even(x): 13 | await asyncio.sleep((x%5)/100) 14 | return x % 2 == 0 15 | 16 | async def async_square(x): 17 | await asyncio.sleep((x%5)/100) 18 | return x**2 19 | 20 | async def echo(x): 21 | log(f"{yellow}Processing:{reset} {repr(x)}") 22 | return x 23 | 24 | 25 | # test cases 26 | 27 | @pytest.mark.asyncio 28 | async def test_simple_filtering(): 29 | stream = Stream.from_iterable(range(12)) 30 | result = await stream.filter(lambda x: x % 2 == 0).to_list() 31 | assert result == [0, 2, 4, 6, 8, 10] 32 | 33 | @pytest.mark.asyncio 34 | async def test_simple_filtering_with_args(): 35 | stream = Stream.from_iterable(range(12)) 36 | def foo(x, *args): 37 | return x < sum(args) 38 | result = await stream.filter(foo, 3, 5).to_list() 39 | assert result == [0, 1, 2, 3, 4, 5, 6, 7] 40 | 41 | @pytest.mark.asyncio 42 | async def test_simple_mapping(): 43 | stream = Stream.from_iterable(range(8)) 44 | result = await stream.map(lambda x: x**2).to_list() 45 | assert result == [0, 1, 4, 9, 16, 25, 36, 49] 46 | 47 | @pytest.mark.asyncio 48 | async def test_sync_transformations(): 49 | result = await ( 50 | Stream 51 | .from_iterable(range(12), max_parallel=4) 52 | .filter(lambda x: x % 2 == 0) 53 | .map(lambda x: x**2) 54 | .to_list() 55 | ) 56 | assert result == [0, 4, 16, 36, 64, 100] 57 | 58 | @pytest.mark.asyncio 59 | async def test_async_transformations(): 60 | result = await ( 61 | Stream 62 | .from_iterable(range(12), max_parallel=4) 63 | .filter(async_is_even) 64 | .map(async_square) 65 | .to_list() 66 | ) 67 | assert result == [0, 4, 16, 36, 64, 100] 68 | 69 | # Stream should not start consuming input until a "sink" metod (e.g. to_list) 70 | # is used (to avoid processing items before all transformations are added). 71 | @pytest.mark.asyncio 72 | async def test_adding_transformations_after_a_pause(): 73 | result = Stream.from_iterable(range(12), max_parallel=4) 74 | # Let event loop run several times, to ensure that any writes to pyfca 75 | # if they were sheduled) had a chance to run. 76 | for _ in range(8): 77 | await asyncio.sleep(0) 78 | # Stream should not be consumed before transformations are added. 79 | result = await result.filter(async_is_even).map(async_square).to_list() 80 | assert result == [0, 4, 16, 36, 64, 100] 81 | 82 | @pytest.mark.asyncio 83 | async def test_filter_creates_new_stream_instance(): 84 | stream = Stream.from_iterable(range(12), max_parallel=4) 85 | filtered = stream.filter(lambda x: x % 2 == 0) 86 | assert filtered != stream 87 | assert await filtered.to_list() == [0, 2, 4, 6, 8, 10] 88 | 89 | @pytest.mark.asyncio 90 | async def test_map_creates_new_stream_instance(): 91 | stream = Stream.from_iterable(range(8), max_parallel=4) 92 | mapped = stream.map(lambda x: x**2) 93 | assert mapped != stream 94 | assert await mapped.to_list() == [0, 1, 4, 9, 16, 25, 36, 49] 95 | 96 | @pytest.mark.asyncio 97 | async def test_filtering_in_map_transformation(): 98 | stream = Stream.from_iterable(range(8), max_parallel=4) 99 | # It should be possible to do filteing and mapping in one step. 100 | def filtering_map(x): 101 | # map and filter elements in one step 102 | return DropChunk if x % 3 == 0 else x*2 103 | result = await stream.map(filtering_map).to_list() 104 | assert result == [2, 4, 8, 10, 14] 105 | 106 | @pytest.mark.asyncio 107 | async def test_variadic_args(): 108 | stream = Stream.from_iterable(range(8)) 109 | # pow requires 2 arguments - base (chunk) and exponent (set to 2) 110 | result = await stream.map(pow, 2).to_list() 111 | assert result == [0, 1, 4, 9, 16, 25, 36, 49] 112 | 113 | @pytest.mark.asyncio 114 | async def test_transformations_on_data_from_file_object(): 115 | with open("test/sample_numbers_1.txt") as f: 116 | stream = Stream.from_iterable(f, max_parallel=4) 117 | result = await ( 118 | stream 119 | .map(echo) 120 | .map(lambda s: int(s.strip())) 121 | .filter(lambda x: x % 2 == 0) 122 | .map(lambda x: x**2) 123 | .map(lambda x: str(x)) 124 | .to_list() 125 | ) 126 | assert result == ['64', '196', '400', '256'] 127 | -------------------------------------------------------------------------------- /test/test_flatmap.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import asyncio 3 | import scramjet.utils as utils 4 | from scramjet.ansi_color_codes import * 5 | import pytest 6 | 7 | log = utils.LogWithTimer.log 8 | fmt = utils.print_formatted 9 | 10 | @pytest.mark.asyncio 11 | async def test_flattening_lists(): 12 | data = ["foo\nbar", "cork", "qux\nbarf ploxx\n", "baz"] 13 | stream = Stream.from_iterable(data, max_parallel=4) 14 | result = await stream.flatmap(lambda s: s.split()).to_list() 15 | print('result:', result) 16 | assert result == ['foo', 'bar', 'cork', 'qux', 'barf', 'ploxx', 'baz'] 17 | 18 | @pytest.mark.asyncio 19 | async def test_flattening_strings(): 20 | data = ["a", "flatmap"] 21 | stream = Stream.from_iterable(data, max_parallel=4) 22 | result = await stream.flatmap(lambda s: s).to_list() 23 | print('result:', result) 24 | assert result == ['a', 'f', 'l', 'a', 't', 'm', 'a', 'p'] 25 | 26 | @pytest.mark.asyncio 27 | async def test_empty_iterables(): 28 | data = [1, 2, 3, 4] 29 | stream = Stream.from_iterable(data, max_parallel=4) 30 | result = await stream.flatmap(lambda x: []).to_list() 31 | print('result:', result) 32 | assert result == [] 33 | 34 | @pytest.mark.asyncio 35 | async def test_flattening_non_iterables_errors(): 36 | data = [1, 2, 3, 4] 37 | Stream.from_iterable(data).flatmap(lambda x: x) 38 | # find flatmap task and see if it errored as expected 39 | for task in asyncio.all_tasks(): 40 | if task.get_name() == 'flatmap-consumer': 41 | with pytest.raises(TypeError): 42 | await task 43 | 44 | @pytest.mark.asyncio 45 | async def test_flattening_lists_with_coroutine(): 46 | async def split(string: str): 47 | return string.split() 48 | data = ["foo\nbar", "cork", "qux\nbarf ploxx\n", "baz"] 49 | stream = Stream.from_iterable(data, max_parallel=1) 50 | result = await stream.flatmap(split).to_list() 51 | assert result == ['foo', 'bar', 'cork', 'qux', 'barf', 'ploxx', 'baz'] 52 | -------------------------------------------------------------------------------- /test/test_miscellaneous.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StringStream 2 | import pytest 3 | import asyncio 4 | 5 | async def read_as_binary_and_decode(size, expected): 6 | with open('test/sample_multibyte_text.txt', 'rb') as file: 7 | bytes = await Stream.read_from(file, chunk_size=size).to_list() 8 | 9 | # ensure that we really have characters split across chunks 10 | with pytest.raises(UnicodeDecodeError): 11 | for chunk in bytes: 12 | chunk.decode("UTF-8") 13 | 14 | result = await Stream.read_from(bytes).decode("UTF-8").to_list() 15 | assert result == expected 16 | 17 | @pytest.mark.asyncio 18 | async def test_decoding_characters_split_across_chunks(): 19 | await read_as_binary_and_decode(3, ['ż', 'ół', 'ć']) 20 | 21 | @pytest.mark.asyncio 22 | async def test_decoding_chunks_with_(): 23 | # with chunk_size == 1 some incoming chunks will contain only partial 24 | # data, yielding empty strings. Ensure these are dropped. 25 | await read_as_binary_and_decode(1, ['ż', 'ó', 'ł', 'ć']) 26 | 27 | @pytest.mark.asyncio 28 | async def test_read_from_respects_stream_class(): 29 | stream = StringStream.read_from(['a', 'b', 'c', 'd']) 30 | assert type(stream) == StringStream 31 | await stream.to_list() 32 | 33 | @pytest.mark.asyncio 34 | async def test_changing_datastream_to_stringstream(): 35 | s1 = Stream.read_from(['a', 'b', 'c', 'd']) 36 | s2 = s1._as(StringStream) 37 | assert type(s2) == StringStream 38 | assert type(s1) == Stream 39 | assert await s2.to_list() == ['a', 'b', 'c', 'd'] 40 | 41 | @pytest.mark.asyncio 42 | async def test_mapping_stringstream_produces_stringstream(): 43 | s1 = StringStream.read_from(['a', 'b', 'c', 'd']) 44 | s2 = s1.map(lambda s: s*2) 45 | assert type(s1) == type(s2) == StringStream 46 | assert await s2.to_list() == ['aa', 'bb', 'cc', 'dd'] 47 | 48 | @pytest.mark.asyncio 49 | async def test_decoding_datastream_produces_stringstream(): 50 | s1 = Stream.read_from([b'foo\n', b'bar baz\n', b'qux']) 51 | s2 = s1.decode("UTF-8") 52 | assert type(s2) == StringStream 53 | assert await s2.to_list() == ['foo\n', 'bar baz\n', 'qux'] 54 | 55 | @pytest.mark.asyncio 56 | async def test_converting_streams_does_not_break_pyfca(): 57 | s1 = Stream.read_from(['a', 'b', 'c', 'd']).map(lambda x: x*2) 58 | s2 = s1._as(StringStream).map(lambda x: 'foo '+x) 59 | assert s2._pyfca == s1._pyfca 60 | await s2.to_list() 61 | 62 | @pytest.mark.asyncio 63 | async def test_each_method(): 64 | result = [] 65 | stream = StringStream.read_from(['a', 'b', 'c', 'd']) 66 | await stream.each(lambda x: result.append(x)).to_list() 67 | assert result == ['a', 'b', 'c', 'd'] 68 | 69 | @pytest.mark.asyncio 70 | async def test_each_method_async(): 71 | sleep_finished = False 72 | async def wait(chunk): 73 | nonlocal sleep_finished 74 | await asyncio.sleep(0.01) 75 | sleep_finished = True 76 | stream = StringStream.read_from(['a', 'b', 'c', 'd']) 77 | await stream.each(wait).to_list() 78 | assert sleep_finished 79 | 80 | def parse_and_square_even_dollars(stream): 81 | return ( 82 | stream 83 | .map(lambda s: int(s[1:])) 84 | .filter(lambda x: x % 2 == 0) 85 | .map(lambda x: x**2) 86 | .map(lambda x: "$" + str(x)) 87 | ) 88 | 89 | @pytest.mark.asyncio 90 | async def test_use_method(): 91 | data = ['$8', '$25', '$3', '$14', '$20', '$9', '$13', '$16'] 92 | stream = Stream.from_iterable(data, max_parallel=4) 93 | result = await stream.use(parse_and_square_even_dollars).to_list() 94 | assert result == ['$64', '$196', '$400', '$256'] 95 | 96 | 97 | async def test_await_on_stream(): 98 | data = ['$8', '$25', '$3', '$14', '$20', '$9', '$13', '$16'] 99 | stream = Stream.from_iterable(data, max_parallel=4) 100 | with pytest.raises(TypeError): 101 | await stream 102 | -------------------------------------------------------------------------------- /test/test_pipe.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StreamAlreadyConsumed 2 | import asyncio 3 | import pytest 4 | 5 | @pytest.mark.asyncio 6 | async def test_simple_stream_piping(): 7 | s1 = Stream.read_from(range(8)).map(lambda x: 2*x) 8 | s2 = Stream().filter(lambda x: x > 5) 9 | s1.pipe(s2) 10 | assert await s2.to_list() == [6, 8, 10, 12, 14] 11 | 12 | @pytest.mark.asyncio 13 | async def test_piping_to_multiple_targets(): 14 | source = Stream.read_from(range(8), max_parallel=4).map(lambda x: x+1) 15 | s1 = Stream(max_parallel=4).map(lambda x: x/10) 16 | s2 = Stream(max_parallel=4).map(lambda x: x*10) 17 | source.pipe(s1) 18 | source.pipe(s2) 19 | result1, result2 = await asyncio.gather(s1.to_list(), s2.to_list()) 20 | assert result1 == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] 21 | assert result2 == [10, 20, 30, 40, 50, 60, 70, 80] 22 | 23 | @pytest.mark.asyncio 24 | async def test_piped_stream_cannot_be_transformed(): 25 | s1 = Stream.read_from(range(8)).map(lambda x: 2*x) 26 | s2 = Stream() 27 | s1.pipe(s2) 28 | with pytest.raises(StreamAlreadyConsumed): 29 | s1.map(lambda x: x+1) 30 | await s2.to_list() 31 | -------------------------------------------------------------------------------- /test/test_processing_order.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import asyncio 4 | from pprint import pprint 5 | import pytest 6 | 7 | from scramjet import pyfca 8 | import scramjet.utils as utils 9 | from scramjet.ansi_color_codes import * 10 | 11 | log = utils.LogWithTimer.log 12 | fmt = utils.print_formatted 13 | 14 | # Use to change delays mocking async function execution 15 | SLOMO_FACTOR = 0.2 16 | 17 | MAX_PARALLEL = 4 18 | 19 | # every second item gets a 'delay' value to mimic long async operations 20 | def make_sequence(count): 21 | result = [{'id': n, 'value': n} for n in range(count)] 22 | for item in result: 23 | if item['id'] % 2: 24 | item['delay'] = 0.1 25 | return result 26 | 27 | 28 | # for checking the order in which transformation functions will be called 29 | function_calls = [] 30 | 31 | async def increment(chunk): 32 | function_calls.append(('increment', chunk['id'])) 33 | if 'delay' in chunk: 34 | await asyncio.sleep(chunk['delay'] * SLOMO_FACTOR) 35 | chunk['value'] = chunk['value'] + 1 36 | return chunk 37 | 38 | def double(chunk): 39 | function_calls.append(('double', chunk['id'])) 40 | chunk['value'] = chunk['value'] * 2 41 | return chunk 42 | 43 | def square(chunk): 44 | function_calls.append(('square', chunk['id'])) 45 | chunk['value'] = chunk['value'] ** 2 46 | return chunk 47 | 48 | 49 | @pytest.mark.asyncio 50 | async def test_processing_order_without_waiting(): 51 | input_data = make_sequence(6) 52 | function_calls.clear() 53 | utils.LogWithTimer.reset() 54 | p = pyfca.Pyfca(MAX_PARALLEL) 55 | p.add_transform(increment) 56 | p.add_transform(square) 57 | p.add_transform(double) 58 | 59 | for i, x in enumerate(input_data, start=1): 60 | drain = p.write(x) 61 | assert drain.done() == (True if i < MAX_PARALLEL else False) 62 | 63 | reads = [p.read() for _ in input_data] 64 | results = await asyncio.gather(*reads) 65 | pprint(results) 66 | check_order(results) 67 | incr_order, dbl_order, sqr_order = extract_ordering(function_calls) 68 | 69 | # first transformation function should be called in the same order as input 70 | assert incr_order == [0, 1, 2, 3, 4, 5] 71 | # 2nd and 3rd functions should be called first on items with even id (as 72 | # they are processed by first function immediately) 73 | assert dbl_order == sqr_order == [0, 2, 4, 1, 3, 5] 74 | assert function_calls == [ 75 | ('increment', 0), 76 | ('square', 0), 77 | ('double', 0), 78 | ('increment', 1), 79 | ('increment', 2), 80 | ('square', 2), 81 | ('double', 2), 82 | ('increment', 3), 83 | ('increment', 4), 84 | ('square', 4), 85 | ('double', 4), 86 | ('increment', 5), 87 | ('square', 1), 88 | ('double', 1), 89 | ('square', 3), 90 | ('double', 3), 91 | ('square', 5), 92 | ('double', 5), 93 | ] 94 | 95 | 96 | @pytest.mark.asyncio 97 | async def test_processing_order_with_waiting(): 98 | input_data = make_sequence(9) 99 | function_calls.clear() 100 | utils.LogWithTimer.reset() 101 | p = pyfca.Pyfca(MAX_PARALLEL) 102 | p.add_transform(increment) 103 | p.add_transform(square) 104 | p.add_transform(double) 105 | 106 | reads = [p.read() for _ in input_data] 107 | for x in input_data: 108 | await p.write(x) 109 | 110 | results = await asyncio.gather(*reads) 111 | pprint(results) 112 | check_order(results) 113 | incr_order, dbl_order, sqr_order = extract_ordering(function_calls) 114 | 115 | # first transformation function should be called in the same order as input 116 | assert incr_order == [0, 1, 2, 3, 4, 5, 6, 7, 8] 117 | # 2nd and 3rd functions should be called first on items with even id (as 118 | # they are processed by first function immediately) and then with odd id, 119 | # in batches of 4. Note that 0th element returns immediately so the first 120 | # "batch" starts at 1. 121 | assert dbl_order == sqr_order == [0, 2, 4, 1, 3, 6, 8, 5, 7] 122 | assert function_calls == [ 123 | ('increment', 0), 124 | ('square', 0), 125 | ('double', 0), 126 | ('increment', 1), 127 | ('increment', 2), 128 | ('square', 2), 129 | ('double', 2), 130 | ('increment', 3), 131 | ('increment', 4), 132 | ('square', 4), 133 | ('double', 4), 134 | ('square', 1), 135 | ('double', 1), 136 | ('square', 3), 137 | ('double', 3), 138 | ('increment', 5), 139 | ('increment', 6), 140 | ('square', 6), 141 | ('double', 6), 142 | ('increment', 7), 143 | ('increment', 8), 144 | ('square', 8), 145 | ('double', 8), 146 | ('square', 5), 147 | ('double', 5), 148 | ('square', 7), 149 | ('double', 7), 150 | ] 151 | 152 | 153 | def check_order(results): 154 | output_order = [item['id'] for item in results] 155 | print('Order of output items:', output_order) 156 | assert output_order == list(range(len(results))) 157 | 158 | def extract_ordering(fcalls): 159 | incrementing_order = [id for fname, id in fcalls if fname == 'increment'] 160 | doubling_order = [id for fname, id in fcalls if fname == 'double'] 161 | squaring_order = [id for fname, id in fcalls if fname == 'square'] 162 | print('Order of "increment" calls:', incrementing_order) 163 | print('Order of "double" calls:', doubling_order) 164 | print('Order of "square" calls:', squaring_order) 165 | return incrementing_order, doubling_order, squaring_order 166 | -------------------------------------------------------------------------------- /test/test_pyfca_spec.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | """This file contains tests written according to specification from 4 | https://github.com/scramjetorg/scramjet-framework-shared/blob/main/tests/spec/ifca.md""" 5 | 6 | import asyncio 7 | import time 8 | import copy 9 | import pytest 10 | 11 | from scramjet import pyfca 12 | import scramjet.utils as utils 13 | from scramjet.ansi_color_codes import * 14 | 15 | 16 | MAX_PARALLEL = 4 17 | # Use to change delays mocking async function execution 18 | SPEED = 200 19 | log = utils.LogWithTimer.log 20 | 21 | # Input data 22 | 23 | TEST_DATA_1 = [ 24 | {'id': x, 'delay': (x % 2)/SPEED} for x in range(6) 25 | ] 26 | 27 | TEST_SEQUENCE = [1, 3, 2, 6, 4, 5] 28 | 29 | TEST_DATA_2 = [ 30 | {'id': count, 'value': value, 'delay': value/SPEED} 31 | for count, value 32 | in enumerate(TEST_SEQUENCE) 33 | ] 34 | 35 | 36 | # Transformation functions and utilities 37 | 38 | async def async_identity(x): 39 | log(f'{yellow}identity start:{reset} {x}') 40 | await utils.mock_delay(x) 41 | log(f'{yellow}identity end:{reset} -> {x}') 42 | return x 43 | 44 | async def async_keep_even(x): 45 | await utils.mock_delay(x) 46 | if x % 2 == 0: 47 | log(f'{yellow}keep even:{reset} {x} -> {x}') 48 | return x 49 | else: 50 | log(f'{yellow}keep even:{reset} {x} -> drop') 51 | return pyfca.DropChunk 52 | 53 | 54 | 55 | # Basic tests 56 | # ----------- 57 | 58 | @pytest.mark.asyncio 59 | async def test_passthrough_by_default(): 60 | input_data = copy.deepcopy(TEST_DATA_2) 61 | p = pyfca.Pyfca(MAX_PARALLEL) 62 | for x in input_data: 63 | p.write(x) 64 | results = [await p.read() for _ in input_data] 65 | for x in results: 66 | log(f"Got: {x}") 67 | # Output should match the input exactly (both values and ordering). 68 | assert results == input_data 69 | 70 | @pytest.mark.asyncio 71 | async def test_simple_transformation(): 72 | input_data = ['a', 'b', 'c', 'd', 'e', 'f'] 73 | p = pyfca.Pyfca(MAX_PARALLEL) 74 | p.add_transform(lambda s: 'foo-' + s) 75 | for x in input_data: 76 | p.write(x) 77 | results = [await p.read() for _ in input_data] 78 | for x in results: 79 | log(f"Got: {x}") 80 | assert results == ['foo-a', 'foo-b', 'foo-c', 'foo-d', 'foo-e', 'foo-f'] 81 | 82 | @pytest.mark.asyncio 83 | async def test_concurrent_processing(): 84 | input_data = copy.deepcopy(TEST_DATA_2) 85 | processing_times = [] 86 | processing_items = [] 87 | 88 | async def transform(x): 89 | start = time.perf_counter() 90 | processing_items.append(x) 91 | log(f'{yellow}processing start:{reset} {x}') 92 | await utils.mock_delay(x) 93 | log(f'{yellow}processing end:{reset} {x}') 94 | processing_times.append(time.perf_counter() - start) 95 | return x 96 | 97 | p = pyfca.Pyfca(MAX_PARALLEL) 98 | p.add_transform(transform) 99 | start = time.perf_counter() 100 | for x in input_data: 101 | p.write(x) 102 | 103 | # let one event loop iteration run 104 | await asyncio.sleep(0) 105 | for item in input_data: 106 | assert item in processing_items 107 | 108 | for _ in input_data: 109 | await p.read() 110 | 111 | total_processing_time = time.perf_counter() - start 112 | assert total_processing_time < sum(processing_times) 113 | 114 | longest_item = max(processing_times) 115 | absolute_overhead = 0.02 # at most 20ms overhead 116 | relative_overhead = 1.20 # at most 20% overhead 117 | assert total_processing_time < longest_item + absolute_overhead 118 | assert total_processing_time < longest_item * relative_overhead 119 | 120 | 121 | # Ordering tests 122 | # -------------- 123 | 124 | @pytest.mark.asyncio 125 | async def test_result_order_with_odd_chunks_delayed(): 126 | input_data = copy.deepcopy(TEST_DATA_1) 127 | p = pyfca.Pyfca(MAX_PARALLEL) 128 | p.add_transform(async_identity) 129 | for x in input_data: 130 | p.write(x) 131 | results = [await p.read() for _ in input_data] 132 | # items should appear in the output unchanged and in the same order 133 | assert results == input_data 134 | 135 | @pytest.mark.asyncio 136 | async def test_result_order_with_varying_processing_time(): 137 | input_data = copy.deepcopy(TEST_DATA_2) 138 | p = pyfca.Pyfca(MAX_PARALLEL) 139 | p.add_transform(async_identity) 140 | for x in input_data: 141 | p.write(x) 142 | results = [await p.read() for _ in input_data] 143 | # items should appear in the output unchanged and in the same order 144 | assert results == input_data 145 | 146 | @pytest.mark.asyncio 147 | async def test_write_and_read_in_turn(): 148 | input_data = copy.deepcopy(TEST_DATA_2) 149 | p = pyfca.Pyfca(MAX_PARALLEL) 150 | p.add_transform(async_identity) 151 | reads = [] 152 | for x in input_data: 153 | p.write(x) 154 | reads.append(p.read()) 155 | results = await asyncio.gather(*reads) 156 | # items should appear in the output unchanged and in the same order 157 | assert results == input_data 158 | 159 | @pytest.mark.asyncio 160 | async def test_multiple_concurrent_reads(): 161 | input_data = copy.deepcopy(TEST_DATA_2) 162 | p = pyfca.Pyfca(MAX_PARALLEL) 163 | p.add_transform(async_identity) 164 | for x in input_data: 165 | p.write(x) 166 | reads = [p.read() for _ in input_data] 167 | results = await asyncio.gather(*reads) 168 | # items should appear in the output unchanged and in the same order 169 | assert results == input_data 170 | 171 | @pytest.mark.asyncio 172 | async def test_reads_before_write(): 173 | input_data = copy.deepcopy(TEST_DATA_2) 174 | p = pyfca.Pyfca(MAX_PARALLEL) 175 | p.add_transform(async_identity) 176 | reads = [p.read() for _ in input_data] 177 | for x in input_data: 178 | p.write(x) 179 | results = await asyncio.gather(*reads) 180 | # items should appear in the output unchanged and in the same order 181 | assert results == input_data 182 | 183 | 184 | # Filtering tests 185 | # --------------- 186 | 187 | @pytest.mark.asyncio 188 | async def test_support_for_dropping_chunks(): 189 | input_data = list(range(8)) 190 | p = pyfca.Pyfca(MAX_PARALLEL) 191 | p.add_transform(async_keep_even) 192 | for x in input_data: 193 | p.write(x) 194 | results = [await p.read() for _ in range(4)] 195 | assert results == [0, 2, 4, 6] 196 | 197 | @pytest.mark.asyncio 198 | async def test_reads_before_filtering(): 199 | input_data = list(range(8)) 200 | p = pyfca.Pyfca(MAX_PARALLEL) 201 | p.add_transform(async_keep_even) 202 | reads = [p.read() for _ in range(4)] 203 | for x in input_data: 204 | p.write(x) 205 | results = await asyncio.gather(*reads) 206 | assert results == [0, 2, 4, 6] 207 | 208 | @pytest.mark.asyncio 209 | async def test_dropping_chunks_in_the_middle_of_chain(): 210 | first_func_called = False 211 | def first(x): 212 | nonlocal first_func_called 213 | first_func_called = True 214 | log(f'{yellow}drop all:{reset} {x} -> drop') 215 | return pyfca.DropChunk 216 | 217 | second_func_called = False 218 | def second(x): 219 | nonlocal second_func_called 220 | second_func_called = True 221 | log(f'{yellow}never called:{reset} {x}') 222 | return x 223 | 224 | input_data = list(range(8)) 225 | p = pyfca.Pyfca(MAX_PARALLEL) 226 | p.add_transform(first) 227 | p.add_transform(second) 228 | for x in input_data: 229 | p.write(x) 230 | 231 | # Ensure all items were processed 232 | p.end() 233 | await p.read() 234 | 235 | assert first_func_called == True 236 | assert second_func_called == False 237 | 238 | 239 | # Limits tests 240 | # ------------ 241 | 242 | @pytest.mark.asyncio 243 | async def test_unrestricted_writing_below_limit(): 244 | input_data = copy.deepcopy(TEST_DATA_2)[:MAX_PARALLEL-1] 245 | p = pyfca.Pyfca(MAX_PARALLEL) 246 | p.add_transform(async_identity) 247 | for x in input_data: 248 | drain = p.write(x) 249 | assert drain.done() == True 250 | [await p.read() for _ in input_data] 251 | 252 | @pytest.mark.asyncio 253 | async def test_drain_pending_when_limit_reached(): 254 | input_data = copy.deepcopy(TEST_DATA_2)[:MAX_PARALLEL] 255 | p = pyfca.Pyfca(MAX_PARALLEL) 256 | p.add_transform(async_identity) 257 | writes = [p.write(x) for x in input_data] 258 | assert writes[-1].done() == False 259 | [await p.read() for _ in input_data] 260 | 261 | @pytest.mark.asyncio 262 | async def test_drain_resolved_when_drops_below_limit(): 263 | input_data = copy.deepcopy(TEST_DATA_2)[:MAX_PARALLEL+2] 264 | p = pyfca.Pyfca(MAX_PARALLEL) 265 | p.add_transform(async_identity) 266 | writes = [p.write(x) for x in input_data] 267 | for drain in writes[-3:]: 268 | assert drain.done() == False 269 | for _ in range(3): 270 | await p.read() 271 | await asyncio.sleep(0) 272 | for drain in writes[-3:]: 273 | assert drain.done() == True 274 | for _ in range(3): 275 | await p.read() 276 | 277 | 278 | # Ending tests 279 | # ------------ 280 | 281 | @pytest.mark.asyncio 282 | async def test_reading_from_empty_ifca(): 283 | p = pyfca.Pyfca(MAX_PARALLEL) 284 | p.end() 285 | result = await p.read() 286 | log(f"Got: {result}") 287 | assert result == None 288 | 289 | @pytest.mark.asyncio 290 | async def test_end_with_pending_reads(): 291 | N = MAX_PARALLEL*2 292 | p = pyfca.Pyfca(MAX_PARALLEL) 293 | reads = [p.read() for _ in range(N)] 294 | p.end() 295 | results = await asyncio.gather(*reads) 296 | log(f"Got: {results}") 297 | assert results == [None] * N 298 | 299 | @pytest.mark.asyncio 300 | async def test_write_after_end_errors(): 301 | p = pyfca.Pyfca(MAX_PARALLEL) 302 | p.end() 303 | with pytest.raises(pyfca.WriteAfterEnd): 304 | p.write('foo') 305 | 306 | @pytest.mark.asyncio 307 | async def test_multiple_ends_error(): 308 | p = pyfca.Pyfca(MAX_PARALLEL) 309 | p.end() 310 | with pytest.raises(pyfca.MultipleEnd): 311 | p.end() 312 | -------------------------------------------------------------------------------- /test/test_reading_files.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from scramjet.streams import Stream 3 | import pytest 4 | from multiprocessing import Process, Value 5 | import math 6 | import test.large_test_files 7 | import time 8 | import aiofiles 9 | 10 | @pytest.mark.asyncio 11 | async def test_stream_from_file_opened_as_text_carries_strings(): 12 | with open("test/sample_text_0.txt") as file: 13 | result = await Stream.read_from(file).to_list() 14 | assert result == ['foo\n'] 15 | 16 | @pytest.mark.asyncio 17 | async def test_stream_from_file_opened_as_binary_carries_bytes(): 18 | with open("test/sample_text_0.txt", 'rb') as file: 19 | result = await Stream.read_from(file).to_list() 20 | assert result == [b'foo\n'] 21 | 22 | @pytest.mark.asyncio 23 | async def test_no_chunk_size_in_text_mode(): 24 | with open("test/sample_text_1.txt") as file: 25 | result = await Stream.read_from(file).to_list() 26 | assert result == ['foo\n', 'bar baz\n', 'qux'] 27 | 28 | @pytest.mark.asyncio 29 | async def test_no_chunk_size_in_binary_mode(): 30 | with open("test/sample_text_1.txt", 'rb') as file: 31 | result = await Stream.read_from(file).to_list() 32 | assert result == [b'foo\n', b'bar baz\n', b'qux'] 33 | 34 | @pytest.mark.asyncio 35 | async def test_specifying_chunk_size_in_text_mode(): 36 | SIZE = 32 37 | with open("test/sample_text_3.txt") as file: 38 | result = await Stream.read_from(file, chunk_size=SIZE).to_list() 39 | for chunk in result[:-1]: # last one may be smaller 40 | assert len(chunk) == SIZE 41 | assert len(result[-1]) <= SIZE 42 | 43 | @pytest.mark.asyncio 44 | async def test_specifying_chunk_size_in_binary_mode(): 45 | SIZE = 32 46 | with open("test/sample_text_3.txt", 'rb') as file: 47 | result = await Stream.read_from(file, chunk_size=SIZE).to_list() 48 | for chunk in result[:-1]: # last one may be smaller 49 | assert len(chunk) == SIZE 50 | assert len(result[-1]) <= SIZE 51 | 52 | @pytest.mark.asyncio 53 | async def test_chunk_size_with_multibyte_chars_in_text_mode(): 54 | with open('test/sample_multibyte_text.txt') as file: 55 | individual_letters = [c for c in file.read()] 56 | with open('test/sample_multibyte_text.txt') as file: 57 | # each chunk should be a complete unicode character 58 | result = await Stream.read_from(file, chunk_size=1).to_list() 59 | assert result == individual_letters 60 | 61 | @pytest.mark.asyncio 62 | async def test_chunk_size_with_multibyte_chars_in_binary_mode(): 63 | with open('test/sample_multibyte_text.txt') as file: 64 | individual_letters = [c for c in file.read()] 65 | with open('test/sample_multibyte_text.txt', 'rb') as file: 66 | # with chunk_size=1 each byte should become separate chunk, 67 | # yielding chunks that are not valid UTF. 68 | result = await Stream.read_from(file, chunk_size=1).to_list() 69 | assert len(result) > len(individual_letters) 70 | with pytest.raises(UnicodeDecodeError): 71 | for chunk in result: 72 | letter = chunk.decode("UTF-8") 73 | 74 | @pytest.mark.asyncio 75 | async def test_reading_large_file_in_chunks(): 76 | path, fsize = test.large_test_files.file_with_newlines 77 | with open(path) as file: 78 | result = await Stream.read_from(file, chunk_size=16384).to_list() 79 | # chunks should be unrelated to lines in input file 80 | assert len(result) == math.ceil(fsize/16384) 81 | 82 | @pytest.mark.asyncio 83 | async def test_reading_large_file_without_newlines(): 84 | path, fsize = test.large_test_files.file_without_newlines 85 | with open(path) as file: 86 | result = await Stream.read_from(file).to_list() 87 | assert len(result) == 1 88 | assert len(result[0]) == fsize 89 | 90 | 91 | # Run in a separate process to avoid influence on tested code 92 | class WriteInIntervals(): 93 | def __init__(self, path, data, counter=None, interval=0.01): 94 | self.path, self.data = path, data 95 | self.counter, self.interval = counter, interval 96 | self.writer = Process(target=self.write) 97 | 98 | def __enter__(self): 99 | self.writer.start() 100 | return self 101 | 102 | def __exit__(self, exc_type, exc_value, exc_tb): 103 | self.writer.join() 104 | 105 | def write(self): 106 | with open(self.path, 'w') as pipe: 107 | for chunk in self.data: 108 | time.sleep(self.interval) 109 | print(f'Write into {repr(self.path)}: {repr(chunk)}') 110 | if self.counter: 111 | self.counter.value += 1 112 | pipe.write(chunk) 113 | pipe.flush() 114 | 115 | @pytest.mark.asyncio 116 | async def test_waiting_for_complete_chunk(named_pipe): 117 | data = ['foo', '\n', 'bar baz', ' ', 'bax\nqux'] 118 | with WriteInIntervals(named_pipe, data): 119 | with open(named_pipe) as file: 120 | result = await Stream.read_from(file, chunk_size=8).to_list() 121 | # all except last chunk should have specified size, 122 | # even though some data will be available for reading earlier. 123 | assert result == ['foo\nbar ', 'baz bax\n', 'qux'] 124 | 125 | @pytest.mark.asyncio 126 | async def test_processing_start_with_sync_source(named_pipe): 127 | data = ['foo\n', 'bar\n', 'baz\n', 'bax\n', 'qux\n'] 128 | chunks_written = Value('i', 0) 129 | write_counts = [] 130 | 131 | def log_how_many_written(chunk): 132 | write_counts.append(chunks_written.value) 133 | return chunk 134 | 135 | with WriteInIntervals(named_pipe, data, chunks_written): 136 | with open(named_pipe) as file: 137 | max_parallel = 3 138 | s = Stream.read_from(file, max_parallel=max_parallel) 139 | result = await s.map(log_how_many_written).to_list() 140 | # Since input is sync, processing of the first chunk should start 141 | # only after max_parallel chunks are read from (and written to) 142 | # the pipe (but it should not wait until all data is read). 143 | assert write_counts[0] == max_parallel < len(data) 144 | assert result == data 145 | 146 | @pytest.mark.asyncio 147 | async def test_processing_start_with_async_source(named_pipe): 148 | data = ['foo\n', 'bar\n', 'baz\n', 'bax\n', 'qux\n'] 149 | chunks_written = Value('i', 0) 150 | chunks_read = Value('i', 0) 151 | read_vs_written = [] 152 | record = namedtuple('record', ['read', 'written']) 153 | 154 | def log_read_vs_written(chunk): 155 | chunks_read.value += 1 156 | read_vs_written.append( 157 | record(chunks_read.value, chunks_written.value) 158 | ) 159 | return chunk 160 | 161 | with WriteInIntervals(named_pipe, data, chunks_written): 162 | async with aiofiles.open(named_pipe) as file: 163 | s = Stream.read_from(file) 164 | result = await s.map(log_read_vs_written).to_list() 165 | # Since input is async, processing of each chunk should start 166 | # immediately after it is written to the pipe. 167 | for record in read_vs_written: 168 | assert record.read == record.written 169 | assert result == data 170 | -------------------------------------------------------------------------------- /test/test_reading_network.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import asyncio 3 | import pytest 4 | from multiprocessing import Process 5 | import os 6 | import math 7 | import test.large_test_files 8 | 9 | # Run in a separate process to avoid influence on tested code 10 | class ServeOverTCP(): 11 | def __init__(self, path, port): 12 | self.path = path 13 | self.port = port 14 | self.writer = Process(target=self.write) 15 | 16 | def __enter__(self): 17 | self.writer.start() 18 | return self 19 | 20 | def __exit__(self, exc_type, exc_value, exc_tb): 21 | self.writer.terminate() 22 | self.writer.join() 23 | 24 | def write(self): 25 | os.system(f'nc -lN localhost {self.port} < {self.path}') 26 | 27 | @pytest.mark.skip(reason="flaky test, sometimes throws ConnectionRefusedError") 28 | @pytest.mark.asyncio 29 | async def test_reading_from_tcp_connection(): 30 | path, fsize = test.large_test_files.file_with_newlines 31 | with open(path, 'rb') as file: 32 | data = file.read() 33 | with ServeOverTCP(path, 8888): 34 | reader, writer = await asyncio.open_connection('localhost', 8888) 35 | result = await Stream.read_from(reader, chunk_size=16384).to_list() 36 | assert len(result) == math.ceil(fsize/16384) 37 | assert b''.join(result) == data 38 | writer.close() 39 | 40 | @pytest.mark.skip(reason="flaky test, sometimes throws ConnectionRefusedError") 41 | @pytest.mark.asyncio 42 | async def test_reading_from_tcp_connection_without_chunk_size(): 43 | path = "test/sample_text_1.txt" 44 | with ServeOverTCP(path, 9999): 45 | reader, writer = await asyncio.open_connection('localhost', 9999) 46 | result = await Stream.read_from(reader).to_list() 47 | assert result == [b'foo\n', b'bar baz\n', b'qux'] 48 | -------------------------------------------------------------------------------- /test/test_reduce.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import scramjet.utils as utils 3 | from scramjet.ansi_color_codes import * 4 | import pytest 5 | 6 | log = utils.LogWithTimer.log 7 | fmt = utils.print_formatted 8 | 9 | @pytest.mark.asyncio 10 | async def test_reduce_adding_numbers(): 11 | data = [1, 2, 3, 4, 5, 6] 12 | stream = Stream.from_iterable(data) 13 | result = await stream.reduce(lambda acc, item: acc+item, 0) 14 | print('sum:', result) 15 | assert result == 21 16 | 17 | @pytest.mark.asyncio 18 | async def test_reducing_with_no_initial_value(): 19 | data = [1, 2, 3, 4, 5, 6] 20 | stream = Stream.from_iterable(data, max_parallel=4) 21 | result = await stream.reduce(lambda acc, item: acc+item) 22 | print('sum:', result) 23 | assert result == 21 24 | 25 | @pytest.mark.asyncio 26 | async def test_reducing_numbers_to_string(): 27 | data = [1, 2, 3, 4, 5, 6] 28 | stream = Stream.from_iterable(data, max_parallel=4) 29 | result = await stream.reduce(lambda acc, item: acc+str(item), "") 30 | print('concatenated:', repr(result)) 31 | assert result == "123456" 32 | 33 | @pytest.mark.asyncio 34 | async def test_counting_items_with_reduce(): 35 | data = ['a', 'b', 'c', 'd', 'e', 'f'] 36 | stream = Stream.from_iterable(data, max_parallel=4) 37 | result = await stream.reduce(lambda count, item: count + 1, 0) 38 | print('count:', result) 39 | assert result == 6 40 | 41 | @pytest.mark.asyncio 42 | async def test_calculating_average_with_reduce(): 43 | data = [1, 3, 5, 7] 44 | stream = Stream.from_iterable(data) 45 | def rolling_avg(accumulator, item): 46 | partial_sum, count = accumulator 47 | return (partial_sum + item, count + 1) 48 | sum, count = await stream.reduce(rolling_avg, (0, 0)) 49 | result = sum/count 50 | print('average:', result) 51 | assert result == 4 52 | 53 | @pytest.mark.asyncio 54 | async def test_counting_items_with_reduce_with_coroutine_func(): 55 | data = ['a', 'b', 'c', 'd', 'e', 'f'] 56 | stream = Stream.from_iterable(data, max_parallel=4) 57 | async def count(count, item): 58 | return count + 1 59 | result = await stream.reduce(count, 0) 60 | print('count:', result) 61 | assert result == 6 62 | -------------------------------------------------------------------------------- /test/test_sequence.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import scramjet.utils as utils 3 | from scramjet.ansi_color_codes import * 4 | import pytest 5 | 6 | log = utils.LogWithTimer.log 7 | fmt = utils.print_formatted 8 | 9 | @pytest.mark.asyncio 10 | async def test_sequencing_text_into_lines(): 11 | data = ["foo\nbar", " ", "b", "az", "\nqux\n", "plox"] 12 | result = await ( 13 | Stream 14 | .from_iterable(data, max_parallel=2) 15 | .sequence(lambda part, chunk: (part+chunk).split('\n'), "") 16 | .to_list() 17 | ) 18 | print(result) 19 | assert result == ['foo', 'bar baz', 'qux', 'plox'] 20 | 21 | # I know, this could be done using flatmap+batch 22 | @pytest.mark.asyncio 23 | async def test_sequencing_lists_into_batches(): 24 | data = [[1, 2, 3], [4, 5], [6, 7, 8, 9, 10]] 25 | def split_into_pairs(part, li): 26 | new_list = part + li 27 | every_2nd_index = range(0, len(new_list), 2) 28 | return [new_list[i:i+2] for i in every_2nd_index] 29 | result = await ( 30 | Stream 31 | .from_iterable(data, max_parallel=2) 32 | .sequence(split_into_pairs, []) 33 | .to_list() 34 | ) 35 | print(result) 36 | assert result == [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] 37 | 38 | 39 | @pytest.mark.asyncio 40 | async def test_sequencing_text_into_lines_with_coroutine_sequencer(): 41 | data = ["foo\nbar", " ", "b", "az", "\nqux\n", "plox"] 42 | async def sequencer(part, chunk): 43 | return (part+chunk).split('\n') 44 | result = await ( 45 | Stream 46 | .from_iterable(data, max_parallel=2) 47 | .sequence(sequencer, "") 48 | .to_list() 49 | ) 50 | print(result) 51 | assert result == ['foo', 'bar baz', 'qux', 'plox'] 52 | -------------------------------------------------------------------------------- /test/test_stringstream.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream, StringStream 2 | import asyncio 3 | import pytest 4 | 5 | @pytest.mark.asyncio 6 | async def test_cutting_text_with_custom_sequencer(): 7 | def split(part, chunk): 8 | words = (part+chunk).split() 9 | # handle case where last element shouldn't be treated as partial 10 | if chunk[-1].isspace(): 11 | words.append("") 12 | return words 13 | 14 | data = ["foo\nbar", " ", "b", "az", "\nqux\n", "fork plox"] 15 | result = await ( 16 | Stream 17 | .read_from(data, max_parallel=2) 18 | .sequence(split, "") 19 | .to_list() 20 | ) 21 | assert result == ['foo', 'bar', 'baz', 'qux', 'fork', 'plox'] 22 | 23 | # this should achieve the same as above, but with helper method 24 | @pytest.mark.asyncio 25 | async def test_splitting_text_into_words(): 26 | data = ["foo\nbar", " ", "b", "az", "\nqux\n", "fork plox"] 27 | result = await ( 28 | StringStream 29 | .read_from(data, max_parallel=2) 30 | .split() 31 | .to_list() 32 | ) 33 | assert result == ['foo', 'bar', 'baz', 'qux', 'fork', 'plox'] 34 | 35 | @pytest.mark.asyncio 36 | async def test_splitting_with_custom_delimiter(): 37 | data = ["foo,bar", " ", "b", "az", ",qux,", "fork plox"] 38 | result = await ( 39 | StringStream 40 | .read_from(data, max_parallel=2) 41 | .split(',') 42 | .to_list() 43 | ) 44 | assert result == ['foo', 'bar baz', 'qux', 'fork plox'] 45 | 46 | @pytest.mark.asyncio 47 | async def test_parsing_stringstream_into_datastream(): 48 | data = [ 49 | "AAL\tAmerican Airlines Group Inc\t46.26\t \t0.43\t0.94%", "AAPL\tApple Inc\t110.06\t \t0.11\t0.10%", "ADBE\tAdobe Systems Inc\t105.02\t \t-0.79\t-0.75%", "ADI\tAnalog Devices Inc\t68.47\t \t0.26\t0.38%", "ADP\tAutomatic Data Processing Inc\t94.39\t \t0.01\t0.01%", 50 | "ADSK\tAutodesk Inc\t76.90\t \t-1.56\t-1.99%", "AKAM\tAkamai Technologies Inc\t66.44\t \t-0.16\t-0.24%", "ALXN\tAlexion Pharmaceuticals Inc\t119.85\t \t-3.12\t-2.54%", "AMAT\tApplied Materials Inc\t30.74\t \t0.01\t0.03%", "AMGN\tAmgen Inc\t145.23\t \t-2.13\t-1.45%", 51 | "AMZN\tAmazon.com Inc\t760.16\t \t3.76\t0.50%", "ATVI\tActivision Blizzard Inc\t38.39\t \t-1.55\t-3.88%", "AVGO\tBroadcom Ltd\t168.16\t \t1.12\t0.67%", "BBBY\tBed Bath & Beyond Inc\t44.42\t \t-0.50\t-1.11%", "BIDU\tBaidu Inc\t164.38\t \t-1.83\t-1.10%", 52 | "BIIB\tBiogen Inc\t317.00\t \t-2.30\t-0.72%", "BMRN\tBiomarin Pharmaceutical Inc\t89.00\t \t-1.74\t-1.92%", "CA\tCA Inc\t31.01\t \t-0.47\t-1.49%", "CELG\tCelgene Corp\t121.97\t \t-0.11\t-0.09%", "CERN\tCerner Corp\t49.53\t \t-0.06\t-0.12%", 53 | "CHKP\tCheck Point Software Technologies Ltd\t83.41\t \t-0.39\t-0.47%", "CHTR\tCharter Communications Inc\t262.70\t \t-2.78\t-1.05%", "CMCSA\tComcast Corp\t68.34\t \t-0.15\t-0.22%", "COST\tCostco Wholesale Corp\t150.36\t \t-0.84\t-0.56%", 54 | "CSCO\tCisco Systems Inc\t30.18\t \t0.13\t0.43%", "CSX\tCSX Corp\t34.00\t \t0.04\t0.12%", "CTRP\tCtrip.Com International Ltd\t42.02\t \t-0.29\t-0.69%", "CTSH\tCognizant Technology Solutions Corp\t55.57\t \t-0.81\t-1.44%", "CTXS\tCitrix Systems Inc\t86.82\t \t-1.16\t-1.32%", 55 | "DISCA\tDiscovery Communications Inc\t27.50\t \t-0.53\t-1.89%", "DISCK\tDiscovery Communications Inc\t26.75\t \t-0.34\t-1.26%", "DISH\tDISH Network Corp\t55.85\t \t0.19\t0.34%", "DLTR\tDollar Tree Inc\t81.91\t \t0.26\t0.32%", "EA\tElectronic Arts\t78.99\t \t-0.63\t-0.79%", 56 | "EBAY\teBay Inc\t28.69\t \t-0.18\t-0.62%", "ESRX\tExpress Scripts Holding Co\t75.77\t \t-0.67\t-0.88%", "EXPE\tExpedia Inc\t125.67\t \t-0.91\t-0.72%", "FAST\tFastenal Co\t44.80\t \t-0.16\t-0.36%", "FB\tFacebook\t117.02\t \t-0.77\t-0.65%", 57 | "FISV\tFiserv Inc\t104.29\t \t-0.77\t-0.73%", "FOX\t21st Century Fox Class B\t27.69\t \t-0.12\t-0.43%", "FOXA\t21st Century Fox Class A\t27.82\t \t-0.10\t-0.36%", "GILD\tGilead Sciences Inc\t74.62\t \t-0.96\t-1.27%", "GOOG\tAlphabet Class C\t760.54\t \t-10.69\t-1.39%", 58 | "GOOGL\tAlphabet Class A\t775.97\t \t-10.19\t-1.30%", "HSIC\tHenry Schein Inc\t156.96\t \t-1.93\t-1.21%", "ILMN\tIllumina Inc\t131.87\t \t-2.22\t-1.66%", "INCY\tIncyte Corp\t103.63\t \t-1.92\t-1.82%", "INTC\tIntel Corp\t34.95\t \t-0.07\t-0.20%", 59 | "INTU\tIntuit Inc\t115.98\t \t2.18\t1.92%", "ISRG\tIntuitive Surgical Inc\t654.89\t \t0.29\t0.04%", "JD\tJD.com Inc\t26.45\t \t-0.30\t-1.12%", "KHC\tKraft Heinz Co\t82.53\t \t-0.31\t-0.37%", "LBTYA\tLiberty Global PLC\t32.77\t \t-0.14\t-0.43%", 60 | "LBTYK\tLiberty Global PLC\t31.76\t \t-0.24\t-0.75%", "LLTC\tLinear Technology Corp\t61.00\t \t0.19\t0.31%", "LRCX\tLam Research Corp\t104.71\t \t0.91\t0.88%", "LVNTA\tLiberty Interactive Corp\t39.76\t \t0.04\t0.10%", "MAR\tMarriott International Inc\t77.14\t \t-0.33\t-0.43%", 61 | "MAT\tMattel Inc\t30.52\t \t-0.91\t-2.90%", "MCHP\tMicrochip Technology Inc\t64.57\t \t-0.88\t-1.34%", "MDLZ\tMondelez International Inc\t42.92\t \t-0.07\t-0.16%", "MNST\tMonster Beverage Corp\t41.68\t \t-0.29\t-0.69%", "MSFT\tMicrosoft Corp\t60.35\t \t-0.29\t-0.48%", 62 | "MU\tMicron Technology Inc\t19.21\t \t0.03\t0.16%", "MXIM\tMaxim Integrated Products Inc\t40.09\t \t0.35\t0.88%", "MYL\tMylan NV\t36.47\t \t-1.09\t-2.90%", "NCLH\tNorwegian Cruise Line Holdings Ltd\t39.68\t \t-0.15\t-0.38%", "NFLX\tNetflix Inc\t115.21\t \t0.18\t0.16%", 63 | "NTAP\tNetApp Inc\t37.00\t \t0.10\t0.27%", "NTES\tNetEase Inc\t230.81\t \t-6.02\t-2.54%", "NVDA\tNVIDIA Corp\t93.36\t \t0.97\t1.05%", "NXPI\tNXP Semiconductors NV\t98.88\t \t0.82\t0.84%", "ORLY\tO Reilly Automotive Inc\t265.74\t \t-5.25\t-1.94%", 64 | "PAYX\tPaychex Inc\t55.93\t \t0.01\t0.02%", "PCAR\tPACCAR Inc\t59.78\t \t-0.24\t-0.40%", "PCLN\tThe Priceline Group\t1507.35\t \t-5.55\t-0.37%", "PYPL\tPayPal Holdings Inc\t40.08\t \t0.20\t0.50%", "QCOM\tQualcomm Inc\t67.31\t \t0.64\t0.96%", 65 | "QVCA\tLiberty Interactive Corp\t21.07\t \t-0.11\t-0.52%", "REGN\tRegeneron Pharmaceuticals Inc\t397.48\t \t-7.08\t-1.75%", "ROST\tRoss Stores Inc\t68.00\t \t2.47\t3.77%", "SBAC\tSBA Communications Corp\t100.75\t \t0.12\t0.12%", "SBUX\tStarbucks Corp\t55.77\t \t-0.08\t-0.14%", 66 | "SIRI\tSirius XM Holdings Inc\t4.56\t \t-0.02\t-0.44%", "SRCL\tStericycle Inc\t76.09\t \t0.82\t1.09%", "STX\tSeagate Technology PLC\t39.29\t \t0.06\t0.15%", "SWKS\tSkyworks Solutions Inc\t78.21\t \t0.26\t0.33%", "SYMC\tSymantec Corp\t23.75\t \t-0.08\t-0.34%", 67 | "TMUS\tT-Mobile US Inc\t53.59\t \t0.20\t0.37%", "TRIP\tTripAdvisor Inc\t50.79\t \t-0.18\t-0.35%", "TSCO\tTractor Supply Co\t72.91\t \t-0.25\t-0.34%", "TSLA\tTesla Motors Inc\t185.02\t \t-3.64\t-1.93%", "TXN\tTexas Instruments Inc\t72.60\t \t0.52\t0.72%", 68 | "ULTA\tUlta Salon Cosmetics and Fragrance Inc\t250.09\t \t-1.04\t-0.41%", "VIAB\tViacom Inc\t37.77\t \t-0.84\t-2.18%", "VOD\tVodafone Group PLC\t25.69\t \t-0.36\t-1.38%", "VRSK\tVerisk Analytics Inc\t83.16\t \t0.16\t0.19%", "VRTX\tVertex Pharmaceuticals Inc\t89.44\t \t-1.76\t-1.93%", 69 | "WBA\tWalgreens Boots Alliance Inc\t83.27\t \t-0.72\t-0.86%", "WDC\tWestern Digital Corp\t60.93\t \t2.13\t3.62%", "WFM\tWhole Foods Market Inc\t30.96\t \t0.02\t0.06%", "XLNX\tXilinx Inc\t52.98\t \t-0.26\t-0.49%", "YHOO\tYahoo Inc\t41.19\t \t-0.26\t-0.63%", "XRAY\tDentsply Sirona Inc\t60.16\t \t-0.79\t-1.30%" 70 | ] 71 | stream = StringStream.read_from(data).map(lambda tsv: tsv.split('\t')).parse( 72 | lambda parts: { 73 | 'symbol': parts[0], 74 | 'name': parts[1], 75 | 'price': float(parts[2]), 76 | 'change': float(parts[4]) 77 | } 78 | ) 79 | assert isinstance (stream, Stream) 80 | results = await stream.to_list() 81 | assert results[0] == { 'symbol': "AAL", 'name': "American Airlines Group Inc", 'price': 46.26, 'change': 0.43 } 82 | assert results[6] == { 'symbol': "AKAM", 'name': "Akamai Technologies Inc", 'price': 66.44, 'change': -0.16 } 83 | assert results[104] == { 'symbol': "XRAY", 'name': "Dentsply Sirona Inc", 'price': 60.16, 'change': -0.79 } 84 | 85 | text = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et\n" + 86 | "dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea\n" + 87 | "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla\n" + 88 | "pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est\n" + 89 | "laborum."] 90 | 91 | @pytest.mark.asyncio 92 | async def test_match_without_regex_groups(): 93 | res = await StringStream.read_from(text).match(r'\b\w{4}[^\w]').to_list() 94 | assert res == ["amet,", "elit,", "enim ", "quis ", "nisi ", "Duis ", "aute ", "esse ", "sint ", "sunt ", "anim "] 95 | 96 | @pytest.mark.asyncio 97 | async def test_match_with_one_regex_group(): 98 | res = await StringStream.read_from(text).match(r'\b(\w{4})[^\w]').to_list() 99 | assert res == ["amet", "elit", "enim", "quis", "nisi", "Duis", "aute", "esse", "sint", "sunt", "anim"] 100 | 101 | @pytest.mark.asyncio 102 | async def test_match_with_multiple_regex_groups(): 103 | res = await StringStream.read_from(text).match(r'\b(\w{2})(\w{2})[^\w]').to_list() 104 | assert res == ["am", "et", "el", "it", "en", "im", "qu", "is", "ni", "si", "Du", "is", "au", "te", "es", "se", "si", "nt", "su", "nt", "an", "im"] 105 | -------------------------------------------------------------------------------- /test/test_write_to.py: -------------------------------------------------------------------------------- 1 | from scramjet.streams import Stream 2 | import pytest 3 | 4 | @pytest.mark.asyncio 5 | async def test_write_to_file(): 6 | with open("test_output", 'w') as file: 7 | s = Stream.read_from("abcdef") 8 | await s.write_to(file) 9 | with open("test_output") as file: 10 | assert file.read() == "abcdef" 11 | 12 | @pytest.mark.asyncio 13 | async def test_write_to_another_stream(): 14 | s1 = Stream.read_from(range(8), max_parallel=4).map(lambda x: x+1) 15 | s2 = Stream(name="s2", max_parallel=4) 16 | await s1.write_to(s2) 17 | s2.end() 18 | assert await s2.to_list() == [1, 2, 3, 4, 5, 6, 7, 8] 19 | --------------------------------------------------------------------------------