├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE.txt ├── MANIFEST.in ├── NOTICE.txt ├── README.md ├── VERSION.txt ├── benchmark ├── benchmark.py ├── results │ ├── read_1field_rec_per_sec.png │ ├── read_500field_rec_per_sec.png │ ├── read_datum_per_sec.png │ ├── write_1field_rec_per_sec.png │ ├── write_500field_rec_per_sec.png │ └── write_datum_per_sec.png ├── sample_schema.avsc ├── synthetic_avro.py └── synthetic_records.py ├── interop └── py.avro ├── ipc ├── HandshakeRequest.avsc └── HandshakeResponse.avsc ├── lib ├── pyAntTasks-1.3-LICENSE.txt ├── pyAntTasks-1.3.jar └── simplejson │ ├── LICENSE.txt │ ├── __init__.py │ ├── _speedups.c │ ├── decoder.py │ ├── encoder.py │ ├── scanner.py │ └── tool.py ├── scripts └── avro ├── setup.py ├── src └── spavro │ ├── __init__.py │ ├── binary.py │ ├── datafile.py │ ├── exceptions.py │ ├── fast_binary.c │ ├── fast_binary.pyx │ ├── io.py │ ├── ipc.py │ ├── new_schema.py │ ├── protocol.py │ ├── schema.py │ ├── schema_resolve.py │ ├── tether │ ├── InputProtocol.avpr │ ├── OutputProtocol.avpr │ ├── __init__.py │ ├── tether_task.py │ ├── tether_task_runner.py │ └── util.py │ ├── tool.py │ └── txipc.py ├── test ├── av_bench.py ├── gen_interop_data.py ├── sample_http_client.py ├── sample_http_server.py ├── set_avro_test_path.py ├── test_datafile.py ├── test_datafile_interop.py ├── test_io.py ├── test_ipc.py ├── test_more_schemas.py ├── test_old_vs_new.py ├── test_protocol.py ├── test_schema.py ├── test_schema_validation.py ├── test_script.py ├── test_union.py ├── test_write_read_schema_resolver.py ├── txsample_http_client.py ├── txsample_http_server.py └── word_count_task.py └── testdata ├── data ├── schema-tests.txt ├── syncInMeta.avro ├── test.avro12 ├── weather-snappy.avro ├── weather-sorted.avro ├── weather-xz.avro ├── weather.avro └── weather.json ├── interop ├── bin │ └── test_rpc_interop.sh └── rpc │ ├── add │ └── onePlusOne │ │ ├── request.avro │ │ └── response.avro │ ├── echo │ └── foo │ │ ├── request.avro │ │ └── response.avro │ └── hello │ └── world │ ├── request.avro │ └── response.avro └── schemas ├── BulkData.avpr ├── FooBarSpecificRecord.avsc ├── contexts.avdl ├── echo.avdl ├── http.avdl ├── interop.avsc ├── mail.avpr ├── namespace.avpr ├── nestedNullable.avdl ├── reserved.avsc ├── schemaevolution.avdl ├── simple.avpr ├── social.avdl ├── specialtypes.avdl ├── stringables.avdl └── weather.avsc /.gitignore: -------------------------------------------------------------------------------- 1 | # Standard Python files 2 | *.template.py 3 | *.pyc 4 | #*# 5 | ._* 6 | *.sqlite 7 | *.pyc 8 | *.pid 9 | *.log 10 | tmp/ 11 | build/ 12 | logs/* 13 | 14 | # OSX file metadata 15 | .DS_Store 16 | 17 | environment.py 18 | environments/* 19 | 20 | # Unit test artifacts 21 | .coverage_output.xml 22 | .nose_output.xml 23 | .coverage 24 | 25 | # other stuff 26 | samples 27 | output 28 | 29 | #build artifacts 30 | *.egg-info 31 | *.so 32 | dist/* 33 | 34 | #project 35 | perf_test 36 | benchmark/output 37 | profile 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | matrix: 8 | include: 9 | - python: 3.3 10 | dist: trusty 11 | sudo: true 12 | - python: 3.7 13 | dist: xenial 14 | sudo: true 15 | # command to install dependencies 16 | install: "python setup.py install" 17 | # command to run tests 18 | script: python -m pytest 19 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Spavro Changelog 2 | ----------------- 3 | 4 | 1.1.22 - Apr 9, 2019 5 | ==================== 6 | 7 | - Added support for xz codec via a ([pull request](https://github.com/pluralsight/spavro/pull/7)) 8 | - Improved the error reporting / validation for integers, calling out overflows as a separate case from non-integers. 9 | - Add union test for integer/float case 10 | 11 | 1.1.21 - Jan 19, 2019 12 | ===================== 13 | 14 | - Add fix for case when int values are presented to a union containing a 'float' schema. Allow the int to be stored using the float schema. 15 | 16 | 1.1.20 - Oct 6, 2018 17 | ===================== 18 | 19 | - Add better handling for appending avro records when the file obejct isn't opened using the right mode. 20 | 21 | 1.1.18, 1.1.19 - Aug 20, 2018 22 | ===================== 23 | 24 | - Fixed bug with the schema resolver where named schemas were not being handled properly 25 | 26 | 1.1.17 - May 4, 2018 27 | ===================== 28 | 29 | - Version bump to try and fix the docs again 30 | 31 | 1.1.16 - May 4, 2018 32 | ===================== 33 | 34 | - Deploy Bug: Bug with new pypi deploy led to extension code not being in the package. This release has the extension code in it. 35 | 36 | 1.1.12, 1.1.13, 1.1.14, 1.1.15 - May 3, 2018 37 | ============================================ 38 | 39 | - Pypi requires a full version to update the description, attempting to add the 40 | markdown version of the description to pypi 41 | 42 | 1.1.11 - Apr 30, 2018 43 | ===================== 44 | 45 | - Fix bug with namespace handling where names with 'dots' in them were still 46 | being concatenated with the namespace leading to bogus names 47 | - The array data 'check' function also had a bug where it was not verifying that 48 | the datum was a list before attempting to check that all items conformed to the schema 49 | 50 | 1.1.8, 1.1.9, 1.1.10 - Mar 19, 2018 51 | =================================== 52 | 53 | - Fix bug with C implementation of zig zag decoder. Additional unnecessary cast was clipping during the bit shifting for larger numbers. 54 | - Skipping 1.1.8 and 1.1.9 was missing C cythonized code and created incompatibilities with python 2.7 55 | 56 | 1.1.7 - Mar 6, 2018 57 | =================== 58 | 59 | - Fix bug with 'bytes' type in union schemas failing to parse 60 | 61 | 1.1.6 - Jan 17, 2018 62 | ==================== 63 | 64 | - Fix bug with reference types (named references) inside unions 65 | 66 | 1.1.5 - Jan 4, 2018 67 | =================== 68 | 69 | - Remove accidental debug loglevel logging directive 70 | 71 | 1.1.4 - Dec 22, 2017 72 | ==================== 73 | 74 | - Add more helpful exception messages (mainly for Python 3 with chained exceptions) that will describe which field in a record datum failed and when ints and strings mismatch, show the datum and the schema. 75 | - Fix some old non-py3 incompatible utility code to be py2/py3 76 | 77 | 1.1.3 - Dec 4, 2017 78 | =================== 79 | 80 | - Fix source distribution Cython file inclusion ([pull request](https://github.com/pluralsight/spavro/pull/2)) 81 | 82 | 1.1.2 - Nov 14, 2017 83 | ==================== 84 | 85 | - Add more type checking in the serializer. Some fast data types were leading to spavro not rejecting bad data. 86 | - Add tests to verify that invalid (no schema conforming data) is rejected 87 | 88 | 1.1.1 - Oct 31, 2017 89 | ==================== 90 | 91 | - Fix bug with Enum adding it to the named types that can be namespaced. 92 | - Fix bug with 32bit systems that could potentially trucate long data at 2^31 bits 93 | 94 | 1.1.0 - June 20, 2017 95 | ===================== 96 | 97 | - Add code to support pickling spavro records. This allows the use of spavro in contexts like Spark that need to serialize the data to be shipped around. 98 | 99 | 1.0.0 - June 7, 2017 100 | ==================== 101 | 102 | - First release of spavro, speedier avro for python! 103 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md LICENSE* NOTICE* 2 | recursive-include src *.pyx 3 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Spavro 2 | Copyright 2017 Pluralsight LLC 3 | 4 | This product includes software developed at Pluralsight 5 | (http://pluralsight.com) and licensed using the Apache 2.0 license. 6 | 7 | Spavro is a derivative work of Apache AVRO, below is the original NOTICE.txt 8 | from Apache Avro (https://github.com/apache/avro/tree/ddfcabcee064d137fb0b9082e8c522ab900433c2) 9 | 10 | Copyright 2010-2015 The Apache Software Foundation 11 | 12 | This product includes software developed at 13 | The Apache Software Foundation (http://www.apache.org/). 14 | 15 | NUnit license acknowledgement: 16 | 17 | | Portions Copyright © 2002-2012 Charlie Poole or Copyright © 2002-2004 James 18 | | W. Newkirk, Michael C. Two, Alexei A. Vorontsov or Copyright © 2000-2002 19 | | Philip A. Craig 20 | 21 | Based upon the representations of upstream licensors, it is understood that 22 | portions of the mapreduce API included in the Java implementation are licensed 23 | from various contributors under one or more contributor license agreements to 24 | Odiago, Inc. and were then contributed by Odiago to Apache Avro, which has now 25 | made them available under the Apache 2.0 license. The original file header text 26 | is: 27 | 28 | | Licensed to Odiago, Inc. under one or more contributor license 29 | | agreements. See the NOTICE file distributed with this work for 30 | | additional information regarding copyright ownership. Odiago, Inc. 31 | | licenses this file to you under the Apache License, Version 2.0 32 | | (the "License"); you may not use this file except in compliance 33 | | with the License. You may obtain a copy of the License at 34 | | 35 | | http://www.apache.org/licenses/LICENSE-2.0 36 | | 37 | | Unless required by applicable law or agreed to in writing, software 38 | | distributed under the License is distributed on an "AS IS" BASIS, 39 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 40 | | implied. See the License for the specific language governing 41 | | permissions and limitations under the License. 42 | 43 | The Odiago NOTICE at the time of the contribution: 44 | 45 | | This product includes software developed by Odiago, Inc. 46 | | (http://www.wibidata.com). 47 | 48 | Apache Ivy includes the following in its NOTICE file: 49 | 50 | | Apache Ivy 51 | | Copyright 2007-2010 The Apache Software Foundation 52 | | 53 | | This product includes software developed by 54 | | The Apache Software Foundation (http://www.apache.org/). 55 | | 56 | | Portions of Ivy were originally developed by 57 | | Jayasoft SARL (http://www.jayasoft.fr/) 58 | | and are licensed to the Apache Software Foundation under the 59 | | "Software Grant License Agreement" 60 | | 61 | | SSH and SFTP support is provided by the JCraft JSch package, 62 | | which is open source software, available under 63 | | the terms of a BSD style license. 64 | | The original software and related information is available 65 | | at http://www.jcraft.com/jsch/. 66 | 67 | Apache Log4Net includes the following in its NOTICE file: 68 | 69 | | Apache log4net 70 | | Copyright 2004-2015 The Apache Software Foundation 71 | | 72 | | This product includes software developed at 73 | | The Apache Software Foundation (http://www.apache.org/). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Spavro Build](https://travis-ci.org/pluralsight/spavro.svg?branch=master)](https://travis-ci.org/pluralsight/spavro) 2 | 3 | # (Sp)eedier Avro - Spavro 4 | 5 | Spavro is a fork of the [official Apache AVRO python 2 implementation](https://github.com/apache/avro) with the goal of greatly improving data read deserialization and write serialization performance. 6 | 7 | Spavro is also python 2/3 compatible (instead of a spearate project / implementation). [Currently tested](https://travis-ci.org/pluralsight/spavro) using python 2.7, 3.3, 3.4, 3.5 and 3.6. Python versions before 3.3 are not supported due to the use of unicode literals and other compatibility features. 8 | 9 | ## Implementation Details 10 | 11 | There are three primary differences between the official implementation and Spavro. First, Spavro uses a C extension, created with Cython, to accelerate some of the low level binary serialization logic. Additionally Spavro uses a different model for handling schemas. Spavro attemps to parse the write and read schemas _once_ and only _once_ and creates recursive reader/writer functions from the schema definition. These reader/writer functions encode the type structure of the schema so no additional lookups are necessary while processing data. The last difference is that Spavro has been updated to be both Python 2 and Python 3 compatible using the `six` library. The official apache AVRO implementation has two separate codebases for Python 2 and Python 3 and spavro only has one. 12 | 13 | This has the net effect of greatly improving the throughput of reading and writing individual datums, since the schema isn't interrogated for every datum. This can be especially beneficial for "compatible" schema reading where both a read and write schema are needed to be able to read a complete data set. 14 | 15 | ## Performance / Benchmarks 16 | 17 | 18 | ### Results 19 | 20 | These tests were run using an AWS `m4.large` instance running CentOS 7. They were run with the following versions: `avro-python3==1.8.2`, `fastavro==0.17.9`, `spavro==1.1.10`. Python `3.6.4` was used for the python 3 tests. 21 | 22 | The TLDR is that spavro has *14-23x* the throughput of the default Apache avro implementation and *2-4x* the throughput of the fastavro library (depending on the shape of the records). 23 | 24 | ### Deserialize avro records (read) 25 | 26 | 27 | Records per second read: 28 | 29 | ![Read, 1 field, records per sec](https://github.com/pluralsight/spavro/raw/master/benchmark/results/read_1field_rec_per_sec.png?raw=true "Read, 1 field, records per sec") 30 | ![Read, 500 fields, records per sec](https://github.com/pluralsight/spavro/raw/master/benchmark/results/read_500field_rec_per_sec.png?raw=true "Read, 500 fields, records per sec") 31 | 32 | Datums per second (individual fields) read: 33 | 34 | ![Read, fields per second](https://github.com/pluralsight/spavro/raw/master/benchmark/results/read_datum_per_sec.png?raw=true "Read, fields per second") 35 | 36 | ### Serialize avro records (write) 37 | 38 | 39 | Records per second write: 40 | 41 | ![Write, 1 field, records per sec](https://github.com/pluralsight/spavro/raw/master/benchmark/results/write_1field_rec_per_sec.png?raw=true "Write, 1 field, records per sec") 42 | ![Write, 500 fields, records per sec](https://github.com/pluralsight/spavro/raw/master/benchmark/results/write_500field_rec_per_sec.png?raw=true "Write, 500 fields, records per sec") 43 | 44 | Datums per second (individual fields) write: 45 | 46 | ![Write, fields per second](https://github.com/pluralsight/spavro/raw/master/benchmark/results/write_datum_per_sec.png?raw=true "Write, fields per second") 47 | 48 | 49 | ### Methodology 50 | 51 | Benchmarks were performed with the `benchmark.py` script in the `/benchmarks` path in the repository (if you'd like to run your own tests). 52 | 53 | Many of the records that led to the creation of spavro were of the form `{"type": "record", "name": "somerecord", "fields": [1 ... n fields usually with a type of the form of a union of ['null' and a primitive type]]}` so the benchmarks were created to simulate that type of record structure. I believe this is a _very_ common use case for avro so the benchmarks were created around this pattern. 54 | 55 | The benchmark creates a random schema of a record with a mix of string, double, long and boolean types and a random record generator to test that schema. The pseudo-random generator is seeded with the same string to make the results deterministic (but with varied records). The number of fields in the record was varied from one to 500 and the performance of the avro implementations were tested for each of the cases. 56 | 57 | The serializer and deserializer benchmarks create an array of simulated records in memory and then attempts to process them using the three different implementation as quickly as possible. This means the max working size is limited to memory (a combination of the number of records and the number of fields in the simulated record). For these benchmarks 5m datums were processed for each run (divided by the number of fields in each record). 58 | 59 | Each run of the schema/record/implementation was repeated ten times and the time to complete was averaged. 60 | 61 | 62 | ## API 63 | 64 | Spavro keeps the default Apache library's API. This allows spavro to be a drop-in replacement for code using the existing Apache implementation. 65 | 66 | ## Tests 67 | 68 | Since the API matches the existing library, the majority of the existing Apache test suite is used to verify the correct operation of Spavro. Spavro adds some additional correctness tests to compare new vs old behaviors as well as some additional logic tests above and beyond the original library. Some of the java-based "map reduce" tests (specifically the tether tests) were removed because Spavro does not include the java code to implement that logic. 69 | 70 | -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.9.0-SNAPSHOT -------------------------------------------------------------------------------- /benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import sys 6 | import random 7 | from collections import defaultdict 8 | import avro.schema 9 | import avro.io 10 | import spavro.schema 11 | import spavro.io 12 | import io 13 | from timeit import default_timer as timer 14 | from fastavro import schemaless_writer, schemaless_reader 15 | from synthetic_records import generate_sample_records, generate_random_records 16 | from synthetic_avro import create_avro 17 | 18 | 19 | class ByteStream(io.BytesIO): 20 | '''Create a context managed bytesIO object''' 21 | def __enter__(self): 22 | return self 23 | 24 | def __exit__(self, *args): 25 | self.close() 26 | return False 27 | 28 | 29 | def make_spavro_reader(schema): 30 | parsed_schema = spavro.schema.parse(json.dumps(schema)) 31 | reader = spavro.io.DatumReader(parsed_schema) 32 | def read_func(data): 33 | bytes_reader = io.BytesIO(data) 34 | decoder = spavro.io.BinaryDecoder(bytes_reader) 35 | return reader.read(decoder) 36 | return read_func 37 | 38 | 39 | def make_avro_reader(schema): 40 | if sys.version_info >= (3, 0): 41 | # why did they change it from parse to Parse in py3? huh? 42 | parsed_schema = avro.schema.Parse(json.dumps(schema)) 43 | else: 44 | parsed_schema = avro.schema.parse(json.dumps(schema)) 45 | reader = avro.io.DatumReader(parsed_schema) 46 | def read_func(data): 47 | bytes_reader = io.BytesIO(data) 48 | decoder = avro.io.BinaryDecoder(bytes_reader) 49 | return reader.read(decoder) 50 | return read_func 51 | 52 | 53 | def make_fastavro_reader(schema): 54 | def read_func(data): 55 | buffer = io.BytesIO(data) 56 | return schemaless_reader(buffer, schema) 57 | return read_func 58 | 59 | 60 | def make_avro_writer(schema, output): 61 | if sys.version_info >= (3, 0): 62 | # why did they change it from parse to Parse in py3? huh? 63 | parsed_schema = avro.schema.Parse(json.dumps(schema)) 64 | else: 65 | parsed_schema = avro.schema.parse(json.dumps(schema)) 66 | writer = avro.io.DatumWriter(parsed_schema) 67 | encoder = avro.io.BinaryEncoder(output) 68 | def write_func(datum): 69 | writer.write(datum, encoder) 70 | return write_func 71 | 72 | 73 | def make_spavro_writer(schema, output): 74 | parsed_schema = spavro.schema.parse(json.dumps(schema)) 75 | writer = spavro.io.DatumWriter(parsed_schema) 76 | encoder = spavro.io.BinaryEncoder(output) 77 | def write_func(datum): 78 | writer.write(datum, encoder) 79 | return write_func 80 | 81 | 82 | def make_fastavro_writer(schema, output): 83 | def write_func(datum): 84 | schemaless_writer(output, schema, datum) 85 | return write_func 86 | 87 | 88 | def time_serdes(name, test_func, test_array): 89 | start_time = timer() 90 | record_count = len(test_array) 91 | for idx, record in enumerate(test_array): 92 | test_func(record) 93 | total_time = timer() - start_time 94 | return record_count, total_time 95 | 96 | 97 | def create_write_records(field_count, record_count): 98 | print("Generating sample records to serialize") 99 | # schema, test_array_generator = generate_sample_records(record_count) 100 | schema, test_array_generator = generate_random_records(field_count, record_count) 101 | test_array = list(test_array_generator) 102 | return schema, test_array 103 | 104 | 105 | def create_read_records(field_count, record_count): 106 | print("Generating sample avro to deserialize") 107 | # schema, test_array_generator = generate_sample_records(record_count) 108 | schema, test_array_generator = generate_random_records(field_count, record_count) 109 | test_array = create_avro(schema, test_array_generator) 110 | return schema, test_array 111 | 112 | 113 | def run_benchmarks(number_of_iterations=5): 114 | random.seed("ALWAYSTHESAME") 115 | results = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) 116 | # field count, record count tuples 117 | test_set = ((1, 5000000), (5, 1000000), (10, 500000), (50, 100000), (100, 50000), (500, 10000)) 118 | 119 | for field_count, record_count in test_set: 120 | schema, test_data = create_read_records(field_count, record_count) 121 | # read 122 | read_functions = [("Avro", make_avro_reader(schema)), 123 | ("Fastavro", make_fastavro_reader(schema)), 124 | ("Spavro", make_spavro_reader(schema))] 125 | 126 | for name, reader in read_functions: 127 | for i in range(number_of_iterations): 128 | print("Run #{}".format(i+1)) 129 | record_count, total_time = time_serdes(name, reader, test_data) 130 | results["read"][(field_count, record_count)][name].append(total_time) 131 | print("{}: {:.2f} records/sec".format(name, record_count / total_time)) 132 | 133 | schema, test_data = create_write_records(field_count, record_count) 134 | # write 135 | write_functions = [("Avro", make_avro_writer(schema, io.BytesIO())), 136 | ("Fastavro", make_fastavro_writer(schema, io.BytesIO())), 137 | ("Spavro", make_spavro_writer(schema, io.BytesIO()))] 138 | for name, writer in write_functions: 139 | for i in range(number_of_iterations): 140 | print("Run #{}".format(i+1)) 141 | record_count, total_time = time_serdes(name, writer, test_data) 142 | results["write"][(field_count, record_count)][name].append(total_time) 143 | print("{}: {:.2f} records/sec".format(name, record_count / total_time)) 144 | return results 145 | 146 | 147 | if __name__ == "__main__": 148 | benchmark_results = run_benchmarks() 149 | with open("benchmark_results.json", "w") as bmark_file: 150 | bmark_file.write(json.dumps(benchmark_results)) 151 | -------------------------------------------------------------------------------- /benchmark/results/read_1field_rec_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/read_1field_rec_per_sec.png -------------------------------------------------------------------------------- /benchmark/results/read_500field_rec_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/read_500field_rec_per_sec.png -------------------------------------------------------------------------------- /benchmark/results/read_datum_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/read_datum_per_sec.png -------------------------------------------------------------------------------- /benchmark/results/write_1field_rec_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/write_1field_rec_per_sec.png -------------------------------------------------------------------------------- /benchmark/results/write_500field_rec_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/write_500field_rec_per_sec.png -------------------------------------------------------------------------------- /benchmark/results/write_datum_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/benchmark/results/write_datum_per_sec.png -------------------------------------------------------------------------------- /benchmark/synthetic_avro.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import io 3 | import spavro.schema 4 | import spavro.io 5 | import json 6 | 7 | 8 | class ByteStream(io.BytesIO): 9 | '''Create a context managed bytesIO object''' 10 | def __enter__(self): 11 | return self 12 | 13 | def __exit__(self, *args): 14 | self.close() 15 | return False 16 | 17 | 18 | def make_record_serializer(schema): 19 | parsed_schema = spavro.schema.parse(json.dumps(schema)) 20 | writer = spavro.io.DatumWriter(parsed_schema) 21 | def write_func(datum): 22 | with ByteStream() as output: 23 | encoder = spavro.io.BinaryEncoder(output) 24 | writer.write(datum, encoder) 25 | return output.getvalue() 26 | return write_func 27 | 28 | def create_avro(schema, records): 29 | avro_encode = make_record_serializer(schema) 30 | return [avro_encode(record) for record in records] 31 | -------------------------------------------------------------------------------- /benchmark/synthetic_records.py: -------------------------------------------------------------------------------- 1 | import random 2 | from string import ascii_letters 3 | import json 4 | 5 | 6 | min_long = -(1 << 63) 7 | max_long = (1 << 63) - 1 8 | 9 | def random_boolean(): 10 | return True if random.random() > 0.5 else False 11 | 12 | def random_double(): 13 | return random.random() + random.randint(-64000, 64000) 14 | 15 | def random_long(): 16 | return random.randint(min_long, max_long) 17 | 18 | def random_string(): 19 | return ''.join(random.choice(ascii_letters) for i in range(random.randint(1, 30))) 20 | 21 | def null_generator(): 22 | return None 23 | 24 | 25 | generators = { 26 | 'boolean': random_boolean, 27 | 'double': random_double, 28 | 'long': random_long, 29 | 'string': random_string, 30 | } 31 | 32 | sample_unions = (["null", "long"], 33 | ["null", "string"], 34 | ["null", "boolean"], 35 | ["null", "double"]) 36 | 37 | 38 | 39 | def generate_random_schema(field_count): 40 | return {"type": "record", "name": "benchmark", "fields": [{"name": "field{}".format(idx), "type": random.choice(sample_unions), "default": "null"} for idx in range(field_count)]} 41 | 42 | 43 | def generate_records(schema, record_count): 44 | synthetic_fields = [(field['type'], [generators[field['type'][1]], null_generator]) for field in schema['fields']] 45 | for i in range(record_count): 46 | yield {"field{}".format(idx): random.choice(field[1])() for idx, field in enumerate(synthetic_fields)} 47 | 48 | 49 | def generate_sample_records(record_count): 50 | with open('sample_schema.avsc', 'r') as avro_schema: 51 | schema = json.loads(avro_schema.read()) 52 | return schema, generate_records(schema, record_count) 53 | 54 | 55 | def generate_random_records(field_count, record_count): 56 | schema = generate_random_schema(field_count) 57 | return schema, generate_records(schema, record_count) 58 | 59 | -------------------------------------------------------------------------------- /interop/py.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/interop/py.avro -------------------------------------------------------------------------------- /ipc/HandshakeRequest.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", 4 | "fields": [ 5 | {"name": "clientHash", 6 | "type": {"type": "fixed", "name": "MD5", "size": 16}}, 7 | {"name": "clientProtocol", "type": ["null", "string"]}, 8 | {"name": "serverHash", "type": "MD5"}, 9 | {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /ipc/HandshakeResponse.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "HandshakeResponse", "namespace": "org.apache.avro.ipc", 4 | "fields": [ 5 | {"name": "match", 6 | "type": {"type": "enum", "name": "HandshakeMatch", 7 | "symbols": ["BOTH", "CLIENT", "NONE"]}}, 8 | {"name": "serverProtocol", 9 | "type": ["null", "string"]}, 10 | {"name": "serverHash", 11 | "type": ["null", {"type": "fixed", "name": "MD5", "size": 16}]}, 12 | {"name": "meta", 13 | "type": ["null", {"type": "map", "values": "bytes"}]} 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /lib/pyAntTasks-1.3-LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /lib/pyAntTasks-1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/lib/pyAntTasks-1.3.jar -------------------------------------------------------------------------------- /lib/simplejson/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006 Bob Ippolito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /lib/simplejson/scanner.py: -------------------------------------------------------------------------------- 1 | """JSON token scanner 2 | """ 3 | import re 4 | try: 5 | from simplejson._speedups import make_scanner as c_make_scanner 6 | except ImportError: 7 | c_make_scanner = None 8 | 9 | __all__ = ['make_scanner'] 10 | 11 | NUMBER_RE = re.compile( 12 | r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?', 13 | (re.VERBOSE | re.MULTILINE | re.DOTALL)) 14 | 15 | def py_make_scanner(context): 16 | parse_object = context.parse_object 17 | parse_array = context.parse_array 18 | parse_string = context.parse_string 19 | match_number = NUMBER_RE.match 20 | encoding = context.encoding 21 | strict = context.strict 22 | parse_float = context.parse_float 23 | parse_int = context.parse_int 24 | parse_constant = context.parse_constant 25 | object_hook = context.object_hook 26 | 27 | def _scan_once(string, idx): 28 | try: 29 | nextchar = string[idx] 30 | except IndexError: 31 | raise StopIteration 32 | 33 | if nextchar == '"': 34 | return parse_string(string, idx + 1, encoding, strict) 35 | elif nextchar == '{': 36 | return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook) 37 | elif nextchar == '[': 38 | return parse_array((string, idx + 1), _scan_once) 39 | elif nextchar == 'n' and string[idx:idx + 4] == 'null': 40 | return None, idx + 4 41 | elif nextchar == 't' and string[idx:idx + 4] == 'true': 42 | return True, idx + 4 43 | elif nextchar == 'f' and string[idx:idx + 5] == 'false': 44 | return False, idx + 5 45 | 46 | m = match_number(string, idx) 47 | if m is not None: 48 | integer, frac, exp = m.groups() 49 | if frac or exp: 50 | res = parse_float(integer + (frac or '') + (exp or '')) 51 | else: 52 | res = parse_int(integer) 53 | return res, m.end() 54 | elif nextchar == 'N' and string[idx:idx + 3] == 'NaN': 55 | return parse_constant('NaN'), idx + 3 56 | elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity': 57 | return parse_constant('Infinity'), idx + 8 58 | elif nextchar == '-' and string[idx:idx + 9] == '-Infinity': 59 | return parse_constant('-Infinity'), idx + 9 60 | else: 61 | raise StopIteration 62 | 63 | return _scan_once 64 | 65 | make_scanner = c_make_scanner or py_make_scanner 66 | -------------------------------------------------------------------------------- /lib/simplejson/tool.py: -------------------------------------------------------------------------------- 1 | r"""Command-line tool to validate and pretty-print JSON 2 | 3 | Usage:: 4 | 5 | $ echo '{"json":"obj"}' | python -m simplejson.tool 6 | { 7 | "json": "obj" 8 | } 9 | $ echo '{ 1.2:3.4}' | python -m simplejson.tool 10 | Expecting property name: line 1 column 2 (char 2) 11 | 12 | """ 13 | import sys 14 | import simplejson 15 | 16 | def main(): 17 | if len(sys.argv) == 1: 18 | infile = sys.stdin 19 | outfile = sys.stdout 20 | elif len(sys.argv) == 2: 21 | infile = open(sys.argv[1], 'rb') 22 | outfile = sys.stdout 23 | elif len(sys.argv) == 3: 24 | infile = open(sys.argv[1], 'rb') 25 | outfile = open(sys.argv[2], 'wb') 26 | else: 27 | raise SystemExit(sys.argv[0] + " [infile [outfile]]") 28 | try: 29 | obj = simplejson.load(infile) 30 | except ValueError, e: 31 | raise SystemExit(e) 32 | simplejson.dump(obj, outfile, sort_keys=True, indent=4) 33 | outfile.write('\n') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /scripts/avro: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Command line utlity for reading and writing Avro files.""" 19 | 20 | from spavro.io import DatumReader, DatumWriter 21 | from spavro.datafile import DataFileReader, DataFileWriter 22 | import spavro.schema 23 | 24 | try: 25 | import json 26 | except ImportError: 27 | import simplejson as json 28 | import csv 29 | from sys import stdout, stdin 30 | import six 31 | from six.moves import filter as ifilter 32 | from six.moves import map as imap 33 | from six.moves import range as xrange 34 | from six import string_types as basestring 35 | # from itertools import ifilter, imap 36 | from functools import partial 37 | from os.path import splitext 38 | import sys 39 | 40 | # python3 support 41 | try: 42 | type(long) 43 | except NameError: 44 | long = int 45 | 46 | binstdin = getattr(sys.stdin, "buffer", sys.stdin) 47 | binstdout = getattr(sys.stdout, "buffer", sys.stdout) 48 | 49 | 50 | class AvroError(Exception): 51 | pass 52 | 53 | 54 | def print_json(row): 55 | print(json.dumps(row)) 56 | 57 | 58 | def print_json_pretty(row): 59 | print(json.dumps(row, indent=4, sort_keys=True)) 60 | 61 | 62 | _write_row = csv.writer(stdout).writerow 63 | _encoding = stdout.encoding or "UTF-8" 64 | 65 | 66 | def _encode(v, encoding=_encoding): 67 | if not isinstance(v, basestring): 68 | return v 69 | return v.encode(_encoding) 70 | 71 | 72 | def print_csv(row): 73 | # We sort the keys to the fields will be in the same place 74 | # FIXME: Do we want to do it in schema order? 75 | _write_row([_encode(row[key]) for key in sorted(row)]) 76 | 77 | 78 | def select_printer(format): 79 | return { 80 | "json": print_json, 81 | "json-pretty": print_json_pretty, 82 | "csv": print_csv 83 | }[format] 84 | 85 | def record_match(expr, record): 86 | return eval(expr, None, {"r" : record}) 87 | 88 | def parse_fields(fields): 89 | fields = fields or '' 90 | if not fields.strip(): 91 | return None 92 | 93 | return [field.strip() for field in fields.split(',') if field.strip()] 94 | 95 | def field_selector(fields): 96 | fields = set(fields) 97 | def keys_filter(obj): 98 | return dict((k, obj[k]) for k in (set(obj) & fields)) 99 | return keys_filter 100 | 101 | def print_avro(avro, opts): 102 | if opts.header and (opts.format != "csv"): 103 | raise AvroError("--header applies only to CSV format") 104 | 105 | # Apply filter first 106 | if opts.filter: 107 | avro = ifilter(partial(record_match, opts.filter), avro) 108 | 109 | for i in xrange(opts.skip): 110 | try: 111 | next(avro) 112 | except StopIteration: 113 | return 114 | 115 | fields = parse_fields(opts.fields) 116 | if fields: 117 | avro = imap(field_selector(fields), avro) 118 | 119 | printer = select_printer(opts.format) 120 | for i, record in enumerate(avro): 121 | if i == 0 and opts.header: 122 | _write_row(sorted(record.keys())) 123 | if i >= opts.count: 124 | break 125 | printer(record) 126 | 127 | 128 | def print_schema(avro): 129 | schema = avro.meta["avro.schema"].decode('utf-8') 130 | # Pretty print 131 | print(json.dumps(json.loads(schema), indent=4, sort_keys=True)) 132 | 133 | 134 | def cat(opts, args): 135 | if not args: 136 | raise AvroError("No files to show") 137 | 138 | for filename in args: 139 | try: 140 | fo = open(filename, "rb") 141 | except (OSError, IOError) as e: 142 | raise AvroError("Can't open %s - %s" % (filename, e)) 143 | 144 | avro = DataFileReader(fo, DatumReader()) 145 | 146 | if opts.print_schema: 147 | print_schema(avro) 148 | continue 149 | 150 | print_avro(avro, opts) 151 | 152 | 153 | def _open(filename, mode): 154 | if filename == "-": 155 | return { 156 | "rb": binstdin, 157 | "wb": binstdout 158 | }[mode] 159 | 160 | return open(filename, mode) 161 | 162 | 163 | def iter_json(info, _): 164 | return imap(lambda x: json.loads(x.decode('utf-8')), info) 165 | 166 | 167 | def convert(value, field): 168 | type = field.type.type 169 | if type == "union": 170 | return convert_union(value, field) 171 | 172 | return { 173 | "int": int, 174 | "long": long, 175 | "float": float, 176 | "double": float, 177 | "string": str, 178 | "bytes": str, 179 | "boolean": bool, 180 | "null": lambda _: None, 181 | "union": lambda v: convert_union(v, field), 182 | }[type](value) 183 | 184 | 185 | def convert_union(value, field): 186 | for name in [s.name for s in field.type.schemas]: 187 | try: 188 | return convert(name)(value) 189 | except ValueError: 190 | continue 191 | 192 | 193 | def iter_csv(info, schema): 194 | header = [field.name for field in schema.fields] 195 | for row in csv.reader((x.decode('utf-8') for x in info)): 196 | values = [convert(v, f) for v, f in zip(row, schema.fields)] 197 | yield dict(zip(header, values)) 198 | 199 | 200 | def guess_input_type(files): 201 | if not files: 202 | return None 203 | 204 | ext = splitext(files[0])[1].lower() 205 | if ext in (".json", ".js"): 206 | return "json" 207 | elif ext in (".csv",): 208 | return "csv" 209 | 210 | return None 211 | 212 | 213 | def write(opts, files): 214 | if not opts.schema: 215 | raise AvroError("No schema specified") 216 | 217 | input_type = opts.input_type or guess_input_type(files) 218 | if not input_type: 219 | raise AvroError("Can't guess input file type (not .json or .csv)") 220 | 221 | try: 222 | schema = spavro.schema.parse(open(opts.schema, "rb").read().decode('utf-8')) 223 | out = _open(opts.output, "wb") 224 | except (IOError, OSError) as e: 225 | raise AvroError("Can't open file - %s" % e) 226 | 227 | writer = DataFileWriter(out, DatumWriter(), schema) 228 | iter_records = {"json": iter_json, "csv": iter_csv}[input_type] 229 | for filename in (files or ["-"]): 230 | info = _open(filename, "rb") 231 | for record in iter_records(info, schema): 232 | writer.append(record) 233 | 234 | writer.close() 235 | 236 | 237 | def main(argv=None): 238 | import sys 239 | from optparse import OptionParser, OptionGroup 240 | 241 | argv = argv or sys.argv 242 | 243 | parser = OptionParser(description="Display/write for Avro files", 244 | version="1.8.1", 245 | usage="usage: %prog cat|write [options] FILE [FILE...]") 246 | # cat options 247 | 248 | cat_options = OptionGroup(parser, "cat options") 249 | cat_options.add_option("-n", "--count", default=float("Infinity"), 250 | help="number of records to print", type=int) 251 | cat_options.add_option("-s", "--skip", help="number of records to skip", 252 | type=int, default=0) 253 | cat_options.add_option("-f", "--format", help="record format", 254 | default="json", 255 | choices=["json", "csv", "json-pretty"]) 256 | cat_options.add_option("--header", help="print CSV header", default=False, 257 | action="store_true") 258 | cat_options.add_option("--filter", help="filter records (e.g. r['age']>1)", 259 | default=None) 260 | cat_options.add_option("--print-schema", help="print schema", 261 | action="store_true", default=False) 262 | cat_options.add_option('--fields', default=None, 263 | help='fields to show, comma separated (show all by default)') 264 | parser.add_option_group(cat_options) 265 | 266 | # write options 267 | write_options = OptionGroup(parser, "write options") 268 | write_options.add_option("--schema", help="schema file (required)") 269 | write_options.add_option("--input-type", 270 | help="input file(s) type (json or csv)", 271 | choices=["json", "csv"], default=None) 272 | write_options.add_option("-o", "--output", help="output file", default="-") 273 | parser.add_option_group(write_options) 274 | 275 | opts, args = parser.parse_args(argv[1:]) 276 | if len(args) < 1: 277 | parser.error("You much specify `cat` or `write`") # Will exit 278 | 279 | command = args.pop(0) 280 | try: 281 | if command == "cat": 282 | cat(opts, args) 283 | elif command == "write": 284 | write(opts, args) 285 | else: 286 | raise AvroError("Unknown command - %s" % command) 287 | except AvroError as e: 288 | parser.error("%s" % e) # Will exit 289 | except Exception as e: 290 | raise SystemExit("panic: %s" % e) 291 | 292 | if __name__ == "__main__": 293 | main() 294 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | try: 4 | from setuptools import setup 5 | except ImportError: 6 | from distutils.core import setup 7 | from sys import version_info 8 | from distutils.extension import Extension 9 | 10 | try: 11 | from Cython.Build import cythonize 12 | USE_CYTHON = True 13 | except ImportError: 14 | USE_CYTHON = False 15 | 16 | install_requires = ['six>=1.10.0'] 17 | if version_info[:2] <= (2, 5): 18 | install_requires.append('simplejson >= 2.0.9') 19 | 20 | ext = '.pyx' if USE_CYTHON else '.c' 21 | extensions = [Extension("spavro.fast_binary", sources=["src/spavro/fast_binary" + ext])] 22 | 23 | if USE_CYTHON: 24 | extensions = cythonize(extensions) 25 | 26 | try: 27 | with open("README.md", "r") as readmefile: 28 | readme_data = readmefile.read() 29 | except IOError: 30 | readme_data = '' 31 | 32 | setup( 33 | name='spavro', 34 | version='1.1.22', 35 | packages=['spavro'], 36 | package_dir={'': 'src'}, 37 | # scripts=["./scripts/avro"], 38 | # Project uses simplejson, so ensure that it gets installed or upgraded 39 | # on the target machine 40 | install_requires=install_requires, 41 | # spavro C extensions 42 | ext_modules=extensions, 43 | # metadata for upload to PyPI 44 | long_description_content_type='text/markdown', 45 | long_description=readme_data, 46 | author='Michael Kowalchik', 47 | author_email='mikepk@pluralsight.com', 48 | description='An Avro library, Spavro is a (sp)eedier avro implementation using Cython -- Spavro is a fork of the official Apache AVRO python 2 implementation with the goal of greatly improving data read deserialization and write serialization performance.', 49 | license='Apache License 2.0', 50 | keywords='avro serialization rpc data', 51 | url='http://github.com/pluralsight/spavro', 52 | extras_require={ 53 | 'snappy': ['python-snappy'], 54 | 'test': ['pytest>=3.1.1'], 55 | }, 56 | classifiers=[ 57 | "Development Status :: 5 - Production/Stable", 58 | "Intended Audience :: Developers", 59 | "License :: OSI Approved :: Apache Software License", 60 | "Programming Language :: Python", 61 | "Programming Language :: Python :: 2.7", 62 | "Programming Language :: Python :: 3.3", 63 | "Programming Language :: Python :: 3.4", 64 | "Programming Language :: Python :: 3.5", 65 | "Programming Language :: Python :: 3.6", 66 | "Topic :: Software Development :: Libraries", 67 | "Topic :: System :: Networking", 68 | "Operating System :: OS Independent", 69 | ] 70 | ) 71 | -------------------------------------------------------------------------------- /src/spavro/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | __all__ = ['schema', 'io', 'datafile', 'protocol', 'ipc'] 18 | 19 | -------------------------------------------------------------------------------- /src/spavro/binary.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import sys 20 | import struct 21 | from binascii import crc32 22 | from spavro import schema 23 | import six 24 | 25 | # TODO(hammer): shouldn't ! be < for little-endian (according to spec?) 26 | if sys.version_info >= (2, 5, 0): 27 | struct_class = struct.Struct 28 | else: 29 | class SimpleStruct(object): 30 | def __init__(self, format): 31 | self.format = format 32 | 33 | def pack(self, *args): 34 | return struct.pack(self.format, *args) 35 | 36 | def unpack(self, *args): 37 | return struct.unpack(self.format, *args) 38 | struct_class = SimpleStruct 39 | 40 | STRUCT_INT = struct_class('!I') # big-endian unsigned int 41 | STRUCT_LONG = struct_class('!Q') # big-endian unsigned long long 42 | STRUCT_FLOAT = struct_class('!f') # big-endian float 43 | STRUCT_DOUBLE = struct_class('!d') # big-endian double 44 | STRUCT_CRC32 = struct_class('>I') # big-endian unsigned int 45 | 46 | 47 | class BinaryDecoder(object): 48 | """Read leaf values.""" 49 | def __init__(self, reader): 50 | """ 51 | reader is a Python object on which we can call read, seek, and tell. 52 | """ 53 | self._reader = reader 54 | 55 | # read-only properties 56 | reader = property(lambda self: self._reader) 57 | 58 | def read(self, n): 59 | """ 60 | Read n bytes. 61 | """ 62 | return self.reader.read(n) 63 | 64 | def read_null(self): 65 | """ 66 | null is written as zero bytes 67 | """ 68 | return None 69 | 70 | def read_boolean(self): 71 | """ 72 | a boolean is written as a single byte 73 | whose value is either 0 (false) or 1 (true). 74 | """ 75 | return ord(self.read(1)) == 1 76 | 77 | def read_int(self): 78 | """ 79 | int and long values are written using variable-length, zig-zag coding. 80 | """ 81 | return self.read_long() 82 | 83 | def read_long(self): 84 | """ 85 | int and long values are written using variable-length, zig-zag coding. 86 | """ 87 | b = ord(self.read(1)) 88 | n = b & 0x7F 89 | shift = 7 90 | while (b & 0x80) != 0: 91 | b = ord(self.read(1)) 92 | n |= (b & 0x7F) << shift 93 | shift += 7 94 | datum = (n >> 1) ^ -(n & 1) 95 | return datum 96 | 97 | def read_float(self): 98 | """ 99 | A float is written as 4 bytes. 100 | The float is converted into a 32-bit integer using a method equivalent to 101 | Java's floatToIntBits and then encoded in little-endian format. 102 | """ 103 | bits = (((ord(self.read(1)) & 0xff)) | 104 | ((ord(self.read(1)) & 0xff) << 8) | 105 | ((ord(self.read(1)) & 0xff) << 16) | 106 | ((ord(self.read(1)) & 0xff) << 24)) 107 | return STRUCT_FLOAT.unpack(STRUCT_INT.pack(bits))[0] 108 | 109 | def read_double(self): 110 | """ 111 | A double is written as 8 bytes. 112 | The double is converted into a 64-bit integer using a method equivalent to 113 | Java's doubleToLongBits and then encoded in little-endian format. 114 | """ 115 | bits = (((ord(self.read(1)) & 0xff)) | 116 | ((ord(self.read(1)) & 0xff) << 8) | 117 | ((ord(self.read(1)) & 0xff) << 16) | 118 | ((ord(self.read(1)) & 0xff) << 24) | 119 | ((ord(self.read(1)) & 0xff) << 32) | 120 | ((ord(self.read(1)) & 0xff) << 40) | 121 | ((ord(self.read(1)) & 0xff) << 48) | 122 | ((ord(self.read(1)) & 0xff) << 56)) 123 | return STRUCT_DOUBLE.unpack(STRUCT_LONG.pack(bits))[0] 124 | 125 | def read_bytes(self): 126 | """ 127 | Bytes are encoded as a long followed by that many bytes of data. 128 | """ 129 | return self.read(self.read_long()) 130 | 131 | def read_utf8(self): 132 | """ 133 | A string is encoded as a long followed by 134 | that many bytes of UTF-8 encoded character data. 135 | """ 136 | return six.text_type(self.read_bytes(), "utf-8") 137 | 138 | def check_crc32(self, bytes): 139 | checksum = STRUCT_CRC32.unpack(self.read(4))[0]; 140 | if crc32(bytes) & 0xffffffff != checksum: 141 | raise schema.AvroException("Checksum failure") 142 | 143 | def skip_null(self): 144 | pass 145 | 146 | def skip_boolean(self): 147 | self.skip(1) 148 | 149 | def skip_int(self): 150 | self.skip_long() 151 | 152 | def skip_long(self): 153 | b = ord(self.read(1)) 154 | while (b & 0x80) != 0: 155 | b = ord(self.read(1)) 156 | 157 | def skip_float(self): 158 | self.skip(4) 159 | 160 | def skip_double(self): 161 | self.skip(8) 162 | 163 | def skip_bytes(self): 164 | self.skip(self.read_long()) 165 | 166 | def skip_utf8(self): 167 | self.skip_bytes() 168 | 169 | def skip(self, n): 170 | self.reader.seek(self.reader.tell() + n) 171 | 172 | 173 | class BinaryEncoder(object): 174 | """Write leaf values.""" 175 | def __init__(self, writer): 176 | """ 177 | writer is a Python object on which we can call write. 178 | """ 179 | self._writer = writer 180 | 181 | # read-only properties 182 | writer = property(lambda self: self._writer) 183 | 184 | def write(self, datum): 185 | """Write an abritrary datum.""" 186 | self.writer.write(datum) 187 | 188 | def write_null(self, datum): 189 | """ 190 | null is written as zero bytes 191 | """ 192 | pass 193 | 194 | def write_boolean(self, datum): 195 | """ 196 | a boolean is written as a single byte 197 | whose value is either 0 (false) or 1 (true). 198 | """ 199 | if datum: 200 | self.write(six.int2byte(1)) 201 | else: 202 | self.write(six.int2byte(0)) 203 | 204 | def write_int(self, datum): 205 | """ 206 | int and long values are written using variable-length, zig-zag coding. 207 | """ 208 | self.write_long(datum); 209 | 210 | def write_long(self, datum): 211 | """ 212 | int and long values are written using variable-length, zig-zag coding. 213 | """ 214 | datum = (datum << 1) ^ (datum >> 63) 215 | while (datum & ~0x7F) != 0: 216 | self.write(six.int2byte((datum & 0x7f) | 0x80)) 217 | datum >>= 7 218 | self.write(six.int2byte(datum)) 219 | 220 | def write_float(self, datum): 221 | """ 222 | A float is written as 4 bytes. 223 | The float is converted into a 32-bit integer using a method equivalent to 224 | Java's floatToIntBits and then encoded in little-endian format. 225 | """ 226 | bits = STRUCT_INT.unpack(STRUCT_FLOAT.pack(datum))[0] 227 | self.write(six.int2byte((bits) & 0xFF)) 228 | self.write(six.int2byte((bits >> 8) & 0xFF)) 229 | self.write(six.int2byte((bits >> 16) & 0xFF)) 230 | self.write(six.int2byte((bits >> 24) & 0xFF)) 231 | 232 | def write_double(self, datum): 233 | """ 234 | A double is written as 8 bytes. 235 | The double is converted into a 64-bit integer using a method equivalent to 236 | Java's doubleToLongBits and then encoded in little-endian format. 237 | """ 238 | bits = STRUCT_LONG.unpack(STRUCT_DOUBLE.pack(datum))[0] 239 | self.write(six.int2byte((bits) & 0xFF)) 240 | self.write(six.int2byte((bits >> 8) & 0xFF)) 241 | self.write(six.int2byte((bits >> 16) & 0xFF)) 242 | self.write(six.int2byte((bits >> 24) & 0xFF)) 243 | self.write(six.int2byte((bits >> 32) & 0xFF)) 244 | self.write(six.int2byte((bits >> 40) & 0xFF)) 245 | self.write(six.int2byte((bits >> 48) & 0xFF)) 246 | self.write(six.int2byte((bits >> 56) & 0xFF)) 247 | 248 | def write_bytes(self, datum): 249 | """ 250 | Bytes are encoded as a long followed by that many bytes of data. 251 | """ 252 | self.write_long(len(datum)) 253 | self.write(struct.pack('%ds' % len(datum), datum)) 254 | 255 | def write_utf8(self, datum): 256 | """ 257 | A string is encoded as a long followed by 258 | that many bytes of UTF-8 encoded character data. 259 | """ 260 | datum = datum.encode("utf-8") 261 | self.write_bytes(datum) 262 | 263 | def write_crc32(self, bytes): 264 | """ 265 | A 4-byte, big-endian CRC32 checksum 266 | """ 267 | self.write(STRUCT_CRC32.pack(crc32(bytes) & 0xffffffff)) 268 | -------------------------------------------------------------------------------- /src/spavro/exceptions.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from spavro import schema 20 | import json 21 | 22 | 23 | class AvroTypeException(schema.AvroException): 24 | """Raised when datum is not an example of schema.""" 25 | def __init__(self, expected_schema, datum): 26 | pretty_expected = json.dumps(json.loads(str(expected_schema)), indent=2) 27 | fail_msg = "The datum %s is not an example of the schema %s"\ 28 | % (datum, pretty_expected) 29 | schema.AvroException.__init__(self, fail_msg) 30 | 31 | 32 | class SchemaResolutionException(schema.AvroException): 33 | def __init__(self, fail_msg, writers_schema=None, readers_schema=None): 34 | if writers_schema: 35 | pretty_writers = json.dumps(json.loads(str(writers_schema)), indent=2) 36 | fail_msg += "\nWriter's Schema: %s" % pretty_writers 37 | if readers_schema: 38 | pretty_readers = json.dumps(json.loads(str(readers_schema)), indent=2) 39 | fail_msg += "\nReader's Schema: %s" % pretty_readers 40 | schema.AvroException.__init__(self, fail_msg) 41 | -------------------------------------------------------------------------------- /src/spavro/new_schema.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | PRIMITIVE = ( 4 | u'null', 5 | u'boolean', 6 | u'string', 7 | u'bytes', 8 | u'int', 9 | u'long', 10 | u'float', 11 | u'double', 12 | ) 13 | 14 | 15 | class Schema(object): 16 | def to_json(self): 17 | raise NotImplemented() 18 | 19 | def __str__(self): 20 | return str(self.type) 21 | 22 | def __repr__(self): 23 | return "<{} type='{}'>".format(self.__class__.__name__, self) 24 | 25 | 26 | class PrimitiveSchema(Schema): 27 | def __init__(self, schema_name): 28 | self.type = schema_name 29 | 30 | 31 | class RecordField(object): 32 | def __init__(self, fielddef): 33 | self.name = fielddef['name'] 34 | self.type = parse_schema(fielddef['type']) 35 | 36 | def __str__(self): 37 | return str(self.type) 38 | 39 | def __repr__(self): 40 | return "<{} type='{}'>".format(self.__class__.__name__, self) 41 | 42 | 43 | class RecordSchema(Schema): 44 | def __init__(self, schema): 45 | self.name = schema['name'] 46 | self.type = schema['type'] 47 | self.fields = [RecordField(field) for field in schema['fields']] 48 | 49 | 50 | class UnionSchema(Schema): 51 | def __init__(self, schemas, names=None): 52 | self.type = 'union' 53 | self.schemas = [parse_schema(schema, names) for schema in schemas] 54 | 55 | 56 | class EnumSchema(Schema): 57 | def __init__(self, schema): 58 | self.type = 'enum' 59 | self.symbols = schema['symbols'] 60 | self.name = schema.get('name', None) 61 | 62 | 63 | class ArraySchema(Schema): 64 | def __init__(self, schema): 65 | raise NotImplemented() 66 | 67 | 68 | class MapSchema(Schema): 69 | def __init__(self, schema): 70 | raise NotImplemented() 71 | 72 | 73 | class FixedSchema(Schema): 74 | def __init__(self, schema): 75 | raise NotImplemented() 76 | 77 | 78 | # all complex types are represented by dictionaries 79 | complex_types = { 80 | 'record': RecordSchema, 81 | 'enum': EnumSchema, 82 | 'array': ArraySchema, 83 | 'map': MapSchema, 84 | 'fixed': FixedSchema 85 | } 86 | 87 | 88 | def parse_schema(schema, names=None): 89 | if type(schema) is list: 90 | return UnionSchema(schema) 91 | elif type(schema) is dict: 92 | if schema['type'] in complex_types: 93 | return complex_types[schema['type']](schema) 94 | elif schema['type'] in PRIMITIVE: 95 | # could add if 'logicalType' in schema as a double guard 96 | # this handles annotated schemas and logical types 97 | # ignores everything else in the dictionary 98 | return parse_schema(schema['type']) 99 | elif schema in PRIMITIVE: 100 | return PrimitiveSchema(schema) 101 | 102 | raise Exception("Invalid schema: {}".format(schema)) 103 | -------------------------------------------------------------------------------- /src/spavro/protocol.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """ 19 | Protocol implementation. 20 | """ 21 | import six 22 | try: 23 | from hashlib import md5 24 | except ImportError: 25 | from md5 import md5 26 | try: 27 | import json 28 | except ImportError: 29 | import simplejson as json 30 | from spavro import schema 31 | 32 | # 33 | # Constants 34 | # 35 | 36 | # TODO(hammer): confirmed 'fixed' with Doug 37 | VALID_TYPE_SCHEMA_TYPES = ('enum', 'record', 'error', 'fixed') 38 | 39 | # 40 | # Exceptions 41 | # 42 | 43 | 44 | class ProtocolParseException(schema.AvroException): 45 | pass 46 | 47 | # 48 | # Base Classes 49 | # 50 | 51 | 52 | class Protocol(object): 53 | """An application protocol.""" 54 | def _parse_types(self, types, type_names): 55 | type_objects = [] 56 | for type in types: 57 | type_object = schema.make_avsc_object(type, type_names) 58 | if type_object.type not in VALID_TYPE_SCHEMA_TYPES: 59 | fail_msg = 'Type %s not an enum, fixed, record, or error.' % type 60 | raise ProtocolParseException(fail_msg) 61 | type_objects.append(type_object) 62 | return type_objects 63 | 64 | def _parse_messages(self, messages, names): 65 | message_objects = {} 66 | for name, body in six.iteritems(messages): 67 | if name in message_objects: 68 | fail_msg = 'Message name "%s" repeated.' % name 69 | raise ProtocolParseException(fail_msg) 70 | elif not(hasattr(body, 'get') and callable(body.get)): 71 | fail_msg = 'Message name "%s" has non-object body %s.' % (name, body) 72 | raise ProtocolParseException(fail_msg) 73 | request = body.get('request') 74 | response = body.get('response') 75 | errors = body.get('errors') 76 | message_objects[name] = Message(name, request, response, errors, names) 77 | return message_objects 78 | 79 | def __init__(self, name, namespace=None, types=None, messages=None): 80 | # Ensure valid ctor args 81 | if not name: 82 | fail_msg = 'Protocols must have a non-empty name.' 83 | raise ProtocolParseException(fail_msg) 84 | elif not isinstance(name, six.string_types): 85 | fail_msg = 'The name property must be a string.' 86 | raise ProtocolParseException(fail_msg) 87 | elif namespace is not None and not isinstance(namespace, six.string_types): 88 | fail_msg = 'The namespace property must be a string.' 89 | raise ProtocolParseException(fail_msg) 90 | elif types is not None and not isinstance(types, list): 91 | fail_msg = 'The types property must be a list.' 92 | raise ProtocolParseException(fail_msg) 93 | elif (messages is not None and 94 | not(hasattr(messages, 'get') and callable(messages.get))): 95 | fail_msg = 'The messages property must be a JSON object.' 96 | raise ProtocolParseException(fail_msg) 97 | 98 | self._props = {} 99 | self.set_prop('name', name) 100 | type_names = schema.Names() 101 | if namespace is not None: 102 | self.set_prop('namespace', namespace) 103 | type_names.default_namespace = namespace 104 | if types is not None: 105 | self.set_prop('types', self._parse_types(types, type_names)) 106 | if messages is not None: 107 | self.set_prop('messages', self._parse_messages(messages, type_names)) 108 | self._md5 = md5(str(self).encode('utf-8')).digest() 109 | 110 | # read-only properties 111 | name = property(lambda self: self.get_prop('name')) 112 | namespace = property(lambda self: self.get_prop('namespace')) 113 | fullname = property(lambda self: 114 | schema.Name(self.name, self.namespace).fullname) 115 | types = property(lambda self: self.get_prop('types')) 116 | types_dict = property(lambda self: dict([(type.name, type) 117 | for type in self.types])) 118 | messages = property(lambda self: self.get_prop('messages')) 119 | md5 = property(lambda self: self._md5) 120 | props = property(lambda self: self._props) 121 | 122 | # utility functions to manipulate properties dict 123 | def get_prop(self, key): 124 | return self.props.get(key) 125 | def set_prop(self, key, value): 126 | self.props[key] = value 127 | 128 | def to_json(self): 129 | to_dump = {} 130 | to_dump['protocol'] = self.name 131 | names = schema.Names(default_namespace=self.namespace) 132 | if self.namespace: 133 | to_dump['namespace'] = self.namespace 134 | if self.types: 135 | to_dump['types'] = [ t.to_json(names) for t in self.types ] 136 | if self.messages: 137 | messages_dict = {} 138 | for name, body in six.iteritems(self.messages): 139 | messages_dict[name] = body.to_json(names) 140 | to_dump['messages'] = messages_dict 141 | return to_dump 142 | 143 | def __str__(self): 144 | return json.dumps(self.to_json()) 145 | 146 | def __eq__(self, that): 147 | to_cmp = json.loads(str(self)) 148 | return to_cmp == json.loads(str(that)) 149 | 150 | 151 | class Message(object): 152 | """A Protocol message.""" 153 | def _parse_request(self, request, names): 154 | if not isinstance(request, list): 155 | fail_msg = 'Request property not a list: %s' % request 156 | raise ProtocolParseException(fail_msg) 157 | return schema.RecordSchema(None, None, request, names, 'request') 158 | 159 | def _parse_response(self, response, names): 160 | if isinstance(response, six.string_types) and names.has_name(response, None): 161 | return names.get_name(response, None) 162 | else: 163 | return schema.make_avsc_object(response, names) 164 | 165 | def _parse_errors(self, errors, names): 166 | if not isinstance(errors, list): 167 | fail_msg = 'Errors property not a list: %s' % errors 168 | raise ProtocolParseException(fail_msg) 169 | errors_for_parsing = {'type': 'error_union', 'declared_errors': errors} 170 | return schema.make_avsc_object(errors_for_parsing, names) 171 | 172 | def __init__(self, name, request, response, errors=None, names=None): 173 | self._name = name 174 | 175 | self._props = {} 176 | self.set_prop('request', self._parse_request(request, names)) 177 | self.set_prop('response', self._parse_response(response, names)) 178 | if errors is not None: 179 | self.set_prop('errors', self._parse_errors(errors, names)) 180 | 181 | # read-only properties 182 | name = property(lambda self: self._name) 183 | request = property(lambda self: self.get_prop('request')) 184 | response = property(lambda self: self.get_prop('response')) 185 | errors = property(lambda self: self.get_prop('errors')) 186 | props = property(lambda self: self._props) 187 | 188 | # utility functions to manipulate properties dict 189 | def get_prop(self, key): 190 | return self.props.get(key) 191 | def set_prop(self, key, value): 192 | self.props[key] = value 193 | 194 | def __str__(self): 195 | return json.dumps(self.to_json()) 196 | 197 | def to_json(self, names=None): 198 | if names is None: 199 | names = schema.Names() 200 | to_dump = {} 201 | to_dump['request'] = self.request.to_json(names) 202 | to_dump['response'] = self.response.to_json(names) 203 | if self.errors: 204 | to_dump['errors'] = self.errors.to_json(names) 205 | return to_dump 206 | 207 | def __eq__(self, that): 208 | return self.name == that.name and self.props == that.props 209 | 210 | 211 | def make_avpr_object(json_data): 212 | """Build Avro Protocol from data parsed out of JSON string.""" 213 | if hasattr(json_data, 'get') and callable(json_data.get): 214 | name = json_data.get('protocol') 215 | namespace = json_data.get('namespace') 216 | types = json_data.get('types') 217 | messages = json_data.get('messages') 218 | return Protocol(name, namespace, types, messages) 219 | else: 220 | raise ProtocolParseException('Not a JSON object: %s' % json_data) 221 | 222 | 223 | def parse(json_string): 224 | """Constructs the Protocol from the JSON text.""" 225 | try: 226 | json_data = json.loads(json_string) 227 | except: 228 | raise ProtocolParseException('Error parsing JSON: %s' % json_string) 229 | 230 | # construct the Avro Protocol object 231 | return make_avpr_object(json_data) 232 | 233 | -------------------------------------------------------------------------------- /src/spavro/schema_resolve.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Pluralsight LLC 2 | 3 | # resolve schemas 4 | from spavro.fast_binary import get_type 5 | from spavro.exceptions import SchemaResolutionException 6 | 7 | 8 | def get_field_by_name(fields, name): 9 | '''Take a list of avro fields, scan the fields using the field name and 10 | return that field.''' 11 | field_names = [field['name'] for field in fields] 12 | return fields[field_names.index(name)] 13 | 14 | 15 | def resolve_record(writer, reader): 16 | '''Take a writer and reader schema and return a 'meta' schema that allows 17 | transforming a previously written record into a new read structure.''' 18 | fields = [] 19 | if writer['name'] != reader['name']: 20 | raise SchemaResolutionException("Schemas not compatible record names don't match") 21 | record_name = reader['name'] 22 | optional = {} 23 | if "namespace" in writer and "namespace" in reader: 24 | optional["namespace"] = reader["namespace"] 25 | 26 | writer_fields = [field['name'] for field in writer['fields']] 27 | reader_fields = [field['name'] for field in reader['fields']] 28 | # check for defaults for records that are in reader 29 | # but not in writer and vice versa 30 | reader_but_not_writer = (set(reader_fields) - set(writer_fields)) 31 | writer_but_not_reader = (set(writer_fields) - set(reader_fields)) 32 | both_reader_and_writer = (set(writer_fields) & set(reader_fields)) 33 | # run through the fields in writer order 34 | for field in writer['fields']: 35 | if field['name'] in both_reader_and_writer: 36 | fields.append({"name": field['name'], "type": resolve(field['type'], 37 | get_field_by_name(reader['fields'], field['name'])['type'])}) 38 | elif field['name'] in writer_but_not_reader: 39 | ### special skip type record 40 | fields.append({"name": field['name'], "type": {"type": "skip", "value": field['type']}}) 41 | 42 | for field in reader['fields']: 43 | if field['name'] in reader_but_not_writer: 44 | try: 45 | fields.append({"name": field['name'], "type": {"type": "default", "value": field['default']}}) 46 | except KeyError: 47 | raise SchemaResolutionException("Schemas not compatible, no default value for field in reader's record that's not present in writer's record") 48 | schema = {"type": "record", "fields": fields, "name": record_name} 49 | schema.update(optional) 50 | return schema 51 | 52 | 53 | primitive_types = ('null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string' ) 54 | 55 | 56 | def resolve_array(writer, reader): 57 | '''Resolve a writer and reader array schema and recursively resolve the 58 | type for each array item schema''' 59 | return {'type': 'array', 'items': resolve(writer['items'], reader['items'])} 60 | 61 | 62 | def resolve_map(writer, reader): 63 | '''Resolve a writer and reader map schema and resolve the type for the 64 | map's value schema''' 65 | return {'type': 'map', 'values': resolve(writer['values'], reader['values'])} 66 | 67 | 68 | def resolve_enum(writer, reader): 69 | '''Compare a writer and reader enum and return a compatible enum''' 70 | if writer['name'] != reader['name']: 71 | raise SchemaResolutionException("Schemas not compatible, enum names don't match") 72 | if set(writer['symbols']) - set(reader['symbols']): 73 | raise SchemaResolutionException("Schemas not compatible, symbol in writer's enum not present in reader's enum") 74 | return {'type': 'enum', 'name': reader['name'], 'symbols': [symbol for symbol in writer['symbols']]} 75 | 76 | 77 | def resolve_fixed(writer, reader): 78 | '''Take a fixed writer and reader schema and return the writers size value. 79 | ''' 80 | if writer['name'] != reader['name'] or writer['size'] != reader['size']: 81 | raise SchemaResolutionException("Schemas not compatible, fixed names or sizes don't match") 82 | return {key: value for key, value in writer.items()} 83 | 84 | 85 | def resolve_union(writer, reader): 86 | '''Take a writer union and a reader union, compare their types and return 87 | a read/write compatible union. 88 | 89 | A compatible read/write union has all of the writer's union schemas in the 90 | reader's schema. 91 | ''' 92 | union = [] 93 | for w_type in writer: 94 | for r_type in reader: 95 | try: 96 | merged = resolve(w_type, r_type) 97 | union.append(merged) 98 | break 99 | except SchemaResolutionException: 100 | # keep trying until we iterate through all read types 101 | continue 102 | else: 103 | # none of the read types matched the write type, this is an error 104 | raise SchemaResolutionException("Schema in writer's union not present in reader's union.") 105 | return union 106 | 107 | 108 | promotable = ['int', 'long', 'float', 'double'] 109 | 110 | 111 | def resolve(writer, reader): 112 | '''Take a writer and a reader schema and return a meta schema that 113 | translates the writer's schema to the reader's schema. 114 | 115 | This handles skipping missing fields and default fills by creating 116 | non-standard 'types' for reader creation. These non-standard types are 117 | never surfaced out since they're not standard avro types but just used 118 | as an implementation detail for generating a write-compantible reader.''' 119 | writer_type = get_type(writer) 120 | reader_type = get_type(reader) 121 | 122 | if writer_type == reader_type: 123 | if reader_type in primitive_types: 124 | return reader 125 | if reader_type == 'array': 126 | return resolve_array(writer, reader) 127 | if reader_type == 'map': 128 | return resolve_map(writer, reader) 129 | if reader_type == 'enum': 130 | return resolve_enum(writer, reader) 131 | if reader_type == 'union': 132 | return resolve_union(writer, reader) 133 | if reader_type == "record": 134 | return resolve_record(writer, reader) 135 | if reader_type == "fixed": 136 | return resolve_fixed(writer, reader) 137 | # for named types or other types that don't match 138 | # just return the reader 139 | return reader 140 | else: 141 | # see if we've 'upgraded' to a union 142 | if reader_type == 'union': 143 | # if the writer type is in the reader's union 144 | # then just return the writer's schema 145 | if writer_type in [get_type(r) for r in reader]: 146 | type_index = [get_type(r) for r in reader].index(writer_type) 147 | return resolve(writer, reader[type_index]) 148 | else: 149 | raise SchemaResolutionException("Writer schema not present in reader union") 150 | if writer_type in promotable and reader_type in promotable and promotable.index(writer_type) < promotable.index(reader_type): 151 | return writer 152 | raise SchemaResolutionException("Reader and Writer schemas are incompatible") 153 | -------------------------------------------------------------------------------- /src/spavro/tether/InputProtocol.avpr: -------------------------------------------------------------------------------- 1 | {"namespace":"org.apache.avro.mapred.tether", 2 | "protocol": "InputProtocol", 3 | "doc": "Transmit inputs to a map or reduce task sub-process.", 4 | 5 | "types": [ 6 | {"name": "TaskType", "type": "enum", "symbols": ["MAP","REDUCE"]} 7 | ], 8 | 9 | "messages": { 10 | 11 | "configure": { 12 | "doc": "Configure the task. Sent before any other message.", 13 | "request": [ 14 | {"name": "taskType", "type": "TaskType", 15 | "doc": "Whether this is a map or reduce task."}, 16 | {"name": "inSchema", "type": "string", 17 | "doc": "The Avro schema for task input data."}, 18 | {"name": "outSchema", "type": "string", 19 | "doc": "The Avro schema for task output data."} 20 | ], 21 | "response": "null", 22 | "one-way": true 23 | }, 24 | 25 | "partitions": { 26 | "doc": "Set the number of map output partitions.", 27 | "request": [ 28 | {"name": "partitions", "type": "int", 29 | "doc": "The number of map output partitions."} 30 | ], 31 | "response": "null", 32 | "one-way": true 33 | }, 34 | 35 | "input": { 36 | "doc": "Send a block of input data to a task.", 37 | "request": [ 38 | {"name": "data", "type": "bytes", 39 | "doc": "A sequence of instances of the declared schema."}, 40 | {"name": "count", "type": "long", 41 | "default": 1, 42 | "doc": "The number of instances in this block."} 43 | ], 44 | "response": "null", 45 | "one-way": true 46 | }, 47 | 48 | "abort": { 49 | "doc": "Called to abort the task.", 50 | "request": [], 51 | "response": "null", 52 | "one-way": true 53 | }, 54 | 55 | "complete": { 56 | "doc": "Called when a task's input is complete.", 57 | "request": [], 58 | "response": "null", 59 | "one-way": true 60 | } 61 | 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/spavro/tether/OutputProtocol.avpr: -------------------------------------------------------------------------------- 1 | {"namespace":"org.apache.avro.mapred.tether", 2 | "protocol": "OutputProtocol", 3 | "doc": "Transmit outputs from a map or reduce task to parent.", 4 | 5 | "messages": { 6 | 7 | "configure": { 8 | "doc": "Configure task. Sent before any other message.", 9 | "request": [ 10 | {"name": "port", "type": "int", 11 | "doc": "The port to transmit inputs to this task on."} 12 | ], 13 | "response": "null", 14 | "one-way": true 15 | }, 16 | 17 | "output": { 18 | "doc": "Send an output datum.", 19 | "request": [ 20 | {"name": "datum", "type": "bytes", 21 | "doc": "A binary-encoded instance of the declared schema."} 22 | ], 23 | "response": "null", 24 | "one-way": true 25 | }, 26 | 27 | "outputPartitioned": { 28 | "doc": "Send map output datum explicitly naming its partition.", 29 | "request": [ 30 | {"name": "partition", "type": "int", 31 | "doc": "The map output partition for this datum."}, 32 | {"name": "datum", "type": "bytes", 33 | "doc": "A binary-encoded instance of the declared schema."} 34 | ], 35 | "response": "null", 36 | "one-way": true 37 | }, 38 | 39 | "status": { 40 | "doc": "Update the task's status message. Also acts as keepalive.", 41 | "request": [ 42 | {"name": "message", "type": "string", 43 | "doc": "The new status message for the task."} 44 | ], 45 | "response": "null", 46 | "one-way": true 47 | }, 48 | 49 | "count": { 50 | "doc": "Increment a task/job counter.", 51 | "request": [ 52 | {"name": "group", "type": "string", 53 | "doc": "The name of the counter group."}, 54 | {"name": "name", "type": "string", 55 | "doc": "The name of the counter to increment."}, 56 | {"name": "amount", "type": "long", 57 | "doc": "The amount to incrment the counter."} 58 | ], 59 | "response": "null", 60 | "one-way": true 61 | }, 62 | 63 | "fail": { 64 | "doc": "Called by a failing task to abort.", 65 | "request": [ 66 | {"name": "message", "type": "string", 67 | "doc": "The reason for failure."} 68 | ], 69 | "response": "null", 70 | "one-way": true 71 | }, 72 | 73 | "complete": { 74 | "doc": "Called when a task's output has completed without error.", 75 | "request": [], 76 | "response": "null", 77 | "one-way": true 78 | } 79 | 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/spavro/tether/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from .util import * 21 | from .tether_task import * 22 | from .tether_task_runner import * 23 | 24 | __all__=util.__all__ 25 | __all__+=tether_task.__all__ 26 | __all__+=tether_task_runner.__all__ 27 | -------------------------------------------------------------------------------- /src/spavro/tether/tether_task_runner.py: -------------------------------------------------------------------------------- 1 | """ 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | """ 18 | 19 | __all__=["TaskRunner"] 20 | 21 | if __name__ == "__main__": 22 | # Relative imports don't work when being run directly 23 | from spavro import tether 24 | from spavro.tether import TetherTask, find_port, inputProtocol 25 | 26 | else: 27 | from . import TetherTask, find_port, inputProtocol 28 | 29 | from spavro import ipc 30 | import six 31 | from six.moves.BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 32 | import logging 33 | import weakref 34 | import threading 35 | import sys 36 | import traceback 37 | 38 | class TaskRunnerResponder(ipc.Responder): 39 | """ 40 | The responder for the thethered process 41 | """ 42 | def __init__(self,runner): 43 | """ 44 | Param 45 | ---------------------------------------------------------- 46 | runner - Instance of TaskRunner 47 | """ 48 | ipc.Responder.__init__(self, inputProtocol) 49 | 50 | self.log=logging.getLogger("TaskRunnerResponder") 51 | 52 | # should we use weak references to avoid circular references? 53 | # We use weak references b\c self.runner owns this instance of TaskRunnerResponder 54 | if isinstance(runner,weakref.ProxyType): 55 | self.runner=runner 56 | else: 57 | self.runner=weakref.proxy(runner) 58 | 59 | self.task=weakref.proxy(runner.task) 60 | 61 | def invoke(self, message, request): 62 | try: 63 | if message.name=='configure': 64 | self.log.info("TetherTaskRunner: Recieved configure") 65 | self.task.configure(request["taskType"],request["inSchema"],request["outSchema"]) 66 | elif message.name=='partitions': 67 | self.log.info("TetherTaskRunner: Recieved partitions") 68 | try: 69 | self.task.set_partitions(request["partitions"]) 70 | except Exception as e: 71 | self.log.error("Exception occured while processing the partitions message: Message:\n"+traceback.format_exc()) 72 | raise 73 | elif message.name=='input': 74 | self.log.info("TetherTaskRunner: Recieved input") 75 | self.task.input(request["data"],request["count"]) 76 | elif message.name=='abort': 77 | self.log.info("TetherTaskRunner: Recieved abort") 78 | self.runner.close() 79 | elif message.name=='complete': 80 | self.log.info("TetherTaskRunner: Recieved complete") 81 | self.task.complete() 82 | self.task.close() 83 | self.runner.close() 84 | else: 85 | self.log.warning("TetherTaskRunner: recieved unknown message {0}".format(message.name)) 86 | 87 | except Exception as e: 88 | self.log.error("Error occured while processing message: {0}".format(message.name)) 89 | emsg=traceback.format_exc() 90 | self.task.fail(emsg) 91 | 92 | return None 93 | 94 | 95 | def HTTPHandlerGen(runner): 96 | """ 97 | This is a class factory for the HTTPHandler. We need 98 | a factory b\c we need a reference to the runner 99 | 100 | Parameters 101 | ----------------------------------------------------------------- 102 | runner - instance of the task runner 103 | """ 104 | 105 | if not(isinstance(runner,weakref.ProxyType)): 106 | runnerref=weakref.proxy(runner) 107 | else: 108 | runnerref=runner 109 | 110 | class TaskRunnerHTTPHandler(BaseHTTPRequestHandler): 111 | """Create a handler for the parent. 112 | """ 113 | 114 | runner=runnerref 115 | def __init__(self,*args,**param): 116 | """ 117 | """ 118 | BaseHTTPRequestHandler.__init__(self,*args,**param) 119 | 120 | def do_POST(self): 121 | self.responder =TaskRunnerResponder(self.runner) 122 | call_request_reader = ipc.FramedReader(self.rfile) 123 | call_request = call_request_reader.read_framed_message() 124 | resp_body = self.responder.respond(call_request) 125 | self.send_response(200) 126 | self.send_header('Content-Type', 'avro/binary') 127 | self.end_headers() 128 | resp_writer = ipc.FramedWriter(self.wfile) 129 | resp_writer.write_framed_message(resp_body) 130 | 131 | return TaskRunnerHTTPHandler 132 | 133 | class TaskRunner(object): 134 | """This class ties together the server handling the requests from 135 | the parent process and the instance of TetherTask which actually 136 | implements the logic for the mapper and reducer phases 137 | """ 138 | 139 | def __init__(self,task): 140 | """ 141 | Construct the runner 142 | 143 | Parameters 144 | --------------------------------------------------------------- 145 | task - An instance of tether task 146 | """ 147 | 148 | self.log=logging.getLogger("TaskRunner:") 149 | 150 | if not(isinstance(task,TetherTask)): 151 | raise ValueError("task must be an instance of tether task") 152 | self.task=task 153 | 154 | self.server=None 155 | self.sthread=None 156 | 157 | def start(self,outputport=None,join=True): 158 | """ 159 | Start the server 160 | 161 | Parameters 162 | ------------------------------------------------------------------- 163 | outputport - (optional) The port on which the parent process is listening 164 | for requests from the task. 165 | - This will typically be supplied by an environment variable 166 | we allow it to be supplied as an argument mainly for debugging 167 | join - (optional) If set to fault then we don't issue a join to block 168 | until the thread excecuting the server terminates. 169 | This is mainly for debugging. By setting it to false, 170 | we can resume execution in this thread so that we can do additional 171 | testing 172 | """ 173 | 174 | port=find_port() 175 | address=("localhost",port) 176 | 177 | 178 | def thread_run(task_runner=None): 179 | task_runner.server = HTTPServer(address, HTTPHandlerGen(task_runner)) 180 | task_runner.server.allow_reuse_address = True 181 | task_runner.server.serve_forever() 182 | 183 | # create a separate thread for the http server 184 | sthread=threading.Thread(target=thread_run,kwargs={"task_runner":self}) 185 | sthread.start() 186 | 187 | self.sthread=sthread 188 | # This needs to run in a separat thread b\c serve_forever() blocks 189 | self.task.open(port,clientPort=outputport) 190 | 191 | # wait for the other thread to finish 192 | if (join): 193 | self.task.ready_for_shutdown.wait() 194 | self.server.shutdown() 195 | 196 | # should we do some kind of check to make sure it exits 197 | self.log.info("Shutdown the logger") 198 | # shutdown the logging 199 | logging.shutdown() 200 | 201 | def close(self): 202 | """ 203 | Handler for the close message 204 | """ 205 | 206 | self.task.close() 207 | 208 | if __name__ == '__main__': 209 | # TODO::Make the logging level a parameter we can set 210 | # logging.basicConfig(level=logging.INFO,filename='/tmp/log',filemode='w') 211 | logging.basicConfig(level=logging.INFO) 212 | 213 | if (len(sys.argv)<=1): 214 | print("Error: tether_task_runner.__main__: Usage: tether_task_runner task_package.task_module.TaskClass") 215 | raise ValueError("Usage: tether_task_runner task_package.task_module.TaskClass") 216 | 217 | fullcls=sys.argv[1] 218 | mod,cname=fullcls.rsplit(".",1) 219 | 220 | logging.info("tether_task_runner.__main__: Task: {0}".format(fullcls)) 221 | 222 | modobj=__import__(mod,fromlist=cname) 223 | 224 | taskcls=getattr(modobj,cname) 225 | task=taskcls() 226 | 227 | runner=TaskRunner(task=task) 228 | runner.start() 229 | -------------------------------------------------------------------------------- /src/spavro/tether/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | """ 18 | 19 | __all__=["find_port"] 20 | 21 | import socket 22 | 23 | 24 | def find_port(): 25 | """ 26 | Return an unbound port 27 | """ 28 | s=socket.socket() 29 | s.bind(("127.0.0.1",0)) 30 | 31 | port=s.getsockname()[1] 32 | s.close() 33 | 34 | return port -------------------------------------------------------------------------------- /src/spavro/tool.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ 21 | Command-line tool 22 | 23 | NOTE: The API for the command-line tool is experimental. 24 | """ 25 | import sys 26 | from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler 27 | import urlparse 28 | from spavro import io 29 | from spavro import datafile 30 | from spavro import protocol 31 | from spavro import ipc 32 | 33 | class GenericResponder(ipc.Responder): 34 | def __init__(self, proto, msg, datum): 35 | proto_json = file(proto, 'r').read() 36 | ipc.Responder.__init__(self, protocol.parse(proto_json)) 37 | self.msg = msg 38 | self.datum = datum 39 | 40 | def invoke(self, message, request): 41 | if message.name == self.msg: 42 | sys.stderr.write("Message: %s Datum: %s \n" % (message.name, self.datum)) 43 | # server will shut down after processing a single Avro request 44 | global server_should_shutdown 45 | server_should_shutdown = True 46 | return self.datum 47 | 48 | class GenericHandler(BaseHTTPRequestHandler): 49 | def do_POST(self): 50 | self.responder = responder 51 | call_request_reader = ipc.FramedReader(self.rfile) 52 | call_request = call_request_reader.read_framed_message() 53 | resp_body = self.responder.respond(call_request) 54 | self.send_response(200) 55 | self.send_header('Content-Type', 'avro/binary') 56 | self.end_headers() 57 | resp_writer = ipc.FramedWriter(self.wfile) 58 | resp_writer.write_framed_message(resp_body) 59 | if server_should_shutdown: 60 | sys.stderr.write("Shutting down server.\n") 61 | self.server.force_stop() 62 | 63 | class StoppableHTTPServer(HTTPServer): 64 | """HTTPServer.shutdown added in Python 2.6. FML.""" 65 | stopped = False 66 | allow_reuse_address = True 67 | def __init__(self, *args, **kw): 68 | HTTPServer.__init__(self, *args, **kw) 69 | self.allow_reuse_address = True 70 | 71 | def serve_forever(self): 72 | while not self.stopped: 73 | self.handle_request() 74 | 75 | def force_stop(self): 76 | self.server_close() 77 | self.stopped = True 78 | self.serve_forever() 79 | 80 | def run_server(uri, proto, msg, datum): 81 | url_obj = urlparse.urlparse(uri) 82 | server_addr = (url_obj.hostname, url_obj.port) 83 | global responder 84 | global server_should_shutdown 85 | server_should_shutdown = False 86 | responder = GenericResponder(proto, msg, datum) 87 | server = StoppableHTTPServer(server_addr, GenericHandler) 88 | print("Port: %s" % server.server_port) 89 | sys.stdout.flush() 90 | server.allow_reuse_address = True 91 | sys.stderr.write("Starting server.\n") 92 | server.serve_forever() 93 | 94 | def send_message(uri, proto, msg, datum): 95 | url_obj = urlparse.urlparse(uri) 96 | client = ipc.HTTPTransceiver(url_obj.hostname, url_obj.port) 97 | proto_json = file(proto, 'r').read() 98 | requestor = ipc.Requestor(protocol.parse(proto_json), client) 99 | print(requestor.request(msg, datum)) 100 | 101 | def file_or_stdin(f): 102 | if f == "-": 103 | return sys.stdin 104 | else: 105 | return file(f) 106 | 107 | def main(args=sys.argv): 108 | if len(args) == 1: 109 | print("Usage: %s [dump|rpcreceive|rpcsend]" % args[0]) 110 | return 1 111 | 112 | if args[1] == "dump": 113 | if len(args) != 3: 114 | print("Usage: %s dump input_file" % args[0]) 115 | return 1 116 | for d in datafile.DataFileReader(file_or_stdin(args[2]), io.DatumReader()): 117 | print(repr(d)) 118 | elif args[1] == "rpcreceive": 119 | usage_str = "Usage: %s rpcreceive uri protocol_file " % args[0] 120 | usage_str += "message_name (-data d | -file f)" 121 | if len(args) not in [5, 7]: 122 | print(usage_str) 123 | return 1 124 | uri, proto, msg = args[2:5] 125 | datum = None 126 | if len(args) > 5: 127 | if args[5] == "-file": 128 | reader = open(args[6], 'rb') 129 | datum_reader = io.DatumReader() 130 | dfr = datafile.DataFileReader(reader, datum_reader) 131 | datum = dfr.next() 132 | elif args[5] == "-data": 133 | print("JSON Decoder not yet implemented.") 134 | return 1 135 | else: 136 | print(usage_str) 137 | return 1 138 | run_server(uri, proto, msg, datum) 139 | elif args[1] == "rpcsend": 140 | usage_str = "Usage: %s rpcsend uri protocol_file " % args[0] 141 | usage_str += "message_name (-data d | -file f)" 142 | if len(args) not in [5, 7]: 143 | print(usage_str) 144 | return 1 145 | uri, proto, msg = args[2:5] 146 | datum = None 147 | if len(args) > 5: 148 | if args[5] == "-file": 149 | reader = open(args[6], 'rb') 150 | datum_reader = io.DatumReader() 151 | dfr = datafile.DataFileReader(reader, datum_reader) 152 | datum = dfr.next() 153 | elif args[5] == "-data": 154 | print("JSON Decoder not yet implemented.") 155 | return 1 156 | else: 157 | print(usage_str) 158 | return 1 159 | send_message(uri, proto, msg, datum) 160 | return 0 161 | 162 | if __name__ == "__main__": 163 | sys.exit(main(sys.argv)) 164 | -------------------------------------------------------------------------------- /src/spavro/txipc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | try: 21 | from cStringIO import StringIO 22 | except ImportError: 23 | from StringIO import StringIO 24 | from spavro import ipc 25 | from spavro import io 26 | 27 | from zope.interface import implements 28 | 29 | from twisted.web.client import Agent 30 | from twisted.web.http_headers import Headers 31 | from twisted.internet.defer import maybeDeferred, Deferred 32 | from twisted.web.iweb import IBodyProducer 33 | from twisted.web import resource, server 34 | from twisted.internet.protocol import Protocol 35 | 36 | class TwistedRequestor(ipc.BaseRequestor): 37 | """A Twisted-compatible requestor. Returns a Deferred that will fire with the 38 | returning value, instead of blocking until the request completes.""" 39 | def _process_handshake(self, call_response, message_name, request_datum): 40 | # process the handshake and call response 41 | buffer_decoder = io.BinaryDecoder(StringIO(call_response)) 42 | call_response_exists = self.read_handshake_response(buffer_decoder) 43 | if call_response_exists: 44 | return self.read_call_response(message_name, buffer_decoder) 45 | else: 46 | return self.request(message_name, request_datum) 47 | 48 | def issue_request(self, call_request, message_name, request_datum): 49 | d = self.transceiver.transceive(call_request) 50 | d.addCallback(self._process_handshake, message_name, request_datum) 51 | return d 52 | 53 | class RequestStreamingProducer(object): 54 | """A streaming producer for issuing requests with the Twisted.web Agent.""" 55 | implements(IBodyProducer) 56 | 57 | paused = False 58 | stopped = False 59 | started = False 60 | 61 | def __init__(self, message): 62 | self._message = message 63 | self._length = len(message) 64 | # We need a buffer length header for every buffer and an additional 65 | # zero-length buffer as the message terminator 66 | self._length += (self._length / ipc.BUFFER_SIZE + 2) \ 67 | * ipc.BUFFER_HEADER_LENGTH 68 | self._total_bytes_sent = 0 69 | self._deferred = Deferred() 70 | 71 | # read-only properties 72 | message = property(lambda self: self._message) 73 | length = property(lambda self: self._length) 74 | consumer = property(lambda self: self._consumer) 75 | deferred = property(lambda self: self._deferred) 76 | 77 | def _get_total_bytes_sent(self): 78 | return self._total_bytes_sent 79 | 80 | def _set_total_bytes_sent(self, bytes_sent): 81 | self._total_bytes_sent = bytes_sent 82 | 83 | total_bytes_sent = property(_get_total_bytes_sent, _set_total_bytes_sent) 84 | 85 | def startProducing(self, consumer): 86 | if self.started: 87 | return 88 | 89 | self.started = True 90 | self._consumer = consumer 91 | # Keep writing data to the consumer until we're finished, 92 | # paused (pauseProducing()) or stopped (stopProducing()) 93 | while self.length - self.total_bytes_sent > 0 and \ 94 | not self.paused and not self.stopped: 95 | self.write() 96 | # self.write will fire this deferred once it has written 97 | # the entire message to the consumer 98 | return self.deferred 99 | 100 | def resumeProducing(self): 101 | self.paused = False 102 | self.write(self) 103 | 104 | def pauseProducing(self): 105 | self.paused = True 106 | 107 | def stopProducing(self): 108 | self.stopped = True 109 | 110 | def write(self): 111 | if self.length - self.total_bytes_sent > ipc.BUFFER_SIZE: 112 | buffer_length = ipc.BUFFER_SIZE 113 | else: 114 | buffer_length = self.length - self.total_bytes_sent 115 | self.write_buffer(self.message[self.total_bytes_sent: 116 | (self.total_bytes_sent + buffer_length)]) 117 | self.total_bytes_sent += buffer_length 118 | # Make sure we wrote the entire message 119 | if self.total_bytes_sent == self.length and not self.stopped: 120 | self.stopProducing() 121 | # A message is always terminated by a zero-length buffer. 122 | self.write_buffer_length(0) 123 | self.deferred.callback(None) 124 | 125 | def write_buffer(self, chunk): 126 | buffer_length = len(chunk) 127 | self.write_buffer_length(buffer_length) 128 | self.consumer.write(chunk) 129 | 130 | def write_buffer_length(self, n): 131 | self.consumer.write(ipc.BIG_ENDIAN_INT_STRUCT.pack(n)) 132 | 133 | class AvroProtocol(Protocol): 134 | 135 | recvd = '' 136 | done = False 137 | 138 | def __init__(self, finished): 139 | self.finished = finished 140 | self.message = [] 141 | 142 | def dataReceived(self, data): 143 | self.recvd = self.recvd + data 144 | while len(self.recvd) >= ipc.BUFFER_HEADER_LENGTH: 145 | buffer_length ,= ipc.BIG_ENDIAN_INT_STRUCT.unpack( 146 | self.recvd[:ipc.BUFFER_HEADER_LENGTH]) 147 | if buffer_length == 0: 148 | response = ''.join(self.message) 149 | self.done = True 150 | self.finished.callback(response) 151 | break 152 | if len(self.recvd) < buffer_length + ipc.BUFFER_HEADER_LENGTH: 153 | break 154 | buffer = self.recvd[ipc.BUFFER_HEADER_LENGTH:buffer_length + ipc.BUFFER_HEADER_LENGTH] 155 | self.recvd = self.recvd[buffer_length + ipc.BUFFER_HEADER_LENGTH:] 156 | self.message.append(buffer) 157 | 158 | def connectionLost(self, reason): 159 | if not self.done: 160 | self.finished.errback(ipc.ConnectionClosedException("Reader read 0 bytes.")) 161 | 162 | class TwistedHTTPTransceiver(object): 163 | """This transceiver uses the Agent class present in Twisted.web >= 9.0 164 | for issuing requests to the remote endpoint.""" 165 | def __init__(self, host, port, remote_name=None, reactor=None): 166 | self.url = "http://%s:%d/" % (host, port) 167 | 168 | if remote_name is None: 169 | # There's no easy way to get this peer's remote address 170 | # in Twisted so I use a random UUID to identify ourselves 171 | import uuid 172 | self.remote_name = uuid.uuid4() 173 | 174 | if reactor is None: 175 | from twisted.internet import reactor 176 | self.agent = Agent(reactor) 177 | 178 | def read_framed_message(self, response): 179 | finished = Deferred() 180 | response.deliverBody(AvroProtocol(finished)) 181 | return finished 182 | 183 | def transceive(self, request): 184 | req_method = 'POST' 185 | req_headers = { 186 | 'Content-Type': ['avro/binary'], 187 | 'Accept-Encoding': ['identity'], 188 | } 189 | 190 | body_producer = RequestStreamingProducer(request) 191 | d = self.agent.request( 192 | req_method, 193 | self.url, 194 | headers=Headers(req_headers), 195 | bodyProducer=body_producer) 196 | return d.addCallback(self.read_framed_message) 197 | 198 | class AvroResponderResource(resource.Resource): 199 | """This Twisted.web resource can be placed anywhere in a URL hierarchy 200 | to provide an Avro endpoint. Different Avro protocols can be served 201 | by the same web server as long as they are in different resources in 202 | a URL hierarchy.""" 203 | isLeaf = True 204 | 205 | def __init__(self, responder): 206 | resource.Resource.__init__(self) 207 | self.responder = responder 208 | 209 | def cb_render_POST(self, resp_body, request): 210 | request.setResponseCode(200) 211 | request.setHeader('Content-Type', 'avro/binary') 212 | resp_writer = ipc.FramedWriter(request) 213 | resp_writer.write_framed_message(resp_body) 214 | request.finish() 215 | 216 | def render_POST(self, request): 217 | # Unfortunately, Twisted.web doesn't support incoming 218 | # streamed input yet, the whole payload must be kept in-memory 219 | request.content.seek(0, 0) 220 | call_request_reader = ipc.FramedReader(request.content) 221 | call_request = call_request_reader.read_framed_message() 222 | d = maybeDeferred(self.responder.respond, call_request) 223 | d.addCallback(self.cb_render_POST, request) 224 | return server.NOT_DONE_YET 225 | -------------------------------------------------------------------------------- /test/av_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | import sys 22 | import time 23 | from random import sample, choice, randint 24 | from string import lowercase 25 | 26 | import spavro.datafile 27 | import spavro.schema 28 | import spavro.io 29 | 30 | 31 | types = ["A", "CNAME"] 32 | 33 | def rand_name(): 34 | return ''.join(sample(lowercase, 15)) 35 | 36 | def rand_ip(): 37 | return "%s.%s.%s.%s" %(randint(0,255), randint(0,255), randint(0,255), randint(0,255)) 38 | 39 | def write(n): 40 | schema_s=""" 41 | { "type": "record", 42 | "name": "Query", 43 | "fields" : [ 44 | {"name": "query", "type": "string"}, 45 | {"name": "response", "type": "string"}, 46 | {"name": "type", "type": "string", "default": "A"} 47 | ]}""" 48 | out = open("datafile.avr",'w') 49 | 50 | schema = spavro.schema.parse(schema_s) 51 | writer = spavro.io.DatumWriter(schema) 52 | dw = spavro.datafile.DataFileWriter(out, writer, schema) #,codec='deflate') 53 | for _ in xrange(n): 54 | response = rand_ip() 55 | query = rand_name() 56 | type = choice(types) 57 | dw.append({'query': query, 'response': response, 'type': type}) 58 | 59 | dw.close() 60 | 61 | def read(): 62 | f = open("datafile.avr") 63 | reader = spavro.io.DatumReader() 64 | af = spavro.datafile.DataFileReader(f,reader) 65 | 66 | x=0 67 | for _ in af: 68 | pass 69 | 70 | def t(f, *args): 71 | s = time.time() 72 | f(*args) 73 | e = time.time() 74 | return e-s 75 | 76 | if __name__ == "__main__": 77 | n = int(sys.argv[1]) 78 | print "Write %0.4f" % t(write, n) 79 | print "Read %0.4f" % t(read) 80 | -------------------------------------------------------------------------------- /test/gen_interop_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | import sys 21 | from spavro import schema 22 | from spavro import io 23 | from spavro import datafile 24 | 25 | DATUM = { 26 | 'intField': 12, 27 | 'longField': 15234324L, 28 | 'stringField': unicode('hey'), 29 | 'boolField': True, 30 | 'floatField': 1234.0, 31 | 'doubleField': -1234.0, 32 | 'bytesField': '12312adf', 33 | 'nullField': None, 34 | 'arrayField': [5.0, 0.0, 12.0], 35 | 'mapField': {'a': {'label': 'a'}, 'bee': {'label': 'cee'}}, 36 | 'unionField': 12.0, 37 | 'enumField': 'C', 38 | 'fixedField': '1019181716151413', 39 | 'recordField': {'label': 'blah', 'children': [{'label': 'inner', 'children': []}]}, 40 | } 41 | 42 | if __name__ == "__main__": 43 | interop_schema = schema.parse(open(sys.argv[1], 'r').read()) 44 | writer = open(sys.argv[2], 'wb') 45 | datum_writer = io.DatumWriter() 46 | # NB: not using compression 47 | dfw = datafile.DataFileWriter(writer, datum_writer, interop_schema) 48 | dfw.append(DATUM) 49 | dfw.close() 50 | -------------------------------------------------------------------------------- /test/sample_http_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | import sys 21 | 22 | from spavro import ipc 23 | from spavro import protocol 24 | 25 | MAIL_PROTOCOL_JSON = """\ 26 | {"namespace": "example.proto", 27 | "protocol": "Mail", 28 | 29 | "types": [ 30 | {"name": "Message", "type": "record", 31 | "fields": [ 32 | {"name": "to", "type": "string"}, 33 | {"name": "from", "type": "string"}, 34 | {"name": "body", "type": "string"} 35 | ] 36 | } 37 | ], 38 | 39 | "messages": { 40 | "send": { 41 | "request": [{"name": "message", "type": "Message"}], 42 | "response": "string" 43 | }, 44 | "replay": { 45 | "request": [], 46 | "response": "string" 47 | } 48 | } 49 | } 50 | """ 51 | MAIL_PROTOCOL = protocol.parse(MAIL_PROTOCOL_JSON) 52 | SERVER_HOST = 'localhost' 53 | SERVER_PORT = 9090 54 | 55 | class UsageError(Exception): 56 | def __init__(self, value): 57 | self.value = value 58 | def __str__(self): 59 | return repr(self.value) 60 | 61 | def make_requestor(server_host, server_port, protocol): 62 | client = ipc.HTTPTransceiver(SERVER_HOST, SERVER_PORT) 63 | return ipc.Requestor(protocol, client) 64 | 65 | if __name__ == '__main__': 66 | if len(sys.argv) not in [4, 5]: 67 | raise UsageError("Usage: []") 68 | 69 | # client code - attach to the server and send a message 70 | # fill in the Message record 71 | message = dict() 72 | message['to'] = sys.argv[1] 73 | message['from'] = sys.argv[2] 74 | message['body'] = sys.argv[3] 75 | 76 | try: 77 | num_messages = int(sys.argv[4]) 78 | except: 79 | num_messages = 1 80 | 81 | # build the parameters for the request 82 | params = {} 83 | params['message'] = message 84 | 85 | # send the requests and print the result 86 | for msg_count in range(num_messages): 87 | requestor = make_requestor(SERVER_HOST, SERVER_PORT, MAIL_PROTOCOL) 88 | result = requestor.request('send', params) 89 | print("Result: " + result) 90 | 91 | # try out a replay message 92 | requestor = make_requestor(SERVER_HOST, SERVER_PORT, MAIL_PROTOCOL) 93 | result = requestor.request('replay', dict()) 94 | print("Replay Result: " + result) 95 | -------------------------------------------------------------------------------- /test/sample_http_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 21 | from spavro import ipc 22 | from spavro import protocol 23 | 24 | MAIL_PROTOCOL_JSON = """\ 25 | {"namespace": "example.proto", 26 | "protocol": "Mail", 27 | 28 | "types": [ 29 | {"name": "Message", "type": "record", 30 | "fields": [ 31 | {"name": "to", "type": "string"}, 32 | {"name": "from", "type": "string"}, 33 | {"name": "body", "type": "string"} 34 | ] 35 | } 36 | ], 37 | 38 | "messages": { 39 | "send": { 40 | "request": [{"name": "message", "type": "Message"}], 41 | "response": "string" 42 | }, 43 | "replay": { 44 | "request": [], 45 | "response": "string" 46 | } 47 | } 48 | } 49 | """ 50 | MAIL_PROTOCOL = protocol.parse(MAIL_PROTOCOL_JSON) 51 | SERVER_ADDRESS = ('localhost', 9090) 52 | 53 | class MailResponder(ipc.Responder): 54 | def __init__(self): 55 | ipc.Responder.__init__(self, MAIL_PROTOCOL) 56 | 57 | def invoke(self, message, request): 58 | if message.name == 'send': 59 | request_content = request['message'] 60 | response = "Sent message to %(to)s from %(from)s with body %(body)s" % \ 61 | request_content 62 | return response 63 | elif message.name == 'replay': 64 | return 'replay' 65 | 66 | class MailHandler(BaseHTTPRequestHandler): 67 | def do_POST(self): 68 | self.responder = MailResponder() 69 | call_request_reader = ipc.FramedReader(self.rfile) 70 | call_request = call_request_reader.read_framed_message() 71 | resp_body = self.responder.respond(call_request) 72 | self.send_response(200) 73 | self.send_header('Content-Type', 'avro/binary') 74 | self.end_headers() 75 | resp_writer = ipc.FramedWriter(self.wfile) 76 | resp_writer.write_framed_message(resp_body) 77 | 78 | if __name__ == '__main__': 79 | mail_server = HTTPServer(SERVER_ADDRESS, MailHandler) 80 | mail_server.allow_reuse_address = True 81 | mail_server.serve_forever() 82 | -------------------------------------------------------------------------------- /test/set_avro_test_path.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """ 19 | Module adjusts the path PYTHONPATH so the unittests 20 | will work even if an egg for AVRO is already installed. 21 | By default eggs always appear higher on pythons path then 22 | directories set via the environment variable PYTHONPATH. 23 | 24 | For reference see: 25 | http://www.velocityreviews.com/forums/t716589-pythonpath-and-eggs.html 26 | http://stackoverflow.com/questions/897792/pythons-sys-path-value. 27 | 28 | Unittests would therefore use the installed AVRO and not the AVRO 29 | being built. To work around this the unittests import this module before 30 | importing AVRO. This module in turn adjusts the python path so that the test 31 | build of AVRO is higher on the path then any installed eggs. 32 | """ 33 | import sys 34 | import os 35 | 36 | # determine the build directory and then make sure all paths that start with the 37 | # build directory are at the top of the path 38 | builddir=os.path.split(os.path.split(__file__)[0])[0] 39 | bpaths=filter(lambda s:s.startswith(builddir), sys.path) 40 | 41 | for p in bpaths: 42 | sys.path.insert(0,p) -------------------------------------------------------------------------------- /test/test_datafile.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | import os 19 | import unittest 20 | 21 | from spavro import schema 22 | from spavro import io 23 | from spavro import datafile 24 | 25 | try: 26 | type(unicode) 27 | except NameError: 28 | unicode = str 29 | 30 | SCHEMAS_TO_VALIDATE = ( 31 | ('"null"', None), 32 | ('"boolean"', True), 33 | ('"string"', unicode('adsfasdf09809dsf-=adsf')), 34 | ('"bytes"', b'12345abcd'), 35 | ('"int"', 1234), 36 | ('"long"', 1234), 37 | ('"float"', 1234.0), 38 | ('"double"', 1234.0), 39 | ('{"type": "fixed", "name": "Test", "size": 1}', b'B'), 40 | ('{"type": "enum", "name": "Test", "symbols": ["A", "B"]}', u'B'), 41 | ('{"type": "array", "items": "long"}', [1, 3, 2]), 42 | ('{"type": "map", "values": "long"}', {'a': 1, 'b': 3, 'c': 2}), 43 | ('["string", "null", "long"]', None), 44 | ("""\ 45 | {"type": "record", 46 | "name": "Test", 47 | "fields": [{"name": "f", "type": "long"}]} 48 | """, {'f': 5}), 49 | ("""\ 50 | {"type": "record", 51 | "name": "Lisp", 52 | "fields": [{"name": "value", 53 | "type": ["null", "string", 54 | {"type": "record", 55 | "name": "Cons", 56 | "fields": [{"name": "car", "type": "Lisp"}, 57 | {"name": "cdr", "type": "Lisp"}]}]}]} 58 | """, {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}), 59 | ) 60 | 61 | FILENAME = 'test_datafile.out' 62 | CODECS_TO_VALIDATE = ('null', 'deflate') 63 | try: 64 | import snappy 65 | CODECS_TO_VALIDATE += ('snappy',) 66 | except ImportError: 67 | print('Snappy not present, will skip testing it.') 68 | 69 | try: 70 | import lzma 71 | CODECS_TO_VALIDATE += ('xz',) 72 | except ImportError: 73 | print('lzma not present, will skip testing xz codec.') 74 | 75 | # TODO(hammer): clean up written files with ant, not os.remove 76 | class TestDataFile(unittest.TestCase): 77 | def test_round_trip(self): 78 | print('') 79 | print('TEST ROUND TRIP') 80 | print('===============') 81 | print('') 82 | correct = 0 83 | print(SCHEMAS_TO_VALIDATE) 84 | for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): 85 | for codec in CODECS_TO_VALIDATE: 86 | print('') 87 | print('SCHEMA NUMBER %d' % (i + 1)) 88 | print('================') 89 | print('') 90 | print('Schema: %s' % example_schema) 91 | print('Datum: %s' % datum) 92 | print('Codec: %s' % codec) 93 | 94 | # write data in binary to file 10 times 95 | writer = open(FILENAME, 'wb') 96 | datum_writer = io.DatumWriter() 97 | schema_object = schema.parse(example_schema) 98 | dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) 99 | for datum_counter in range(10): 100 | dfw.append(datum) 101 | dfw.close() 102 | 103 | # read data in binary from file 104 | reader = open(FILENAME, 'rb') 105 | datum_reader = io.DatumReader() 106 | dfr = datafile.DataFileReader(reader, datum_reader) 107 | round_trip_data = [] 108 | for read_datum in dfr: 109 | round_trip_data.append(read_datum) 110 | 111 | print('Round Trip Data: %s' % round_trip_data) 112 | print('Round Trip Data Length: %d' % len(round_trip_data)) 113 | is_correct = [datum] * 10 == round_trip_data 114 | if is_correct: 115 | correct += 1 116 | print('Correct Round Trip: %s' % is_correct) 117 | print('') 118 | os.remove(FILENAME) 119 | self.assertEqual(correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE)) 120 | 121 | def test_append(self): 122 | print('') 123 | print('TEST APPEND') 124 | print('===========') 125 | print('') 126 | correct = 0 127 | for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): 128 | for codec in CODECS_TO_VALIDATE: 129 | print('') 130 | print('SCHEMA NUMBER %d' % (i + 1)) 131 | print('================') 132 | print('') 133 | print('Schema: %s' % example_schema) 134 | print('Datum: %s' % datum) 135 | print('Codec: %s' % codec) 136 | 137 | # write data in binary to file once 138 | writer = open(FILENAME, 'wb') 139 | datum_writer = io.DatumWriter() 140 | schema_object = schema.parse(example_schema) 141 | dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) 142 | dfw.append(datum) 143 | dfw.close() 144 | 145 | # open file, write, and close nine times 146 | for i in range(9): 147 | writer = open(FILENAME, 'ab+') 148 | dfw = datafile.DataFileWriter(writer, io.DatumWriter()) 149 | dfw.append(datum) 150 | dfw.close() 151 | 152 | # read data in binary from file 153 | reader = open(FILENAME, 'rb') 154 | datum_reader = io.DatumReader() 155 | dfr = datafile.DataFileReader(reader, datum_reader) 156 | appended_data = [] 157 | for datum in dfr: 158 | appended_data.append(datum) 159 | 160 | print('Appended Data: %s' % appended_data) 161 | print('Appended Data Length: %d' % len(appended_data)) 162 | is_correct = [datum] * 10 == appended_data 163 | if is_correct: 164 | correct += 1 165 | print('Correct Appended: %s' % is_correct) 166 | print('') 167 | os.remove(FILENAME) 168 | self.assertEqual(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE)) 169 | 170 | def test_context_manager(self): 171 | # Context manager was introduced as a first class 172 | # member only in Python 2.6 and above. 173 | import sys 174 | if sys.version_info < (2,6): 175 | print('Skipping context manager tests on this Python version.') 176 | return 177 | # Test the writer with a 'with' statement. 178 | writer = open(FILENAME, 'wb') 179 | datum_writer = io.DatumWriter() 180 | sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] 181 | schema_object = schema.parse(sample_schema) 182 | with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: 183 | dfw.append(sample_datum) 184 | self.assertTrue(writer.closed) 185 | 186 | # Test the reader with a 'with' statement. 187 | datums = [] 188 | reader = open(FILENAME, 'rb') 189 | datum_reader = io.DatumReader() 190 | with datafile.DataFileReader(reader, datum_reader) as dfr: 191 | for datum in dfr: 192 | datums.append(datum) 193 | self.assertTrue(reader.closed) 194 | 195 | def test_metadata(self): 196 | # Test the writer with a 'with' statement. 197 | writer = open(FILENAME, 'wb') 198 | datum_writer = io.DatumWriter() 199 | sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] 200 | schema_object = schema.parse(sample_schema) 201 | with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: 202 | dfw.set_meta('test.string', 'foo') 203 | dfw.set_meta('test.number', '1') 204 | dfw.append(sample_datum) 205 | self.assertTrue(writer.closed) 206 | 207 | # Test the reader with a 'with' statement. 208 | datums = [] 209 | reader = open(FILENAME, 'rb') 210 | datum_reader = io.DatumReader() 211 | with datafile.DataFileReader(reader, datum_reader) as dfr: 212 | self.assertEqual('foo', dfr.get_meta('test.string')) 213 | self.assertEqual('1', dfr.get_meta('test.number')) 214 | for datum in dfr: 215 | datums.append(datum) 216 | self.assertTrue(reader.closed) 217 | 218 | def test_writer_incorrect_mode_handling(self): 219 | '''When an output file is passed to the DataFileWriter to append records and 220 | it's not readable, throw an exception.''' 221 | writer = open(FILENAME, 'wb') 222 | with self.assertRaises(datafile.DataFileException) as context: 223 | datafile.DataFileWriter(writer, io.DatumWriter()) 224 | 225 | 226 | if __name__ == '__main__': 227 | unittest.main() 228 | -------------------------------------------------------------------------------- /test/test_datafile_interop.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | import os 19 | import unittest 20 | 21 | from spavro import io 22 | from spavro import datafile 23 | 24 | class TestDataFileInterop(unittest.TestCase): 25 | def test_interop(self): 26 | print('') 27 | print('TEST INTEROP') 28 | print('============') 29 | print('') 30 | for f in os.listdir('./interop'): 31 | print('READING %s' % f) 32 | print('') 33 | 34 | # read data in binary from file 35 | reader = open(os.path.join('./interop', f), 'rb') 36 | datum_reader = io.DatumReader() 37 | dfr = datafile.DataFileReader(reader, datum_reader) 38 | for datum in dfr: 39 | assert datum is not None 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /test/test_ipc.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """ 19 | There are currently no IPC tests within python, in part because there are no 20 | servers yet available. 21 | """ 22 | import unittest 23 | 24 | # This test does import this code, to make sure it at least passes 25 | # compilation. 26 | from spavro import ipc 27 | 28 | class TestIPC(unittest.TestCase): 29 | def test_placeholder(self): 30 | pass 31 | 32 | def test_server_with_path(self): 33 | client_with_custom_path = ipc.HTTPTransceiver('apache.org', 80, '/service/article') 34 | self.assertEqual('/service/article', client_with_custom_path.req_resource) 35 | 36 | client_with_default_path = ipc.HTTPTransceiver('apache.org', 80) 37 | self.assertEqual('/', client_with_default_path.req_resource) 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /test/test_more_schemas.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Pluralsight LLC 2 | 3 | import unittest 4 | from six import BytesIO 5 | 6 | import spavro.io 7 | from spavro.io import FastDatumWriter 8 | # from spavro.io import SlowDatumWriter as FastDatumWriter 9 | # from spavro.exceptions import AvroTypeException, SchemaParseException 10 | from spavro.schema import SchemaParseException, AvroException, UnionSchema 11 | 12 | 13 | from spavro import schema 14 | 15 | invalid_schemas = ( 16 | ("true_is_not_a_schema", '"True"'), 17 | ("unquoted_true_is_not_a_schema", 'True'), 18 | ("missing_type", '{"no_type": "test"}'), 19 | ("invalid_type", '{"type": "panther"}'), 20 | ("missing_required_size_in_fixed", '''{"type": "fixed", 21 | "name": "Missing size"}'''), 22 | ("missing_required_name_in_fixed", '''{"type": "fixed", 23 | "size": 314}'''), 24 | ("enum_symbols_not_an_array", '''{"type": "enum", 25 | "name": "Status", 26 | "symbols": "Normal Caution Critical"}'''), 27 | ("missing_required_name_in_enum", '''{"type": "enum", 28 | "symbols": ["Normal", "Caution", "Critical"]}''') 29 | ) 30 | 31 | valid_schemas = ( 32 | ("bytes_in_a_union", 33 | '["null", "string", "bytes"]', 34 | UnionSchema), 35 | ("bytes_and_fixed_in_a_union", 36 | '["bytes", {"name": "sixteen-bytes", "type": "fixed", "size": 16}]', 37 | UnionSchema), 38 | ) 39 | 40 | 41 | class TestSchemaParsing(unittest.TestCase): 42 | pass 43 | 44 | 45 | def create_good_case(local_schema, expected_schema_type): 46 | def test_write_good_data(self): 47 | write_schema = spavro.schema.parse(local_schema) 48 | self.assertIs(type(write_schema), expected_schema_type) 49 | return test_write_good_data 50 | 51 | 52 | def create_exception_case(local_schema): 53 | # print(schema) 54 | def test_parse_invalid_schema(self): 55 | with self.assertRaises((SchemaParseException, AvroException)) as context: 56 | spavro.schema.parse(local_schema) 57 | return test_parse_invalid_schema 58 | 59 | 60 | def make_good_cases(cases): 61 | for name, local_schema, expected_schema_type in cases: 62 | test_method = create_good_case(local_schema, expected_schema_type) 63 | test_method.__name__ = 'test_good_schema_{}'.format(name) 64 | setattr(TestSchemaParsing, test_method.__name__, test_method) 65 | 66 | 67 | def make_exception_cases(cases): 68 | for name, local_schema in cases: 69 | test_method = create_exception_case(local_schema) 70 | test_method.__name__ = 'test_invalid_schema_{}'.format(name) 71 | setattr(TestSchemaParsing, test_method.__name__, test_method) 72 | 73 | 74 | make_good_cases(valid_schemas) 75 | make_exception_cases(invalid_schemas) 76 | -------------------------------------------------------------------------------- /test/test_old_vs_new.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Pluralsight LLC 2 | 3 | import unittest 4 | import six 5 | from six import BytesIO as StringIO 6 | # try: 7 | # from cStringIO import StringIO 8 | # except ImportError: 9 | # from StringIO import StringIO 10 | try: 11 | type(unicode) 12 | except NameError: 13 | unicode = str 14 | 15 | import json 16 | import spavro.schema 17 | import spavro.io 18 | # from spavro.fast_binary import make_union_writer, get_writer 19 | from spavro.io import FastDatumWriter, SlowDatumWriter 20 | 21 | 22 | class TestOldVsNew(unittest.TestCase): 23 | def setUp(self): 24 | self.fdw = FastDatumWriter() 25 | self.sdw = SlowDatumWriter() 26 | 27 | cases = ( 28 | ("null", "null", None), 29 | ("int", "int", 1234), 30 | ("long", "long", 1234567890), 31 | ("float", "float", 1234.123), 32 | ("double", "double", 1234.12345), 33 | ("boolean", "boolean", True), 34 | ("string", "string", unicode('adsfasdf09809dsf-=adsf')), 35 | ("a_fixed_record", {"type": "fixed", "size": 4, "name": "fixeddata"}, b'Obj\x01'), 36 | ("an_enum_record", {"type": "enum", "symbols": ["A", "B", "C"], "name": "enumdata"}, "A"), 37 | ("an_array", {"type": "array", "items": "int"}, [1, 2, 3, 1234, 4321]), 38 | ("a_map", {"type": "map", "values": "int"}, {"L1": 1, "L2": 2, "L3": 3, "L4": 4}), 39 | ("union_test_string", ["null", "int", "string"], unicode("whassssuuuup")), 40 | ("union_test_int", ["null", "int", "string"], 1234), 41 | ("union_test_null", ["null", "int", "string"], None), 42 | ("a_record", 43 | {"type": "record", "name": "recorddata", "fields": [{"name": "field1", "type": "long"}, 44 | {"name": "field2", "type": "string"}]}, 45 | {"field1": 1234, "field2": unicode("whassssuuuuup")}), 46 | ("recursive_defined_record", 47 | {u'fields': [{u'type': [u'null', u'string', {u'fields': [{u'type': u'Lisp', u'name': u'car'}, 48 | {u'type': u'Lisp', u'name': u'cdr'}], 49 | u'type': u'record', u'name': u'Cons'}], 50 | u'name': u'value'}], 51 | u'type': u'record', 52 | u'name': u'Lisp'}, 53 | {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}), 54 | ("union_of_two_records_recordA", 55 | [{"type": "record", "name": "recorddata", "fields": [{"name": "fieldA1", "type": "long"}, 56 | {"name": "fieldA2", "type": "string"}]}, 57 | {"type": "record", "name": "recorddata2", "fields": [{"name": "fieldB1", "type": {"type": "fixed", "name": "fixedbytes", "size": 4}}, 58 | {"name": "fieldB2", "type": "string"}]}], 59 | {"fieldA1": 1234, "fieldA2": unicode("whassssuuuuup")}), 60 | ("union_of_two_records_recordB", 61 | [{"type": "record", "name": "recorddata", "fields": [{"name": "fieldA1", "type": "long"}, 62 | {"name": "fieldA2", "type": "string"}]}, 63 | {"type": "record", "name": "recorddata2", "fields": [{"name": "fieldB1", "type": {"type": "fixed", "name": "fixedbytes", "size": 4}}, 64 | {"name": "fieldB2", "type": "string"}]}], 65 | {"fieldB1": b'\x01\x02\x03\x04', "fieldB2": unicode("Nother Record")}) 66 | ) 67 | 68 | 69 | def create_case(schema, datum): 70 | def compare_old_and_new(self): 71 | fastbuff = StringIO() 72 | slowbuff = StringIO() 73 | fastencoder = spavro.io.FastBinaryEncoder(fastbuff) 74 | slowencoder = spavro.io.SlowBinaryEncoder(slowbuff) 75 | write_schema = spavro.schema.parse(json.dumps(schema)) 76 | for i in range(10): 77 | self.fdw.write_data(write_schema, datum, fastencoder) 78 | self.sdw.write_data(write_schema, datum, slowencoder) 79 | self.assertEqual(fastbuff.getvalue(), slowbuff.getvalue()) 80 | return compare_old_and_new 81 | 82 | 83 | def make_cases(): 84 | for name, schema, datum in cases: 85 | test_method = create_case(schema, datum) 86 | test_method.__name__ = 'test_write_old_vs_new_{}'.format(name) 87 | setattr(TestOldVsNew, test_method.__name__, test_method) 88 | 89 | make_cases() 90 | -------------------------------------------------------------------------------- /test/test_schema_validation.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Pluralsight LLC 2 | 3 | import unittest 4 | from six import BytesIO 5 | 6 | import spavro.io 7 | from spavro.io import FastDatumWriter 8 | # from spavro.io import SlowDatumWriter as FastDatumWriter 9 | from spavro.exceptions import AvroTypeException 10 | 11 | 12 | valid_data = ( 13 | ("int_at_the_upper_boundary", 2147483647, '"int"'), 14 | ("int_at_the_lower_boundary", -2147483648, '"int"'), 15 | ("long_at_the_upper_boundary", 9223372036854775807, '"long"'), 16 | ("long_at_the_lower_boundary", -9223372036854775808, '"long"'), 17 | ("interger_data_float_schema", 123, '"float"'), 18 | # booleans are considered an integer type? fascinating 19 | ("boolean_data_float_schema", True, '"float"'), 20 | ("boolean_data_integer_schema", True, '"int"'), 21 | ("optional_field", {"value": 100}, '{"fields": [{"type": ["null", "string"], "name": "id"}, {"type": "int", "name": "value"}], "type": "record", "name": "test_schema"}'), 22 | ("fixed", b'\x01\x01\x01\x01\x01\x01\x01\x01', '{"name": "testfix", "type": "fixed", "size": 8}'), 23 | ("make_sure_null_term_doesnt_break", b'\x01\x01\x00\x01\x01\x01\x01\x01', '{"name": "testfix", "type": "fixed", "size": 8}'), 24 | ) 25 | 26 | invalid_data = ( 27 | ("missing_required_field_1", {"value": 100}, '{"fields": [{"type": "string", "name": "id"}, {"type": "int", "name": "value"}], "type": "record", "name": "test_schema"}'), 28 | ("missing_required_field_2", {"id": "bork"}, '{"fields": [{"type": "string", "name": "id"}, {"type": "int", "name": "value"}], "type": "record", "name": "test_schema"}'), 29 | ("string_data_long_schema", u'boom!', '"long"'), 30 | ("string_data_boolean_schema", u"boom!", '"boolean"'), 31 | ("int_data_boolean_schema", 123, '"boolean"'), 32 | ("float_data_int_schema", 123.456, '"long"'), 33 | ("null_data_string_schema", None, '"string"'), 34 | ("null_data_int_schema", None, '"int"'), 35 | ("null_data_boolean_schema", None, '"boolean"'), 36 | ("mismatch_fixed_data_fixed_schema", b'\x97', '{"name": "testfix", "type": "fixed", "size": 8}'), 37 | ("int_too_big", 2147483648, '"int"'), 38 | ("int_too_small", -2147483649, '"int"'), 39 | ("long_too_big", 9223372036854775808, '"long"'), 40 | ("long_too_small", -9223372036854775809, '"long"'), 41 | ("wrong_data_in_array", [1, u'B'], '{"type": "array", "items": "int"}'), 42 | ) 43 | 44 | 45 | class TestValidData(unittest.TestCase): 46 | pass 47 | 48 | 49 | def create_good_case(schema, datum): 50 | write_schema = spavro.schema.parse(schema) 51 | def test_write_good_data(self): 52 | fastbuff = BytesIO() 53 | fastencoder = spavro.io.FastBinaryEncoder(fastbuff) 54 | fdw = FastDatumWriter(write_schema) 55 | fdw.write(datum, fastencoder) 56 | return test_write_good_data 57 | 58 | 59 | def create_exception_case(schema, datum): 60 | print(schema) 61 | write_schema = spavro.schema.parse(schema) 62 | def test_write_invalid_data(self): 63 | with self.assertRaises(AvroTypeException) as context: 64 | fastbuff = BytesIO() 65 | fastencoder = spavro.io.FastBinaryEncoder(fastbuff) 66 | fdw = FastDatumWriter(write_schema) 67 | fdw.write(datum, fastencoder) 68 | print(context.exception) 69 | return test_write_invalid_data 70 | 71 | 72 | def make_good_cases(cases): 73 | for name, datum, schema in cases: 74 | test_method = create_good_case(schema, datum) 75 | test_method.__name__ = 'test_good_data_{}'.format(name) 76 | setattr(TestValidData, test_method.__name__, test_method) 77 | 78 | 79 | def make_exception_cases(cases): 80 | for name, datum, schema in cases: 81 | test_method = create_exception_case(schema, datum) 82 | test_method.__name__ = 'test_invalid_data_{}'.format(name) 83 | setattr(TestValidData, test_method.__name__, test_method) 84 | 85 | 86 | make_good_cases(valid_data) 87 | make_exception_cases(invalid_data) 88 | -------------------------------------------------------------------------------- /test/test_script.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | import unittest 19 | import csv 20 | # import six 21 | from six import BytesIO, StringIO 22 | import re 23 | try: 24 | import json 25 | except ImportError: 26 | import simplejson as json 27 | from tempfile import NamedTemporaryFile 28 | import spavro.schema 29 | from spavro.io import DatumWriter 30 | from spavro.datafile import DataFileWriter 31 | from os.path import dirname, join, isfile 32 | from os import remove 33 | from operator import itemgetter 34 | 35 | SPACE_CHARS = re.compile(r'\s+') 36 | 37 | NUM_RECORDS = 7 38 | 39 | try: 40 | from subprocess import check_output 41 | except ImportError: 42 | from subprocess import Popen, PIPE 43 | 44 | def check_output(args): 45 | pipe = Popen(args, stdout=PIPE) 46 | if pipe.wait() != 0: 47 | raise ValueError 48 | return pipe.stdout.read() 49 | 50 | try: 51 | from subprocess import check_call 52 | except ImportError: 53 | def check_call(args, **kw): 54 | pipe = Popen(args, **kw) 55 | assert pipe.wait() == 0 56 | 57 | SCHEMA = ''' 58 | { 59 | "namespace": "test.avro", 60 | "name": "LooneyTunes", 61 | "type": "record", 62 | "fields": [ 63 | {"name": "first", "type": "string"}, 64 | {"name": "last", "type": "string"}, 65 | {"name": "type", "type": "string"} 66 | ] 67 | } 68 | ''' 69 | 70 | LOONIES = ( 71 | (u"daffy", u"duck", u"duck"), 72 | (u"bugs", u"bunny", u"bunny"), 73 | (u"tweety", u"", u"bird"), 74 | (u"road", u"runner", u"bird"), 75 | (u"wile", u"e", u"coyote"), 76 | (u"pepe", u"le pew", u"skunk"), 77 | (u"foghorn", u"leghorn", u"rooster"), 78 | ) 79 | 80 | def looney_records(): 81 | for f, l, t in LOONIES: 82 | yield {"first": f, "last": l, "type": t} 83 | 84 | SCRIPT = join(dirname(__file__), "..", "scripts", "avro") 85 | 86 | _JSON_PRETTY = '''{ 87 | "first": "daffy", 88 | "last": "duck", 89 | "type": "duck" 90 | }''' 91 | 92 | def gen_avro(filename): 93 | schema = spavro.schema.parse(SCHEMA) 94 | fo = open(filename, "wb") 95 | writer = DataFileWriter(fo, DatumWriter(), schema) 96 | for record in looney_records(): 97 | writer.append(record) 98 | writer.close() 99 | fo.close() 100 | 101 | 102 | def tempfile(): 103 | return NamedTemporaryFile(delete=False).name 104 | 105 | 106 | class TestCat(unittest.TestCase): 107 | def setUp(self): 108 | self.avro_file = tempfile() 109 | gen_avro(self.avro_file) 110 | 111 | def tearDown(self): 112 | if isfile(self.avro_file): 113 | remove(self.avro_file) 114 | 115 | def _run(self, *args, **kw): 116 | out = check_output([SCRIPT, "cat", self.avro_file] + list(args)).decode('utf-8') 117 | # subprocess returns bytes, decode into utf8 strings 118 | # Note: should this be ASCII instead? 119 | if kw.get("raw"): 120 | return out 121 | else: 122 | return out.splitlines() 123 | 124 | def test_print(self): 125 | return len(self._run()) == NUM_RECORDS 126 | 127 | def test_filter(self): 128 | return len(self._run("--filter", "r['type']=='bird'")) == 2 129 | 130 | def test_skip(self): 131 | skip = 3 132 | return len(self._run("--skip", str(skip))) == NUM_RECORDS - skip 133 | 134 | def test_csv(self): 135 | raw = self._run("-f", "csv", raw=True) 136 | reader = csv.reader(StringIO(raw)) 137 | assert len(list(reader)) == NUM_RECORDS 138 | 139 | def test_csv_header(self): 140 | '''Test the CSV header is processed correctly''' 141 | io = StringIO(self._run("-f", "csv", "--header", raw=True)) 142 | reader = csv.DictReader(io) 143 | self.assertEqual(reader.fieldnames, ['first', 'last', 'type']) 144 | 145 | def test_print_schema(self): 146 | out = self._run("--print-schema", raw=True) 147 | assert json.loads(out)["namespace"] == "test.avro" 148 | 149 | def test_help(self): 150 | # Just see we have these 151 | self._run("-h") 152 | self._run("--help") 153 | 154 | def test_json_pretty(self): 155 | out = self._run("--format", "json-pretty", "-n", "1", raw=1) 156 | clean_out = SPACE_CHARS.sub('', out) 157 | example = SPACE_CHARS.sub('', _JSON_PRETTY) 158 | self.assertEqual(example, clean_out) 159 | 160 | def test_version(self): 161 | check_output([SCRIPT, "cat", "--version"]).decode('utf-8') 162 | 163 | def test_files(self): 164 | out = self._run(self.avro_file) 165 | assert len(out) == 2 * NUM_RECORDS 166 | 167 | def test_fields(self): 168 | # One field selection (no comma) 169 | out = self._run('--fields', 'last') 170 | assert json.loads(out[0]) == {'last': 'duck'} 171 | 172 | # Field selection (with comma and space) 173 | out = self._run('--fields', 'first, last') 174 | assert json.loads(out[0]) == {'first': 'daffy', 'last': 'duck'} 175 | 176 | # Empty fields should get all 177 | out = self._run('--fields', '') 178 | assert json.loads(out[0]) == \ 179 | {'first': 'daffy', 'last': 'duck', 'type': 'duck'} 180 | 181 | # Non existing fields are ignored 182 | out = self._run('--fields', 'first,last,age') 183 | assert json.loads(out[0]) == {'first': 'daffy', 'last': 'duck'} 184 | 185 | 186 | class TestWrite(unittest.TestCase): 187 | def setUp(self): 188 | self.json_file = tempfile() + ".json" 189 | fo = open(self.json_file, "w") 190 | for record in looney_records(): 191 | json.dump(record, fo) 192 | fo.write("\n") 193 | fo.close() 194 | 195 | self.csv_file = tempfile() + ".csv" 196 | fo = open(self.csv_file, "w") 197 | write = csv.writer(fo).writerow 198 | get = itemgetter("first", "last", "type") 199 | for record in looney_records(): 200 | write(get(record)) 201 | fo.close() 202 | 203 | self.schema_file = tempfile() 204 | fo = open(self.schema_file, "w") 205 | fo.write(SCHEMA) 206 | fo.close() 207 | 208 | def tearDown(self): 209 | for filename in (self.csv_file, self.json_file, self.schema_file): 210 | try: 211 | remove(filename) 212 | except OSError: 213 | continue 214 | 215 | def _run(self, *args, **kw): 216 | args = [SCRIPT, "write", "--schema", self.schema_file] + list(args) 217 | check_call(args, **kw) 218 | 219 | def load_avro(self, filename): 220 | out = check_output([SCRIPT, "cat", filename]).decode('utf-8') 221 | return map(json.loads, out.splitlines()) 222 | 223 | def test_version(self): 224 | check_call([SCRIPT, "write", "--version"]) 225 | 226 | def format_check(self, format, filename): 227 | tmp = tempfile() 228 | fo = open(tmp, "wb") 229 | self._run(filename, "-f", format, stdout=fo) 230 | fo.close() 231 | 232 | records = list(self.load_avro(tmp)) 233 | assert len(records) == NUM_RECORDS 234 | assert records[0]["first"] == "daffy" 235 | 236 | remove(tmp) 237 | 238 | def test_write_json(self): 239 | self.format_check("json", self.json_file) 240 | 241 | def test_write_csv(self): 242 | self.format_check("csv", self.csv_file) 243 | 244 | def test_outfile(self): 245 | tmp = tempfile() 246 | remove(tmp) 247 | self._run(self.json_file, "-o", tmp) 248 | 249 | assert len(list(self.load_avro(tmp))) == NUM_RECORDS 250 | remove(tmp) 251 | 252 | def test_multi_file(self): 253 | tmp = tempfile() 254 | fo = open(tmp, "wb") 255 | self._run(self.json_file, self.json_file, stdout=fo) 256 | fo.close() 257 | assert len(list(self.load_avro(tmp))) == 2 * NUM_RECORDS 258 | remove(tmp) 259 | 260 | def test_stdin(self): 261 | tmp = tempfile() 262 | 263 | info = open(self.json_file, "rb") 264 | fo = open(tmp, "wb") 265 | self._run("--input-type", "json", stdin=info, stdout=fo) 266 | fo.close() 267 | 268 | assert len(list(self.load_avro(tmp))) == NUM_RECORDS 269 | remove(tmp) 270 | -------------------------------------------------------------------------------- /test/test_union.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Pluralsight LLC 2 | 3 | import unittest 4 | from six import BytesIO 5 | 6 | import spavro.io 7 | from spavro.io import FastDatumWriter 8 | # from spavro.io import SlowDatumWriter as FastDatumWriter 9 | # from spavro.exceptions import AvroTypeException 10 | 11 | 12 | simple_write_cases = ( 13 | ("float_and_null", 3.14159, '["null", "float"]', b"\x02\xd0\x0fI@"), 14 | ("float_and_double", 3.14159, '["null", "double"]', b"\x02n\x86\x1b\xf0\xf9!\t@"), 15 | ("float_and_null_with_int", 150, '["null", "float"]', b'\x02\x00\x00\x16C'), 16 | ("promote_int_to_float", 314159, '["null", "float"]', b"\x02\xe0e\x99H"), 17 | ("dont_promte_int_to_float", 314159, '["null", "float", "int"]', b"\x04\xde\xac&"), 18 | ("promote_int_to_double", 314159, '["null", "double"]', b"\x02\x00\x00\x00\x00\xbc,\x13A"), 19 | ("dont_promte_int_to_double", 314159, '["null", "double", "int"]', b"\x04\xde\xac&"), 20 | ("promote_string_to_bytes", u"testing123", '["null", "bytes"]', b"\x02\x14testing123"), 21 | ("dont_promote_string_to_bytes", u"testing123", '["null", "bytes", "string"]', b"\x04\x14testing123") 22 | ) 23 | 24 | 25 | class TestUnionWriter(unittest.TestCase): 26 | pass 27 | 28 | 29 | def create_write_case(schema, datum, expected): 30 | write_schema = spavro.schema.parse(schema) 31 | 32 | def test_write_good_data(self): 33 | fastbuff = BytesIO() 34 | fastencoder = spavro.io.FastBinaryEncoder(fastbuff) 35 | fdw = FastDatumWriter(write_schema) 36 | fdw.write(datum, fastencoder) 37 | self.assertEqual(fastbuff.getvalue(), expected) 38 | return test_write_good_data 39 | 40 | 41 | def make_write_cases(cases): 42 | for name, datum, schema, expected in cases: 43 | test_method = create_write_case(schema, datum, expected) 44 | test_method.__name__ = 'test_simple_union_write_{}'.format(name) 45 | setattr(TestUnionWriter, test_method.__name__, test_method) 46 | 47 | 48 | make_write_cases(simple_write_cases) 49 | -------------------------------------------------------------------------------- /test/test_write_read_schema_resolver.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Pluralsight LLC 2 | 3 | import unittest 4 | from spavro.schema_resolve import resolve 5 | from spavro.exceptions import SchemaResolutionException 6 | 7 | 8 | class TestResolver(unittest.TestCase): 9 | pass 10 | 11 | 12 | pass_cases = ( 13 | ("simple_null", "null", "null", "null"), 14 | ("simple_int", "int", "int", "int"), 15 | ("simple_long", "long", "long", "long"), 16 | ("simple_float", "float", "float", "float"), 17 | ("promote_int_to_long", "int", "long", "int"), 18 | ("promote_int_to_double", "int", "double", "int"), 19 | ("promote_float_to_double", "float", "double", "float"), 20 | ("promote_long_to_double", "long", "double", "long"), 21 | ("record_upgrade_to_union_and_default_field", {"fields": [{"default": "FOO", 22 | "type": {"symbols": ["FOO", "BAR"], 23 | "namespace": "", 24 | "type": "enum", 25 | "name": "F"}, 26 | "name": "H"} 27 | ], 28 | "type": "record", 29 | "name": "Test"}, 30 | ["int", 31 | {"fields": [{"default": "FOO", 32 | "type": {"symbols": ["FOO", "BAR"], 33 | "namespace": "", 34 | "type": "enum", 35 | "name": "F"}, 36 | "name": "H"}, 37 | {"name": "spork", 38 | "type": "int", 39 | "default": 1234} 40 | ], 41 | "type": "record", 42 | "name": "Test"}], 43 | {'fields': [{"type": {"symbols": ["FOO", "BAR"], 44 | "type": "enum", 45 | "name": "F"}, 46 | "name": "H"}, 47 | {'type': {'type': 'default', 'value': 1234}, 48 | 'name': 'spork'}], 49 | 'type': 'record', 50 | 'name': 'Test'}), 51 | ("symbol_added_to_reader_enum", 52 | {"type": "enum", "name": "bigby", "symbols": ["A", "C"]}, 53 | {"type": "enum", "name": "bigby", "symbols": ["A", "B", "C"]}, 54 | {'symbols': ['A', 'C'], 'type': 'enum', 'name': 'bigby'}), 55 | ("array_items_upgraded_to_union", 56 | {"type": "array", "items": "string"}, 57 | {"type": "array", "items": ["int", "string"]}, 58 | {'items': 'string', 'type': 'array'}) 59 | ) 60 | 61 | exception_cases = ( 62 | ("null_vs_int", "null", "int", SchemaResolutionException), 63 | ("boolean_vs_int", "boolean", "int", SchemaResolutionException), 64 | ("lower_precision_promote_long_int", "long", "int", SchemaResolutionException), 65 | ("lower_precision_promote_double_float", "double", "float", SchemaResolutionException), 66 | ("missing_symbol_in_read", 67 | {"type": "enum", "name": "bigby", "symbols": ["A", "C"]}, 68 | {"type": "enum", "name": "bigby", "symbols": ["A", "B"]}, SchemaResolutionException), 69 | ("union_missing_write_schema", 70 | "int", ["string", "boolean"], SchemaResolutionException), 71 | ("record_names_dont_match", 72 | {"type": "record", "name": "my_name", "fields": [{"type": "int", "name": "A"}]}, 73 | {"type": "record", "name": "not_my_name", "fields": [{"type": "int", "name": "A"}]}, 74 | SchemaResolutionException), 75 | ("record_field_types_dont_match", 76 | {"type": "record", "name": "my_name", "fields": [{"type": "string", "name": "A"}]}, 77 | {"type": "record", "name": "my_name", "fields": [{"type": "int", "name": "A"}]}, 78 | SchemaResolutionException), 79 | ("record_new_field_no_default", 80 | {"type": "record", "name": "my_name", "fields": [{"type": "string", "name": "A"}]}, 81 | {"type": "record", "name": "my_name", "fields": [{"type": "int", "name": "A"}, 82 | {"type": "int", "name": "B"}]}, 83 | SchemaResolutionException) 84 | ) 85 | 86 | 87 | def create_pass_case(writer, reader, expected): 88 | def resolve_write_reader(self): 89 | resolved = resolve(writer, reader) 90 | self.assertEqual(resolved, expected) 91 | return resolve_write_reader 92 | 93 | 94 | def create_exception_case(writer, reader, exception): 95 | def resolve_write_reader(self): 96 | with self.assertRaises(exception) as context: 97 | resolved = resolve(writer, reader) 98 | return resolve_write_reader 99 | 100 | 101 | def make_cases(cases): 102 | for name, writer, reader, expected in cases: 103 | test_method = create_pass_case(writer, reader, expected) 104 | test_method.__name__ = 'test_schema_resolution_{}'.format(name) 105 | setattr(TestResolver, test_method.__name__, test_method) 106 | 107 | 108 | def make_exception_cases(cases): 109 | for name, writer, reader, expected in cases: 110 | test_method = create_exception_case(writer, reader, expected) 111 | test_method.__name__ = 'test_incompatible_schema_{}'.format(name) 112 | setattr(TestResolver, test_method.__name__, test_method) 113 | 114 | 115 | make_cases(pass_cases) 116 | make_exception_cases(exception_cases) 117 | -------------------------------------------------------------------------------- /test/txsample_http_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | import sys 21 | 22 | from twisted.internet import reactor, defer 23 | from twisted.python.util import println 24 | 25 | from spavro import protocol 26 | from spavro import txipc 27 | 28 | MAIL_PROTOCOL_JSON = """\ 29 | {"namespace": "example.proto", 30 | "protocol": "Mail", 31 | 32 | "types": [ 33 | {"name": "Message", "type": "record", 34 | "fields": [ 35 | {"name": "to", "type": "string"}, 36 | {"name": "from", "type": "string"}, 37 | {"name": "body", "type": "string"} 38 | ] 39 | } 40 | ], 41 | 42 | "messages": { 43 | "send": { 44 | "request": [{"name": "message", "type": "Message"}], 45 | "response": "string" 46 | }, 47 | "replay": { 48 | "request": [], 49 | "response": "string" 50 | } 51 | } 52 | } 53 | """ 54 | MAIL_PROTOCOL = protocol.parse(MAIL_PROTOCOL_JSON) 55 | SERVER_HOST = 'localhost' 56 | SERVER_PORT = 9090 57 | 58 | class UsageError(Exception): 59 | def __init__(self, value): 60 | self.value = value 61 | def __str__(self): 62 | return repr(self.value) 63 | 64 | def make_requestor(server_host, server_port, protocol): 65 | client = txipc.TwistedHTTPTransceiver(SERVER_HOST, SERVER_PORT) 66 | return txipc.TwistedRequestor(protocol, client) 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) not in [4, 5]: 70 | raise UsageError("Usage: []") 71 | 72 | # client code - attach to the server and send a message 73 | # fill in the Message record 74 | message = dict() 75 | message['to'] = sys.argv[1] 76 | message['from'] = sys.argv[2] 77 | message['body'] = sys.argv[3] 78 | 79 | try: 80 | num_messages = int(sys.argv[4]) 81 | except: 82 | num_messages = 1 83 | 84 | # build the parameters for the request 85 | params = {} 86 | params['message'] = message 87 | 88 | requests = [] 89 | # send the requests and print the result 90 | for msg_count in range(num_messages): 91 | requestor = make_requestor(SERVER_HOST, SERVER_PORT, MAIL_PROTOCOL) 92 | d = requestor.request('send', params) 93 | d.addCallback(lambda result: println("Result: " + result)) 94 | requests.append(d) 95 | results = defer.gatherResults(requests) 96 | 97 | def replay_cb(result): 98 | print("Replay Result: " + result) 99 | reactor.stop() 100 | 101 | def replay(_): 102 | # try out a replay message 103 | requestor = make_requestor(SERVER_HOST, SERVER_PORT, MAIL_PROTOCOL) 104 | d = requestor.request('replay', dict()) 105 | d.addCallback(replay_cb) 106 | 107 | results.addCallback(replay) 108 | reactor.run() 109 | -------------------------------------------------------------------------------- /test/txsample_http_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Modifications copyright (C) 2017 Pluralsight LLC 4 | # 5 | # Licensed to the Apache Software Foundation (ASF) under one 6 | # or more contributor license agreements. See the NOTICE file 7 | # distributed with this work for additional information 8 | # regarding copyright ownership. The ASF licenses this file 9 | # to you under the Apache License, Version 2.0 (the 10 | # "License"); you may not use this file except in compliance 11 | # with the License. You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | from twisted.web import server 21 | from twisted.internet import reactor 22 | 23 | from spavro import ipc 24 | from spavro import protocol 25 | from spavro import txipc 26 | 27 | MAIL_PROTOCOL_JSON = """\ 28 | {"namespace": "example.proto", 29 | "protocol": "Mail", 30 | 31 | "types": [ 32 | {"name": "Message", "type": "record", 33 | "fields": [ 34 | {"name": "to", "type": "string"}, 35 | {"name": "from", "type": "string"}, 36 | {"name": "body", "type": "string"} 37 | ] 38 | } 39 | ], 40 | 41 | "messages": { 42 | "send": { 43 | "request": [{"name": "message", "type": "Message"}], 44 | "response": "string" 45 | }, 46 | "replay": { 47 | "request": [], 48 | "response": "string" 49 | } 50 | } 51 | } 52 | """ 53 | MAIL_PROTOCOL = protocol.parse(MAIL_PROTOCOL_JSON) 54 | SERVER_ADDRESS = ('localhost', 9090) 55 | 56 | class MailResponder(ipc.Responder): 57 | def __init__(self): 58 | ipc.Responder.__init__(self, MAIL_PROTOCOL) 59 | 60 | def invoke(self, message, request): 61 | if message.name == 'send': 62 | request_content = request['message'] 63 | response = "Sent message to %(to)s from %(from)s with body %(body)s" % \ 64 | request_content 65 | return response 66 | elif message.name == 'replay': 67 | return 'replay' 68 | 69 | if __name__ == '__main__': 70 | root = server.Site(txipc.AvroResponderResource(MailResponder())) 71 | reactor.listenTCP(9090, root) 72 | reactor.run() 73 | -------------------------------------------------------------------------------- /test/word_count_task.py: -------------------------------------------------------------------------------- 1 | # Modifications copyright (C) 2017 Pluralsight LLC 2 | 3 | """ 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | """ 20 | 21 | __all__=["WordCountTask"] 22 | 23 | from spavro.tether import TetherTask 24 | 25 | import logging 26 | 27 | #TODO::Make the logging level a parameter we can set 28 | #logging.basicConfig(level=logging.INFO) 29 | class WordCountTask(TetherTask): 30 | """ 31 | Implements the mappper and reducer for the word count example 32 | """ 33 | 34 | def __init__(self): 35 | """ 36 | """ 37 | 38 | inschema="""{"type":"string"}""" 39 | midschema="""{"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[ 40 | {"name":"key","type":"string"}, 41 | {"name":"value","type":"long","order":"ignore"}] 42 | }""" 43 | outschema=midschema 44 | TetherTask.__init__(self,inschema,midschema,outschema) 45 | 46 | 47 | #keep track of the partial sums of the counts 48 | self.psum=0 49 | 50 | 51 | def map(self,record,collector): 52 | """Implement the mapper for the word count example 53 | 54 | Parameters 55 | ---------------------------------------------------------------------------- 56 | record - The input record 57 | collector - The collector to collect the output 58 | """ 59 | 60 | words=record.split() 61 | 62 | for w in words: 63 | logging.info("WordCountTask.Map: word={0}".format(w)) 64 | collector.collect({"key":w,"value":1}) 65 | 66 | def reduce(self,record, collector): 67 | """Called with input values to generate reducer output. Inputs are sorted by the mapper 68 | key. 69 | 70 | The reduce function is invoked once for each value belonging to a given key outputted 71 | by the mapper. 72 | 73 | Parameters 74 | ---------------------------------------------------------------------------- 75 | record - The mapper output 76 | collector - The collector to collect the output 77 | """ 78 | 79 | self.psum+=record["value"] 80 | 81 | def reduceFlush(self,record, collector): 82 | """ 83 | Called with the last intermediate value in each equivalence run. 84 | In other words, reduceFlush is invoked once for each key produced in the reduce 85 | phase. It is called after reduce has been invoked on each value for the given key. 86 | 87 | Parameters 88 | ------------------------------------------------------------------ 89 | record - the last record on which reduce was invoked. 90 | """ 91 | 92 | #collect the current record 93 | logging.info("WordCountTask.reduceFlush key={0} value={1}".format(record["key"],self.psum)) 94 | 95 | collector.collect({"key":record["key"],"value":self.psum}) 96 | 97 | #reset the sum 98 | self.psum=0 99 | -------------------------------------------------------------------------------- /testdata/data/schema-tests.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | // NOTE: the Java implementation provides a slow-but-direct implementation 18 | // of the fingerpriting algorithm which is used to cross-check the 19 | // "fingerprint" values below. Thus, the Java unit-tests provide validation 20 | // for these values, so other languages can just assume they are correct. 21 | 22 | 23 | // 000 24 | < $portfile & 64 | count=0 65 | while [ ! -s $portfile ] 66 | do 67 | sleep 1 68 | if [ $count -ge 10 ] 69 | then 70 | echo $server did not start. 71 | exit 1 72 | fi 73 | count=`expr $count + 1` 74 | done 75 | read ignore port < $portfile 76 | $client http://127.0.0.1:$port $proto $msg -file $c/request.avro 77 | wait 78 | done 79 | done 80 | done 81 | done 82 | 83 | echo RPC INTEROP TESTS PASS 84 | -------------------------------------------------------------------------------- /testdata/interop/rpc/add/onePlusOne/request.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/add/onePlusOne/request.avro -------------------------------------------------------------------------------- /testdata/interop/rpc/add/onePlusOne/response.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/add/onePlusOne/response.avro -------------------------------------------------------------------------------- /testdata/interop/rpc/echo/foo/request.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/echo/foo/request.avro -------------------------------------------------------------------------------- /testdata/interop/rpc/echo/foo/response.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/echo/foo/response.avro -------------------------------------------------------------------------------- /testdata/interop/rpc/hello/world/request.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/hello/world/request.avro -------------------------------------------------------------------------------- /testdata/interop/rpc/hello/world/response.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluralsight/spavro/74ef7d8aa241f148dead0ffc2e402c815432a542/testdata/interop/rpc/hello/world/response.avro -------------------------------------------------------------------------------- /testdata/schemas/BulkData.avpr: -------------------------------------------------------------------------------- 1 | 2 | {"namespace": "org.apache.avro.test", 3 | "protocol": "BulkData", 4 | 5 | "types": [], 6 | 7 | "messages": { 8 | 9 | "read": { 10 | "request": [], 11 | "response": "bytes" 12 | }, 13 | 14 | "write": { 15 | "request": [ {"name": "data", "type": "bytes"} ], 16 | "response": "null" 17 | } 18 | 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /testdata/schemas/FooBarSpecificRecord.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "FooBarSpecificRecord", 4 | "namespace": "org.apache.avro", 5 | "fields": [ 6 | {"name": "id", "type": "int"}, 7 | {"name": "name", "type": "string"}, 8 | {"name": "nicknames", "type": 9 | {"type": "array", "items": "string"}}, 10 | {"name": "relatedids", "type": 11 | {"type": "array", "items": "int"}}, 12 | {"name": "typeEnum", "type": 13 | ["null", { 14 | "type": "enum", 15 | "name": "TypeEnum", 16 | "namespace": "org.apache.avro", 17 | "symbols" : ["a","b", "c"] 18 | }], 19 | "default": null 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /testdata/schemas/contexts.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | @version("1.0.5") 20 | @namespace("org.apache.avro.ipc.specific") 21 | protocol Contexts { 22 | record HomePage { 23 | } 24 | 25 | record ProductPage { 26 | string product; 27 | } 28 | 29 | record CartPage { 30 | array productsInCart; 31 | } 32 | 33 | record UnknownPage { 34 | } 35 | 36 | record PageView { 37 | long datetime; 38 | union {UnknownPage, HomePage, ProductPage, CartPage} pageContext; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /testdata/schemas/echo.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | @namespace("org.apache.avro.echo") 20 | protocol Echo { 21 | record Ping { 22 | long timestamp = -1; 23 | string text = ""; 24 | } 25 | 26 | record Pong { 27 | long timestamp = -1; 28 | Ping ping; 29 | } 30 | 31 | Pong ping(Ping ping); 32 | } 33 | -------------------------------------------------------------------------------- /testdata/schemas/http.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** NOTE: This structure was inspired by HTTP and deliberately skewed to get the effects that needed testing */ 20 | 21 | @namespace("org.apache.avro.test.http") 22 | protocol Http { 23 | 24 | enum NetworkType { 25 | IPv4, 26 | IPv6 27 | } 28 | 29 | record NetworkConnection { 30 | NetworkType networkType; 31 | string networkAddress; 32 | } 33 | 34 | record UserAgent { 35 | union { null, string } id = null; 36 | string useragent; 37 | } 38 | 39 | enum HttpMethod { 40 | GET, 41 | POST 42 | } 43 | 44 | record QueryParameter { 45 | string name; 46 | union { null, string } value; // Sometimes there is no value. 47 | } 48 | 49 | record HttpURI { 50 | HttpMethod method; 51 | string path; 52 | array parameters = []; 53 | } 54 | 55 | record HttpRequest { 56 | UserAgent userAgent; 57 | HttpURI URI; 58 | } 59 | 60 | record Request { 61 | long timestamp; 62 | NetworkConnection connection; 63 | HttpRequest httpRequest; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /testdata/schemas/interop.avsc: -------------------------------------------------------------------------------- 1 | {"type": "record", "name":"Interop", "namespace": "org.apache.avro", 2 | "fields": [ 3 | {"name": "intField", "type": "int"}, 4 | {"name": "longField", "type": "long"}, 5 | {"name": "stringField", "type": "string"}, 6 | {"name": "boolField", "type": "boolean"}, 7 | {"name": "floatField", "type": "float"}, 8 | {"name": "doubleField", "type": "double"}, 9 | {"name": "bytesField", "type": "bytes"}, 10 | {"name": "nullField", "type": "null"}, 11 | {"name": "arrayField", "type": {"type": "array", "items": "double"}}, 12 | {"name": "mapField", "type": 13 | {"type": "map", "values": 14 | {"type": "record", "name": "Foo", 15 | "fields": [{"name": "label", "type": "string"}]}}}, 16 | {"name": "unionField", "type": 17 | ["boolean", "double", {"type": "array", "items": "bytes"}]}, 18 | {"name": "enumField", "type": 19 | {"type": "enum", "name": "Kind", "symbols": ["A","B","C"]}}, 20 | {"name": "fixedField", "type": 21 | {"type": "fixed", "name": "MD5", "size": 16}}, 22 | {"name": "recordField", "type": 23 | {"type": "record", "name": "Node", 24 | "fields": [ 25 | {"name": "label", "type": "string"}, 26 | {"name": "children", "type": {"type": "array", "items": "Node"}}]}} 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /testdata/schemas/mail.avpr: -------------------------------------------------------------------------------- 1 | {"namespace": "org.apache.avro.test", 2 | "protocol": "Mail", 3 | 4 | "types": [ 5 | {"name": "Message", "type": "record", 6 | "fields": [ 7 | {"name": "to", "type": "string"}, 8 | {"name": "from", "type": "string"}, 9 | {"name": "body", "type": "string"} 10 | ] 11 | } 12 | ], 13 | 14 | "messages": { 15 | "send": { 16 | "request": [{"name": "message", "type": "Message"}], 17 | "response": "string" 18 | }, 19 | "fireandforget": { 20 | "request": [{"name": "message", "type": "Message"}], 21 | "response": "null", 22 | "one-way": true 23 | } 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /testdata/schemas/namespace.avpr: -------------------------------------------------------------------------------- 1 | {"namespace": "org.apache.avro.test.namespace", 2 | "protocol": "TestNamespace", 3 | 4 | "types": [ 5 | {"name": "org.apache.avro.test.util.MD5", "type": "fixed", "size": 16}, 6 | {"name": "TestRecord", "type": "record", 7 | "fields": [ {"name": "hash", "type": "org.apache.avro.test.util.MD5"} ] 8 | }, 9 | {"name": "TestError", "namespace": "org.apache.avro.test.errors", 10 | "type": "error", "fields": [ {"name": "message", "type": "string"} ] 11 | } 12 | ], 13 | 14 | "messages": { 15 | "echo": { 16 | "request": [{"name": "record", "type": "TestRecord"}], 17 | "response": "TestRecord" 18 | }, 19 | 20 | "error": { 21 | "request": [], 22 | "response": "null", 23 | "errors": ["org.apache.avro.test.errors.TestError"] 24 | } 25 | 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /testdata/schemas/nestedNullable.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | @namespace("org.apache.avro.test.nullable") 20 | protocol Nullable { 21 | 22 | enum MyEnum { 23 | One, 24 | Two 25 | } 26 | 27 | record SubRecord { 28 | string value; 29 | } 30 | 31 | record RecordWithNullables { 32 | union { null, string } nullableString = null; 33 | union { null, long } nullableLong = null; 34 | union { null, int } nullableInt = null; 35 | union { null, map } nullableMap = null; 36 | union { null, array } nullableArray = null; 37 | union { null, SubRecord } nullableRecord = null; 38 | union { null, MyEnum } nullableEnum = null; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /testdata/schemas/reserved.avsc: -------------------------------------------------------------------------------- 1 | {"name": "org.apache.avro.test.Reserved", "type": "enum", 2 | "symbols": ["default","class","int"]}, 3 | -------------------------------------------------------------------------------- /testdata/schemas/schemaevolution.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A few simple test schemas for testing schema evolution the IDL generated classes 21 | */ 22 | @namespace("org.apache.avro.compiler.schema.evolve") 23 | protocol SchemaEvolveTesting { 24 | record TestRecord1 { 25 | string name; 26 | long value; 27 | } 28 | 29 | record TestRecord2 { 30 | string name; 31 | long value; 32 | string data; 33 | } 34 | 35 | record TestRecord3 { 36 | string name; 37 | string data; 38 | } 39 | 40 | record NestedEvolve1 { 41 | string rootName; 42 | TestRecord1 nested; 43 | } 44 | 45 | record NestedEvolve2 { 46 | string rootName; 47 | TestRecord2 nested; 48 | } 49 | 50 | record NestedEvolve3 { 51 | string rootName; 52 | TestRecord3 nested; 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /testdata/schemas/simple.avpr: -------------------------------------------------------------------------------- 1 | {"namespace": "org.apache.avro.test", 2 | "protocol": "Simple", 3 | "doc": "Protocol used for testing.", 4 | "version" : "1.6.2", 5 | "javaAnnotation": ["javax.annotation.Generated(\"avro\")", 6 | "org.apache.avro.TestAnnotation"], 7 | 8 | "types": [ 9 | {"name": "Kind", "type": "enum", "symbols": ["FOO","BAR","BAZ"], 10 | "javaAnnotation": "org.apache.avro.TestAnnotation"}, 11 | 12 | {"name": "MD5", "type": "fixed", "size": 16, 13 | "javaAnnotation": "org.apache.avro.TestAnnotation"}, 14 | 15 | {"name": "TestRecord", "type": "record", 16 | "javaAnnotation": "org.apache.avro.TestAnnotation", 17 | "fields": [ 18 | {"name": "name", "type": "string", "order": "ignore", 19 | "javaAnnotation": "org.apache.avro.TestAnnotation"}, 20 | {"name": "kind", "type": "Kind", "order": "descending"}, 21 | {"name": "hash", "type": "MD5"} 22 | ] 23 | }, 24 | 25 | {"name": "TestError", "type": "error", "fields": [ 26 | {"name": "message", "type": "string"} 27 | ] 28 | }, 29 | 30 | {"name": "TestRecordWithUnion", "type": "record", 31 | "fields": [ 32 | {"name": "kind", "type": ["null", "Kind"]}, 33 | {"name": "value", "type": ["null", "string"]} 34 | ] 35 | } 36 | 37 | ], 38 | 39 | "messages": { 40 | 41 | "hello": { 42 | "doc": "Send a greeting", 43 | "request": [{"name": "greeting", "type": "string", "aliases" : [ "salute" ], "customProp" : "customValue"}], 44 | "response": "string" 45 | }, 46 | 47 | "echo": { 48 | "doc": "Pretend you're in a cave!", 49 | "request": [{"name": "record", "type": "TestRecord"}], 50 | "response": "TestRecord" 51 | }, 52 | 53 | "add": { 54 | "specialProp" : "test", 55 | "request": [{"name": "arg1", "type": "int"}, {"name": "arg2", "type": "int"}], 56 | "response": "int" 57 | }, 58 | 59 | "echoBytes": { 60 | "request": [{"name": "data", "type": "bytes"}], 61 | "response": "bytes" 62 | }, 63 | 64 | "error": { 65 | "doc": "Always throws an error.", 66 | "request": [], 67 | "response": "null", 68 | "errors": ["TestError"] 69 | }, 70 | 71 | "ack": { 72 | "doc": "Send a one way message", 73 | "request": [], 74 | "response": "null", 75 | "one-way": true, 76 | "javaAnnotation": "org.apache.avro.TestAnnotation" 77 | } 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /testdata/schemas/social.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | @version("1.0.5") 20 | @namespace("org.apache.avro.ipc.specific") 21 | protocol Social { 22 | enum PrivacyType { FRIENDS, FRIENDS_OF_FRIENDS, PUBLIC, CUSTOM } 23 | 24 | record Person { 25 | string name; 26 | int year_of_birth; 27 | string country = "US"; 28 | string state; 29 | array friends = []; 30 | array languages = [ "English" , "Java" ]; 31 | PrivacyType defaultPrivacy = "FRIENDS"; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /testdata/schemas/specialtypes.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** NOTE: This structure is intended to contain names that are likely to cause collisions with the generated code. */ 20 | 21 | @namespace("org.apache.avro.test.specialtypes") 22 | protocol LetsBreakIt { 23 | 24 | enum Enum { 25 | builder, 26 | Builder, 27 | builderBuider, 28 | value, 29 | this 30 | } 31 | 32 | record One { 33 | Enum this; 34 | } 35 | 36 | record Two { 37 | union { null, string } this = null; 38 | string String; 39 | } 40 | 41 | record Variables { 42 | One this; 43 | 44 | One Boolean; 45 | One Integer; 46 | One Long; 47 | One Float; 48 | One String; 49 | } 50 | 51 | enum Boolean { 52 | Yes, 53 | No 54 | } 55 | 56 | record String { 57 | string value; 58 | } 59 | 60 | record builder { 61 | One this; 62 | Two builder; 63 | } 64 | 65 | record builderBuilder { 66 | One this; 67 | Two that; 68 | } 69 | 70 | record Builder { 71 | One this; 72 | Two that; 73 | } 74 | 75 | record value { 76 | One this; 77 | Two that; 78 | } 79 | 80 | record Types { 81 | Boolean one; 82 | builder two; 83 | Builder three; 84 | builderBuilder four; 85 | String five; 86 | value six; 87 | } 88 | 89 | record Names { 90 | string Boolean; 91 | string builder; 92 | string Builder; 93 | string builderBuilder; 94 | string String; 95 | string value; 96 | } 97 | 98 | record TopLevelDomainNames { 99 | string org; 100 | string avro; 101 | string com; 102 | string net; 103 | string nl; 104 | } 105 | 106 | record Exception { 107 | string whatever; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /testdata/schemas/stringables.avdl: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A test case to exercise the stringable feature on @java-class, @java-key-class and 21 | * @java-element-class 22 | */ 23 | @namespace("test") 24 | protocol AnnotatedStringableTypes { 25 | 26 | record StringablesRecord { 27 | /** Each field exercises one of the java-class, key-class or element-class. */ 28 | @java-class("java.math.BigDecimal") string value; 29 | @java-key-class("java.math.BigInteger") map mapWithBigIntKeys; 30 | map<@java-class("java.math.BigDecimal") string> mapWithBigDecimalElements; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /testdata/schemas/weather.avsc: -------------------------------------------------------------------------------- 1 | {"type": "record", "name": "test.Weather", 2 | "doc": "A weather reading.", 3 | "fields": [ 4 | {"name": "station", "type": "string", "order": "ignore"}, 5 | {"name": "time", "type": "long"}, 6 | {"name": "temp", "type": "int"} 7 | ] 8 | } 9 | --------------------------------------------------------------------------------