├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── config └── kafka-connect-fs.properties ├── docker-compose.yml ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── config_options.rst │ ├── connector.rst │ ├── faq.rst │ ├── filereaders.rst │ ├── index.rst │ └── policies.rst ├── pom.xml └── src ├── main ├── assembly │ ├── development.xml │ ├── package.xml │ └── standalone.xml ├── java │ └── com │ │ └── github │ │ └── mmolimar │ │ └── kafka │ │ └── connect │ │ └── fs │ │ ├── FsSourceConnector.java │ │ ├── FsSourceConnectorConfig.java │ │ ├── FsSourceTask.java │ │ ├── FsSourceTaskConfig.java │ │ ├── file │ │ ├── FileMetadata.java │ │ └── reader │ │ │ ├── AbstractFileReader.java │ │ │ ├── AgnosticFileReader.java │ │ │ ├── AvroFileReader.java │ │ │ ├── BinaryFileReader.java │ │ │ ├── CobolFileReader.java │ │ │ ├── CompressionType.java │ │ │ ├── CsvFileReader.java │ │ │ ├── FileReader.java │ │ │ ├── FixedWidthFileReader.java │ │ │ ├── JacksonFileReader.java │ │ │ ├── JsonFileReader.java │ │ │ ├── OrcFileReader.java │ │ │ ├── ParquetFileReader.java │ │ │ ├── SequenceFileReader.java │ │ │ ├── TextFileReader.java │ │ │ ├── TsvFileReader.java │ │ │ ├── UnivocityFileReader.java │ │ │ ├── XmlFileReader.java │ │ │ └── YamlFileReader.java │ │ ├── policy │ │ ├── AbstractPolicy.java │ │ ├── CronPolicy.java │ │ ├── HdfsFileWatcherPolicy.java │ │ ├── Policy.java │ │ ├── S3EventNotificationsPolicy.java │ │ ├── SimplePolicy.java │ │ └── SleepyPolicy.java │ │ └── util │ │ ├── Iterators.java │ │ ├── ReflectionUtils.java │ │ ├── TailCall.java │ │ └── Version.java ├── resources │ ├── META-INF │ │ └── services │ │ │ └── org.apache.hadoop.fs.FileSystem │ └── kafka-connect-fs-version.properties └── scala │ └── com │ └── github │ └── mmolimar │ └── kafka │ └── connect │ └── fs │ └── file │ └── reader │ └── CobrixReader.scala └── test ├── java └── com │ └── github │ └── mmolimar │ └── kafka │ └── connect │ └── fs │ ├── AbstractHdfsFsConfig.java │ ├── AbstractLocalFsConfig.java │ ├── FsTestConfig.java │ ├── connector │ ├── FsSourceConnectorConfigTest.java │ └── FsSourceConnectorTest.java │ ├── file │ └── reader │ │ ├── AgnosticFileReaderTest.java │ │ ├── AvroFileReaderTest.java │ │ ├── BinaryFileReaderTest.java │ │ ├── CobolFileReaderTest.java │ │ ├── CsvFileReaderTest.java │ │ ├── FileReaderTestBase.java │ │ ├── FixedWidthFileReaderTest.java │ │ ├── JacksonFileReaderTest.java │ │ ├── JsonFileReaderTest.java │ │ ├── OrcFileReaderTest.java │ │ ├── ParquetFileReaderTest.java │ │ ├── ReaderFsTestConfig.java │ │ ├── SequenceFileReaderTest.java │ │ ├── TextFileReaderTest.java │ │ ├── TsvFileReaderTest.java │ │ ├── UnivocityFileReaderTest.java │ │ ├── XmlFileReaderTest.java │ │ └── YamlFileReaderTest.java │ ├── policy │ ├── CronPolicyTest.java │ ├── HdfsFileWatcherPolicyTest.java │ ├── PolicyFsTestConfig.java │ ├── PolicyTestBase.java │ ├── S3EventNotificationsPolicyTest.java │ ├── SimplePolicyTest.java │ └── SleepyPolicyTest.java │ └── task │ ├── FsSourceTaskConfigTest.java │ ├── FsSourceTaskTest.java │ └── TaskFsTestConfig.java └── resources ├── file └── reader │ ├── data │ └── cobol │ │ ├── code-pages.cpy │ │ ├── code-pages.dt │ │ ├── companies.cpy │ │ ├── companies.dt │ │ ├── type-variety.cpy │ │ └── type-variety.dt │ └── schemas │ ├── people.avsc │ └── people_projection.avsc └── log4j.properties /.gitignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | *.ser 4 | *.class 5 | *~ 6 | *.bak 7 | #*.off 8 | *.old 9 | 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | 16 | # idea 17 | .idea 18 | *.iml 19 | 20 | # building 21 | target 22 | build 23 | null 24 | tmp 25 | temp 26 | test-output 27 | build.log 28 | 29 | # other scm 30 | .svn 31 | .CVS 32 | .hg* 33 | 34 | # switch to regexp syntax. 35 | # syntax: regexp 36 | # ^\.pc/ 37 | 38 | # Documentation autogenerated 39 | javadoc 40 | apidocs 41 | 42 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | language: java 3 | jdk: 4 | - oraclejdk8 5 | install: 6 | - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V 7 | script: 8 | - mvn test jacoco:report 9 | after_success: 10 | - mvn coveralls:report 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM confluentinc/cp-kafka-connect-base:6.1.0 2 | 3 | ARG PROJECT_VERSION 4 | ENV CONNECT_PLUGIN_PATH="/usr/share/java,/usr/share/confluent-hub-components" 5 | 6 | COPY ./target/components/packages/mmolimar-kafka-connect-fs-${PROJECT_VERSION}.zip /tmp/kafka-connect-fs.zip 7 | RUN confluent-hub install --no-prompt /tmp/kafka-connect-fs.zip && rm -rf /tmp/kafka-connect-fs.zip 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kafka Connect FileSystem Connector [![Build Status](https://travis-ci.org/mmolimar/kafka-connect-fs.svg?branch=master)](https://travis-ci.org/mmolimar/kafka-connect-fs)[![Coverage Status](https://coveralls.io/repos/github/mmolimar/kafka-connect-fs/badge.svg?branch=master)](https://coveralls.io/github/mmolimar/kafka-connect-fs?branch=master) 2 | 3 | **kafka-connect-fs** is a [Kafka Connector](https://kafka.apache.org/documentation.html#connect) 4 | for reading records from files in the file systems specified and load them into Kafka. 5 | 6 | Documentation for this connector can be found [here](https://kafka-connect-fs.readthedocs.io/). 7 | 8 | ## Development 9 | 10 | To build a development version you'll need a recent version of Kafka. You can build 11 | kafka-connect-fs with Maven using the standard lifecycle phases. 12 | 13 | ## FAQ 14 | 15 | Some frequently asked questions on Kafka Connect FileSystem Connector can be found here - 16 | https://kafka-connect-fs.readthedocs.io/en/latest/faq.html 17 | 18 | ## Contribute 19 | 20 | - Source Code: https://github.com/mmolimar/kafka-connect-fs 21 | - Issue Tracker: https://github.com/mmolimar/kafka-connect-fs/issues 22 | 23 | ## License 24 | 25 | Released under the Apache License, version 2.0. 26 | -------------------------------------------------------------------------------- /config/kafka-connect-fs.properties: -------------------------------------------------------------------------------- 1 | name=FsSourceConnector 2 | connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector 3 | tasks.max=1 4 | fs.uris=file:///data,hdfs://localhost:8020/data 5 | topic=mytopic 6 | policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy 7 | policy.recursive=true 8 | policy.regexp=^.*\.txt$ 9 | policy.batch_size=0 10 | policy.cleanup=none 11 | file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader 12 | file_reader.batch_size=0 13 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | cp-zookeeper: 4 | image: confluentinc/cp-zookeeper:6.1.0 5 | hostname: zookeeper 6 | container_name: zookeeper 7 | ports: 8 | - "2181:2181" 9 | environment: 10 | ZOOKEEPER_CLIENT_PORT: 2181 11 | ZOOKEEPER_TICK_TIME: 2000 12 | 13 | cp-kafka: 14 | image: confluentinc/cp-kafka:6.1.0 15 | hostname: kafka 16 | container_name: kafka 17 | depends_on: 18 | - cp-zookeeper 19 | ports: 20 | - "29092:29092" 21 | - "9092:9092" 22 | environment: 23 | KAFKA_BROKER_ID: 1 24 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 25 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 26 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 27 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 28 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 29 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092 30 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 31 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 32 | CONFLUENT_METRICS_ENABLE: 'false' 33 | 34 | cp-schema-registry: 35 | image: confluentinc/cp-schema-registry:6.1.0 36 | hostname: schema-registry 37 | container_name: schema-registry 38 | depends_on: 39 | - cp-zookeeper 40 | - cp-kafka 41 | ports: 42 | - "8081:8081" 43 | environment: 44 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 45 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' 46 | 47 | connect-fs: 48 | image: mmolimar/kafka-connect-fs:1.3.0 49 | container_name: connect 50 | depends_on: 51 | - cp-kafka 52 | - cp-schema-registry 53 | ports: 54 | - "8083:8083" 55 | - "8000:8000" 56 | environment: 57 | CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092' 58 | CONNECT_REST_ADVERTISED_HOST_NAME: connect 59 | CONNECT_REST_PORT: 8083 60 | CONNECT_GROUP_ID: compose-connect-group 61 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs 62 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 63 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 64 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets 65 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 66 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status 67 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 68 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter 69 | CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter 70 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 71 | CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" 72 | CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" 73 | CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181' 74 | CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/" 75 | CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO" 76 | CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR 77 | KAFKA_OPTS: "-agentlib:jdwp=transport=dt_socket,server=y,address=8000,suspend=n" 78 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = kafka-connect-fs 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=kafka-connect-fs 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # kafka-connect-fs documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Mar 23 20:59:04 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | #templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'Kafka Connect FileSystem Connector' 50 | copyright = '2017, Mario Molina' 51 | author = 'Mario Molina' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '1.3' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '1.3' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | # language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['build'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | # -- Options for HTML output ---------------------------------------------- 81 | import sphinx_rtd_theme 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 98 | #html_static_path = ['_static'] 99 | 100 | # -- Options for HTMLHelp output ------------------------------------------ 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'KafkaConnectFileSystemConnectorDoc' 104 | 105 | # -- Options for LaTeX output --------------------------------------------- 106 | 107 | latex_elements = { 108 | # The paper size ('letterpaper' or 'a4paper'). 109 | # 110 | # 'papersize': 'letterpaper', 111 | 112 | # The font size ('10pt', '11pt' or '12pt'). 113 | # 114 | # 'pointsize': '10pt', 115 | 116 | # Additional stuff for the LaTeX preamble. 117 | # 118 | # 'preamble': '', 119 | 120 | # Latex figure (float) alignment 121 | # 122 | # 'figure_align': 'htbp', 123 | } 124 | 125 | # Grouping the document tree into LaTeX files. List of tuples 126 | # (source start file, target name, title, 127 | # author, documentclass [howto, manual, or own class]). 128 | latex_documents = [ 129 | (master_doc, 'KafkaConnectFileSystemConnector.tex', 'Kafka Connect FileSystem Connector Documentation', 130 | 'Mario Molina', 'manual'), 131 | ] 132 | 133 | # -- Options for manual page output --------------------------------------- 134 | 135 | # One entry per manual page. List of tuples 136 | # (source start file, name, description, authors, manual section). 137 | man_pages = [ 138 | (master_doc, 'kafkaconnectfs', 'Kafka Connect FileSystem Connector Documentation', 139 | [author], 1) 140 | ] 141 | 142 | # -- Options for Texinfo output ------------------------------------------- 143 | 144 | # Grouping the document tree into Texinfo files. List of tuples 145 | # (source start file, target name, title, author, 146 | # dir menu entry, description, category) 147 | texinfo_documents = [ 148 | (master_doc, 'KafkaConnectFs', 'Kafka Connect FileSystem Connector Documentation', 149 | author, 'KafkaConnectFileSystemConnector', 'Kafka Connector for FileSystem', 150 | 'Miscellaneous'), 151 | ] 152 | -------------------------------------------------------------------------------- /docs/source/connector.rst: -------------------------------------------------------------------------------- 1 | .. _connector: 2 | 3 | ******************************************** 4 | Connector 5 | ******************************************** 6 | 7 | The connector takes advantage of the abstraction provided from `Hadoop Common `__ 8 | using the implementation of the ``org.apache.hadoop.fs.FileSystem`` class. So, it's possible to use a 9 | wide variety of FS or if your FS is not included in the Hadoop Common API you can implement an extension 10 | of this abstraction and using it in a transparent way. 11 | 12 | Among others, these are some file systems it supports: 13 | 14 | * HDFS. 15 | * S3. 16 | * Google Cloud Storage. 17 | * Azure Blob Storage & Azure Data Lake Store. 18 | * FTP & SFTP. 19 | * WebHDFS. 20 | * Local File System. 21 | * Hadoop Archive File System. 22 | 23 | Getting started 24 | ============================================ 25 | 26 | Prerequisites 27 | -------------------------------------------- 28 | 29 | - Apache Kafka 2.6.0. 30 | - Java 8. 31 | - Confluent Schema Registry (recommended). 32 | 33 | Building from source 34 | -------------------------------------------- 35 | 36 | .. sourcecode:: bash 37 | 38 | mvn clean package 39 | 40 | General config 41 | -------------------------------------------- 42 | 43 | The ``kafka-connect-fs.properties`` file defines the following properties as required: 44 | 45 | .. sourcecode:: bash 46 | 47 | name=FsSourceConnector 48 | connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector 49 | tasks.max=1 50 | fs.uris=file:///data,hdfs://localhost:8020/data 51 | topic=mytopic 52 | policy.class= 53 | policy.recursive=true 54 | policy.regexp=.* 55 | policy.batch_size=0 56 | policy.cleanup=none 57 | file_reader.class= 58 | file_reader.batch_size=0 59 | 60 | #. The connector name. 61 | #. Class indicating the connector. 62 | #. Number of tasks the connector is allowed to start. 63 | #. Comma-separated URIs of the FS(s). They can be URIs pointing out directly to a file 64 | or a directory in the FS. These URIs can also be dynamic by using expressions for 65 | modifying them in runtime. 66 | #. Topic in which copy data from the FS. 67 | #. Policy class to apply (must implement 68 | ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface). 69 | #. Flag to activate traversed recursion in subdirectories when listing files. 70 | #. Regular expression to filter files from the FS. 71 | #. Number of files that should be handled at a time. Non-positive values disable batching. 72 | #. Cleanup strategy to manage processed files. 73 | #. File reader class to read files from the FS 74 | (must implement ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader`` interface). 75 | #. Number of records to process at a time. Non-positive values disable batching. 76 | 77 | A more detailed information about these properties can be found :ref:`here`. 78 | 79 | Running in local 80 | -------------------------------------------- 81 | 82 | .. sourcecode:: bash 83 | 84 | export KAFKA_HOME=/path/to/kafka/install/dir 85 | 86 | .. sourcecode:: bash 87 | 88 | mvn clean package 89 | export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')" 90 | $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties config/kafka-connect-fs.properties 91 | 92 | Running in Docker 93 | -------------------------------------------- 94 | 95 | .. sourcecode:: bash 96 | 97 | mvn clean package 98 | 99 | .. sourcecode:: bash 100 | 101 | docker build --build-arg PROJECT_VERSION= . 102 | docker-compose build 103 | docker-compose up -d 104 | docker logs --tail="all" -f connect 105 | 106 | .. sourcecode:: bash 107 | 108 | curl -sX GET http://localhost:8083/connector-plugins | grep FsSourceConnector 109 | 110 | Components 111 | ============================================ 112 | 113 | There are two main concepts to decouple concerns within the connector. 114 | They are **policies** and **file readers**, described below. 115 | 116 | Policies 117 | -------------------------------------------- 118 | 119 | In order to ingest data from the FS(s), the connector needs a **policy** to define the rules to do it. 120 | 121 | Basically, the policy tries to connect to each FS included in the ``fs.uris`` connector property, lists files 122 | (and filter them using the regular expression provided in the ``policy.regexp`` property) and enables 123 | a file reader to read records. 124 | 125 | The policy to be used by the connector is defined in the ``policy.class`` connector property. 126 | 127 | .. important:: When delivering records from the connector to Kafka, they contain their own file offset 128 | so, if in the next eventual policy execution this file is processed again, 129 | the policy will seek the file to this offset and process the next records 130 | if any (**if the offset was committed**). 131 | 132 | .. note:: If the URIs included in the ``fs.uris`` connector property contain any expression of the 133 | form ``${XXX}``, this dynamic URI is built in the moment of the policy execution. 134 | 135 | Currently, there are few policies to support some use cases but, for sure, you can develop your own one 136 | if the existing policies don't fit your needs. 137 | The only restriction is that you must implement the interface 138 | ``com.github.mmolimar.kafka.connect.fs.policy.Policy``. 139 | 140 | .. include:: policies.rst 141 | 142 | File readers 143 | -------------------------------------------- 144 | 145 | They read files and process each record from the FS. The **file reader** is needed by the policy to enable 146 | the connector to process each record and includes in the implementation how to seek and iterate over the 147 | records within the file. 148 | 149 | The file reader to be used when processing files is defined in the ``file_reader.class`` connector property. 150 | 151 | In the same way as policies, the connector provides several sort of readers to parse and read records 152 | for different file formats. If you don't have a file reader that fits your needs, just implement one 153 | with the unique restriction that it must implement the interface 154 | ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader``. 155 | 156 | The are several file readers included which can read the following file formats: 157 | 158 | * Parquet. 159 | * Avro. 160 | * ORC. 161 | * SequenceFile. 162 | * Cobol / EBCDIC. 163 | * Other binary files. 164 | * CSV. 165 | * TSV. 166 | * Fixed-width. 167 | * JSON. 168 | * XML. 169 | * YAML. 170 | * Text. 171 | 172 | .. include:: filereaders.rst 173 | -------------------------------------------------------------------------------- /docs/source/faq.rst: -------------------------------------------------------------------------------- 1 | .. faq: 2 | 3 | ******************************************** 4 | FAQs 5 | ******************************************** 6 | 7 | **My file was already processed and the connector, when it's executed again, 8 | processes the same records again.** 9 | 10 | If during the previous executions the records were sent successfully to Kafka, 11 | their offsets were sent too. Then, when executing the policy again, it 12 | retrieves the offset and seeks the file. If this didn't happen, it's possible 13 | that the offset was not committed yet and, consequently, the offset retrieved 14 | is non-existent or too old. 15 | 16 | Have a look when the offsets are committed in Kafka and/or try to execute the 17 | policy when you are sure the offsets have been committed. 18 | 19 | **The connector started but does not process any kind of file.** 20 | 21 | This can be for several reasons: 22 | 23 | * Check if the files contained in the FS match the regexp provided. 24 | * Check if there is any kind of problem with the FS. The connector tolerates 25 | FS connection exceptions to process them later but in log files you'll find 26 | these possible errors. 27 | * The file reader is reading files with an invalid format so it cannot 28 | process the file and continues with the next one. You can see 29 | this as an error in the log. 30 | 31 | **I have directories in the FS created day by day and I have to modify 32 | the connector everyday.** 33 | 34 | Don't do this! Take advantage of the dynamic URIs using expressions. 35 | 36 | For instance, if you have this URI ``hdfs://host:9000/data/2020``, you can 37 | use this URI ``hdfs://host:9000/data/${yyyy}`` instead. 38 | 39 | **The connector is too slow to process all URIs I have.** 40 | 41 | Obviously, this depends of the files in the FS(s) but having several URIs in 42 | the connector might be a good idea to adjust the number of tasks 43 | to process those URIs in parallel ( ``tasks.max`` connector property). 44 | 45 | Also, using the properties ``policy.batch_size`` and/or ``file_reader.batch_size`` 46 | in case you have tons of files or files too large might help. 47 | 48 | **I removed a file from the FS but the connector is still sending messages 49 | with the contents of that file.** 50 | 51 | This is a tricky issue. The file reader is an iterator and processes 52 | record by record but part of the file is buffered and, even though the 53 | file was removed from the FS, the file reader continues producing records 54 | until throws an exception. It's a matter of time. 55 | 56 | But the main thing is that you don't have to worry about removing files 57 | from the FS when they are being processed. The connector tolerates errors 58 | when reading files and continues with the next file. 59 | -------------------------------------------------------------------------------- /docs/source/filereaders.rst: -------------------------------------------------------------------------------- 1 | Parquet 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Reads files with `Parquet `__ format. 5 | 6 | The reader takes advantage of the Parquet-Avro API and uses the Parquet file 7 | as if it was an Avro file, so the message sent to Kafka is built in the same 8 | way as the Avro file reader does. 9 | 10 | More information about properties of this file reader :ref:`here`. 11 | 12 | Avro 13 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 14 | 15 | Files with `Avro `__ format can be read with this reader. 16 | 17 | The Avro schema is not needed due to is read from the file. The message sent 18 | to Kafka is created by transforming the record by means of 19 | `Confluent avro-converter `__ 20 | API. 21 | 22 | More information about properties of this file reader :ref:`here`. 23 | 24 | ORC 25 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 26 | 27 | `ORC files `__ are a self-describing type-aware 28 | columnar file format designed for Hadoop workloads. 29 | 30 | This reader can process this file format, translating its schema and building 31 | a Kafka message with the content. 32 | 33 | .. warning:: If you have ORC files with ``union`` data types, this sort of 34 | data types will be transformed in a ``map`` object in the Kafka message. 35 | The value of each key will be ``fieldN``, where ``N`` represents 36 | the index within the data type. 37 | 38 | More information about properties of this file reader :ref:`here`. 39 | 40 | SequenceFile 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 42 | 43 | `Sequence files `__ are one kind of 44 | the Hadoop file formats which are serialized in key-value pairs. 45 | 46 | This reader can process this file format and build a Kafka message with the 47 | key-value pair. These two values are named ``key`` and ``value`` in the message 48 | by default but you can customize these field names. 49 | 50 | More information about properties of this file reader :ref:`here`. 51 | 52 | Cobol 53 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 54 | 55 | Mainframe files (Cobol / EBCDIC binary files) can be processed with this reader which uses the 56 | `Cobrix `__ parser. 57 | 58 | By means of the corresponding copybook -representing its schema-, it parses each record and 59 | translate it into a Kafka message with the schema. 60 | 61 | More information about properties of this file reader :ref:`here`. 62 | 63 | Binary 64 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 65 | 66 | All other kind of binary files can be ingested using this reader. 67 | 68 | It just extracts the content plus some metadata such as: path, file owner, file group, length, access time, 69 | and modification time. 70 | 71 | Each message will contain the following schema: 72 | 73 | * ``path``: File path (string). 74 | * ``owner``: Owner of the file. (string). 75 | * ``group``: Group associated with the file. (string). 76 | * ``length``: Length of this file, in bytes. (long). 77 | * ``access_time``: Access time of the file. (long). 78 | * ``modification_time``: Modification time of the file (long). 79 | * ``content``: Content of the file (bytes). 80 | 81 | More information about properties of this file reader :ref:`here`. 82 | 83 | CSV 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 85 | 86 | CSV file reader using a custom token to distinguish different columns in each line. 87 | 88 | It allows to distinguish a header in the files and set the name of their columns 89 | in the message sent to Kafka. If there is no header, the value of each column will be in 90 | the field named ``column_N`` (**N** represents the column index) in the message. 91 | Also, the token delimiter for columns is configurable. 92 | 93 | This reader is based on the `Univocity CSV parser `__. 94 | 95 | More information about properties of this file reader :ref:`here`. 96 | 97 | TSV 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 99 | 100 | TSV file reader using a tab ``\t`` to distinguish different columns in each line. 101 | 102 | Its behaviour is the same one for the CSV file reader regarding the header and the column names. 103 | 104 | This reader is based on the `Univocity TSV parser `__. 105 | 106 | More information about properties of this file reader :ref:`here`. 107 | 108 | FixedWidth 109 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 110 | 111 | FixedWidth is a plain text file reader which distinguishes each column based on the length of each field. 112 | 113 | Its behaviour is the same one for the CSV / TSV file readers regarding the header and the column names. 114 | 115 | This reader is based on the `Univocity Fixed-Width parser `__. 116 | 117 | More information about properties of this file reader :ref:`here`. 118 | 119 | JSON 120 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 121 | 122 | Reads JSON files which might contain multiple number of fields with their specified 123 | data types. The schema for this sort of records is inferred reading the first record 124 | and marked as optional in the schema all the fields contained. 125 | 126 | More information about properties of this file reader :ref:`here`. 127 | 128 | XML 129 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 130 | 131 | Reads XML files which might contain multiple number of fields with their specified 132 | data types. The schema for this sort of records is inferred reading the first record 133 | and marked as optional in the schema all the fields contained. 134 | 135 | .. warning:: Take into account the current 136 | `limitations `__. 137 | 138 | More information about properties of this file reader :ref:`here`. 139 | 140 | YAML 141 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 142 | 143 | Reads YAML files which might contain multiple number of fields with their specified 144 | data types. The schema for this sort of records is inferred reading the first record 145 | and marked as optional in the schema all the fields contained. 146 | 147 | More information about properties of this file reader :ref:`here`. 148 | 149 | Text 150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 151 | 152 | Reads plain text files. 153 | 154 | Each line represents one record (by default) which will be in a field 155 | named ``value`` in the message sent to Kafka by default but you can 156 | customize these field names. 157 | 158 | More information about properties of this file reader :ref:`here`. 159 | 160 | Agnostic 161 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 162 | 163 | Actually, this reader is a wrapper of the readers listing above. 164 | 165 | It tries to read any kind of file format using an internal reader based on the file extension, 166 | applying the proper one (Parquet, Avro, ORC, SequenceFile, Cobol / EBCDIC, CSV, TSV, FixedWidth, JSON, XML, 167 | YAML, or Text). In case of no extension has been matched, the Text file reader will be applied. 168 | 169 | Default extensions for each format (configurable): 170 | 171 | * Parquet: ``.parquet`` 172 | * Avro: ``.avro`` 173 | * ORC: ``.orc`` 174 | * SequenceFile: ``.seq`` 175 | * Cobol / EBCDIC: ``.dat`` 176 | * Other binary files: ``.bin`` 177 | * CSV: ``.csv`` 178 | * TSV: ``.tsv`` 179 | * FixedWidth: ``.fixed`` 180 | * JSON: ``.json`` 181 | * XML: ``.xml`` 182 | * YAML: ``.yaml`` 183 | * Text: any other sort of file extension. 184 | 185 | More information about properties of this file reader :ref:`here`. 186 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. kafka-connect-fs documentation master file, created by Mario Molina 2 | 3 | ******************************************** 4 | Kafka Connect FileSystem Connector 5 | ******************************************** 6 | 7 | Kafka Connect FileSystem Connector is a source connector for reading records from 8 | files in the file systems specified and load them into Kafka. 9 | 10 | The connector supports: 11 | 12 | * Several sort of File Systems (FS) to use. 13 | * Dynamic and static URIs to ingest data from. 14 | * Policies to define rules about how to look for files and clean them up after processing. 15 | * File readers to parse and read different kind of file formats. 16 | 17 | To learn more about the connector you can read :ref:`this section` and for more detailed 18 | configuration options you can read :ref:`this other one`. 19 | 20 | Also, you can download the source code from `here. `__ 21 | 22 | Contents 23 | ============================================ 24 | 25 | .. toctree:: 26 | :maxdepth: 2 27 | 28 | connector 29 | config_options 30 | faq 31 | -------------------------------------------------------------------------------- /docs/source/policies.rst: -------------------------------------------------------------------------------- 1 | Simple 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | It's a policy which just filters and processes files included in the corresponding URIs one time. 5 | 6 | .. attention:: This policy is more oriented for testing purposes. 7 | 8 | Sleepy 9 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 10 | 11 | The behaviour of this policy is similar to Simple policy but on each execution it sleeps 12 | and wait for the next one. Additionally, its custom properties allow to end it. 13 | 14 | You can learn more about the properties of this policy :ref:`here`. 15 | 16 | Cron 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 18 | 19 | This policy is scheduled based on cron expressions and their format to put in the configuration 20 | are based on the library `Quartz Scheduler `__. 21 | 22 | After finishing each execution, the policy gets slept until the next one is scheduled, if applicable. 23 | 24 | You can learn more about the properties of this policy :ref:`here`. 25 | 26 | HDFS file watcher 27 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 28 | 29 | It uses Hadoop notifications events and all create/append/rename/close events will be reported 30 | as files to be ingested. 31 | 32 | Just use it when you have HDFS URIs. 33 | 34 | You can learn more about the properties of this policy :ref:`here`. 35 | 36 | .. attention:: The URIs included in the general property ``fs.uris`` will be filtered and only those 37 | ones which start with the prefix ``hdfs://`` will be watched. Also, this policy 38 | will only work for Hadoop versions 2.6.0 or higher. 39 | 40 | S3 event notifications 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 42 | 43 | It uses S3 event notifications sent from S3 to process files which have been created or modified in S3. 44 | These notifications will be read from a AWS-SQS queue and they can be sent to SQS directly from S3 or via 45 | AWS-SNS, either as a SNS notification or a raw message in the subscription. 46 | 47 | Just use it when you have S3 URIs and the event notifications in the S3 bucket must be enabled to a SNS 48 | topic or a SQS queue. 49 | 50 | You can learn more about the properties of this policy :ref:`here`. 51 | -------------------------------------------------------------------------------- /src/main/assembly/development.xml: -------------------------------------------------------------------------------- 1 | 5 | 7 | development 8 | 9 | dir 10 | 11 | false 12 | 13 | 14 | share/java/kafka-connect-fs/ 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/main/assembly/package.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | package 7 | 8 | dir 9 | 10 | false 11 | 12 | 13 | ${project.basedir} 14 | share/doc/${project.name}/ 15 | 16 | README* 17 | LICENSE* 18 | NOTICE* 19 | licenses/ 20 | 21 | 22 | 23 | ${project.basedir}/config 24 | etc/${project.name} 25 | 26 | * 27 | 28 | 29 | 30 | 31 | 32 | share/java/${project.name} 33 | true 34 | true 35 | 36 | org.apache.kafka:connect-api 37 | org.mortbay.jetty:* 38 | com.sun.jersey:* 39 | org.eclipse.jetty:jetty-util 40 | com.sun.jersey.contribs:jersey-guice 41 | org.apache.zookeeper:zookeeper 42 | log4j:log4j 43 | org.slf4j:slf4j-api 44 | org.slf4j:slf4j-log4j12 45 | javax.servlet:servlet-api 46 | javax.servlet.jsp:jsp-api 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/main/assembly/standalone.xml: -------------------------------------------------------------------------------- 1 | 5 | 7 | standalone 8 | 9 | jar 10 | 11 | false 12 | 13 | 14 | ${project.basedir} 15 | / 16 | 17 | README* 18 | LICENSE* 19 | NOTICE* 20 | licenses.html 21 | licenses/ 22 | notices/ 23 | 24 | 25 | 26 | 27 | 28 | / 29 | true 30 | true 31 | runtime 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.util.Version; 4 | import org.apache.kafka.common.config.ConfigDef; 5 | import org.apache.kafka.common.config.ConfigException; 6 | import org.apache.kafka.connect.connector.Task; 7 | import org.apache.kafka.connect.errors.ConnectException; 8 | import org.apache.kafka.connect.source.SourceConnector; 9 | import org.apache.kafka.connect.util.ConnectorUtils; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import java.util.ArrayList; 14 | import java.util.HashMap; 15 | import java.util.List; 16 | import java.util.Map; 17 | 18 | public class FsSourceConnector extends SourceConnector { 19 | 20 | private static Logger log = LoggerFactory.getLogger(FsSourceConnector.class); 21 | 22 | private FsSourceConnectorConfig config; 23 | 24 | @Override 25 | public String version() { 26 | return Version.getVersion(); 27 | } 28 | 29 | @Override 30 | public void start(Map properties) { 31 | log.info("{} Starting connector...", this); 32 | try { 33 | config = new FsSourceConnectorConfig(properties); 34 | } catch (ConfigException ce) { 35 | throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce); 36 | } catch (Exception ce) { 37 | throw new ConnectException("An error has occurred when starting FsSourceConnector." + ce); 38 | } 39 | } 40 | 41 | @Override 42 | public Class taskClass() { 43 | return FsSourceTask.class; 44 | } 45 | 46 | @Override 47 | public List> taskConfigs(int maxTasks) { 48 | if (config == null) { 49 | throw new ConnectException("Connector config has not been initialized."); 50 | } 51 | final List> taskConfigs = new ArrayList<>(); 52 | 53 | List fsUris = config.getFsUris(); 54 | int groups = Math.min(fsUris.size(), maxTasks); 55 | ConnectorUtils.groupPartitions(fsUris, groups) 56 | .forEach(dirs -> { 57 | Map taskProps = new HashMap<>(config.originalsStrings()); 58 | taskProps.put(FsSourceConnectorConfig.FS_URIS, String.join(",", dirs)); 59 | taskConfigs.add(taskProps); 60 | }); 61 | 62 | log.debug("{} Partitions grouped as: {}", this, taskConfigs); 63 | 64 | return taskConfigs; 65 | } 66 | 67 | @Override 68 | public void stop() { 69 | log.info("{} Stopping FsSourceConnector.", this); 70 | // Nothing to do 71 | } 72 | 73 | @Override 74 | public ConfigDef config() { 75 | return FsSourceConnectorConfig.conf(); 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return this.getClass().getSimpleName(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import org.apache.kafka.common.config.AbstractConfig; 4 | import org.apache.kafka.common.config.ConfigDef; 5 | import org.apache.kafka.common.config.ConfigDef.Importance; 6 | import org.apache.kafka.common.config.ConfigDef.Type; 7 | 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | 12 | public class FsSourceConnectorConfig extends AbstractConfig { 13 | 14 | public static final String FS_URIS = "fs.uris"; 15 | private static final String FS_URIS_DOC = "Comma-separated URIs of the FS(s)."; 16 | private static final String FS_URIS_DISPLAY = "File system URIs"; 17 | 18 | public static final String TOPIC = "topic"; 19 | private static final String TOPIC_DOC = "Topic to copy data to."; 20 | private static final String TOPIC_DISPLAY = "Topic"; 21 | 22 | private static final String CONNECTOR_GROUP = "Connector"; 23 | 24 | public FsSourceConnectorConfig(ConfigDef config, Map parsedConfig) { 25 | super(config, parsedConfig); 26 | } 27 | 28 | public FsSourceConnectorConfig(Map parsedConfig) { 29 | this(conf(), parsedConfig); 30 | } 31 | 32 | public static ConfigDef conf() { 33 | int order = 0; 34 | return new ConfigDef() 35 | .define( 36 | FS_URIS, 37 | Type.LIST, 38 | ConfigDef.NO_DEFAULT_VALUE, 39 | Importance.HIGH, 40 | FS_URIS_DOC, 41 | CONNECTOR_GROUP, 42 | ++order, 43 | ConfigDef.Width.LONG, 44 | FS_URIS_DISPLAY 45 | ).define( 46 | TOPIC, 47 | Type.STRING, 48 | ConfigDef.NO_DEFAULT_VALUE, 49 | Importance.HIGH, 50 | TOPIC_DOC, 51 | CONNECTOR_GROUP, 52 | ++order, 53 | ConfigDef.Width.LONG, 54 | TOPIC_DISPLAY 55 | ); 56 | } 57 | 58 | public List getFsUris() { 59 | return this.getList(FS_URIS); 60 | } 61 | 62 | public String getTopic() { 63 | return this.getString(TOPIC); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.AbstractFileReader; 5 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; 6 | import com.github.mmolimar.kafka.connect.fs.policy.Policy; 7 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; 8 | import com.github.mmolimar.kafka.connect.fs.util.Version; 9 | import org.apache.kafka.common.config.ConfigException; 10 | import org.apache.kafka.common.utils.SystemTime; 11 | import org.apache.kafka.common.utils.Time; 12 | import org.apache.kafka.connect.data.Struct; 13 | import org.apache.kafka.connect.errors.ConnectException; 14 | import org.apache.kafka.connect.source.SourceRecord; 15 | import org.apache.kafka.connect.source.SourceTask; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import java.io.IOException; 20 | import java.util.*; 21 | import java.util.concurrent.atomic.AtomicBoolean; 22 | import java.util.function.Function; 23 | import java.util.stream.Collectors; 24 | import java.util.stream.Stream; 25 | import java.util.stream.StreamSupport; 26 | 27 | public class FsSourceTask extends SourceTask { 28 | 29 | private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class); 30 | 31 | private final AtomicBoolean stop; 32 | private final Time time; 33 | 34 | private FsSourceTaskConfig config; 35 | private Policy policy; 36 | private int pollInterval; 37 | 38 | public FsSourceTask() { 39 | this.stop = new AtomicBoolean(false); 40 | this.time = new SystemTime(); 41 | } 42 | 43 | @Override 44 | public String version() { 45 | return Version.getVersion(); 46 | } 47 | 48 | @Override 49 | @SuppressWarnings("unchecked") 50 | public void start(Map properties) { 51 | log.info("{} Starting FS source task...", this); 52 | try { 53 | config = new FsSourceTaskConfig(properties); 54 | if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) { 55 | throw new ConfigException("Policy class " + 56 | config.getClass(FsSourceTaskConfig.POLICY_CLASS) + " is not a subclass of " + Policy.class); 57 | } 58 | if (config.getClass(FsSourceTaskConfig.FILE_READER_CLASS).isAssignableFrom(FileReader.class)) { 59 | throw new ConfigException("FileReader class " + 60 | config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + " is not a subclass of " + FileReader.class); 61 | } 62 | 63 | Class policyClass = (Class) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS)); 64 | policy = ReflectionUtils.makePolicy(policyClass, config); 65 | pollInterval = config.getInt(FsSourceTaskConfig.POLL_INTERVAL_MS); 66 | } catch (ConfigException ce) { 67 | log.error("{} Couldn't start FS source task: {}", this, ce.getMessage(), ce); 68 | throw new ConnectException("Couldn't start FS source task due to configuration error: " + ce.getMessage(), ce); 69 | } catch (Exception e) { 70 | log.error("{} Couldn't start FS source task: {}", this, e.getMessage(), e); 71 | throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage(), e); 72 | } 73 | log.info("{} FS source task started with policy [{}].", this, policy.getClass().getName()); 74 | } 75 | 76 | @Override 77 | public List poll() { 78 | while (!stop.get() && policy != null && !policy.hasEnded()) { 79 | log.trace("{} Polling for new data...", this); 80 | Function> makePartitionKey = (FileMetadata metadata) -> 81 | Collections.singletonMap("path", metadata.getPath()); 82 | 83 | // Fetch all the offsets upfront to avoid fetching offsets once per file 84 | List filesToProcess = filesToProcess().collect(Collectors.toList()); 85 | List> partitions = filesToProcess.stream().map(makePartitionKey).collect(Collectors.toList()); 86 | Map, Map> offsets = context.offsetStorageReader().offsets(partitions); 87 | 88 | List totalRecords = filesToProcess.stream().map(metadata -> { 89 | List records = new ArrayList<>(); 90 | Map partitionKey = makePartitionKey.apply(metadata); 91 | Map offset = Optional.ofNullable(offsets.get(partitionKey)).orElse(new HashMap<>()); 92 | try (FileReader reader = policy.offer(metadata, offset)) { 93 | if (reader.hasNext()) log.info("{} Processing records for file {}...", this, metadata); 94 | while (reader.hasNext()) { 95 | Struct record = reader.next(); 96 | // TODO change FileReader interface in the next major version 97 | boolean hasNext = (reader instanceof AbstractFileReader) ? 98 | ((AbstractFileReader) reader).hasNextBatch() || reader.hasNext() : reader.hasNext(); 99 | records.add(convert(metadata, reader.currentOffset(), !hasNext, record)); 100 | } 101 | } catch (IOException | ConnectException e) { 102 | // when an exception happens reading a file, the connector continues 103 | log.warn("{} Error reading file [{}]: {}. Keep going...", 104 | this, metadata.getPath(), e.getMessage(), e); 105 | } 106 | log.debug("{} Read [{}] records from file [{}].", this, records.size(), metadata.getPath()); 107 | 108 | return records; 109 | }).flatMap(Collection::stream).collect(Collectors.toList()); 110 | 111 | log.debug("{} Returning [{}] records in execution number [{}] for policy [{}].", 112 | this, totalRecords.size(), policy.getExecutions(), policy.getClass().getName()); 113 | 114 | return totalRecords; 115 | } 116 | if (pollInterval > 0) { 117 | log.trace("{} Waiting [{}] ms for next poll.", this, pollInterval); 118 | time.sleep(pollInterval); 119 | } 120 | return null; 121 | } 122 | 123 | private Stream filesToProcess() { 124 | try { 125 | return asStream(policy.execute()) 126 | .filter(metadata -> metadata.getLen() > 0); 127 | } catch (IOException | ConnectException e) { 128 | // when an exception happens executing the policy, the connector continues 129 | log.error("{} Cannot retrieve files to process from the FS: [{}]. " + 130 | "There was an error executing the policy but the task tolerates this and continues: {}", 131 | this, policy.getURIs(), e.getMessage(), e); 132 | return Stream.empty(); 133 | } 134 | } 135 | 136 | private Stream asStream(Iterator src) { 137 | Iterable iterable = () -> src; 138 | return StreamSupport.stream(iterable.spliterator(), false); 139 | } 140 | 141 | private SourceRecord convert(FileMetadata metadata, long offset, boolean eof, Struct struct) { 142 | return new SourceRecord( 143 | Collections.singletonMap("path", metadata.getPath()), 144 | new HashMap() {{ 145 | put("offset", offset); 146 | put("file-size", metadata.getLen()); 147 | put("eof", eof); 148 | }}, 149 | config.getTopic(), 150 | struct.schema(), 151 | struct 152 | ); 153 | } 154 | 155 | @Override 156 | public void stop() { 157 | log.info("{} Stopping FS source task...", this); 158 | stop.set(true); 159 | synchronized (this) { 160 | if (policy != null) { 161 | try { 162 | policy.close(); 163 | } catch (IOException ioe) { 164 | log.warn("{} Error closing policy: {}", this, ioe.getMessage(), ioe); 165 | } 166 | } 167 | } 168 | } 169 | 170 | @Override 171 | public String toString() { 172 | return this.getClass().getSimpleName(); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file; 2 | 3 | import java.util.List; 4 | import java.util.Optional; 5 | 6 | public class FileMetadata { 7 | private String path; 8 | private long length; 9 | private List blocks; 10 | 11 | public FileMetadata(String path, long length, List blocks) { 12 | this.path = Optional.ofNullable(path).orElse(""); 13 | this.length = length; 14 | this.blocks = blocks; 15 | } 16 | 17 | public String getPath() { 18 | return path; 19 | } 20 | 21 | public long getLen() { 22 | return length; 23 | } 24 | 25 | public List getBlocks() { 26 | return blocks; 27 | } 28 | 29 | @Override 30 | public String toString() { 31 | return String.format("[path = %s, length = %s, blocks = %s]", path, length, blocks); 32 | } 33 | 34 | @Override 35 | public boolean equals(Object object) { 36 | if (this == object) return true; 37 | if (!(object instanceof FileMetadata)) return false; 38 | 39 | FileMetadata metadata = (FileMetadata) object; 40 | return this.path.equals(metadata.getPath()) && 41 | this.length == metadata.length && 42 | this.blocks.equals(metadata.getBlocks()); 43 | } 44 | 45 | public int hashCode() { 46 | return path.hashCode(); 47 | } 48 | 49 | 50 | public static class BlockInfo { 51 | private long offset; 52 | private long length; 53 | private boolean corrupt; 54 | 55 | public BlockInfo(long offset, long length, boolean corrupt) { 56 | this.offset = offset; 57 | this.length = length; 58 | this.corrupt = corrupt; 59 | } 60 | 61 | @Override 62 | public boolean equals(Object object) { 63 | if (this == object) return true; 64 | if (!(object instanceof BlockInfo)) return false; 65 | 66 | BlockInfo blockInfo = (BlockInfo) object; 67 | return this.offset == blockInfo.offset && 68 | this.length == blockInfo.length && 69 | this.corrupt == blockInfo.corrupt; 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | return String.format("[offset = %s, length = %s, corrupt = %s]", offset, length, corrupt); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.kafka.connect.data.Struct; 7 | import org.apache.kafka.connect.errors.ConnectException; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.io.IOException; 12 | import java.util.Map; 13 | import java.util.NoSuchElementException; 14 | import java.util.stream.Collectors; 15 | 16 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 17 | 18 | public abstract class AbstractFileReader implements FileReader { 19 | 20 | protected final Logger log = LoggerFactory.getLogger(getClass()); 21 | 22 | private final FileSystem fs; 23 | private final Path filePath; 24 | private final ReaderAdapter adapter; 25 | private final int batchSize; 26 | private boolean seeked; 27 | private long offset; 28 | 29 | public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { 30 | if (fs == null || filePath == null) { 31 | throw new IllegalArgumentException("File system and file path are required."); 32 | } 33 | this.fs = fs; 34 | this.filePath = filePath; 35 | this.adapter = adapter; 36 | this.batchSize = Integer.parseInt(config.getOrDefault(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, "0").toString()); 37 | this.seeked = false; 38 | this.offset = 0; 39 | 40 | configure(readerConfig(config)); 41 | log.trace("{} Initialized file reader with batch size [{}] for file [{}].", this, this.batchSize, this.filePath); 42 | } 43 | 44 | protected final Map readerConfig(Map config) { 45 | return config.entrySet().stream() 46 | .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) 47 | .filter(entry -> entry.getValue() != null) 48 | .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString())); 49 | } 50 | 51 | protected abstract void configure(Map config); 52 | 53 | protected FileSystem getFs() { 54 | return fs; 55 | } 56 | 57 | @Override 58 | public Path getFilePath() { 59 | return filePath; 60 | } 61 | 62 | @Override 63 | public long currentOffset() { 64 | return offset; 65 | } 66 | 67 | protected void incrementOffset() { 68 | offset++; 69 | } 70 | 71 | protected void setOffset(long offset) { 72 | this.offset = offset; 73 | } 74 | 75 | @Override 76 | public final boolean hasNext() { 77 | checkClosed(); 78 | try { 79 | return (batchSize <= 0 || offset == 0 || offset % batchSize != 0 || (offset % batchSize == 0 && seeked)) && 80 | hasNextRecord(); 81 | } catch (ConnectException ce) { 82 | throw ce; 83 | } catch (Exception e) { 84 | throw new ConnectException("Error when checking if the reader has more records.", e); 85 | } 86 | } 87 | 88 | @Override 89 | public final Struct next() { 90 | if (!hasNext()) { 91 | throw new NoSuchElementException("There are no more records in file: " + getFilePath()); 92 | } 93 | try { 94 | Struct struct = adapter.apply(nextRecord()); 95 | seeked = false; 96 | return struct; 97 | } catch (ConnectException ce) { 98 | throw ce; 99 | } catch (Exception e) { 100 | throw new ConnectException("Error processing next record in file: " + getFilePath(), e); 101 | } 102 | } 103 | 104 | public final boolean hasNextBatch() { 105 | checkClosed(); 106 | try { 107 | return batchSize > 0 && hasNextRecord(); 108 | } catch (ConnectException ce) { 109 | throw ce; 110 | } catch (Exception e) { 111 | throw new ConnectException("Error when checking if the reader has more batches.", e); 112 | } 113 | } 114 | 115 | public final void nextBatch() { 116 | if (!hasNextBatch()) { 117 | throw new NoSuchElementException("There are no more batches in file: " + getFilePath()); 118 | } 119 | long batchOffset = offset + (offset % batchSize); 120 | seek(batchOffset); 121 | } 122 | 123 | @Override 124 | public final void seek(long offset) { 125 | if (offset < 0) { 126 | throw new IllegalArgumentException("Record offset must be greater than 0."); 127 | } 128 | checkClosed(); 129 | try { 130 | seekFile(offset); 131 | seeked = true; 132 | } catch (IOException ioe) { 133 | throw new ConnectException("Error seeking file: " + getFilePath(), ioe); 134 | } 135 | } 136 | 137 | @Override 138 | public String toString() { 139 | return this.getClass().getSimpleName(); 140 | } 141 | 142 | protected ReaderAdapter getAdapter() { 143 | return adapter; 144 | } 145 | 146 | private void checkClosed() { 147 | if (isClosed()) { 148 | throw new ConnectException("File stream is closed!"); 149 | } 150 | } 151 | 152 | protected abstract T nextRecord() throws IOException; 153 | 154 | protected abstract boolean hasNextRecord() throws IOException; 155 | 156 | protected abstract void seekFile(long offset) throws IOException; 157 | 158 | protected abstract boolean isClosed(); 159 | 160 | } 161 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import io.confluent.connect.avro.AvroData; 4 | import org.apache.avro.Schema; 5 | import org.apache.avro.file.DataFileReader; 6 | import org.apache.avro.generic.GenericRecord; 7 | import org.apache.avro.specific.SpecificDatumReader; 8 | import org.apache.hadoop.fs.AvroFSInput; 9 | import org.apache.hadoop.fs.FileContext; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.kafka.connect.data.Struct; 13 | 14 | import java.io.IOException; 15 | import java.util.Map; 16 | import java.util.Optional; 17 | 18 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 19 | 20 | public class AvroFileReader extends AbstractFileReader { 21 | 22 | private static final String FILE_READER_AVRO = FILE_READER_PREFIX + "avro."; 23 | 24 | public static final String FILE_READER_AVRO_SCHEMA = FILE_READER_AVRO + "schema"; 25 | 26 | private final DataFileReader reader; 27 | private Schema schema; 28 | private boolean closed; 29 | 30 | public AvroFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 31 | super(fs, filePath, new GenericRecordToStruct(), config); 32 | 33 | AvroFSInput input = new AvroFSInput(FileContext.getFileContext(filePath.toUri()), filePath); 34 | if (this.schema == null) { 35 | this.reader = new DataFileReader<>(input, new SpecificDatumReader<>()); 36 | } else { 37 | this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); 38 | } 39 | this.closed = false; 40 | } 41 | 42 | @Override 43 | protected void configure(Map config) { 44 | this.schema = Optional.ofNullable(config.get(FILE_READER_AVRO_SCHEMA)) 45 | .map(c -> new Schema.Parser().parse(c)) 46 | .orElse(null); 47 | } 48 | 49 | @Override 50 | public boolean hasNextRecord() { 51 | return reader.hasNext(); 52 | } 53 | 54 | @Override 55 | protected GenericRecord nextRecord() { 56 | GenericRecord record = reader.next(); 57 | incrementOffset(); 58 | 59 | return record; 60 | } 61 | 62 | @Override 63 | public void seekFile(long offset) throws IOException { 64 | if (offset == currentOffset()) { 65 | return; 66 | } else if (offset < currentOffset()) { 67 | reader.sync(0L); 68 | } 69 | while (super.hasNext() && offset > currentOffset()) { 70 | super.next(); 71 | } 72 | setOffset(offset); 73 | } 74 | 75 | @Override 76 | public void close() throws IOException { 77 | closed = true; 78 | reader.sync(0); 79 | reader.close(); 80 | } 81 | 82 | @Override 83 | public boolean isClosed() { 84 | return closed; 85 | } 86 | 87 | static class GenericRecordToStruct implements ReaderAdapter { 88 | 89 | private static final int CACHE_SIZE = 100; 90 | private final AvroData avroData; 91 | 92 | GenericRecordToStruct() { 93 | this.avroData = new AvroData(CACHE_SIZE); 94 | } 95 | 96 | @Override 97 | public Struct apply(GenericRecord record) { 98 | return (Struct) avroData.toConnectData(record.getSchema(), record).value(); 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/BinaryFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.FSDataInputStream; 4 | import org.apache.hadoop.fs.FileStatus; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.kafka.connect.data.Schema; 8 | import org.apache.kafka.connect.data.SchemaBuilder; 9 | import org.apache.kafka.connect.data.Struct; 10 | 11 | import java.io.ByteArrayOutputStream; 12 | import java.io.EOFException; 13 | import java.io.IOException; 14 | import java.util.Map; 15 | 16 | public class BinaryFileReader extends AbstractFileReader { 17 | 18 | private static final String FIELD_PATH = "path"; 19 | private static final String FIELD_OWNER = "owner"; 20 | private static final String FIELD_GROUP = "group"; 21 | private static final String FIELD_LENGTH = "length"; 22 | private static final String FIELD_ACCESS_TIME = "access_time"; 23 | private static final String FIELD_MODIFICATION_TIME = "modification_time"; 24 | private static final String FIELD_CONTENT = "content"; 25 | 26 | protected static final int NUM_RECORDS = 1; 27 | 28 | private final FileStatus fileStatus; 29 | private final Schema schema; 30 | 31 | private FSDataInputStream is; 32 | private boolean closed; 33 | 34 | public BinaryFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 35 | super(fs, filePath, new BinaryToStruct(), config); 36 | 37 | this.is = getFs().open(getFilePath()); 38 | this.fileStatus = getFs().getFileStatus(getFilePath()); 39 | this.schema = buildSchema(); 40 | this.closed = false; 41 | } 42 | 43 | @Override 44 | protected void configure(Map config) { 45 | } 46 | 47 | @Override 48 | protected BinaryRecord nextRecord() throws IOException { 49 | return new BinaryRecord(schema, fileStatus, readFully(is)); 50 | } 51 | 52 | @Override 53 | protected boolean hasNextRecord() throws IOException { 54 | return is.available() > 0; 55 | } 56 | 57 | @Override 58 | protected void seekFile(long offset) throws IOException { 59 | if (offset == 0 && !isClosed()) { 60 | is = getFs().open(getFilePath()); 61 | } else if (!isClosed()){ 62 | readFully(is); 63 | } 64 | } 65 | 66 | @Override 67 | public void close() throws IOException { 68 | closed = true; 69 | is.close(); 70 | } 71 | 72 | @Override 73 | public boolean isClosed() { 74 | return closed; 75 | } 76 | 77 | private Schema buildSchema() { 78 | return SchemaBuilder.struct() 79 | .field(FIELD_PATH, Schema.STRING_SCHEMA) 80 | .field(FIELD_OWNER, Schema.STRING_SCHEMA) 81 | .field(FIELD_GROUP, Schema.STRING_SCHEMA) 82 | .field(FIELD_LENGTH, Schema.INT64_SCHEMA) 83 | .field(FIELD_ACCESS_TIME, Schema.INT64_SCHEMA) 84 | .field(FIELD_MODIFICATION_TIME, Schema.INT64_SCHEMA) 85 | .field(FIELD_CONTENT, Schema.BYTES_SCHEMA) 86 | .build(); 87 | } 88 | 89 | private byte[] readFully(FSDataInputStream in) throws IOException { 90 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 91 | try { 92 | while (true) { 93 | baos.write(in.readByte()); 94 | } 95 | } catch (EOFException ignored) { 96 | } 97 | return baos.toByteArray(); 98 | } 99 | 100 | static class BinaryToStruct implements ReaderAdapter { 101 | 102 | @Override 103 | public Struct apply(BinaryRecord record) { 104 | Struct struct = new Struct(record.schema); 105 | record.schema.fields().forEach(field -> { 106 | Object value = null; 107 | switch (field.name()) { 108 | case FIELD_PATH: 109 | value = record.fileStatus.getPath().toString(); 110 | break; 111 | case FIELD_OWNER: 112 | value = record.fileStatus.getOwner(); 113 | break; 114 | case FIELD_GROUP: 115 | value = record.fileStatus.getGroup(); 116 | break; 117 | case FIELD_LENGTH: 118 | value = record.fileStatus.getLen(); 119 | break; 120 | case FIELD_ACCESS_TIME: 121 | value = record.fileStatus.getAccessTime(); 122 | break; 123 | case FIELD_MODIFICATION_TIME: 124 | value = record.fileStatus.getModificationTime(); 125 | break; 126 | case FIELD_CONTENT: 127 | value = record.content; 128 | break; 129 | } 130 | struct.put(field, value); 131 | }); 132 | return struct; 133 | } 134 | } 135 | 136 | static class BinaryRecord { 137 | 138 | private final Schema schema; 139 | private final FileStatus fileStatus; 140 | private final byte[] content; 141 | 142 | BinaryRecord(Schema schema, FileStatus fileStatus, byte[] content) { 143 | this.schema = schema; 144 | this.fileStatus = fileStatus; 145 | this.content = content; 146 | } 147 | 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | public enum CompressionType { 4 | BZIP2, 5 | GZIP, 6 | NONE; 7 | 8 | private boolean concatenated; 9 | 10 | CompressionType() { 11 | this.concatenated = true; 12 | } 13 | 14 | public boolean isConcatenated() { 15 | return concatenated; 16 | } 17 | 18 | public static CompressionType fromName(String compression, boolean concatenated) { 19 | CompressionType ct = CompressionType.valueOf(compression.trim().toUpperCase()); 20 | ct.concatenated = concatenated; 21 | return ct; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.univocity.parsers.common.AbstractParser; 4 | import com.univocity.parsers.csv.CsvParser; 5 | import com.univocity.parsers.csv.CsvParserSettings; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | 9 | import java.io.IOException; 10 | import java.util.Map; 11 | 12 | public class CsvFileReader extends UnivocityFileReader { 13 | 14 | public static final String FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE = FILE_READER_DELIMITED_SETTINGS + "empty_value"; 15 | public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection"; 16 | public static final String FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED = FILE_READER_DELIMITED_SETTINGS + "escape_unquoted"; 17 | 18 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter"; 19 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote"; 20 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_escape"; 21 | 22 | public CsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 23 | super(fs, filePath, config); 24 | } 25 | 26 | @Override 27 | protected CsvParserSettings parserSettings(Map config) { 28 | CsvParserSettings settings = new CsvParserSettings(); 29 | settings.setEmptyValue(config.get(FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE)); 30 | settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false)); 31 | settings.setEscapeUnquotedValues(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED, false)); 32 | settings.getFormat().setDelimiter(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",")); 33 | settings.getFormat().setQuote(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE, "\"").charAt(0)); 34 | settings.getFormat().setQuoteEscape(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE, "\"").charAt(0)); 35 | 36 | return settings; 37 | } 38 | 39 | @Override 40 | protected AbstractParser createParser(CsvParserSettings settings) { 41 | return new CsvParser(settings); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.kafka.connect.data.Struct; 5 | 6 | import java.io.Closeable; 7 | import java.util.Iterator; 8 | import java.util.function.Function; 9 | 10 | public interface FileReader extends Iterator, Closeable { 11 | 12 | Path getFilePath(); 13 | 14 | void seek(long offset); 15 | 16 | long currentOffset(); 17 | } 18 | 19 | @FunctionalInterface 20 | interface ReaderAdapter extends Function { 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.univocity.parsers.common.AbstractParser; 4 | import com.univocity.parsers.fixed.FixedWidthFields; 5 | import com.univocity.parsers.fixed.FixedWidthParser; 6 | import com.univocity.parsers.fixed.FixedWidthParserSettings; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | 10 | import java.io.IOException; 11 | import java.util.Arrays; 12 | import java.util.Map; 13 | import java.util.Optional; 14 | 15 | public class FixedWidthFileReader extends UnivocityFileReader { 16 | 17 | public static final String FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS = FILE_READER_DELIMITED_SETTINGS + "field_lengths"; 18 | public static final String FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING = FILE_READER_DELIMITED_SETTINGS + "keep_padding"; 19 | public static final String FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS = FILE_READER_DELIMITED_SETTINGS + "padding_for_headers"; 20 | public static final String FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE = FILE_READER_DELIMITED_SETTINGS + "ends_on_new_line"; 21 | public static final String FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS = FILE_READER_DELIMITED_SETTINGS + "skip_trailing_chars"; 22 | 23 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING = FILE_READER_DELIMITED_SETTINGS_FORMAT + "padding"; 24 | 25 | public FixedWidthFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 26 | super(fs, filePath, config); 27 | } 28 | 29 | @Override 30 | protected FixedWidthParserSettings parserSettings(Map config) { 31 | FixedWidthFields fieldLengths = new FixedWidthFields(); 32 | Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS)) 33 | .map(fl -> Arrays.stream(fl.split(","))) 34 | .ifPresent(fl -> fl.forEach(field -> fieldLengths.addField(Integer.parseInt(field)))); 35 | 36 | FixedWidthParserSettings settings = new FixedWidthParserSettings(fieldLengths); 37 | settings.setKeepPadding(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING, false)); 38 | settings.setUseDefaultPaddingForHeaders(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS, true)); 39 | settings.setRecordEndsOnNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE, true)); 40 | settings.setSkipTrailingCharsUntilNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS, false)); 41 | settings.getFormat().setPadding(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING, " ").charAt(0)); 42 | 43 | return settings; 44 | } 45 | 46 | @Override 47 | protected AbstractParser createParser(FixedWidthParserSettings settings) { 48 | return new FixedWidthParser(settings); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | 7 | import java.io.IOException; 8 | import java.util.Map; 9 | 10 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 11 | 12 | public class JsonFileReader extends JacksonFileReader { 13 | 14 | private static final String FILE_READER_JSON = FILE_READER_PREFIX + "json."; 15 | private static final String FILE_READER_JSON_COMPRESSION = FILE_READER_JSON + "compression."; 16 | 17 | static final String FILE_READER_JSON_DESERIALIZATION_CONFIGS = FILE_READER_JSON + "deserialization."; 18 | 19 | public static final String FILE_READER_JSON_RECORD_PER_LINE = FILE_READER_JSON + "record_per_line"; 20 | public static final String FILE_READER_JSON_COMPRESSION_TYPE = FILE_READER_JSON_COMPRESSION + "type"; 21 | public static final String FILE_READER_JSON_COMPRESSION_CONCATENATED = FILE_READER_JSON_COMPRESSION + "concatenated"; 22 | public static final String FILE_READER_JSON_ENCODING = FILE_READER_JSON + "encoding"; 23 | 24 | public JsonFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 25 | super(fs, filePath, config); 26 | } 27 | 28 | @Override 29 | protected Object readerEncodingConfig(Map config) { 30 | return config.get(FILE_READER_JSON_ENCODING); 31 | } 32 | 33 | @Override 34 | protected Object recordPerLineConfig(Map config) { 35 | return config.get(FILE_READER_JSON_RECORD_PER_LINE); 36 | } 37 | 38 | @Override 39 | protected Object compressionTypeConfig(Map config) { 40 | return config.get(FILE_READER_JSON_COMPRESSION_TYPE); 41 | } 42 | 43 | @Override 44 | protected Object compressionConcatenatedConfig(Map config) { 45 | return config.get(FILE_READER_JSON_COMPRESSION_CONCATENATED); 46 | } 47 | 48 | @Override 49 | protected String deserializationConfigPrefix() { 50 | return FILE_READER_JSON_DESERIALIZATION_CONFIGS; 51 | } 52 | 53 | @Override 54 | protected ObjectMapper getObjectMapper() { 55 | return new ObjectMapper(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import io.confluent.connect.avro.AvroData; 4 | import org.apache.avro.Schema; 5 | import org.apache.avro.generic.GenericData; 6 | import org.apache.avro.generic.GenericRecord; 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.kafka.connect.data.Struct; 11 | import org.apache.parquet.avro.AvroParquetReader; 12 | import org.apache.parquet.avro.AvroReadSupport; 13 | import org.apache.parquet.hadoop.ParquetReader; 14 | import org.apache.parquet.hadoop.util.HadoopInputFile; 15 | 16 | import java.io.IOException; 17 | import java.util.Map; 18 | import java.util.Optional; 19 | 20 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 21 | 22 | public class ParquetFileReader extends AbstractFileReader { 23 | 24 | private static final String FILE_READER_PARQUET = FILE_READER_PREFIX + "parquet."; 25 | 26 | public static final String FILE_READER_PARQUET_SCHEMA = FILE_READER_PARQUET + "schema"; 27 | public static final String FILE_READER_PARQUET_PROJECTION = FILE_READER_PARQUET + "projection"; 28 | 29 | private ParquetReader reader; 30 | private GenericRecord currentRecord; 31 | private Schema schema; 32 | private Schema projection; 33 | private boolean closed; 34 | 35 | public ParquetFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 36 | super(fs, filePath, new GenericRecordToStruct(), config); 37 | 38 | this.reader = initReader(); 39 | this.closed = false; 40 | } 41 | 42 | private ParquetReader initReader() throws IOException { 43 | Configuration configuration = getFs().getConf(); 44 | if (this.schema != null) { 45 | AvroReadSupport.setAvroReadSchema(configuration, this.schema); 46 | } 47 | if (this.projection != null) { 48 | AvroReadSupport.setRequestedProjection(configuration, this.projection); 49 | } 50 | return AvroParquetReader 51 | .builder(HadoopInputFile.fromPath(getFilePath(), configuration)) 52 | .build(); 53 | } 54 | 55 | protected void configure(Map config) { 56 | this.schema = Optional.ofNullable(config.get(FILE_READER_PARQUET_SCHEMA)) 57 | .map(c -> new Schema.Parser().parse(c)) 58 | .orElse(null); 59 | this.projection = Optional.ofNullable(config.get(FILE_READER_PARQUET_PROJECTION)) 60 | .map(c -> new Schema.Parser().parse(c)) 61 | .orElse(null); 62 | } 63 | 64 | @Override 65 | public boolean hasNextRecord() throws IOException { 66 | if (currentRecord == null) { 67 | currentRecord = reader.read(); 68 | } 69 | return currentRecord != null; 70 | } 71 | 72 | @Override 73 | protected GenericRecord nextRecord() { 74 | GenericRecord record; 75 | if (this.projection != null) { 76 | record = new GenericData.Record(this.projection); 77 | this.projection.getFields().forEach(field -> record.put(field.name(), currentRecord.get(field.name()))); 78 | } else { 79 | record = currentRecord; 80 | } 81 | currentRecord = null; 82 | incrementOffset(); 83 | return record; 84 | } 85 | 86 | @Override 87 | public void seekFile(long offset) throws IOException { 88 | if (currentOffset() > offset) { 89 | this.reader = initReader(); 90 | this.closed = false; 91 | setOffset(0); 92 | } 93 | while (hasNext() && currentOffset() < offset) { 94 | nextRecord(); 95 | } 96 | } 97 | 98 | @Override 99 | public void close() throws IOException { 100 | closed = true; 101 | reader.close(); 102 | } 103 | 104 | @Override 105 | public boolean isClosed() { 106 | return closed; 107 | } 108 | 109 | static class GenericRecordToStruct implements ReaderAdapter { 110 | private static final int CACHE_SIZE = 100; 111 | private final AvroData avroData; 112 | 113 | GenericRecordToStruct() { 114 | this.avroData = new AvroData(CACHE_SIZE); 115 | } 116 | 117 | @Override 118 | public Struct apply(GenericRecord record) { 119 | return (Struct) avroData.toConnectData(record.getSchema(), record).value(); 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.util.ReflectionUtils; 7 | import org.apache.kafka.connect.data.Schema; 8 | import org.apache.kafka.connect.data.SchemaBuilder; 9 | import org.apache.kafka.connect.data.Struct; 10 | 11 | import java.io.EOFException; 12 | import java.io.IOException; 13 | import java.util.Map; 14 | 15 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 16 | 17 | public class SequenceFileReader extends AbstractFileReader> { 18 | 19 | public static final String FIELD_NAME_KEY_DEFAULT = "key"; 20 | public static final String FIELD_NAME_VALUE_DEFAULT = "value"; 21 | 22 | private static final int DEFAULT_BUFFER_SIZE = 4096; 23 | private static final String FILE_READER_SEQUENCE = FILE_READER_PREFIX + "sequence."; 24 | private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_SEQUENCE + "field_name."; 25 | 26 | public static final String FILE_READER_BUFFER_SIZE = FILE_READER_SEQUENCE + "buffer_size"; 27 | public static final String FILE_READER_SEQUENCE_FIELD_NAME_KEY = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "key"; 28 | public static final String FILE_READER_SEQUENCE_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value"; 29 | 30 | private final SequenceFile.Reader reader; 31 | private final Writable key, value; 32 | private final Schema schema; 33 | private String keyFieldName, valueFieldName; 34 | private boolean hasNext; 35 | private boolean closed; 36 | 37 | public SequenceFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 38 | super(fs, filePath, new SeqToStruct(), config); 39 | 40 | this.reader = new SequenceFile.Reader(fs.getConf(), 41 | SequenceFile.Reader.file(filePath), 42 | SequenceFile.Reader.bufferSize(fs.getConf().getInt(FILE_READER_BUFFER_SIZE, DEFAULT_BUFFER_SIZE))); 43 | this.key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf()); 44 | this.value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf()); 45 | this.schema = SchemaBuilder.struct() 46 | .field(keyFieldName, getSchema(this.key)) 47 | .field(valueFieldName, getSchema(this.value)) 48 | .build(); 49 | this.hasNext = false; 50 | this.closed = false; 51 | } 52 | 53 | @Override 54 | protected void configure(Map config) { 55 | this.keyFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY_DEFAULT); 56 | this.valueFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT); 57 | } 58 | 59 | Schema getSchema(Writable writable) { 60 | if (writable instanceof ByteWritable) { 61 | return SchemaBuilder.INT8_SCHEMA; 62 | } else if (writable instanceof ShortWritable) { 63 | return SchemaBuilder.INT16_SCHEMA; 64 | } else if (writable instanceof IntWritable) { 65 | return SchemaBuilder.INT32_SCHEMA; 66 | } else if (writable instanceof LongWritable) { 67 | return SchemaBuilder.INT64_SCHEMA; 68 | } else if (writable instanceof FloatWritable) { 69 | return SchemaBuilder.FLOAT32_SCHEMA; 70 | } else if (writable instanceof DoubleWritable) { 71 | return SchemaBuilder.FLOAT64_SCHEMA; 72 | } else if (writable instanceof BytesWritable) { 73 | return SchemaBuilder.BYTES_SCHEMA; 74 | } else if (writable instanceof BooleanWritable) { 75 | return SchemaBuilder.BOOLEAN_SCHEMA; 76 | } 77 | return SchemaBuilder.STRING_SCHEMA; 78 | } 79 | 80 | @Override 81 | public boolean hasNextRecord() throws IOException { 82 | try { 83 | if (!hasNext) { 84 | hasNext = reader.next(key, value); 85 | } 86 | return hasNext; 87 | } catch (EOFException eofe) { 88 | return false; 89 | } 90 | } 91 | 92 | @Override 93 | protected SequenceRecord nextRecord() { 94 | incrementOffset(); 95 | hasNext = false; 96 | return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value); 97 | } 98 | 99 | @Override 100 | public void seekFile(long offset) throws IOException { 101 | if (offset == currentOffset()) { 102 | return; 103 | } else if (offset < currentOffset()) { 104 | reader.sync(0L); 105 | hasNext = false; 106 | } 107 | while (super.hasNext() && offset > currentOffset()) { 108 | super.next(); 109 | hasNext = false; 110 | } 111 | setOffset(offset); 112 | } 113 | 114 | @Override 115 | public void close() throws IOException { 116 | closed = true; 117 | reader.close(); 118 | } 119 | 120 | @Override 121 | public boolean isClosed() { 122 | return closed; 123 | } 124 | 125 | static class SeqToStruct implements ReaderAdapter> { 126 | 127 | @Override 128 | public Struct apply(SequenceRecord record) { 129 | return new Struct(record.schema) 130 | .put(record.keyFieldName, toSchemaValue(record.key)) 131 | .put(record.valueFieldName, toSchemaValue(record.value)); 132 | } 133 | 134 | Object toSchemaValue(Writable writable) { 135 | if (writable instanceof ByteWritable) { 136 | return ((ByteWritable) writable).get(); 137 | } else if (writable instanceof ShortWritable) { 138 | return ((ShortWritable) writable).get(); 139 | } else if (writable instanceof IntWritable) { 140 | return ((IntWritable) writable).get(); 141 | } else if (writable instanceof LongWritable) { 142 | return ((LongWritable) writable).get(); 143 | } else if (writable instanceof FloatWritable) { 144 | return ((FloatWritable) writable).get(); 145 | } else if (writable instanceof DoubleWritable) { 146 | return ((DoubleWritable) writable).get(); 147 | } else if (writable instanceof BytesWritable) { 148 | return ((BytesWritable) writable).getBytes(); 149 | } else if (writable instanceof BooleanWritable) { 150 | return ((BooleanWritable) writable).get(); 151 | } 152 | return writable.toString(); 153 | } 154 | } 155 | 156 | static class SequenceRecord { 157 | 158 | private final Schema schema; 159 | private final String keyFieldName; 160 | private final T key; 161 | private final String valueFieldName; 162 | private final U value; 163 | 164 | SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) { 165 | this.schema = schema; 166 | this.keyFieldName = keyFieldName; 167 | this.key = key; 168 | this.valueFieldName = valueFieldName; 169 | this.value = value; 170 | } 171 | 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; 4 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.kafka.connect.data.Schema; 8 | import org.apache.kafka.connect.data.SchemaBuilder; 9 | import org.apache.kafka.connect.data.Struct; 10 | 11 | import java.io.*; 12 | import java.nio.charset.Charset; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.stream.Collectors; 16 | 17 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 18 | 19 | public class TextFileReader extends AbstractFileReader { 20 | 21 | private static final String FILE_READER_TEXT = FILE_READER_PREFIX + "text."; 22 | private static final String FILE_READER_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name."; 23 | private static final String FILE_READER_TEXT_COMPRESSION = FILE_READER_TEXT + "compression."; 24 | 25 | public static final String FIELD_NAME_VALUE_DEFAULT = "value"; 26 | 27 | public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_FIELD_NAME_PREFIX + "value"; 28 | public static final String FILE_READER_TEXT_RECORD_PER_LINE = FILE_READER_TEXT + "record_per_line"; 29 | public static final String FILE_READER_TEXT_COMPRESSION_TYPE = FILE_READER_TEXT_COMPRESSION + "type"; 30 | public static final String FILE_READER_TEXT_COMPRESSION_CONCATENATED = FILE_READER_TEXT_COMPRESSION + "concatenated"; 31 | public static final String FILE_READER_TEXT_ENCODING = FILE_READER_TEXT + "encoding"; 32 | 33 | private String current; 34 | private boolean finished = false; 35 | private LineNumberReader reader; 36 | private Schema schema; 37 | private Charset charset; 38 | private CompressionType compression; 39 | private boolean recordPerLine; 40 | private boolean closed; 41 | 42 | public TextFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 43 | super(fs, filePath, new TxtToStruct(), config); 44 | this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); 45 | this.closed = false; 46 | } 47 | 48 | @Override 49 | protected void configure(Map config) { 50 | this.schema = SchemaBuilder.struct() 51 | .field(config.getOrDefault(FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT), 52 | Schema.STRING_SCHEMA) 53 | .build(); 54 | this.recordPerLine = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_RECORD_PER_LINE, "true")); 55 | String cType = config.getOrDefault(FILE_READER_TEXT_COMPRESSION_TYPE, CompressionType.NONE.toString()); 56 | boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_COMPRESSION_CONCATENATED, 57 | "true")); 58 | this.compression = CompressionType.fromName(cType, concatenated); 59 | this.charset = Charset.forName(config.getOrDefault(FILE_READER_TEXT_ENCODING, Charset.defaultCharset().name())); 60 | } 61 | 62 | private Reader getFileReader(InputStream inputStream) throws IOException { 63 | final InputStreamReader isr; 64 | switch (this.compression) { 65 | case BZIP2: 66 | isr = new InputStreamReader(new BZip2CompressorInputStream(inputStream, 67 | this.compression.isConcatenated()), this.charset); 68 | break; 69 | case GZIP: 70 | isr = new InputStreamReader(new GzipCompressorInputStream(inputStream, 71 | this.compression.isConcatenated()), this.charset); 72 | break; 73 | default: 74 | isr = new InputStreamReader(inputStream, this.charset); 75 | break; 76 | } 77 | return isr; 78 | } 79 | 80 | @Override 81 | public boolean hasNextRecord() throws IOException { 82 | if (current != null) { 83 | return true; 84 | } else if (finished) { 85 | return false; 86 | } else { 87 | if (!recordPerLine) { 88 | List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); 89 | current = String.join("\n", lines); 90 | finished = true; 91 | return true; 92 | } 93 | for (; ; ) { 94 | String line = reader.readLine(); 95 | if (line == null) { 96 | finished = true; 97 | return false; 98 | } 99 | current = line; 100 | return true; 101 | } 102 | } 103 | } 104 | 105 | @Override 106 | protected TextRecord nextRecord() { 107 | String aux = current; 108 | current = null; 109 | incrementOffset(); 110 | return new TextRecord(schema, aux); 111 | } 112 | 113 | @Override 114 | public void seekFile(long offset) throws IOException { 115 | current = null; 116 | if (offset < reader.getLineNumber()) { 117 | finished = false; 118 | reader.close(); 119 | reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); 120 | } 121 | while (reader.getLineNumber() < offset && reader.readLine() != null) { 122 | } 123 | setOffset(reader.getLineNumber()); 124 | } 125 | 126 | @Override 127 | public void close() throws IOException { 128 | closed = true; 129 | reader.close(); 130 | } 131 | 132 | @Override 133 | public boolean isClosed() { 134 | return closed; 135 | } 136 | 137 | static class TxtToStruct implements ReaderAdapter { 138 | 139 | @Override 140 | public Struct apply(TextRecord record) { 141 | return new Struct(record.schema) 142 | .put(record.schema.fields().get(0), record.value); 143 | } 144 | } 145 | 146 | static class TextRecord { 147 | private final Schema schema; 148 | private final String value; 149 | 150 | TextRecord(Schema schema, String value) { 151 | this.schema = schema; 152 | this.value = value; 153 | } 154 | 155 | public String getValue() { 156 | return value; 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.univocity.parsers.common.AbstractParser; 4 | import com.univocity.parsers.tsv.TsvParser; 5 | import com.univocity.parsers.tsv.TsvParserSettings; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | 9 | import java.io.IOException; 10 | import java.util.Map; 11 | 12 | public class TsvFileReader extends UnivocityFileReader { 13 | 14 | public static final String FILE_READER_DELIMITED_SETTINGS_LINE_JOINING = FILE_READER_DELIMITED_SETTINGS + "line_joining"; 15 | 16 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escape"; 17 | public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escaped_char"; 18 | 19 | public TsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 20 | super(fs, filePath, config); 21 | } 22 | 23 | @Override 24 | protected TsvParserSettings parserSettings(Map config) { 25 | TsvParserSettings settings = new TsvParserSettings(); 26 | settings.setLineJoiningEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_JOINING, false)); 27 | settings.getFormat().setEscapeChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE, "\"").charAt(0)); 28 | settings.getFormat().setEscapedTabChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR, "\"").charAt(0)); 29 | 30 | return settings; 31 | } 32 | 33 | @Override 34 | protected AbstractParser createParser(TsvParserSettings settings) { 35 | return new TsvParser(settings); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/XmlFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.dataformat.xml.XmlMapper; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | 8 | import java.io.IOException; 9 | import java.util.Map; 10 | 11 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 12 | 13 | public class XmlFileReader extends JacksonFileReader { 14 | 15 | private static final String FILE_READER_XML = FILE_READER_PREFIX + "xml."; 16 | private static final String FILE_READER_XML_COMPRESSION = FILE_READER_XML + "compression."; 17 | 18 | static final String FILE_READER_XML_DESERIALIZATION_CONFIGS = FILE_READER_XML + "deserialization."; 19 | 20 | public static final String FILE_READER_XML_RECORD_PER_LINE = FILE_READER_XML + "record_per_line"; 21 | public static final String FILE_READER_XML_COMPRESSION_TYPE = FILE_READER_XML_COMPRESSION + "type"; 22 | public static final String FILE_READER_XML_COMPRESSION_CONCATENATED = FILE_READER_XML_COMPRESSION + "concatenated"; 23 | public static final String FILE_READER_XML_ENCODING = FILE_READER_XML + "encoding"; 24 | 25 | public XmlFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 26 | super(fs, filePath, config); 27 | } 28 | 29 | @Override 30 | protected Object readerEncodingConfig(Map config) { 31 | return config.get(FILE_READER_XML_ENCODING); 32 | } 33 | 34 | @Override 35 | protected Object recordPerLineConfig(Map config) { 36 | return config.get(FILE_READER_XML_RECORD_PER_LINE); 37 | } 38 | 39 | @Override 40 | protected Object compressionTypeConfig(Map config) { 41 | return config.get(FILE_READER_XML_COMPRESSION_TYPE); 42 | } 43 | 44 | @Override 45 | protected Object compressionConcatenatedConfig(Map config) { 46 | return config.get(FILE_READER_XML_COMPRESSION_CONCATENATED); 47 | } 48 | 49 | @Override 50 | protected String deserializationConfigPrefix() { 51 | return FILE_READER_XML_DESERIALIZATION_CONFIGS; 52 | } 53 | 54 | @Override 55 | protected ObjectMapper getObjectMapper() { 56 | return new XmlMapper(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/YamlFileReader.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | 8 | import java.io.IOException; 9 | import java.util.Map; 10 | 11 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; 12 | 13 | public class YamlFileReader extends JacksonFileReader { 14 | 15 | private static final String FILE_READER_YAML = FILE_READER_PREFIX + "yaml."; 16 | private static final String FILE_READER_YAML_COMPRESSION = FILE_READER_YAML + "compression."; 17 | 18 | static final String FILE_READER_YAML_DESERIALIZATION_CONFIGS = FILE_READER_YAML + "deserialization."; 19 | 20 | public static final String FILE_READER_YAML_COMPRESSION_TYPE = FILE_READER_YAML_COMPRESSION + "type"; 21 | public static final String FILE_READER_YAML_COMPRESSION_CONCATENATED = FILE_READER_YAML_COMPRESSION + "concatenated"; 22 | public static final String FILE_READER_YAML_ENCODING = FILE_READER_YAML + "encoding"; 23 | 24 | public YamlFileReader(FileSystem fs, Path filePath, Map config) throws IOException { 25 | super(fs, filePath, config); 26 | } 27 | 28 | @Override 29 | protected Object readerEncodingConfig(Map config) { 30 | return config.get(FILE_READER_YAML_ENCODING); 31 | } 32 | 33 | @Override 34 | protected Object recordPerLineConfig(Map config) { 35 | return false; 36 | } 37 | 38 | @Override 39 | protected Object compressionTypeConfig(Map config) { 40 | return config.get(FILE_READER_YAML_COMPRESSION_TYPE); 41 | } 42 | 43 | @Override 44 | protected Object compressionConcatenatedConfig(Map config) { 45 | return config.get(FILE_READER_YAML_COMPRESSION_CONCATENATED); 46 | } 47 | 48 | @Override 49 | protected String deserializationConfigPrefix() { 50 | return FILE_READER_YAML_DESERIALIZATION_CONFIGS; 51 | } 52 | 53 | @Override 54 | protected ObjectMapper getObjectMapper() { 55 | return new YAMLMapper(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.cronutils.model.CronType; 4 | import com.cronutils.model.definition.CronDefinitionBuilder; 5 | import com.cronutils.model.time.ExecutionTime; 6 | import com.cronutils.parser.CronParser; 7 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 8 | import org.apache.kafka.common.config.ConfigException; 9 | import org.apache.kafka.common.utils.SystemTime; 10 | import org.apache.kafka.common.utils.Time; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.io.IOException; 15 | import java.time.DateTimeException; 16 | import java.time.LocalDateTime; 17 | import java.time.ZoneId; 18 | import java.time.ZonedDateTime; 19 | import java.util.Date; 20 | import java.util.Map; 21 | 22 | public class CronPolicy extends AbstractPolicy { 23 | 24 | private static final Logger log = LoggerFactory.getLogger(CronPolicy.class); 25 | 26 | private static final String CRON_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "cron."; 27 | 28 | public static final String CRON_POLICY_EXPRESSION = CRON_POLICY_PREFIX + "expression"; 29 | public static final String CRON_POLICY_END_DATE = CRON_POLICY_PREFIX + "end_date"; 30 | 31 | private final Time time; 32 | private ExecutionTime executionTime; 33 | private Date endDate; 34 | 35 | public CronPolicy(FsSourceTaskConfig conf) throws IOException { 36 | super(conf); 37 | this.time = new SystemTime(); 38 | } 39 | 40 | @Override 41 | protected void configPolicy(Map customConfigs) { 42 | try { 43 | if (customConfigs.get(CRON_POLICY_END_DATE) != null && 44 | !customConfigs.get(CRON_POLICY_END_DATE).toString().equals("")) { 45 | endDate = Date.from(LocalDateTime.parse(customConfigs.get(CRON_POLICY_END_DATE).toString().trim()) 46 | .atZone(ZoneId.systemDefault()).toInstant()); 47 | } 48 | executionTime = ExecutionTime.forCron( 49 | new CronParser(CronDefinitionBuilder.instanceDefinitionFor(CronType.QUARTZ)) 50 | .parse(customConfigs.get(CRON_POLICY_EXPRESSION).toString()) 51 | ); 52 | } catch (DateTimeException dte) { 53 | throw new ConfigException(CRON_POLICY_END_DATE + " property must have a proper value. Got: '" + 54 | customConfigs.get(CRON_POLICY_END_DATE) + "'."); 55 | } catch (IllegalArgumentException iae) { 56 | throw new ConfigException(CRON_POLICY_EXPRESSION + " property must have a proper value. Got: '" + 57 | customConfigs.get(CRON_POLICY_EXPRESSION) + "'."); 58 | } 59 | } 60 | 61 | @Override 62 | protected void preCheck() { 63 | executionTime.timeToNextExecution(ZonedDateTime.now()) 64 | .ifPresent(next -> time.sleep(next.toMillis())); 65 | } 66 | 67 | @Override 68 | protected boolean isPolicyCompleted() { 69 | return (endDate != null && 70 | endDate.before(Date.from(LocalDateTime.now().atZone(ZoneId.systemDefault()).toInstant()))) || 71 | !executionTime.timeToNextExecution(ZonedDateTime.now()).isPresent(); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; 5 | import org.apache.kafka.connect.storage.OffsetStorageReader; 6 | 7 | import java.io.Closeable; 8 | import java.io.IOException; 9 | import java.util.Iterator; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | public interface Policy extends Closeable { 14 | 15 | Iterator execute() throws IOException; 16 | 17 | FileReader offer(FileMetadata metadata, Map offset) throws IOException; 18 | 19 | boolean hasEnded(); 20 | 21 | List getURIs(); 22 | 23 | long getExecutions(); 24 | 25 | void interrupt(); 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicy.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | 8 | public class SimplePolicy extends AbstractPolicy { 9 | 10 | public SimplePolicy(FsSourceTaskConfig conf) throws IOException { 11 | super(conf); 12 | } 13 | 14 | @Override 15 | protected void configPolicy(Map customConfigs) { 16 | 17 | } 18 | 19 | @Override 20 | protected boolean isPolicyCompleted() { 21 | return getExecutions() > 0; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import org.apache.kafka.common.config.ConfigException; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | import java.util.Map; 10 | 11 | public class SleepyPolicy extends AbstractPolicy { 12 | 13 | private static final Logger log = LoggerFactory.getLogger(SleepyPolicy.class); 14 | 15 | private static final int DEFAULT_SLEEP_FRACTION = 10; 16 | private static final int DEFAULT_MAX_EXECS = -1; 17 | private static final String SLEEPY_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "sleepy."; 18 | 19 | public static final String SLEEPY_POLICY_SLEEP_MS = SLEEPY_POLICY_PREFIX + "sleep"; 20 | public static final String SLEEPY_POLICY_SLEEP_FRACTION = SLEEPY_POLICY_PREFIX + "fraction"; 21 | public static final String SLEEPY_POLICY_MAX_EXECS = SLEEPY_POLICY_PREFIX + "max_execs"; 22 | 23 | private long sleep; 24 | private long sleepFraction; 25 | private long maxExecs; 26 | 27 | public SleepyPolicy(FsSourceTaskConfig conf) throws IOException { 28 | super(conf); 29 | } 30 | 31 | @Override 32 | protected void configPolicy(Map customConfigs) { 33 | try { 34 | this.sleep = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); 35 | } catch (NumberFormatException nfe) { 36 | throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number (long). Got: " + 37 | customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); 38 | } 39 | try { 40 | this.maxExecs = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_MAX_EXECS, 41 | String.valueOf(DEFAULT_MAX_EXECS))); 42 | } catch (NumberFormatException nfe) { 43 | throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number (long). Got: " + 44 | customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); 45 | } 46 | try { 47 | this.sleepFraction = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_SLEEP_FRACTION, 48 | String.valueOf(DEFAULT_SLEEP_FRACTION))); 49 | } catch (NumberFormatException nfe) { 50 | throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number (long). Got: " + 51 | customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); 52 | } 53 | } 54 | 55 | @Override 56 | protected void preCheck() { 57 | sleepIfApply(); 58 | } 59 | 60 | private void sleepIfApply() { 61 | if (getExecutions() > 0) { 62 | int counter = 0; 63 | while (!hasEnded() && counter < sleepFraction) { 64 | try { 65 | Thread.sleep(sleep / sleepFraction); 66 | counter++; 67 | } catch (InterruptedException ie) { 68 | log.warn("{} An interrupted exception has occurred when sleeping: {}", this, ie.getMessage(), ie); 69 | } 70 | } 71 | } 72 | } 73 | 74 | @Override 75 | protected boolean isPolicyCompleted() { 76 | return maxExecs >= 0 && getExecutions() >= maxExecs; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.util; 2 | 3 | import java.util.*; 4 | 5 | public class Iterators { 6 | 7 | public static Iterator> partition(Iterator it, int size) { 8 | if (size <= 0) { 9 | return Collections.singletonList(it).iterator(); 10 | } 11 | 12 | return new Iterator>() { 13 | @Override 14 | public boolean hasNext() { 15 | return it.hasNext(); 16 | } 17 | 18 | @Override 19 | public Iterator next() { 20 | if (!hasNext()) { 21 | throw new NoSuchElementException(); 22 | } 23 | List elements = new ArrayList<>(size); 24 | while (it.hasNext() && elements.size() < size) { 25 | elements.add(it.next()); 26 | } 27 | return elements.iterator(); 28 | } 29 | }; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.util; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; 5 | import com.github.mmolimar.kafka.connect.fs.policy.Policy; 6 | import org.apache.commons.lang3.reflect.ConstructorUtils; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.kafka.connect.errors.ConnectException; 10 | 11 | import java.lang.reflect.Constructor; 12 | import java.lang.reflect.InvocationTargetException; 13 | import java.util.Arrays; 14 | import java.util.Map; 15 | 16 | public class ReflectionUtils { 17 | 18 | public static FileReader makeReader(Class clazz, FileSystem fs, 19 | Path path, Map config) { 20 | return make(clazz, fs, path, config); 21 | } 22 | 23 | public static Policy makePolicy(Class clazz, FsSourceTaskConfig conf) { 24 | return make(clazz, conf); 25 | } 26 | 27 | private static T make(Class clazz, Object... args) { 28 | try { 29 | Class[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new); 30 | Constructor constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses); 31 | 32 | return constructor.newInstance(args); 33 | } catch (IllegalAccessException | 34 | InstantiationException | 35 | InvocationTargetException e) { 36 | throw new ConnectException(e.getCause()); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.util; 2 | 3 | import java.util.stream.Stream; 4 | 5 | @FunctionalInterface 6 | public interface TailCall { 7 | 8 | TailCall apply(); 9 | 10 | default boolean completed() { 11 | return false; 12 | } 13 | 14 | default T result() { 15 | throw new IllegalStateException("Call does not have a value."); 16 | } 17 | 18 | default T invoke() { 19 | return Stream.iterate(this, TailCall::apply) 20 | .filter(TailCall::completed) 21 | .findFirst() 22 | .get() 23 | .result(); 24 | } 25 | 26 | static TailCall done(final T value) { 27 | return new TailCall() { 28 | @Override 29 | public boolean completed() { 30 | return true; 31 | } 32 | 33 | @Override 34 | public T result() { 35 | return value; 36 | } 37 | 38 | @Override 39 | public TailCall apply() { 40 | throw new IllegalStateException("Done cannot be applied."); 41 | } 42 | }; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.util; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.util.Properties; 7 | 8 | public class Version { 9 | private static final Logger log = LoggerFactory.getLogger(Version.class); 10 | private static String version = "unknown"; 11 | 12 | static { 13 | try { 14 | Properties props = new Properties(); 15 | props.load(Version.class.getResourceAsStream("/kafka-connect-fs-version.properties")); 16 | version = props.getProperty("version", version).trim(); 17 | } catch (Exception e) { 18 | log.warn("Error while loading version: {}", e.getMessage(), e); 19 | } 20 | } 21 | 22 | public static String getVersion() { 23 | return version; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem: -------------------------------------------------------------------------------- 1 | org.apache.hadoop.fs.LocalFileSystem 2 | org.apache.hadoop.fs.viewfs.ViewFileSystem 3 | org.apache.hadoop.fs.HarFileSystem 4 | org.apache.hadoop.fs.http.HttpFileSystem 5 | org.apache.hadoop.fs.http.HttpsFileSystem 6 | org.apache.hadoop.fs.ftp.FTPFileSystem 7 | org.apache.hadoop.hdfs.DistributedFileSystem 8 | org.apache.hadoop.fs.s3a.S3AFileSystem 9 | org.apache.hadoop.fs.s3native.NativeS3FileSystem 10 | org.apache.hadoop.fs.adl.AdlFileSystem 11 | org.apache.hadoop.fs.azure.NativeAzureFileSystem 12 | org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure 13 | org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem 14 | org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem 15 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 16 | -------------------------------------------------------------------------------- /src/main/resources/kafka-connect-fs-version.properties: -------------------------------------------------------------------------------- 1 | version=${project.version} 2 | -------------------------------------------------------------------------------- /src/main/scala/com/github/mmolimar/kafka/connect/fs/file/reader/CobrixReader.scala: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader 2 | 3 | import com.github.mmolimar.kafka.connect.fs.file.reader.CobolFileReader.StructHandler 4 | import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters 5 | import za.co.absa.cobrix.cobol.reader.{VarLenNestedReader, VarLenReader} 6 | 7 | import scala.collection.Seq 8 | 9 | protected object CobrixReader { 10 | 11 | def varLenReader(copybookContent: String, params: ReaderParameters): VarLenReader = { 12 | new VarLenNestedReader[java.util.Map[String, AnyRef]](Seq(copybookContent), params, new StructHandler()) 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.hdfs.MiniDFSCluster; 6 | 7 | import java.io.IOException; 8 | import java.net.URI; 9 | import java.nio.file.Files; 10 | 11 | public abstract class AbstractHdfsFsConfig implements FsTestConfig { 12 | private MiniDFSCluster cluster; 13 | private FileSystem fs; 14 | private URI fsUri; 15 | 16 | @Override 17 | public final void initFs() throws IOException { 18 | Configuration clusterConfig = new Configuration(); 19 | java.nio.file.Path hdfsDir = Files.createTempDirectory("test-"); 20 | clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); 21 | cluster = new MiniDFSCluster.Builder(clusterConfig).build(); 22 | fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); 23 | fs = FileSystem.newInstance(fsUri, new Configuration()); 24 | init(); 25 | } 26 | 27 | protected abstract void init() throws IOException; 28 | 29 | @Override 30 | public FileSystem getFs() { 31 | return fs; 32 | } 33 | 34 | @Override 35 | public URI getFsUri() { 36 | return fsUri; 37 | } 38 | 39 | @Override 40 | public void close() throws IOException { 41 | fs.close(); 42 | cluster.shutdown(true, true); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import org.apache.commons.io.FileUtils; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | 7 | import java.io.IOException; 8 | import java.net.URI; 9 | import java.nio.file.Files; 10 | 11 | public abstract class AbstractLocalFsConfig implements FsTestConfig { 12 | private java.nio.file.Path localDir; 13 | private FileSystem fs; 14 | private URI fsUri; 15 | 16 | @Override 17 | public final void initFs() throws IOException { 18 | localDir = Files.createTempDirectory("test-"); 19 | fsUri = localDir.toUri(); 20 | fs = FileSystem.newInstance(fsUri, new Configuration()); 21 | init(); 22 | } 23 | 24 | protected abstract void init() throws IOException; 25 | 26 | @Override 27 | public FileSystem getFs() { 28 | return fs; 29 | } 30 | 31 | @Override 32 | public URI getFsUri() { 33 | return fsUri; 34 | } 35 | 36 | @Override 37 | public void close() throws IOException { 38 | fs.close(); 39 | FileUtils.deleteDirectory(localDir.toFile()); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | 5 | import java.io.Closeable; 6 | import java.io.IOException; 7 | import java.net.URI; 8 | 9 | public interface FsTestConfig extends Closeable { 10 | 11 | void initFs() throws IOException; 12 | 13 | FileSystem getFs(); 14 | 15 | URI getFsUri(); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.connector; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; 4 | import org.apache.kafka.common.config.ConfigDef; 5 | import org.junit.jupiter.api.Test; 6 | 7 | import static org.junit.jupiter.api.Assertions.assertFalse; 8 | import static org.junit.jupiter.api.Assertions.assertNotNull; 9 | 10 | public class FsSourceConnectorConfigTest { 11 | 12 | @Test 13 | public void checkDocumentation() { 14 | ConfigDef config = FsSourceConnectorConfig.conf(); 15 | config.names().forEach(key -> { 16 | assertFalse(config.configKeys().get(key).documentation == null || 17 | "".equals(config.configKeys().get(key).documentation.trim()), 18 | () -> "Property " + key + " should be documented"); 19 | }); 20 | } 21 | 22 | @Test 23 | public void toRst() { 24 | assertNotNull(FsSourceConnectorConfig.conf().toRst()); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.connector; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnector; 4 | import com.github.mmolimar.kafka.connect.fs.FsSourceTask; 5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 6 | import org.apache.kafka.connect.errors.ConnectException; 7 | import org.junit.jupiter.api.BeforeEach; 8 | import org.junit.jupiter.api.Test; 9 | import org.junit.jupiter.api.io.TempDir; 10 | 11 | import java.io.File; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.stream.IntStream; 16 | 17 | import static org.junit.jupiter.api.Assertions.*; 18 | 19 | public class FsSourceConnectorTest { 20 | @TempDir 21 | public static File temporaryFolder; 22 | 23 | private FsSourceConnector connector; 24 | private Map connProps; 25 | 26 | @BeforeEach 27 | public void setup() { 28 | connector = new FsSourceConnector(); 29 | 30 | Map cfg = new HashMap() {{ 31 | put(FsSourceTaskConfig.FS_URIS, String.join(",", 32 | temporaryFolder.toURI() + File.separator + "dir1", 33 | temporaryFolder.toURI() + File.separator + "dir2", 34 | temporaryFolder.toURI() + File.separator + "dir3")); 35 | put(FsSourceTaskConfig.TOPIC, "topic_test"); 36 | }}; 37 | connProps = new HashMap<>(cfg); 38 | } 39 | 40 | @Test 41 | public void nullProperties() { 42 | assertThrows(ConnectException.class, () -> connector.start(null)); 43 | } 44 | 45 | @Test 46 | public void expectedFsUris() { 47 | Map testProps = new HashMap<>(connProps); 48 | testProps.remove(FsSourceTaskConfig.FS_URIS); 49 | assertThrows(ConnectException.class, () -> connector.start(testProps)); 50 | } 51 | 52 | @Test 53 | public void minimumConfig() { 54 | connector.start(connProps); 55 | connector.stop(); 56 | } 57 | 58 | @Test 59 | public void checkTaskClass() { 60 | assertEquals(FsSourceTask.class, connector.taskClass()); 61 | } 62 | 63 | @Test 64 | public void configTasksWithoutStart() { 65 | assertThrows(ConnectException.class, () -> connector.taskConfigs(1)); 66 | } 67 | 68 | @Test 69 | public void invalidConfigTaskNumber() { 70 | connector.start(connProps); 71 | assertThrows(IllegalArgumentException.class, () -> connector.taskConfigs(0)); 72 | } 73 | 74 | @Test 75 | public void configTasks() { 76 | connector.start(connProps); 77 | int uris = connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length; 78 | IntStream.range(1, connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length + 1) 79 | .forEach(index -> { 80 | List> taskConfigs = connector.taskConfigs(index); 81 | assertEquals(taskConfigs.size(), Math.min(index, uris)); 82 | }); 83 | connector.stop(); 84 | } 85 | 86 | @Test 87 | public void checkVersion() { 88 | assertNotNull(connector.version()); 89 | assertFalse("unknown".equalsIgnoreCase(connector.version())); 90 | } 91 | 92 | @Test 93 | public void checkDefaultConf() { 94 | assertNotNull(connector.config()); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.avro.AvroTypeException; 4 | import org.apache.avro.Schema; 5 | import org.apache.avro.SchemaParseException; 6 | import org.apache.avro.file.DataFileWriter; 7 | import org.apache.avro.generic.GenericData; 8 | import org.apache.avro.generic.GenericDatumWriter; 9 | import org.apache.avro.generic.GenericRecord; 10 | import org.apache.avro.io.DatumWriter; 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.kafka.connect.data.Struct; 15 | import org.apache.kafka.connect.errors.ConnectException; 16 | import org.junit.jupiter.api.BeforeAll; 17 | import org.junit.jupiter.params.ParameterizedTest; 18 | import org.junit.jupiter.params.provider.MethodSource; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | import java.util.UUID; 25 | import java.util.stream.IntStream; 26 | 27 | import static org.junit.jupiter.api.Assertions.*; 28 | 29 | public class AvroFileReaderTest extends FileReaderTestBase { 30 | 31 | private static final String FIELD_INDEX = "index"; 32 | private static final String FIELD_NAME = "name"; 33 | private static final String FIELD_SURNAME = "surname"; 34 | private static final String FILE_EXTENSION = "avr"; 35 | 36 | private static Schema schema; 37 | 38 | @BeforeAll 39 | public static void setUp() throws IOException { 40 | schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); 41 | } 42 | 43 | @Override 44 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 45 | File avroFile = File.createTempFile("test-", "." + getFileExtension()); 46 | DatumWriter writer = new GenericDatumWriter<>(schema); 47 | try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { 48 | dataFileWriter.setFlushOnEveryBlock(true); 49 | dataFileWriter.setSyncInterval(32); 50 | dataFileWriter.create(schema, avroFile); 51 | 52 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 53 | GenericRecord datum = new GenericData.Record(schema); 54 | datum.put(FIELD_INDEX, index); 55 | datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); 56 | datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); 57 | try { 58 | fsConfig.offsetsByIndex().put(index, (long) index); 59 | dataFileWriter.append(datum); 60 | } catch (IOException ioe) { 61 | throw new RuntimeException(ioe); 62 | } 63 | }); 64 | } 65 | Path path = new Path(new Path(fsConfig.getFsUri()), avroFile.getName()); 66 | fsConfig.getFs().moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); 67 | return path; 68 | } 69 | 70 | @ParameterizedTest 71 | @MethodSource("fileSystemConfigProvider") 72 | public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException { 73 | Map readerConfig = getReaderConfig(); 74 | readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); 75 | FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); 76 | fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); 77 | readAllData(fsConfig); 78 | } 79 | 80 | @ParameterizedTest 81 | @MethodSource("fileSystemConfigProvider") 82 | public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException { 83 | Map readerConfig = getReaderConfig(); 84 | readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); 85 | FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); 86 | fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); 87 | assertThrows(ConnectException.class, () -> readAllData(fsConfig)); 88 | assertThrows(AvroTypeException.class, () -> { 89 | try { 90 | readAllData(fsConfig); 91 | } catch (Exception e) { 92 | throw e.getCause(); 93 | } 94 | }); 95 | } 96 | 97 | @ParameterizedTest 98 | @MethodSource("fileSystemConfigProvider") 99 | public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOException { 100 | Map readerConfig = getReaderConfig(); 101 | readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); 102 | FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); 103 | assertThrows(ConnectException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); 104 | assertThrows(SchemaParseException.class, () -> { 105 | try { 106 | getReader(testFs, fsConfig.getDataFile(), readerConfig); 107 | } catch (Exception e) { 108 | throw e.getCause(); 109 | } 110 | }); 111 | } 112 | 113 | @Override 114 | protected Class getReaderClass() { 115 | return AvroFileReader.class; 116 | } 117 | 118 | @Override 119 | protected Map getReaderConfig() { 120 | return new HashMap<>(); 121 | } 122 | 123 | @Override 124 | protected void checkData(Struct record, long index) { 125 | assertAll( 126 | () -> assertEquals(index, (int) record.get(FIELD_INDEX)), 127 | () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")), 128 | () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")) 129 | ); 130 | } 131 | 132 | @Override 133 | protected String getFileExtension() { 134 | return FILE_EXTENSION; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/BinaryFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.kafka.connect.data.Struct; 6 | import org.junit.jupiter.api.Disabled; 7 | import org.junit.jupiter.params.ParameterizedTest; 8 | import org.junit.jupiter.params.provider.MethodSource; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | import java.nio.file.Files; 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | import java.util.NoSuchElementException; 16 | import java.util.stream.IntStream; 17 | 18 | import static org.junit.jupiter.api.Assertions.*; 19 | 20 | public class BinaryFileReaderTest extends FileReaderTestBase { 21 | 22 | private static final String FILE_EXTENSION = "bin"; 23 | 24 | @Override 25 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 26 | File binaryFile = File.createTempFile("test-", "." + getFileExtension()); 27 | byte[] content = "test".getBytes(); 28 | Path path = new Path(new Path(fsConfig.getFsUri()), binaryFile.getName()); 29 | Files.write(binaryFile.toPath(), content); 30 | fsConfig.getFs().moveFromLocalFile(new Path(binaryFile.getAbsolutePath()), path); 31 | IntStream.range(0, NUM_RECORDS).forEach(index -> fsConfig.offsetsByIndex().put(index, (long) 0)); 32 | return path; 33 | } 34 | 35 | @ParameterizedTest 36 | @MethodSource("fileSystemConfigProvider") 37 | public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { 38 | File tmp = File.createTempFile("test-", "." + getFileExtension()); 39 | Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); 40 | fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); 41 | FileReader reader = getReader(fsConfig.getFs(), path, getReaderConfig()); 42 | assertFalse(reader.hasNext()); 43 | } 44 | 45 | @ParameterizedTest 46 | @MethodSource("fileSystemConfigProvider") 47 | @Override 48 | public void readAllData(ReaderFsTestConfig fsConfig) { 49 | FileReader reader = fsConfig.getReader(); 50 | assertTrue(reader.hasNext()); 51 | 52 | int recordCount = 0; 53 | while (reader.hasNext()) { 54 | Struct record = reader.next(); 55 | checkData(record, recordCount); 56 | recordCount++; 57 | } 58 | assertEquals(1, recordCount, "The number of records in the file does not match"); 59 | } 60 | 61 | @ParameterizedTest 62 | @MethodSource("fileSystemConfigProvider") 63 | public void readAllDataInBatches(ReaderFsTestConfig fsConfig) { 64 | Map config = getReaderConfig(); 65 | int batchSize = 5; 66 | config.put(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, batchSize); 67 | AbstractFileReader reader = (AbstractFileReader) getReader(fsConfig.getFs(), fsConfig.getDataFile(), config); 68 | assertTrue(reader.hasNext()); 69 | 70 | int recordCount = 0; 71 | while (reader.hasNextBatch()) { 72 | reader.nextBatch(); 73 | while (reader.hasNext()) { 74 | Struct record = reader.next(); 75 | checkData(record, recordCount); 76 | recordCount++; 77 | } 78 | assertEquals(1, recordCount % batchSize); 79 | } 80 | assertThrows(NoSuchElementException.class, reader::nextBatch); 81 | assertEquals(1, recordCount, "The number of records in the file does not match"); 82 | } 83 | 84 | @ParameterizedTest 85 | @MethodSource("fileSystemConfigProvider") 86 | @Disabled 87 | public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { 88 | } 89 | 90 | @Override 91 | protected Class getReaderClass() { 92 | return BinaryFileReader.class; 93 | } 94 | 95 | @Override 96 | protected Map getReaderConfig() { 97 | return new HashMap<>(); 98 | } 99 | 100 | @Override 101 | protected void checkData(Struct record, long index) { 102 | assertAll( 103 | () -> assertFalse(record.get("path").toString().isEmpty()), 104 | () -> assertFalse(record.get("owner").toString().isEmpty()), 105 | () -> assertFalse(record.get("group").toString().isEmpty()), 106 | () -> assertEquals(record.getInt64("length"), 4L), 107 | () -> assertNotNull(record.get("access_time")), 108 | () -> assertNotNull(record.get("modification_time")), 109 | () -> assertEquals(new String(record.getBytes("content")), "test") 110 | ); 111 | } 112 | 113 | @Override 114 | protected String getFileExtension() { 115 | return FILE_EXTENSION; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.kafka.connect.data.Struct; 5 | import org.junit.jupiter.params.ParameterizedTest; 6 | import org.junit.jupiter.params.provider.MethodSource; 7 | 8 | import java.io.File; 9 | import java.io.FileWriter; 10 | import java.io.IOException; 11 | import java.io.PrintWriter; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | import java.util.stream.IntStream; 15 | 16 | import static org.junit.jupiter.api.Assertions.*; 17 | 18 | public class CsvFileReaderTest extends UnivocityFileReaderTest { 19 | 20 | @Override 21 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 22 | boolean header = args.length < 1 || (boolean) args[0]; 23 | CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; 24 | File txtFile = File.createTempFile("test-", "." + getFileExtension()); 25 | try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { 26 | if (header) { 27 | String headerValue = String.join("#", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, 28 | FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); 29 | writer.append(headerValue + "\n"); 30 | } 31 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 32 | String value = String.format("%d#%d#%d#%d#%f#%f#%s#%s#%s\n", 33 | (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, 34 | true, "test bytes", "test string"); 35 | writer.append(value); 36 | fsConfig.offsetsByIndex().put(index, (long) index); 37 | }); 38 | } 39 | Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); 40 | fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); 41 | return path; 42 | } 43 | 44 | @ParameterizedTest 45 | @MethodSource("fileSystemConfigProvider") 46 | public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOException { 47 | File tmp = File.createTempFile("test-", "." + getFileExtension()); 48 | try (FileWriter writer = new FileWriter(tmp)) { 49 | String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, 50 | FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); 51 | writer.append(headerValue + "\n"); 52 | writer.append(",\"\",,,,,true,test bytes,test string\n"); 53 | writer.append("#comment\n"); 54 | writer.append(",\"\",,,,,true,test bytes,test string\n"); 55 | } 56 | Map readerConfig = getReaderConfig(); 57 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","); 58 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 59 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "10"); 60 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "100"); 61 | 62 | Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); 63 | fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); 64 | FileReader reader = getReader(fsConfig.getFs(), path, readerConfig); 65 | 66 | assertTrue(reader.hasNext()); 67 | 68 | int recordCount = 0; 69 | while (reader.hasNext()) { 70 | Struct record = reader.next(); 71 | assertAll( 72 | () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 100), 73 | () -> assertEquals(record.get(FIELD_COLUMN2), (short) 10), 74 | () -> assertEquals(record.get(FIELD_COLUMN3), 100), 75 | () -> assertEquals(record.get(FIELD_COLUMN4), 100L), 76 | () -> assertEquals(record.get(FIELD_COLUMN5), 100.00f), 77 | () -> assertEquals(record.get(FIELD_COLUMN6), 100.00d), 78 | () -> assertEquals(record.get(FIELD_COLUMN7), true), 79 | () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"), 80 | () -> assertEquals(record.get(FIELD_COLUMN9), "test string") 81 | ); 82 | recordCount++; 83 | } 84 | assertEquals(2, recordCount, () -> "The number of records in the file does not match"); 85 | } 86 | 87 | @ParameterizedTest 88 | @MethodSource("fileSystemConfigProvider") 89 | public void readAllDataWithEmptyAndNullValueWithAllowNullsAndWithoutSchemaProvided(ReaderFsTestConfig fsConfig) throws IOException { 90 | File tmp = File.createTempFile("test-", "." + getFileExtension()); 91 | try (FileWriter writer = new FileWriter(tmp)) { 92 | String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3); 93 | writer.append(headerValue + "\n"); 94 | writer.append("yes,\"\",\n"); 95 | writer.append("yes,cool,test"); 96 | } 97 | 98 | Map defaultReadConfig = getReaderConfig(); 99 | defaultReadConfig.remove(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA); 100 | Map readerConfig = defaultReadConfig; 101 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","); 102 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 103 | readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "true"); 104 | 105 | Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); 106 | fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); 107 | FileReader reader = getReader(fsConfig.getFs(), path, readerConfig); 108 | 109 | assertTrue(reader.hasNext()); 110 | 111 | Struct record = reader.next(); 112 | assertAll( 113 | () -> assertEquals(record.get(FIELD_COLUMN1), "yes"), 114 | () -> assertNull(record.get(FIELD_COLUMN2)), 115 | () -> assertNull(record.get(FIELD_COLUMN3)) 116 | ); 117 | 118 | assertTrue(reader.hasNext()); 119 | Struct record2 = reader.next(); 120 | assertAll( 121 | () -> assertEquals(record2.get(FIELD_COLUMN1), "yes"), 122 | () -> assertEquals(record2.get(FIELD_COLUMN2), "cool"), 123 | () -> assertEquals(record2.get(FIELD_COLUMN3), "test") 124 | ); 125 | 126 | assertFalse(reader.hasNext()); 127 | } 128 | 129 | @Override 130 | protected Map getReaderConfig() { 131 | return new HashMap() {{ 132 | put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, "#"); 133 | put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 134 | put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); 135 | }}; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | 5 | import java.io.File; 6 | import java.io.IOException; 7 | import java.io.PrintWriter; 8 | import java.util.Arrays; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.stream.Collectors; 12 | import java.util.stream.IntStream; 13 | 14 | public class FixedWidthFileReaderTest extends UnivocityFileReaderTest { 15 | 16 | private static final int[] fieldLengths = new int[]{45, 53, 71, 89, 14, 44, 67, 46, 75}; 17 | 18 | @Override 19 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 20 | boolean header = args.length < 1 || (boolean) args[0]; 21 | CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; 22 | File txtFile = File.createTempFile("test-", "." + getFileExtension()); 23 | try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { 24 | if (header) { 25 | writer.append(String.format("%-" + fieldLengths[0] + "s", FIELD_COLUMN1) + 26 | String.format("%-" + fieldLengths[1] + "s", FIELD_COLUMN2) + 27 | String.format("%-" + fieldLengths[2] + "s", FIELD_COLUMN3) + 28 | String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) + 29 | String.format("%-" + fieldLengths[4] + "s", FIELD_COLUMN5) + 30 | String.format("%-" + fieldLengths[5] + "s", FIELD_COLUMN6) + 31 | String.format("%-" + fieldLengths[6] + "s", FIELD_COLUMN7) + 32 | String.format("%-" + fieldLengths[7] + "s", FIELD_COLUMN8) + 33 | String.format("%-" + fieldLengths[8] + "s", FIELD_COLUMN9) + "\n"); 34 | } 35 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 36 | writer.append(String.format("%-" + fieldLengths[0] + "s", String.format("%d", (byte) 2)) + 37 | String.format("%-" + fieldLengths[1] + "s", String.format("%d", (short) 4)) + 38 | String.format("%-" + fieldLengths[2] + "s", String.format("%d", 8)) + 39 | String.format("%-" + fieldLengths[3] + "s", String.format("%d", 16L)) + 40 | String.format("%-" + fieldLengths[4] + "s", String.format("%f", 32.32f)) + 41 | String.format("%-" + fieldLengths[5] + "s", String.format("%f", 64.64d)) + 42 | String.format("%-" + fieldLengths[6] + "s", String.format("%s", true)) + 43 | String.format("%-" + fieldLengths[7] + "s", String.format("%s", "test bytes")) + 44 | String.format("%-" + fieldLengths[8] + "s", String.format("%s", "test string")) + "\n" 45 | ); 46 | fsConfig.offsetsByIndex().put(index, (long) index); 47 | }); 48 | } 49 | Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); 50 | fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); 51 | return path; 52 | } 53 | 54 | @Override 55 | protected Map getReaderConfig() { 56 | return new HashMap() {{ 57 | put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 58 | put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS, 59 | Arrays.stream(fieldLengths).mapToObj(String::valueOf).collect(Collectors.joining(","))); 60 | put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); 61 | }}; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import org.apache.kafka.connect.data.Struct; 5 | 6 | import java.math.BigInteger; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | 10 | import static org.junit.jupiter.api.Assertions.*; 11 | 12 | public class JsonFileReaderTest extends JacksonFileReaderTest { 13 | 14 | private static final String FILE_EXTENSION = "jsn"; 15 | 16 | @Override 17 | protected void checkData(Struct record, long index) { 18 | List array = record.getArray(FIELD_ARRAY_COMPLEX); 19 | Struct subrecord = record.getStruct(FIELD_STRUCT); 20 | assertAll( 21 | () -> assertEquals(index, (int) record.get(FIELD_INTEGER)), 22 | () -> assertEquals(new BigInteger("9999999999999999999").longValue(), record.get(FIELD_BIG_INTEGER)), 23 | () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)), 24 | () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), 25 | () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), 26 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0), 27 | () -> assertNull(record.get(FIELD_NULL)), 28 | () -> assertNotNull(record.schema().field(FIELD_NULL)), 29 | () -> assertEquals("dGVzdA==", record.get(FIELD_BINARY)), 30 | () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)), 31 | 32 | () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)), 33 | () -> assertEquals(Long.MAX_VALUE, (long) array.get(0).get(FIELD_LONG)), 34 | () -> assertTrue(array.get(0).get(FIELD_STRING).toString().startsWith(index + "_")), 35 | () -> assertTrue(Boolean.parseBoolean(array.get(0).get(FIELD_BOOLEAN).toString())), 36 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(0).get(FIELD_DECIMAL), 0), 37 | () -> assertNull(array.get(0).get(FIELD_NULL)), 38 | () -> assertNotNull(array.get(0).schema().field(FIELD_NULL)), 39 | () -> assertEquals(index + 1, (int) array.get(1).get(FIELD_INTEGER)), 40 | () -> assertEquals(Long.MAX_VALUE, (long) array.get(1).get(FIELD_LONG)), 41 | () -> assertTrue(array.get(1).get(FIELD_STRING).toString().startsWith(index + "_")), 42 | () -> assertTrue(Boolean.parseBoolean(array.get(1).get(FIELD_BOOLEAN).toString())), 43 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(1).get(FIELD_DECIMAL), 0), 44 | () -> assertNull(array.get(1).get(FIELD_NULL)), 45 | () -> assertNotNull(array.get(1).schema().field(FIELD_NULL)), 46 | 47 | () -> assertEquals(index, (int) subrecord.get(FIELD_INTEGER)), 48 | () -> assertEquals(Long.MAX_VALUE, (long) subrecord.get(FIELD_LONG)), 49 | () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), 50 | () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), 51 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) subrecord.get(FIELD_DECIMAL), 0), 52 | () -> assertNull(subrecord.get(FIELD_NULL)), 53 | () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) 54 | ); 55 | } 56 | 57 | @Override 58 | protected Class getReaderClass() { 59 | return JsonFileReader.class; 60 | } 61 | 62 | @Override 63 | protected String getFileExtension() { 64 | return FILE_EXTENSION; 65 | } 66 | 67 | @Override 68 | protected String readerEncodingConfig() { 69 | return JsonFileReader.FILE_READER_JSON_ENCODING; 70 | } 71 | 72 | @Override 73 | protected String recordPerLineConfig() { 74 | return JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE; 75 | } 76 | 77 | @Override 78 | protected String compressionTypeConfig() { 79 | return JsonFileReader.FILE_READER_JSON_COMPRESSION_TYPE; 80 | } 81 | 82 | @Override 83 | protected String compressionConcatenatedConfig() { 84 | return JsonFileReader.FILE_READER_JSON_COMPRESSION_CONCATENATED; 85 | } 86 | 87 | @Override 88 | protected String deserializationConfigPrefix() { 89 | return JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS; 90 | } 91 | 92 | @Override 93 | protected ObjectMapper getObjectMapper() { 94 | return new ObjectMapper(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; 4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; 5 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig; 6 | import org.apache.hadoop.fs.Path; 7 | 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | interface ReaderFsTestConfig extends FsTestConfig { 12 | 13 | void setDataFile(Path dataFile); 14 | 15 | Path getDataFile(); 16 | 17 | void setReader(FileReader reader); 18 | 19 | FileReader getReader(); 20 | 21 | Map offsetsByIndex(); 22 | 23 | } 24 | 25 | class LocalFsConfig extends AbstractLocalFsConfig implements ReaderFsTestConfig { 26 | private Path dataFile; 27 | private FileReader reader; 28 | private Map offsetsByIndex; 29 | 30 | @Override 31 | public void init() { 32 | offsetsByIndex = new HashMap<>(); 33 | } 34 | 35 | @Override 36 | public void setDataFile(Path dataFile) { 37 | this.dataFile = dataFile; 38 | } 39 | 40 | @Override 41 | public Path getDataFile() { 42 | return dataFile; 43 | } 44 | 45 | @Override 46 | public void setReader(FileReader reader) { 47 | this.reader = reader; 48 | } 49 | 50 | @Override 51 | public FileReader getReader() { 52 | return reader; 53 | } 54 | 55 | @Override 56 | public Map offsetsByIndex() { 57 | return offsetsByIndex; 58 | } 59 | 60 | } 61 | 62 | class HdfsFsConfig extends AbstractHdfsFsConfig implements ReaderFsTestConfig { 63 | private Path dataFile; 64 | private FileReader reader; 65 | private Map offsetsByIndex; 66 | 67 | @Override 68 | public void init() { 69 | offsetsByIndex = new HashMap<>(); 70 | } 71 | 72 | @Override 73 | public Path getDataFile() { 74 | return dataFile; 75 | } 76 | 77 | @Override 78 | public void setDataFile(Path dataFile) { 79 | this.dataFile = dataFile; 80 | } 81 | 82 | @Override 83 | public void setReader(FileReader reader) { 84 | this.reader = reader; 85 | } 86 | 87 | @Override 88 | public FileReader getReader() { 89 | return reader; 90 | } 91 | 92 | @Override 93 | public Map offsetsByIndex() { 94 | return offsetsByIndex; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.util.ReflectionUtils; 7 | import org.apache.kafka.connect.data.SchemaBuilder; 8 | import org.apache.kafka.connect.data.Struct; 9 | import org.junit.jupiter.params.ParameterizedTest; 10 | import org.junit.jupiter.params.provider.MethodSource; 11 | 12 | import java.io.DataInput; 13 | import java.io.DataOutput; 14 | import java.io.File; 15 | import java.io.IOException; 16 | import java.util.HashMap; 17 | import java.util.Map; 18 | import java.util.UUID; 19 | import java.util.stream.IntStream; 20 | 21 | import static org.junit.jupiter.api.Assertions.*; 22 | 23 | public class SequenceFileReaderTest extends FileReaderTestBase { 24 | 25 | private static final String FIELD_NAME_KEY = "custom_field_key"; 26 | private static final String FIELD_NAME_VALUE = "custom_field_name"; 27 | private static final String FILE_EXTENSION = "sq"; 28 | 29 | @Override 30 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 31 | FileSystem fs = fsConfig.getFs(); 32 | File seqFile = File.createTempFile("test-", "." + getFileExtension()); 33 | try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), 34 | SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), 35 | SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { 36 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 37 | Writable key = new IntWritable(index); 38 | Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); 39 | try { 40 | writer.append(key, value); 41 | writer.sync(); 42 | fsConfig.offsetsByIndex().put(index, (long) index); 43 | } catch (IOException ioe) { 44 | throw new RuntimeException(ioe); 45 | } 46 | }); 47 | } 48 | Path path = new Path(new Path(fsConfig.getFsUri()), seqFile.getName()); 49 | fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); 50 | return path; 51 | } 52 | 53 | @ParameterizedTest 54 | @MethodSource("fileSystemConfigProvider") 55 | public void defaultFieldNames(ReaderFsTestConfig fsConfig) { 56 | Map readerConfig = getReaderConfig(); 57 | readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null); 58 | readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null); 59 | FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); 60 | assertEquals(reader.getFilePath(), fsConfig.getDataFile()); 61 | assertTrue(reader.hasNext()); 62 | 63 | int recordCount = 0; 64 | while (reader.hasNext()) { 65 | Struct record = reader.next(); 66 | checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, 67 | record, recordCount); 68 | recordCount++; 69 | } 70 | assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); 71 | } 72 | 73 | @ParameterizedTest 74 | @MethodSource("fileSystemConfigProvider") 75 | public void schemaMapper(ReaderFsTestConfig fsConfig) { 76 | SequenceFileReader reader = (SequenceFileReader) fsConfig.getReader(); 77 | 78 | ByteWritable byteWritable = new ByteWritable((byte) 1); 79 | ShortWritable shortWritable = new ShortWritable((short) 123); 80 | IntWritable intWritable = new IntWritable(123); 81 | LongWritable longWritable = new LongWritable(123L); 82 | FloatWritable floatWritable = new FloatWritable(0.123F); 83 | DoubleWritable doubleWritable = new DoubleWritable(0.123D); 84 | BytesWritable bytesWritable = new BytesWritable(new byte[]{1, 2, 3}); 85 | BooleanWritable booleanWritable = new BooleanWritable(true); 86 | Text textWritable = new Text("123"); 87 | 88 | assertEquals(SchemaBuilder.INT8_SCHEMA, reader.getSchema(byteWritable)); 89 | assertEquals(SchemaBuilder.INT16_SCHEMA, reader.getSchema(shortWritable)); 90 | assertEquals(SchemaBuilder.INT32_SCHEMA, reader.getSchema(intWritable)); 91 | assertEquals(SchemaBuilder.INT64_SCHEMA, reader.getSchema(longWritable)); 92 | assertEquals(SchemaBuilder.FLOAT32_SCHEMA, reader.getSchema(floatWritable)); 93 | assertEquals(SchemaBuilder.FLOAT64_SCHEMA, reader.getSchema(doubleWritable)); 94 | assertEquals(SchemaBuilder.BYTES_SCHEMA, reader.getSchema(bytesWritable)); 95 | assertEquals(SchemaBuilder.BOOLEAN_SCHEMA, reader.getSchema(booleanWritable)); 96 | assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(textWritable)); 97 | assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(new Writable() { 98 | 99 | @Override 100 | public void write(DataOutput out) { 101 | 102 | } 103 | 104 | @Override 105 | public void readFields(DataInput in) { 106 | 107 | } 108 | })); 109 | 110 | SequenceFileReader.SeqToStruct seqToStruct = new SequenceFileReader.SeqToStruct(); 111 | assertEquals(seqToStruct.toSchemaValue(byteWritable), byteWritable.get()); 112 | assertEquals(seqToStruct.toSchemaValue(shortWritable), shortWritable.get()); 113 | assertEquals(seqToStruct.toSchemaValue(intWritable), intWritable.get()); 114 | assertEquals(seqToStruct.toSchemaValue(longWritable), longWritable.get()); 115 | assertEquals(seqToStruct.toSchemaValue(floatWritable), floatWritable.get()); 116 | assertEquals(seqToStruct.toSchemaValue(doubleWritable), doubleWritable.get()); 117 | assertEquals(seqToStruct.toSchemaValue(bytesWritable), bytesWritable.getBytes()); 118 | assertEquals(seqToStruct.toSchemaValue(booleanWritable), booleanWritable.get()); 119 | assertEquals(seqToStruct.toSchemaValue(textWritable), textWritable.toString()); 120 | } 121 | 122 | @Override 123 | protected Class getReaderClass() { 124 | return SequenceFileReader.class; 125 | } 126 | 127 | @Override 128 | protected Map getReaderConfig() { 129 | return new HashMap() {{ 130 | put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); 131 | put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 132 | }}; 133 | } 134 | 135 | @Override 136 | protected void checkData(Struct record, long index) { 137 | checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); 138 | } 139 | 140 | private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { 141 | assertAll( 142 | () -> assertEquals(index, (int) record.get(keyFieldName)), 143 | () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")) 144 | ); 145 | } 146 | 147 | @Override 148 | protected String getFileExtension() { 149 | return FILE_EXTENSION; 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.kafka.connect.data.Struct; 5 | import org.apache.kafka.connect.errors.ConnectException; 6 | import org.junit.jupiter.params.ParameterizedTest; 7 | import org.junit.jupiter.params.provider.MethodSource; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.io.PrintWriter; 12 | import java.nio.charset.UnsupportedCharsetException; 13 | import java.util.Arrays; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.UUID; 17 | import java.util.stream.IntStream; 18 | 19 | import static org.junit.jupiter.api.Assertions.*; 20 | 21 | public class TextFileReaderTest extends FileReaderTestBase { 22 | 23 | private static final String FIELD_NAME_VALUE = "custom_field_name"; 24 | private static final String FILE_EXTENSION = "txt"; 25 | private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.GZIP; 26 | 27 | @Override 28 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 29 | CompressionType compression = args.length < 1 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[0]; 30 | File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); 31 | try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { 32 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 33 | String value = String.format("%d_%s", index, UUID.randomUUID()); 34 | writer.append(value + "\n"); 35 | fsConfig.offsetsByIndex().put(index, (long) index); 36 | }); 37 | } 38 | Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); 39 | fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); 40 | return path; 41 | } 42 | 43 | @ParameterizedTest 44 | @MethodSource("fileSystemConfigProvider") 45 | public void validFileEncoding(ReaderFsTestConfig fsConfig) { 46 | Map readerConfig = getReaderConfig(); 47 | readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 48 | readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); 49 | readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); 50 | FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); 51 | fsConfig.setReader(reader); 52 | readAllData(fsConfig); 53 | } 54 | 55 | @ParameterizedTest 56 | @MethodSource("fileSystemConfigProvider") 57 | public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { 58 | Map readerConfig = getReaderConfig(); 59 | readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 60 | readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); 61 | readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); 62 | assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); 63 | assertThrows(UnsupportedCharsetException.class, () -> { 64 | try { 65 | getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); 66 | } catch (Exception e) { 67 | throw e.getCause(); 68 | } 69 | }); 70 | } 71 | 72 | @ParameterizedTest 73 | @MethodSource("fileSystemConfigProvider") 74 | public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException { 75 | Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT); 76 | Map readerConfig = getReaderConfig(); 77 | readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 78 | readerConfig.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); 79 | readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); 80 | FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); 81 | 82 | assertTrue(reader.hasNext()); 83 | 84 | int recordCount = 0; 85 | while (reader.hasNext()) { 86 | Struct record = reader.next(); 87 | checkData(record, recordCount); 88 | recordCount++; 89 | } 90 | reader.close(); 91 | assertEquals(1, recordCount, () -> "The number of records in the file does not match"); 92 | } 93 | 94 | @ParameterizedTest 95 | @MethodSource("fileSystemConfigProvider") 96 | public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { 97 | Arrays.stream(CompressionType.values()).forEach(compressionType -> { 98 | try { 99 | Path file = createDataFile(fsConfig, compressionType); 100 | Map readerConfig = getReaderConfig(); 101 | readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 102 | readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); 103 | FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); 104 | 105 | assertTrue(reader.hasNext()); 106 | 107 | int recordCount = 0; 108 | while (reader.hasNext()) { 109 | Struct record = reader.next(); 110 | checkData(record, recordCount); 111 | recordCount++; 112 | } 113 | reader.close(); 114 | assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); 115 | } catch (Exception e) { 116 | throw new RuntimeException(e); 117 | } 118 | }); 119 | } 120 | 121 | @Override 122 | protected Class getReaderClass() { 123 | return TextFileReader.class; 124 | } 125 | 126 | @Override 127 | protected Map getReaderConfig() { 128 | return new HashMap() {{ 129 | put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); 130 | put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); 131 | put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); 132 | }}; 133 | } 134 | 135 | @Override 136 | protected void checkData(Struct record, long index) { 137 | assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); 138 | } 139 | 140 | @Override 141 | protected String getFileExtension() { 142 | return FILE_EXTENSION; 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.kafka.connect.data.Struct; 5 | import org.junit.jupiter.params.ParameterizedTest; 6 | import org.junit.jupiter.params.provider.MethodSource; 7 | 8 | import java.io.File; 9 | import java.io.FileWriter; 10 | import java.io.IOException; 11 | import java.io.PrintWriter; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | import java.util.stream.IntStream; 15 | 16 | import static org.junit.jupiter.api.Assertions.*; 17 | 18 | public class TsvFileReaderTest extends UnivocityFileReaderTest { 19 | 20 | @Override 21 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 22 | boolean header = args.length < 1 || (boolean) args[0]; 23 | CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; 24 | File txtFile = File.createTempFile("test-", "." + getFileExtension()); 25 | try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { 26 | if (header) { 27 | String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, 28 | FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); 29 | writer.append(headerValue + "\n"); 30 | } 31 | IntStream.range(0, NUM_RECORDS).forEach(index -> { 32 | String value = String.format("%d\t%d\t%d\t%d\t%f\t%f\t%s\t%s\t%s\n", 33 | (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, 34 | true, "test bytes", "test string"); 35 | writer.append(value); 36 | fsConfig.offsetsByIndex().put(index, (long) index); 37 | }); 38 | } 39 | Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); 40 | fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); 41 | return path; 42 | } 43 | 44 | @ParameterizedTest 45 | @MethodSource("fileSystemConfigProvider") 46 | public void readAllDataWithEmptyAndNullValueWithAllowNullsAndWithoutSchemaProvided(ReaderFsTestConfig fsConfig) throws IOException { 47 | File tmp = File.createTempFile("test-", "." + getFileExtension()); 48 | try (FileWriter writer = new FileWriter(tmp)) { 49 | String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3); 50 | writer.append(headerValue + "\n"); 51 | writer.append("yes\t\"\"\t\n"); 52 | writer.append("yes\tcool\ttest"); 53 | } 54 | 55 | Map defaultReadConfig = getReaderConfig(); 56 | defaultReadConfig.remove(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA); 57 | Map readerConfig = defaultReadConfig; 58 | readerConfig.put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 59 | readerConfig.put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "true"); 60 | 61 | Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); 62 | fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); 63 | FileReader reader = getReader(fsConfig.getFs(), path, readerConfig); 64 | 65 | assertTrue(reader.hasNext()); 66 | 67 | Struct record = reader.next(); 68 | assertAll( 69 | () -> assertEquals(record.get(FIELD_COLUMN1), "yes"), 70 | () -> assertNull(record.get(FIELD_COLUMN2)), 71 | () -> assertNull(record.get(FIELD_COLUMN3)) 72 | ); 73 | 74 | assertTrue(reader.hasNext()); 75 | Struct record2 = reader.next(); 76 | assertAll( 77 | () -> assertEquals(record2.get(FIELD_COLUMN1), "yes"), 78 | () -> assertEquals(record2.get(FIELD_COLUMN2), "cool"), 79 | () -> assertEquals(record2.get(FIELD_COLUMN3), "test") 80 | ); 81 | 82 | assertFalse(reader.hasNext()); 83 | } 84 | 85 | 86 | @Override 87 | protected Map getReaderConfig() { 88 | return new HashMap() {{ 89 | put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); 90 | put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); 91 | }}; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/XmlFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.dataformat.xml.XmlMapper; 5 | import org.apache.kafka.connect.data.Struct; 6 | 7 | import static org.junit.jupiter.api.Assertions.*; 8 | 9 | public class XmlFileReaderTest extends JacksonFileReaderTest { 10 | 11 | private static final String FILE_EXTENSION = "xl"; 12 | 13 | @Override 14 | protected void checkData(Struct record, long index) { 15 | Struct array = record.getStruct(FIELD_ARRAY_COMPLEX); 16 | Struct subrecord = record.getStruct(FIELD_STRUCT); 17 | assertAll( 18 | () -> assertEquals(index, Integer.parseInt(record.getString(FIELD_INTEGER))), 19 | () -> assertEquals("9999999999999999999", record.get(FIELD_BIG_INTEGER)), 20 | () -> assertEquals(Long.MAX_VALUE, Long.parseLong(record.getString(FIELD_LONG))), 21 | () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), 22 | () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), 23 | () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(record.getString(FIELD_DECIMAL))), 24 | () -> assertNull(record.get(FIELD_NULL)), 25 | () -> assertNotNull(record.schema().field(FIELD_NULL)), 26 | () -> assertEquals("dGVzdA==", record.get(FIELD_BINARY)), 27 | () -> assertEquals("elm[" + (index + 1) + "]", record.get(FIELD_ARRAY_SIMPLE)), 28 | 29 | () -> assertEquals(index + 1, Integer.parseInt(array.getString(FIELD_INTEGER))), 30 | () -> assertEquals(Long.MAX_VALUE, Long.parseLong(array.getString(FIELD_LONG))), 31 | () -> assertTrue(array.get(FIELD_STRING).toString().startsWith(index + "_")), 32 | () -> assertTrue(Boolean.parseBoolean(array.get(FIELD_BOOLEAN).toString())), 33 | () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(array.getString(FIELD_DECIMAL))), 34 | () -> assertNull(array.get(FIELD_NULL)), 35 | () -> assertNotNull(array.schema().field(FIELD_NULL)), 36 | () -> assertEquals(index + 1, Integer.parseInt(array.getString(FIELD_INTEGER))), 37 | () -> assertEquals(Long.MAX_VALUE, Long.parseLong(array.getString(FIELD_LONG))), 38 | () -> assertTrue(array.get(FIELD_STRING).toString().startsWith(index + "_")), 39 | () -> assertTrue(Boolean.parseBoolean(array.get(FIELD_BOOLEAN).toString())), 40 | () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(array.getString(FIELD_DECIMAL))), 41 | () -> assertNull(array.get(FIELD_NULL)), 42 | () -> assertNotNull(array.schema().field(FIELD_NULL)), 43 | 44 | () -> assertEquals(index, Integer.parseInt(subrecord.getString(FIELD_INTEGER))), 45 | () -> assertEquals(Long.MAX_VALUE, Long.parseLong(subrecord.getString(FIELD_LONG))), 46 | () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), 47 | () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), 48 | () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(subrecord.getString(FIELD_DECIMAL))), 49 | () -> assertNull(subrecord.get(FIELD_NULL)), 50 | () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) 51 | ); 52 | } 53 | 54 | @Override 55 | protected Class getReaderClass() { 56 | return XmlFileReader.class; 57 | } 58 | 59 | @Override 60 | protected String getFileExtension() { 61 | return FILE_EXTENSION; 62 | } 63 | 64 | @Override 65 | protected String readerEncodingConfig() { 66 | return XmlFileReader.FILE_READER_XML_ENCODING; 67 | } 68 | 69 | @Override 70 | protected String recordPerLineConfig() { 71 | return XmlFileReader.FILE_READER_XML_RECORD_PER_LINE; 72 | } 73 | 74 | @Override 75 | protected String compressionTypeConfig() { 76 | return XmlFileReader.FILE_READER_XML_COMPRESSION_TYPE; 77 | } 78 | 79 | @Override 80 | protected String compressionConcatenatedConfig() { 81 | return XmlFileReader.FILE_READER_XML_COMPRESSION_CONCATENATED; 82 | } 83 | 84 | @Override 85 | protected String deserializationConfigPrefix() { 86 | return XmlFileReader.FILE_READER_XML_DESERIALIZATION_CONFIGS; 87 | } 88 | 89 | @Override 90 | protected ObjectMapper getObjectMapper() { 91 | return new XmlMapper(); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/YamlFileReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.file.reader; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.kafka.connect.data.Struct; 7 | import org.junit.jupiter.api.Disabled; 8 | import org.junit.jupiter.params.ParameterizedTest; 9 | import org.junit.jupiter.params.provider.MethodSource; 10 | 11 | import java.io.BufferedWriter; 12 | import java.io.File; 13 | import java.io.FileWriter; 14 | import java.io.IOException; 15 | import java.math.BigInteger; 16 | import java.util.Arrays; 17 | import java.util.List; 18 | import java.util.Map; 19 | 20 | import static org.junit.jupiter.api.Assertions.*; 21 | 22 | public class YamlFileReaderTest extends JacksonFileReaderTest { 23 | 24 | private static final String FILE_EXTENSION = "yl"; 25 | 26 | protected static final int NUM_RECORDS = 1; 27 | 28 | @Override 29 | protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { 30 | CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2]; 31 | return super.createDataFile(fsConfig, 1, false, compression); 32 | } 33 | 34 | @ParameterizedTest 35 | @MethodSource("fileSystemConfigProvider") 36 | public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { 37 | File tmp = File.createTempFile("test-", "." + getFileExtension()); 38 | try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { 39 | writer.write("test"); 40 | } 41 | Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); 42 | fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); 43 | getReader(fsConfig.getFs(), path, getReaderConfig()); 44 | } 45 | 46 | @ParameterizedTest 47 | @MethodSource("fileSystemConfigProvider") 48 | public void readAllData(ReaderFsTestConfig fsConfig) { 49 | FileReader reader = fsConfig.getReader(); 50 | assertTrue(reader.hasNext()); 51 | 52 | int recordCount = 0; 53 | while (reader.hasNext()) { 54 | Struct record = reader.next(); 55 | checkData(record, recordCount); 56 | recordCount++; 57 | } 58 | assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); 59 | } 60 | 61 | @ParameterizedTest 62 | @MethodSource("fileSystemConfigProvider") 63 | @Disabled 64 | public void seekFile(ReaderFsTestConfig fsConfig) { 65 | } 66 | 67 | @ParameterizedTest 68 | @MethodSource("fileSystemConfigProvider") 69 | @Disabled 70 | public void exceededSeek(ReaderFsTestConfig fsConfig) { 71 | 72 | } 73 | 74 | @ParameterizedTest 75 | @MethodSource("fileSystemConfigProvider") 76 | @Disabled 77 | public void readAllDataInBatches(ReaderFsTestConfig fsConfig) { 78 | 79 | } 80 | 81 | @ParameterizedTest 82 | @MethodSource("fileSystemConfigProvider") 83 | public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { 84 | Arrays.stream(CompressionType.values()).forEach(compressionType -> { 85 | try { 86 | Path file = createDataFile(fsConfig, NUM_RECORDS, true, compressionType); 87 | Map readerConfig = getReaderConfig(); 88 | readerConfig.put(compressionTypeConfig(), compressionType.toString()); 89 | readerConfig.put(compressionConcatenatedConfig(), "true"); 90 | FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); 91 | 92 | assertTrue(reader.hasNext()); 93 | 94 | int recordCount = 0; 95 | while (reader.hasNext()) { 96 | Struct record = reader.next(); 97 | checkData(record, recordCount); 98 | recordCount++; 99 | } 100 | reader.close(); 101 | assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); 102 | } catch (Exception e) { 103 | throw new RuntimeException(e); 104 | } 105 | }); 106 | } 107 | 108 | @Override 109 | protected void checkData(Struct record, long index) { 110 | List array = record.getArray(FIELD_ARRAY_COMPLEX); 111 | Struct subrecord = record.getStruct(FIELD_STRUCT); 112 | assertAll( 113 | () -> assertEquals(index, (int) record.get(FIELD_INTEGER)), 114 | () -> assertEquals(new BigInteger("9999999999999999999").longValue(), record.get(FIELD_BIG_INTEGER)), 115 | () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)), 116 | () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), 117 | () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), 118 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0), 119 | () -> assertNull(record.get(FIELD_NULL)), 120 | () -> assertNotNull(record.schema().field(FIELD_NULL)), 121 | () -> assertEquals("test", new String((byte[]) record.get(FIELD_BINARY))), 122 | () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)), 123 | 124 | () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)), 125 | () -> assertEquals(Long.MAX_VALUE, (long) array.get(0).get(FIELD_LONG)), 126 | () -> assertTrue(array.get(0).get(FIELD_STRING).toString().startsWith(index + "_")), 127 | () -> assertTrue(Boolean.parseBoolean(array.get(0).get(FIELD_BOOLEAN).toString())), 128 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(0).get(FIELD_DECIMAL), 0), 129 | () -> assertNull(array.get(0).get(FIELD_NULL)), 130 | () -> assertNotNull(array.get(0).schema().field(FIELD_NULL)), 131 | () -> assertEquals(index + 1, (int) array.get(1).get(FIELD_INTEGER)), 132 | () -> assertEquals(Long.MAX_VALUE, (long) array.get(1).get(FIELD_LONG)), 133 | () -> assertTrue(array.get(1).get(FIELD_STRING).toString().startsWith(index + "_")), 134 | () -> assertTrue(Boolean.parseBoolean(array.get(1).get(FIELD_BOOLEAN).toString())), 135 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(1).get(FIELD_DECIMAL), 0), 136 | () -> assertNull(array.get(1).get(FIELD_NULL)), 137 | () -> assertNotNull(array.get(1).schema().field(FIELD_NULL)), 138 | 139 | () -> assertEquals(index, (int) subrecord.get(FIELD_INTEGER)), 140 | () -> assertEquals(Long.MAX_VALUE, (long) subrecord.get(FIELD_LONG)), 141 | () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), 142 | () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), 143 | () -> assertEquals(Double.parseDouble(index + "." + index), (Double) subrecord.get(FIELD_DECIMAL), 0), 144 | () -> assertNull(subrecord.get(FIELD_NULL)), 145 | () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) 146 | ); 147 | } 148 | 149 | @Override 150 | protected Class getReaderClass() { 151 | return YamlFileReader.class; 152 | } 153 | 154 | @Override 155 | protected String getFileExtension() { 156 | return FILE_EXTENSION; 157 | } 158 | 159 | @Override 160 | protected String readerEncodingConfig() { 161 | return YamlFileReader.FILE_READER_YAML_ENCODING; 162 | } 163 | 164 | @Override 165 | protected String recordPerLineConfig() { 166 | return "UNKNOWN"; 167 | } 168 | 169 | @Override 170 | protected String compressionTypeConfig() { 171 | return YamlFileReader.FILE_READER_YAML_COMPRESSION_TYPE; 172 | } 173 | 174 | @Override 175 | protected String compressionConcatenatedConfig() { 176 | return YamlFileReader.FILE_READER_YAML_COMPRESSION_CONCATENATED; 177 | } 178 | 179 | @Override 180 | protected String deserializationConfigPrefix() { 181 | return YamlFileReader.FILE_READER_YAML_DESERIALIZATION_CONFIGS; 182 | } 183 | 184 | @Override 185 | protected ObjectMapper getObjectMapper() { 186 | return new YAMLMapper(); 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; 5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.kafka.common.config.ConfigException; 8 | import org.apache.kafka.connect.errors.ConnectException; 9 | import org.apache.kafka.connect.errors.IllegalWorkerStateException; 10 | import org.junit.jupiter.params.ParameterizedTest; 11 | import org.junit.jupiter.params.provider.MethodSource; 12 | 13 | import java.io.IOException; 14 | import java.time.LocalDateTime; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | import static org.junit.jupiter.api.Assertions.*; 20 | 21 | public class CronPolicyTest extends PolicyTestBase { 22 | 23 | @Override 24 | protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { 25 | Map cfg = new HashMap() {{ 26 | String[] uris = directories.stream().map(Path::toString) 27 | .toArray(String[]::new); 28 | put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); 29 | put(FsSourceTaskConfig.TOPIC, "topic_test"); 30 | put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName()); 31 | put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); 32 | put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); 33 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); 34 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); 35 | put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); 36 | put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); 37 | }}; 38 | return new FsSourceTaskConfig(cfg); 39 | } 40 | 41 | @ParameterizedTest 42 | @MethodSource("fileSystemConfigProvider") 43 | @Override 44 | public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { 45 | fsConfig.getPolicy().execute(); 46 | fsConfig.getPolicy().interrupt(); 47 | assertTrue(fsConfig.getPolicy().hasEnded()); 48 | assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); 49 | } 50 | 51 | @ParameterizedTest 52 | @MethodSource("fileSystemConfigProvider") 53 | @SuppressWarnings("unchecked") 54 | public void invalidCronExpression(PolicyFsTestConfig fsConfig) { 55 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 56 | originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); 57 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 58 | assertThrows(ConnectException.class, () -> 59 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 60 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 61 | assertThrows(ConfigException.class, () -> { 62 | try { 63 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 64 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 65 | } catch (Exception e) { 66 | throw e.getCause(); 67 | } 68 | }); 69 | } 70 | 71 | @ParameterizedTest 72 | @MethodSource("fileSystemConfigProvider") 73 | @SuppressWarnings("unchecked") 74 | public void invalidEndDate(PolicyFsTestConfig fsConfig) { 75 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 76 | originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); 77 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 78 | assertThrows(ConnectException.class, () -> 79 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 80 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 81 | assertThrows(ConfigException.class, () -> { 82 | try { 83 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 84 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 85 | } catch (Exception e) { 86 | throw e.getCause(); 87 | } 88 | }); 89 | } 90 | 91 | @ParameterizedTest 92 | @MethodSource("fileSystemConfigProvider") 93 | @SuppressWarnings("unchecked") 94 | public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws IOException { 95 | try (Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 96 | .getClass(FsSourceTaskConfig.POLICY_CLASS), 97 | fsConfig.getSourceTaskConfig())) { 98 | 99 | for (int i = 0; i < 5; i++) { 100 | assertFalse(policy.hasEnded()); 101 | policy.execute(); 102 | } 103 | policy.interrupt(); 104 | assertTrue(policy.hasEnded()); 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; 5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.kafka.common.config.ConfigException; 8 | import org.apache.kafka.connect.errors.ConnectException; 9 | import org.apache.kafka.connect.errors.IllegalWorkerStateException; 10 | import org.junit.jupiter.api.BeforeAll; 11 | import org.junit.jupiter.params.ParameterizedTest; 12 | import org.junit.jupiter.params.provider.MethodSource; 13 | 14 | import java.io.IOException; 15 | import java.util.Collections; 16 | import java.util.HashMap; 17 | import java.util.List; 18 | import java.util.Map; 19 | 20 | import static org.junit.jupiter.api.Assertions.*; 21 | 22 | public class HdfsFileWatcherPolicyTest extends PolicyTestBase { 23 | 24 | static { 25 | TEST_FILE_SYSTEMS = Collections.singletonList( 26 | new HdfsFsConfig() 27 | ); 28 | } 29 | 30 | @BeforeAll 31 | public static void initFs() throws IOException { 32 | for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { 33 | fsConfig.initFs(); 34 | } 35 | } 36 | 37 | @Override 38 | protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { 39 | Map cfg = new HashMap() {{ 40 | String[] uris = directories.stream().map(Path::toString) 41 | .toArray(String[]::new); 42 | put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); 43 | put(FsSourceTaskConfig.TOPIC, "topic_test"); 44 | put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); 45 | put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); 46 | put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); 47 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); 48 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); 49 | }}; 50 | return new FsSourceTaskConfig(cfg); 51 | } 52 | 53 | // This policy does not throw any exception. Just stop watching those nonexistent dirs 54 | @ParameterizedTest 55 | @MethodSource("fileSystemConfigProvider") 56 | @Override 57 | public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException { 58 | for (Path dir : fsConfig.getDirectories()) { 59 | fsConfig.getFs().delete(dir, true); 60 | } 61 | try { 62 | fsConfig.getPolicy().execute(); 63 | } finally { 64 | for (Path dir : fsConfig.getDirectories()) { 65 | fsConfig.getFs().mkdirs(dir); 66 | } 67 | } 68 | } 69 | 70 | // This policy never ends. We have to interrupt it 71 | @ParameterizedTest 72 | @MethodSource("fileSystemConfigProvider") 73 | @Override 74 | public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { 75 | fsConfig.getPolicy().execute(); 76 | assertFalse(fsConfig.getPolicy().hasEnded()); 77 | fsConfig.getPolicy().interrupt(); 78 | assertTrue(fsConfig.getPolicy().hasEnded()); 79 | assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); 80 | } 81 | 82 | @ParameterizedTest 83 | @MethodSource("fileSystemConfigProvider") 84 | @SuppressWarnings("unchecked") 85 | public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException, IOException { 86 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 87 | originals.put(FsSourceTaskConfig.FS_URIS, "hdfs://localhost:65432/data"); 88 | originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "0"); 89 | originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "0"); 90 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 91 | try(Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 92 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)) { 93 | int count = 0; 94 | while (!policy.hasEnded() && count < 10) { 95 | Thread.sleep(500); 96 | count++; 97 | } 98 | assertTrue(count < 10); 99 | assertTrue(policy.hasEnded()); 100 | } 101 | } 102 | 103 | @ParameterizedTest 104 | @MethodSource("fileSystemConfigProvider") 105 | @SuppressWarnings("unchecked") 106 | public void invalidPollTime(PolicyFsTestConfig fsConfig) { 107 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 108 | originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "invalid"); 109 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 110 | assertThrows(ConnectException.class, () -> 111 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 112 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 113 | assertThrows(ConfigException.class, () -> { 114 | try { 115 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 116 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 117 | } catch (Exception e) { 118 | throw e.getCause(); 119 | } 120 | }); 121 | } 122 | 123 | @ParameterizedTest 124 | @MethodSource("fileSystemConfigProvider") 125 | @SuppressWarnings("unchecked") 126 | public void invalidRetryTime(PolicyFsTestConfig fsConfig) { 127 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 128 | originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "invalid"); 129 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 130 | assertThrows(ConnectException.class, () -> 131 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 132 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 133 | assertThrows(ConfigException.class, () -> { 134 | try { 135 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 136 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 137 | } catch (Exception e) { 138 | throw e.getCause(); 139 | } 140 | }); 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; 4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; 5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 6 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig; 7 | import org.apache.hadoop.fs.Path; 8 | 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.UUID; 13 | 14 | interface PolicyFsTestConfig extends FsTestConfig { 15 | 16 | Policy getPolicy(); 17 | 18 | void setPolicy(Policy policy); 19 | 20 | FsSourceTaskConfig getSourceTaskConfig(); 21 | 22 | void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig); 23 | 24 | List getDirectories(); 25 | 26 | } 27 | 28 | class LocalFsConfig extends AbstractLocalFsConfig implements PolicyFsTestConfig { 29 | private Policy policy; 30 | private FsSourceTaskConfig sourceTaskConfig; 31 | private List directories; 32 | 33 | @Override 34 | public void init() throws IOException { 35 | directories = new ArrayList() {{ 36 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 37 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 38 | }}; 39 | for (Path dir : directories) { 40 | getFs().mkdirs(dir); 41 | } 42 | } 43 | 44 | @Override 45 | public Policy getPolicy() { 46 | return policy; 47 | } 48 | 49 | @Override 50 | public void setPolicy(Policy policy) { 51 | this.policy = policy; 52 | } 53 | 54 | @Override 55 | public FsSourceTaskConfig getSourceTaskConfig() { 56 | return sourceTaskConfig; 57 | } 58 | 59 | @Override 60 | public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { 61 | this.sourceTaskConfig = sourceTaskConfig; 62 | } 63 | 64 | @Override 65 | public List getDirectories() { 66 | return directories; 67 | } 68 | 69 | } 70 | 71 | class HdfsFsConfig extends AbstractHdfsFsConfig implements PolicyFsTestConfig { 72 | private Policy policy; 73 | private FsSourceTaskConfig sourceTaskConfig; 74 | private List directories; 75 | 76 | @Override 77 | public void init() throws IOException { 78 | directories = new ArrayList() {{ 79 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 80 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 81 | }}; 82 | for (Path dir : directories) { 83 | getFs().mkdirs(dir); 84 | } 85 | } 86 | 87 | @Override 88 | public Policy getPolicy() { 89 | return policy; 90 | } 91 | 92 | @Override 93 | public void setPolicy(Policy policy) { 94 | this.policy = policy; 95 | } 96 | 97 | @Override 98 | public FsSourceTaskConfig getSourceTaskConfig() { 99 | return sourceTaskConfig; 100 | } 101 | 102 | @Override 103 | public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { 104 | this.sourceTaskConfig = sourceTaskConfig; 105 | } 106 | 107 | @Override 108 | public List getDirectories() { 109 | return directories; 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; 5 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; 6 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; 7 | 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.junit.jupiter.params.ParameterizedTest; 11 | import org.junit.jupiter.params.provider.MethodSource; 12 | 13 | import static org.junit.jupiter.api.Assertions.*; 14 | 15 | import java.io.IOException; 16 | import java.util.HashMap; 17 | import java.util.Iterator; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | public class SimplePolicyTest extends PolicyTestBase { 22 | 23 | @Override 24 | protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { 25 | Map cfg = new HashMap() {{ 26 | String[] uris = directories.stream().map(Path::toString) 27 | .toArray(String[]::new); 28 | put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); 29 | put(FsSourceTaskConfig.TOPIC, "topic_test"); 30 | put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); 31 | put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); 32 | put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); 33 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); 34 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test/"); 35 | }}; 36 | return new FsSourceTaskConfig(cfg); 37 | } 38 | 39 | @ParameterizedTest 40 | @MethodSource("fileSystemConfigProvider") 41 | @SuppressWarnings("unchecked") 42 | public void execPolicyEndsAfterBatching(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException { 43 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 44 | originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1"); 45 | FsSourceTaskConfig sourceTaskConfig = new FsSourceTaskConfig(originals); 46 | 47 | try (Policy policy = ReflectionUtils.makePolicy( 48 | (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), 49 | sourceTaskConfig)) { 50 | 51 | FileSystem fs = fsConfig.getFs(); 52 | for (Path dir : fsConfig.getDirectories()) { 53 | fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); 54 | // this file does not match the regexp 55 | fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid")); 56 | 57 | // we wait till FS has registered the files 58 | Thread.sleep(3000); 59 | } 60 | 61 | Iterator it = policy.execute(); 62 | 63 | // First batch of files (1 file) 64 | assertFalse(policy.hasEnded()); 65 | assertTrue(it.hasNext()); 66 | String firstPath = it.next().getPath(); 67 | assertFalse(it.hasNext()); 68 | assertFalse(policy.hasEnded()); 69 | 70 | // Second batch of files (1 file) 71 | it = policy.execute(); 72 | assertTrue(it.hasNext()); 73 | assertNotEquals(firstPath, it.next().getPath()); 74 | assertFalse(it.hasNext()); 75 | assertTrue(policy.hasEnded()); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.policy; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; 5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.kafka.common.config.ConfigException; 8 | import org.apache.kafka.connect.errors.ConnectException; 9 | import org.junit.jupiter.params.ParameterizedTest; 10 | import org.junit.jupiter.params.provider.MethodSource; 11 | 12 | import java.io.IOException; 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | 17 | import static org.junit.jupiter.api.Assertions.*; 18 | 19 | public class SleepyPolicyTest extends PolicyTestBase { 20 | 21 | @Override 22 | protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { 23 | Map cfg = new HashMap() {{ 24 | String[] uris = directories.stream().map(Path::toString) 25 | .toArray(String[]::new); 26 | put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); 27 | put(FsSourceTaskConfig.TOPIC, "topic_test"); 28 | put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); 29 | put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); 30 | put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); 31 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); 32 | put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); 33 | put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); 34 | put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); 35 | }}; 36 | return new FsSourceTaskConfig(cfg); 37 | } 38 | 39 | @ParameterizedTest 40 | @MethodSource("fileSystemConfigProvider") 41 | @SuppressWarnings("unchecked") 42 | public void invalidSleepTime(PolicyFsTestConfig fsConfig) { 43 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 44 | originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); 45 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 46 | assertThrows(ConnectException.class, () -> 47 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 48 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 49 | assertThrows(ConfigException.class, () -> { 50 | try { 51 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 52 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 53 | } catch (Exception e) { 54 | throw e.getCause(); 55 | } 56 | }); 57 | } 58 | 59 | @ParameterizedTest 60 | @MethodSource("fileSystemConfigProvider") 61 | @SuppressWarnings("unchecked") 62 | public void invalidMaxExecs(PolicyFsTestConfig fsConfig) { 63 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 64 | originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); 65 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 66 | assertThrows(ConnectException.class, () -> 67 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 68 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 69 | assertThrows(ConfigException.class, () -> { 70 | try { 71 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 72 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 73 | } catch (Exception e) { 74 | throw e.getCause(); 75 | } 76 | }); 77 | } 78 | 79 | @ParameterizedTest 80 | @MethodSource("fileSystemConfigProvider") 81 | @SuppressWarnings("unchecked") 82 | public void invalidSleepFraction(PolicyFsTestConfig fsConfig) { 83 | Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); 84 | originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); 85 | FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); 86 | assertThrows(ConnectException.class, () -> 87 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 88 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); 89 | assertThrows(ConfigException.class, () -> { 90 | try { 91 | ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 92 | .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); 93 | } catch (Exception e) { 94 | throw e.getCause(); 95 | } 96 | }); 97 | } 98 | 99 | @ParameterizedTest 100 | @MethodSource("fileSystemConfigProvider") 101 | @SuppressWarnings("unchecked") 102 | public void sleepExecution(PolicyFsTestConfig fsConfig) throws IOException { 103 | Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); 104 | tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); 105 | tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); 106 | FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); 107 | 108 | try (Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 109 | .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) { 110 | assertFalse(policy.hasEnded()); 111 | policy.execute(); 112 | assertFalse(policy.hasEnded()); 113 | policy.execute(); 114 | assertTrue(policy.hasEnded()); 115 | } 116 | } 117 | 118 | @ParameterizedTest 119 | @MethodSource("fileSystemConfigProvider") 120 | @SuppressWarnings("unchecked") 121 | public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException { 122 | Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); 123 | tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); 124 | tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); 125 | FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); 126 | 127 | try (Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() 128 | .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) { 129 | // it never ends 130 | for (int i = 0; i < 100; i++) { 131 | assertFalse(policy.hasEnded()); 132 | policy.execute(); 133 | } 134 | policy.interrupt(); 135 | assertTrue(policy.hasEnded()); 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.task; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; 4 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; 5 | import org.apache.kafka.common.config.ConfigDef; 6 | import org.junit.jupiter.api.Test; 7 | 8 | import static org.junit.jupiter.api.Assertions.assertFalse; 9 | import static org.junit.jupiter.api.Assertions.assertNotNull; 10 | 11 | public class FsSourceTaskConfigTest { 12 | 13 | @Test 14 | public void checkDocumentation() { 15 | ConfigDef config = FsSourceTaskConfig.conf(); 16 | config.names().forEach(key -> { 17 | assertFalse(config.configKeys().get(key).documentation == null || 18 | "".equals(config.configKeys().get(key).documentation.trim()), 19 | () -> "Property " + key + " should be documented"); 20 | }); 21 | } 22 | 23 | @Test 24 | public void toRst() { 25 | assertNotNull(FsSourceConnectorConfig.conf().toRst()); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java: -------------------------------------------------------------------------------- 1 | package com.github.mmolimar.kafka.connect.fs.task; 2 | 3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; 4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; 5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTask; 6 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig; 7 | import org.apache.hadoop.fs.Path; 8 | 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.UUID; 14 | 15 | interface TaskFsTestConfig extends FsTestConfig { 16 | 17 | FsSourceTask getTask(); 18 | 19 | void setTask(FsSourceTask task); 20 | 21 | Map getTaskConfig(); 22 | 23 | void setTaskConfig(Map taskConfig); 24 | 25 | List getDirectories(); 26 | 27 | } 28 | 29 | class LocalFsConfig extends AbstractLocalFsConfig implements TaskFsTestConfig { 30 | private FsSourceTask task; 31 | private Map taskConfig; 32 | private List directories; 33 | 34 | @Override 35 | public void init() throws IOException { 36 | directories = new ArrayList() {{ 37 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 38 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 39 | }}; 40 | for (Path dir : directories) { 41 | getFs().mkdirs(dir); 42 | } 43 | } 44 | 45 | @Override 46 | public FsSourceTask getTask() { 47 | return task; 48 | } 49 | 50 | @Override 51 | public void setTask(FsSourceTask task) { 52 | this.task = task; 53 | } 54 | 55 | @Override 56 | public Map getTaskConfig() { 57 | return taskConfig; 58 | } 59 | 60 | @Override 61 | public void setTaskConfig(Map taskConfig) { 62 | this.taskConfig = taskConfig; 63 | } 64 | 65 | @Override 66 | public List getDirectories() { 67 | return directories; 68 | } 69 | 70 | } 71 | 72 | class HdfsFsConfig extends AbstractHdfsFsConfig implements TaskFsTestConfig { 73 | private FsSourceTask task; 74 | private Map taskConfig; 75 | private List directories; 76 | 77 | @Override 78 | public void init() throws IOException { 79 | directories = new ArrayList() {{ 80 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 81 | add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); 82 | }}; 83 | for (Path dir : directories) { 84 | getFs().mkdirs(dir); 85 | } 86 | } 87 | 88 | @Override 89 | public FsSourceTask getTask() { 90 | return task; 91 | } 92 | 93 | @Override 94 | public void setTask(FsSourceTask task) { 95 | this.task = task; 96 | } 97 | 98 | @Override 99 | public Map getTaskConfig() { 100 | return taskConfig; 101 | } 102 | 103 | @Override 104 | public void setTaskConfig(Map taskConfig) { 105 | this.taskConfig = taskConfig; 106 | } 107 | 108 | @Override 109 | public List getDirectories() { 110 | return directories; 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/test/resources/file/reader/data/cobol/code-pages.cpy: -------------------------------------------------------------------------------- 1 | 01 TRANSDATA. 2 | 05 CURRENCY PIC X(3). 3 | 05 SIGNATURE PIC X(8). 4 | 05 COMPANY-NAME-NP PIC X(15). 5 | 05 COMPANY-ID PIC X(10). 6 | 05 WEALTH-QFY PIC 9(1). 7 | 05 AMOUNT PIC S9(09)V99 BINARY. 8 | -------------------------------------------------------------------------------- /src/test/resources/file/reader/data/cobol/code-pages.dt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/code-pages.dt -------------------------------------------------------------------------------- /src/test/resources/file/reader/data/cobol/companies.cpy: -------------------------------------------------------------------------------- 1 | 01 COMPANY-DETAILS. 2 | 05 SEGMENT-ID PIC X(5). 3 | 05 COMPANY-ID PIC X(10). 4 | 05 STATIC-DETAILS. 5 | 10 COMPANY-NAME PIC X(15). 6 | 10 ADDRESS PIC X(25). 7 | 10 TAXPAYER. 8 | 15 TAXPAYER-TYPE PIC X(1). 9 | 15 TAXPAYER-STR PIC X(8). 10 | 15 TAXPAYER-NUM REDEFINES TAXPAYER-STR 11 | PIC 9(8) COMP. 12 | 10 STRATEGY. 13 | 15 STRATEGY_DETAIL OCCURS 6. 14 | 25 NUM1 PIC 9(7) COMP. 15 | 25 NUM2 PIC 9(7) COMP-3. 16 | 17 | -------------------------------------------------------------------------------- /src/test/resources/file/reader/data/cobol/companies.dt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/companies.dt -------------------------------------------------------------------------------- /src/test/resources/file/reader/data/cobol/type-variety.dt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/type-variety.dt -------------------------------------------------------------------------------- /src/test/resources/file/reader/schemas/people.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "test.avro", 4 | "name": "People", 5 | "fields": [{ 6 | "name": "name", 7 | "type": "string" 8 | }, { 9 | "name": "surname", 10 | "type": "string" 11 | }, { 12 | "name": "index", 13 | "type": "int" 14 | }] 15 | } -------------------------------------------------------------------------------- /src/test/resources/file/reader/schemas/people_projection.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "test.avro", 4 | "name": "PeopleProjection", 5 | "fields": [{ 6 | "name": "name", 7 | "type": "string" 8 | }, { 9 | "name": "index", 10 | "type": "int" 11 | }] 12 | } -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n 9 | 10 | log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE 11 | log4j.logger.org.apache.hadoop=ERROR 12 | log4j.logger.BlockStateChange=WARN 13 | log4j.logger.org.apache.parquet=WARN 14 | log4j.logger.org.apache.orc=WARN 15 | log4j.logger.org.eclipse.jetty=WARN 16 | log4j.logger.io.confluent.connect.avro=WARN 17 | --------------------------------------------------------------------------------