├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── config
    └── kafka-connect-fs.properties
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── conf.py
    │   ├── config_options.rst
    │   ├── connector.rst
    │   ├── faq.rst
    │   ├── filereaders.rst
    │   ├── index.rst
    │   └── policies.rst
├── pom.xml
└── src
    ├── main
        ├── assembly
        │   ├── development.xml
        │   ├── package.xml
        │   └── standalone.xml
        ├── java
        │   └── com
        │   │   └── github
        │   │       └── mmolimar
        │   │           └── kafka
        │   │               └── connect
        │   │                   └── fs
        │   │                       ├── FsSourceConnector.java
        │   │                       ├── FsSourceConnectorConfig.java
        │   │                       ├── FsSourceTask.java
        │   │                       ├── FsSourceTaskConfig.java
        │   │                       ├── file
        │   │                           ├── FileMetadata.java
        │   │                           └── reader
        │   │                           │   ├── AbstractFileReader.java
        │   │                           │   ├── AgnosticFileReader.java
        │   │                           │   ├── AvroFileReader.java
        │   │                           │   ├── BinaryFileReader.java
        │   │                           │   ├── CobolFileReader.java
        │   │                           │   ├── CompressionType.java
        │   │                           │   ├── CsvFileReader.java
        │   │                           │   ├── FileReader.java
        │   │                           │   ├── FixedWidthFileReader.java
        │   │                           │   ├── JacksonFileReader.java
        │   │                           │   ├── JsonFileReader.java
        │   │                           │   ├── OrcFileReader.java
        │   │                           │   ├── ParquetFileReader.java
        │   │                           │   ├── SequenceFileReader.java
        │   │                           │   ├── TextFileReader.java
        │   │                           │   ├── TsvFileReader.java
        │   │                           │   ├── UnivocityFileReader.java
        │   │                           │   ├── XmlFileReader.java
        │   │                           │   └── YamlFileReader.java
        │   │                       ├── policy
        │   │                           ├── AbstractPolicy.java
        │   │                           ├── CronPolicy.java
        │   │                           ├── HdfsFileWatcherPolicy.java
        │   │                           ├── Policy.java
        │   │                           ├── S3EventNotificationsPolicy.java
        │   │                           ├── SimplePolicy.java
        │   │                           └── SleepyPolicy.java
        │   │                       └── util
        │   │                           ├── Iterators.java
        │   │                           ├── ReflectionUtils.java
        │   │                           ├── TailCall.java
        │   │                           └── Version.java
        ├── resources
        │   ├── META-INF
        │   │   └── services
        │   │   │   └── org.apache.hadoop.fs.FileSystem
        │   └── kafka-connect-fs-version.properties
        └── scala
        │   └── com
        │       └── github
        │           └── mmolimar
        │               └── kafka
        │                   └── connect
        │                       └── fs
        │                           └── file
        │                               └── reader
        │                                   └── CobrixReader.scala
    └── test
        ├── java
            └── com
            │   └── github
            │       └── mmolimar
            │           └── kafka
            │               └── connect
            │                   └── fs
            │                       ├── AbstractHdfsFsConfig.java
            │                       ├── AbstractLocalFsConfig.java
            │                       ├── FsTestConfig.java
            │                       ├── connector
            │                           ├── FsSourceConnectorConfigTest.java
            │                           └── FsSourceConnectorTest.java
            │                       ├── file
            │                           └── reader
            │                           │   ├── AgnosticFileReaderTest.java
            │                           │   ├── AvroFileReaderTest.java
            │                           │   ├── BinaryFileReaderTest.java
            │                           │   ├── CobolFileReaderTest.java
            │                           │   ├── CsvFileReaderTest.java
            │                           │   ├── FileReaderTestBase.java
            │                           │   ├── FixedWidthFileReaderTest.java
            │                           │   ├── JacksonFileReaderTest.java
            │                           │   ├── JsonFileReaderTest.java
            │                           │   ├── OrcFileReaderTest.java
            │                           │   ├── ParquetFileReaderTest.java
            │                           │   ├── ReaderFsTestConfig.java
            │                           │   ├── SequenceFileReaderTest.java
            │                           │   ├── TextFileReaderTest.java
            │                           │   ├── TsvFileReaderTest.java
            │                           │   ├── UnivocityFileReaderTest.java
            │                           │   ├── XmlFileReaderTest.java
            │                           │   └── YamlFileReaderTest.java
            │                       ├── policy
            │                           ├── CronPolicyTest.java
            │                           ├── HdfsFileWatcherPolicyTest.java
            │                           ├── PolicyFsTestConfig.java
            │                           ├── PolicyTestBase.java
            │                           ├── S3EventNotificationsPolicyTest.java
            │                           ├── SimplePolicyTest.java
            │                           └── SleepyPolicyTest.java
            │                       └── task
            │                           ├── FsSourceTaskConfigTest.java
            │                           ├── FsSourceTaskTest.java
            │                           └── TaskFsTestConfig.java
        └── resources
            ├── file
                └── reader
                │   ├── data
                │       └── cobol
                │       │   ├── code-pages.cpy
                │       │   ├── code-pages.dt
                │       │   ├── companies.cpy
                │       │   ├── companies.dt
                │       │   ├── type-variety.cpy
                │       │   └── type-variety.dt
                │   └── schemas
                │       ├── people.avsc
                │       └── people_projection.avsc
            └── log4j.properties


/.gitignore:
--------------------------------------------------------------------------------
 1 | # use glob syntax.
 2 | syntax: glob
 3 | *.ser
 4 | *.class
 5 | *~
 6 | *.bak
 7 | #*.off
 8 | *.old
 9 | 
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | 
16 | # idea
17 | .idea
18 | *.iml
19 | 
20 | # building
21 | target
22 | build
23 | null
24 | tmp
25 | temp
26 | test-output
27 | build.log
28 | 
29 | # other scm
30 | .svn
31 | .CVS
32 | .hg*
33 | 
34 | # switch to regexp syntax.
35 | #  syntax: regexp
36 | #  ^\.pc/
37 | 
38 | # Documentation autogenerated
39 | javadoc
40 | apidocs
41 | 
42 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | language: java
 3 | jdk:
 4 |  - oraclejdk8
 5 | install:
 6 |  - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V
 7 | script:
 8 |  - mvn test jacoco:report
 9 | after_success:
10 |  - mvn coveralls:report
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM confluentinc/cp-kafka-connect-base:6.1.0
2 | 
3 | ARG PROJECT_VERSION
4 | ENV CONNECT_PLUGIN_PATH="/usr/share/java,/usr/share/confluent-hub-components"
5 | 
6 | COPY ./target/components/packages/mmolimar-kafka-connect-fs-${PROJECT_VERSION}.zip /tmp/kafka-connect-fs.zip
7 | RUN confluent-hub install --no-prompt /tmp/kafka-connect-fs.zip && rm -rf /tmp/kafka-connect-fs.zip
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kafka Connect FileSystem Connector [![Build Status](https://travis-ci.org/mmolimar/kafka-connect-fs.svg?branch=master)](https://travis-ci.org/mmolimar/kafka-connect-fs)[![Coverage Status](https://coveralls.io/repos/github/mmolimar/kafka-connect-fs/badge.svg?branch=master)](https://coveralls.io/github/mmolimar/kafka-connect-fs?branch=master)
 2 | 
 3 | **kafka-connect-fs** is a [Kafka Connector](https://kafka.apache.org/documentation.html#connect)
 4 | for reading records from files in the file systems specified and load them into Kafka.
 5 | 
 6 | Documentation for this connector can be found [here](https://kafka-connect-fs.readthedocs.io/).
 7 | 
 8 | ## Development
 9 | 
10 | To build a development version you'll need a recent version of Kafka. You can build
11 | kafka-connect-fs with Maven using the standard lifecycle phases.
12 | 
13 | ## FAQ
14 | 
15 | Some frequently asked questions on Kafka Connect FileSystem Connector can be found here -
16 | https://kafka-connect-fs.readthedocs.io/en/latest/faq.html
17 | 
18 | ## Contribute
19 | 
20 | - Source Code: https://github.com/mmolimar/kafka-connect-fs
21 | - Issue Tracker: https://github.com/mmolimar/kafka-connect-fs/issues
22 | 
23 | ## License
24 | 
25 | Released under the Apache License, version 2.0.
26 | 


--------------------------------------------------------------------------------
/config/kafka-connect-fs.properties:
--------------------------------------------------------------------------------
 1 | name=FsSourceConnector
 2 | connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector
 3 | tasks.max=1
 4 | fs.uris=file:///data,hdfs://localhost:8020/data
 5 | topic=mytopic
 6 | policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy
 7 | policy.recursive=true
 8 | policy.regexp=^.*\.txt$
 9 | policy.batch_size=0
10 | policy.cleanup=none
11 | file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader
12 | file_reader.batch_size=0
13 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   cp-zookeeper:
 4 |     image: confluentinc/cp-zookeeper:6.1.0
 5 |     hostname: zookeeper
 6 |     container_name: zookeeper
 7 |     ports:
 8 |       - "2181:2181"
 9 |     environment:
10 |       ZOOKEEPER_CLIENT_PORT: 2181
11 |       ZOOKEEPER_TICK_TIME: 2000
12 | 
13 |   cp-kafka:
14 |     image: confluentinc/cp-kafka:6.1.0
15 |     hostname: kafka
16 |     container_name: kafka
17 |     depends_on:
18 |       - cp-zookeeper
19 |     ports:
20 |       - "29092:29092"
21 |       - "9092:9092"
22 |     environment:
23 |       KAFKA_BROKER_ID: 1
24 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
25 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
26 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
27 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
28 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
29 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092
30 |       CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181
31 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
32 |       CONFLUENT_METRICS_ENABLE: 'false'
33 | 
34 |   cp-schema-registry:
35 |     image: confluentinc/cp-schema-registry:6.1.0
36 |     hostname: schema-registry
37 |     container_name: schema-registry
38 |     depends_on:
39 |       - cp-zookeeper
40 |       - cp-kafka
41 |     ports:
42 |       - "8081:8081"
43 |     environment:
44 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
45 |       SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
46 | 
47 |   connect-fs:
48 |     image: mmolimar/kafka-connect-fs:1.3.0
49 |     container_name: connect
50 |     depends_on:
51 |       - cp-kafka
52 |       - cp-schema-registry
53 |     ports:
54 |       - "8083:8083"
55 |       - "8000:8000"
56 |     environment:
57 |       CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092'
58 |       CONNECT_REST_ADVERTISED_HOST_NAME: connect
59 |       CONNECT_REST_PORT: 8083
60 |       CONNECT_GROUP_ID: compose-connect-group
61 |       CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
62 |       CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
63 |       CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
64 |       CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
65 |       CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
66 |       CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
67 |       CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
68 |       CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
69 |       CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
70 |       CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
71 |       CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
72 |       CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
73 |       CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181'
74 |       CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/"
75 |       CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO"
76 |       CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR
77 |       KAFKA_OPTS: "-agentlib:jdwp=transport=dt_socket,server=y,address=8000,suspend=n"
78 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = kafka-connect-fs
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=kafka-connect-fs
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # kafka-connect-fs documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Mar 23 20:59:04 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = []
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | #templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = '.rst'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'Kafka Connect FileSystem Connector'
 50 | copyright = '2017, Mario Molina'
 51 | author = 'Mario Molina'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '1.3'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '1.3'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | # language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = ['build']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | # -- Options for HTML output ----------------------------------------------
 81 | import sphinx_rtd_theme
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'sphinx_rtd_theme'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 98 | #html_static_path = ['_static']
 99 | 
100 | # -- Options for HTMLHelp output ------------------------------------------
101 | 
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'KafkaConnectFileSystemConnectorDoc'
104 | 
105 | # -- Options for LaTeX output ---------------------------------------------
106 | 
107 | latex_elements = {
108 |     # The paper size ('letterpaper' or 'a4paper').
109 |     #
110 |     # 'papersize': 'letterpaper',
111 | 
112 |     # The font size ('10pt', '11pt' or '12pt').
113 |     #
114 |     # 'pointsize': '10pt',
115 | 
116 |     # Additional stuff for the LaTeX preamble.
117 |     #
118 |     # 'preamble': '',
119 | 
120 |     # Latex figure (float) alignment
121 |     #
122 |     # 'figure_align': 'htbp',
123 | }
124 | 
125 | # Grouping the document tree into LaTeX files. List of tuples
126 | # (source start file, target name, title,
127 | #  author, documentclass [howto, manual, or own class]).
128 | latex_documents = [
129 |     (master_doc, 'KafkaConnectFileSystemConnector.tex', 'Kafka Connect FileSystem Connector Documentation',
130 |      'Mario Molina', 'manual'),
131 | ]
132 | 
133 | # -- Options for manual page output ---------------------------------------
134 | 
135 | # One entry per manual page. List of tuples
136 | # (source start file, name, description, authors, manual section).
137 | man_pages = [
138 |     (master_doc, 'kafkaconnectfs', 'Kafka Connect FileSystem Connector Documentation',
139 |      [author], 1)
140 | ]
141 | 
142 | # -- Options for Texinfo output -------------------------------------------
143 | 
144 | # Grouping the document tree into Texinfo files. List of tuples
145 | # (source start file, target name, title, author,
146 | #  dir menu entry, description, category)
147 | texinfo_documents = [
148 |     (master_doc, 'KafkaConnectFs', 'Kafka Connect FileSystem Connector Documentation',
149 |      author, 'KafkaConnectFileSystemConnector', 'Kafka Connector for FileSystem',
150 |      'Miscellaneous'),
151 | ]
152 | 


--------------------------------------------------------------------------------
/docs/source/connector.rst:
--------------------------------------------------------------------------------
  1 | .. _connector:
  2 | 
  3 | ********************************************
  4 | Connector
  5 | ********************************************
  6 | 
  7 | The connector takes advantage of the abstraction provided from `Hadoop Common <https://hadoop.apache.org/>`__
  8 | using the implementation of the ``org.apache.hadoop.fs.FileSystem`` class. So, it's possible to use a
  9 | wide variety of FS or if your FS is not included in the Hadoop Common API you can implement an extension
 10 | of this abstraction and using it in a transparent way.
 11 | 
 12 | Among others, these are some file systems it supports:
 13 | 
 14 | * HDFS.
 15 | * S3.
 16 | * Google Cloud Storage.
 17 | * Azure Blob Storage & Azure Data Lake Store.
 18 | * FTP & SFTP.
 19 | * WebHDFS.
 20 | * Local File System.
 21 | * Hadoop Archive File System.
 22 | 
 23 | Getting started
 24 | ============================================
 25 | 
 26 | Prerequisites
 27 | --------------------------------------------
 28 | 
 29 | -  Apache Kafka 2.6.0.
 30 | -  Java 8.
 31 | -  Confluent Schema Registry (recommended).
 32 | 
 33 | Building from source
 34 | --------------------------------------------
 35 | 
 36 | .. sourcecode:: bash
 37 | 
 38 |    mvn clean package
 39 | 
 40 | General config
 41 | --------------------------------------------
 42 | 
 43 | The ``kafka-connect-fs.properties`` file defines the following properties as required:
 44 | 
 45 | .. sourcecode:: bash
 46 | 
 47 |    name=FsSourceConnector
 48 |    connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector
 49 |    tasks.max=1
 50 |    fs.uris=file:///data,hdfs://localhost:8020/data
 51 |    topic=mytopic
 52 |    policy.class=<Policy class>
 53 |    policy.recursive=true
 54 |    policy.regexp=.*
 55 |    policy.batch_size=0
 56 |    policy.cleanup=none
 57 |    file_reader.class=<File reader class>
 58 |    file_reader.batch_size=0
 59 | 
 60 | #. The connector name.
 61 | #. Class indicating the connector.
 62 | #. Number of tasks the connector is allowed to start.
 63 | #. Comma-separated URIs of the FS(s). They can be URIs pointing out directly to a file
 64 |    or a directory in the FS. These URIs can also be dynamic by using expressions for
 65 |    modifying them in runtime.
 66 | #. Topic in which copy data from the FS.
 67 | #. Policy class to apply (must implement
 68 |    ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface).
 69 | #. Flag to activate traversed recursion in subdirectories when listing files.
 70 | #. Regular expression to filter files from the FS.
 71 | #. Number of files that should be handled at a time. Non-positive values disable batching.
 72 | #. Cleanup strategy to manage processed files.
 73 | #. File reader class to read files from the FS
 74 |    (must implement ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader`` interface).
 75 | #. Number of records to process at a time. Non-positive values disable batching.
 76 | 
 77 | A more detailed information about these properties can be found :ref:`here<config_options-general>`.
 78 | 
 79 | Running in local
 80 | --------------------------------------------
 81 | 
 82 | .. sourcecode:: bash
 83 | 
 84 |    export KAFKA_HOME=/path/to/kafka/install/dir
 85 | 
 86 | .. sourcecode:: bash
 87 | 
 88 |    mvn clean package
 89 |    export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')"
 90 |    $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties config/kafka-connect-fs.properties
 91 | 
 92 | Running in Docker
 93 | --------------------------------------------
 94 | 
 95 | .. sourcecode:: bash
 96 | 
 97 |    mvn clean package
 98 | 
 99 | .. sourcecode:: bash
100 | 
101 |    docker build --build-arg PROJECT_VERSION=<VERSION> .
102 |    docker-compose build
103 |    docker-compose up -d
104 |    docker logs --tail="all" -f connect
105 | 
106 | .. sourcecode:: bash
107 | 
108 |    curl -sX GET http://localhost:8083/connector-plugins | grep FsSourceConnector
109 | 
110 | Components
111 | ============================================
112 | 
113 | There are two main concepts to decouple concerns within the connector.
114 | They are **policies** and **file readers**, described below.
115 | 
116 | Policies
117 | --------------------------------------------
118 | 
119 | In order to ingest data from the FS(s), the connector needs a **policy** to define the rules to do it.
120 | 
121 | Basically, the policy tries to connect to each FS included in the ``fs.uris`` connector property, lists files
122 | (and filter them using the regular expression provided in the ``policy.regexp`` property) and enables
123 | a file reader to read records.
124 | 
125 | The policy to be used by the connector is defined in the ``policy.class`` connector property.
126 | 
127 | .. important:: When delivering records from the connector to Kafka, they contain their own file offset
128 |                so, if in the next eventual policy execution this file is processed again,
129 |                the policy will seek the file to this offset and process the next records
130 |                if any (**if the offset was committed**).
131 | 
132 | .. note:: If the URIs included in the ``fs.uris`` connector property contain any expression of the
133 |           form ``${XXX}``, this dynamic URI is built in the moment of the policy execution.
134 | 
135 | Currently, there are few policies to support some use cases but, for sure, you can develop your own one
136 | if the existing policies don't fit your needs.
137 | The only restriction is that you must implement the interface
138 | ``com.github.mmolimar.kafka.connect.fs.policy.Policy``.
139 | 
140 | .. include:: policies.rst
141 | 
142 | File readers
143 | --------------------------------------------
144 | 
145 | They read files and process each record from the FS. The **file reader** is needed by the policy to enable
146 | the connector to process each record and includes in the implementation how to seek and iterate over the
147 | records within the file.
148 | 
149 | The file reader to be used when processing files is defined in the ``file_reader.class`` connector property.
150 | 
151 | In the same way as policies, the connector provides several sort of readers to parse and read records
152 | for different file formats. If you don't have a file reader that fits your needs, just implement one
153 | with the unique restriction that it must implement the interface
154 | ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader``.
155 | 
156 | The are several file readers included which can read the following file formats:
157 | 
158 | * Parquet.
159 | * Avro.
160 | * ORC.
161 | * SequenceFile.
162 | * Cobol / EBCDIC.
163 | * Other binary files.
164 | * CSV.
165 | * TSV.
166 | * Fixed-width.
167 | * JSON.
168 | * XML.
169 | * YAML.
170 | * Text.
171 | 
172 | .. include:: filereaders.rst
173 | 


--------------------------------------------------------------------------------
/docs/source/faq.rst:
--------------------------------------------------------------------------------
 1 | .. faq:
 2 | 
 3 | ********************************************
 4 | FAQs
 5 | ********************************************
 6 | 
 7 | **My file was already processed and the connector, when it's executed again,
 8 | processes the same records again.**
 9 | 
10 | If during the previous executions the records were sent successfully to Kafka,
11 | their offsets were sent too. Then, when executing the policy again, it
12 | retrieves the offset and seeks the file. If this didn't happen, it's possible
13 | that the offset was not committed yet and, consequently, the offset retrieved
14 | is non-existent or too old.
15 | 
16 | Have a look when the offsets are committed in Kafka and/or try to execute the
17 | policy when you are sure the offsets have been committed.
18 | 
19 | **The connector started but does not process any kind of file.**
20 | 
21 | This can be for several reasons:
22 | 
23 | * Check if the files contained in the FS match the regexp provided.
24 | * Check if there is any kind of problem with the FS. The connector tolerates
25 |   FS connection exceptions to process them later but in log files you'll find
26 |   these possible errors.
27 | * The file reader is reading files with an invalid format so it cannot
28 |   process the file and continues with the next one. You can see
29 |   this as an error in the log.
30 | 
31 | **I have directories in the FS created day by day and I have to modify
32 | the connector everyday.**
33 | 
34 | Don't do this! Take advantage of the dynamic URIs using expressions.
35 | 
36 | For instance, if you have this URI ``hdfs://host:9000/data/2020``, you can
37 | use this URI ``hdfs://host:9000/data/${yyyy}`` instead.
38 | 
39 | **The connector is too slow to process all URIs I have.**
40 | 
41 | Obviously, this depends of the files in the FS(s) but having several URIs in
42 | the connector might be a good idea to adjust the number of tasks
43 | to process those URIs in parallel ( ``tasks.max`` connector property).
44 | 
45 | Also, using the properties ``policy.batch_size`` and/or ``file_reader.batch_size``
46 | in case you have tons of files or files too large might help.
47 | 
48 | **I removed a file from the FS but the connector is still sending messages
49 | with the contents of that file.**
50 | 
51 | This is a tricky issue. The file reader is an iterator and processes
52 | record by record but part of the file is buffered and, even though the
53 | file was removed from the FS, the file reader continues producing records
54 | until throws an exception. It's a matter of time.
55 | 
56 | But the main thing is that you don't have to worry about removing files
57 | from the FS when they are being processed. The connector tolerates errors
58 | when reading files and continues with the next file.
59 | 


--------------------------------------------------------------------------------
/docs/source/filereaders.rst:
--------------------------------------------------------------------------------
  1 | Parquet
  2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  3 | 
  4 | Reads files with `Parquet <https://parquet.apache.org/>`__ format.
  5 | 
  6 | The reader takes advantage of the Parquet-Avro API and uses the Parquet file
  7 | as if it was an Avro file, so the message sent to Kafka is built in the same
  8 | way as the Avro file reader does.
  9 | 
 10 | More information about properties of this file reader :ref:`here<config_options-filereaders-parquet>`.
 11 | 
 12 | Avro
 13 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 14 | 
 15 | Files with `Avro <https://avro.apache.org/>`__ format can be read with this reader.
 16 | 
 17 | The Avro schema is not needed due to is read from the file. The message sent
 18 | to Kafka is created by transforming the record by means of
 19 | `Confluent avro-converter <https://github.com/confluentinc/schema-registry/tree/master/avro-converter>`__
 20 | API.
 21 | 
 22 | More information about properties of this file reader :ref:`here<config_options-filereaders-avro>`.
 23 | 
 24 | ORC
 25 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 26 | 
 27 | `ORC files <https://orc.apache.org>`__ are a self-describing type-aware
 28 | columnar file format designed for Hadoop workloads.
 29 | 
 30 | This reader can process this file format, translating its schema and building
 31 | a Kafka message with the content.
 32 | 
 33 | .. warning:: If you have ORC files with ``union`` data types, this sort of
 34 |              data types will be transformed in a ``map`` object in the Kafka message.
 35 |              The value of each key will be ``fieldN``, where ``N`` represents
 36 |              the index within the data type.
 37 | 
 38 | More information about properties of this file reader :ref:`here<config_options-filereaders-orc>`.
 39 | 
 40 | SequenceFile
 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 42 | 
 43 | `Sequence files <https://wiki.apache.org/hadoop/SequenceFile>`__ are one kind of
 44 | the Hadoop file formats which are serialized in key-value pairs.
 45 | 
 46 | This reader can process this file format and build a Kafka message with the
 47 | key-value pair. These two values are named ``key`` and ``value`` in the message
 48 | by default but you can customize these field names.
 49 | 
 50 | More information about properties of this file reader :ref:`here<config_options-filereaders-sequencefile>`.
 51 | 
 52 | Cobol
 53 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 54 | 
 55 | Mainframe files (Cobol / EBCDIC binary files) can be processed with this reader which uses the
 56 | `Cobrix <https://github.com/AbsaOSS/cobrix/>`__ parser.
 57 | 
 58 | By means of the corresponding copybook -representing its schema-, it parses each record and
 59 | translate it into a Kafka message with the schema.
 60 | 
 61 | More information about properties of this file reader :ref:`here<config_options-filereaders-cobol>`.
 62 | 
 63 | Binary
 64 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 65 | 
 66 | All other kind of binary files can be ingested using this reader.
 67 | 
 68 | It just extracts the content plus some metadata such as: path, file owner, file group, length, access time,
 69 | and modification time.
 70 | 
 71 | Each message will contain the following schema:
 72 | 
 73 |   * ``path``: File path (string).
 74 |   * ``owner``: Owner of the file. (string).
 75 |   * ``group``: Group associated with the file. (string).
 76 |   * ``length``: Length of this file, in bytes. (long).
 77 |   * ``access_time``: Access time of the file. (long).
 78 |   * ``modification_time``: Modification time of the file (long).
 79 |   * ``content``: Content of the file (bytes).
 80 | 
 81 | More information about properties of this file reader :ref:`here<config_options-filereaders-binary>`.
 82 | 
 83 | CSV
 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 85 | 
 86 | CSV file reader using a custom token to distinguish different columns in each line.
 87 | 
 88 | It allows to distinguish a header in the files and set the name of their columns
 89 | in the message sent to Kafka. If there is no header, the value of each column will be in
 90 | the field named ``column_N`` (**N** represents the column index) in the message.
 91 | Also, the token delimiter for columns is configurable.
 92 | 
 93 | This reader is based on the `Univocity CSV parser <https://www.univocity.com/pages/univocity_parsers_csv.html#working-with-csv>`__.
 94 | 
 95 | More information about properties of this file reader :ref:`here<config_options-filereaders-csv>`.
 96 | 
 97 | TSV
 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 99 | 
100 | TSV file reader using a tab ``\t`` to distinguish different columns in each line.
101 | 
102 | Its behaviour is the same one for the CSV file reader regarding the header and the column names.
103 | 
104 | This reader is based on the `Univocity TSV parser <https://www.univocity.com/pages/univocity_parsers_tsv.html#working-with-tsv>`__.
105 | 
106 | More information about properties of this file reader :ref:`here<config_options-filereaders-tsv>`.
107 | 
108 | FixedWidth
109 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
110 | 
111 | FixedWidth is a plain text file reader which distinguishes each column based on the length of each field.
112 | 
113 | Its behaviour is the same one for the CSV / TSV file readers regarding the header and the column names.
114 | 
115 | This reader is based on the `Univocity Fixed-Width parser <https://www.univocity.com/pages/univocity_parsers_fixed_width.html#working-with-fixed-width>`__.
116 | 
117 | More information about properties of this file reader :ref:`here<config_options-filereaders-fixedwidth>`.
118 | 
119 | JSON
120 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121 | 
122 | Reads JSON files which might contain multiple number of fields with their specified
123 | data types. The schema for this sort of records is inferred reading the first record
124 | and marked as optional in the schema all the fields contained.
125 | 
126 | More information about properties of this file reader :ref:`here<config_options-filereaders-json>`.
127 | 
128 | XML
129 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
130 | 
131 | Reads XML files which might contain multiple number of fields with their specified
132 | data types. The schema for this sort of records is inferred reading the first record
133 | and marked as optional in the schema all the fields contained.
134 | 
135 | .. warning:: Take into account the current
136 |              `limitations <https://github.com/FasterXML/jackson-dataformat-xml#known-limitations>`__.
137 | 
138 | More information about properties of this file reader :ref:`here<config_options-filereaders-xml>`.
139 | 
140 | YAML
141 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142 | 
143 | Reads YAML files which might contain multiple number of fields with their specified
144 | data types. The schema for this sort of records is inferred reading the first record
145 | and marked as optional in the schema all the fields contained.
146 | 
147 | More information about properties of this file reader :ref:`here<config_options-filereaders-yaml>`.
148 | 
149 | Text
150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
151 | 
152 | Reads plain text files.
153 | 
154 | Each line represents one record (by default) which will be in a field
155 | named ``value`` in the message sent to Kafka by default but you can
156 | customize these field names.
157 | 
158 | More information about properties of this file reader :ref:`here<config_options-filereaders-text>`.
159 | 
160 | Agnostic
161 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
162 | 
163 | Actually, this reader is a wrapper of the readers listing above.
164 | 
165 | It tries to read any kind of file format using an internal reader based on the file extension,
166 | applying the proper one (Parquet, Avro, ORC, SequenceFile, Cobol / EBCDIC, CSV, TSV, FixedWidth, JSON, XML,
167 | YAML, or Text). In case of no extension has been matched, the Text file reader will be applied.
168 | 
169 | Default extensions for each format (configurable):
170 | 
171 | * Parquet: ``.parquet``
172 | * Avro: ``.avro``
173 | * ORC: ``.orc``
174 | * SequenceFile: ``.seq``
175 | * Cobol / EBCDIC: ``.dat``
176 | * Other binary files: ``.bin``
177 | * CSV: ``.csv``
178 | * TSV: ``.tsv``
179 | * FixedWidth: ``.fixed``
180 | * JSON: ``.json``
181 | * XML: ``.xml``
182 | * YAML: ``.yaml``
183 | * Text: any other sort of file extension.
184 | 
185 | More information about properties of this file reader :ref:`here<config_options-filereaders-agnostic>`.
186 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. kafka-connect-fs documentation master file, created by Mario Molina
 2 | 
 3 | ********************************************
 4 | Kafka Connect FileSystem Connector
 5 | ********************************************
 6 | 
 7 | Kafka Connect FileSystem Connector is a source connector for reading records from
 8 | files in the file systems specified and load them into Kafka.
 9 | 
10 | The connector supports:
11 | 
12 | * Several sort of File Systems (FS) to use.
13 | * Dynamic and static URIs to ingest data from.
14 | * Policies to define rules about how to look for files and clean them up after processing.
15 | * File readers to parse and read different kind of file formats.
16 | 
17 | To learn more about the connector you can read :ref:`this section<connector>` and for more detailed
18 | configuration options you can read :ref:`this other one<config_options>`.
19 | 
20 | Also, you can download the source code from `here. <https://github.com/mmolimar/kafka-connect-fs>`__
21 | 
22 | Contents
23 | ============================================
24 | 
25 | .. toctree::
26 |    :maxdepth: 2
27 | 
28 |    connector
29 |    config_options
30 |    faq
31 | 


--------------------------------------------------------------------------------
/docs/source/policies.rst:
--------------------------------------------------------------------------------
 1 | Simple
 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 3 | 
 4 | It's a policy which just filters and processes files included in the corresponding URIs one time.
 5 | 
 6 | .. attention:: This policy is more oriented for testing purposes.
 7 | 
 8 | Sleepy
 9 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
10 | 
11 | The behaviour of this policy is similar to Simple policy but on each execution it sleeps
12 | and wait for the next one. Additionally, its custom properties allow to end it.
13 | 
14 | You can learn more about the properties of this policy :ref:`here<config_options-policies-sleepy>`.
15 | 
16 | Cron
17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18 | 
19 | This policy is scheduled based on cron expressions and their format to put in the configuration
20 | are based on the library `Quartz Scheduler <https://www.quartz-scheduler.org>`__.
21 | 
22 | After finishing each execution, the policy gets slept until the next one is scheduled, if applicable.
23 | 
24 | You can learn more about the properties of this policy :ref:`here<config_options-policies-cron>`.
25 | 
26 | HDFS file watcher
27 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
28 | 
29 | It uses Hadoop notifications events and all create/append/rename/close events will be reported
30 | as files to be ingested.
31 | 
32 | Just use it when you have HDFS URIs.
33 | 
34 | You can learn more about the properties of this policy :ref:`here<config_options-policies-hdfs>`.
35 | 
36 | .. attention:: The URIs included in the general property ``fs.uris`` will be filtered and only those
37 |                ones which start with the prefix ``hdfs://`` will be watched. Also, this policy
38 |                will only work for Hadoop versions 2.6.0 or higher.
39 | 
40 | S3 event notifications
41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
42 | 
43 | It uses S3 event notifications sent from S3 to process files which have been created or modified in S3.
44 | These notifications will be read from a AWS-SQS queue and they can be sent to SQS directly from S3 or via
45 | AWS-SNS, either as a SNS notification or a raw message in the subscription.
46 | 
47 | Just use it when you have S3 URIs and the event notifications in the S3 bucket must be enabled to a SNS
48 | topic or a SQS queue.
49 | 
50 | You can learn more about the properties of this policy :ref:`here<config_options-policies-s3events>`.
51 | 


--------------------------------------------------------------------------------
/src/main/assembly/development.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |           xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
 4 |           http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 5 |     <!-- Assembles all dependencies in target/ directory so scripts can easily run in a development
 6 |          environment -->
 7 |     <id>development</id>
 8 |     <formats>
 9 |         <format>dir</format>
10 |     </formats>
11 |     <includeBaseDirectory>false</includeBaseDirectory>
12 |     <dependencySets>
13 |         <dependencySet>
14 |             <outputDirectory>share/java/kafka-connect-fs/</outputDirectory>
15 |         </dependencySet>
16 |     </dependencySets>
17 | </assembly>


--------------------------------------------------------------------------------
/src/main/assembly/package.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |           xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
 4 |           http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 5 |   <!-- Assembles a packaged version targeting OS installation. -->
 6 |   <id>package</id>
 7 |   <formats>
 8 |     <format>dir</format>
 9 |   </formats>
10 |   <includeBaseDirectory>false</includeBaseDirectory>
11 |   <fileSets>
12 |     <fileSet>
13 |       <directory>${project.basedir}</directory>
14 |       <outputDirectory>share/doc/${project.name}/</outputDirectory>
15 |       <includes>
16 |         <include>README*</include>
17 |         <include>LICENSE*</include>
18 |         <include>NOTICE*</include>
19 |         <include>licenses/</include>
20 |       </includes>
21 |     </fileSet>
22 |     <fileSet>
23 |       <directory>${project.basedir}/config</directory>
24 |       <outputDirectory>etc/${project.name}</outputDirectory>
25 |       <includes>
26 |         <include>*</include>
27 |       </includes>
28 |     </fileSet>
29 |   </fileSets>
30 |   <dependencySets>
31 |     <dependencySet>
32 |       <outputDirectory>share/java/${project.name}</outputDirectory>
33 |       <useProjectArtifact>true</useProjectArtifact>
34 |       <useTransitiveFiltering>true</useTransitiveFiltering>
35 |       <excludes>
36 |         <exclude>org.apache.kafka:connect-api</exclude>
37 |         <exclude>org.mortbay.jetty:*</exclude>
38 |         <exclude>com.sun.jersey:*</exclude>
39 |         <exclude>org.eclipse.jetty:jetty-util</exclude>
40 |         <exclude>com.sun.jersey.contribs:jersey-guice</exclude>
41 |         <exclude>org.apache.zookeeper:zookeeper</exclude>
42 |         <exclude>log4j:log4j</exclude>
43 |         <exclude>org.slf4j:slf4j-api</exclude>
44 |         <exclude>org.slf4j:slf4j-log4j12</exclude>
45 |         <exclude>javax.servlet:servlet-api</exclude>
46 |         <exclude>javax.servlet.jsp:jsp-api</exclude>
47 |       </excludes>
48 |     </dependencySet>
49 |   </dependencySets>
50 | </assembly>
51 | 


--------------------------------------------------------------------------------
/src/main/assembly/standalone.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |           xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
 4 |           http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 5 |     <!-- Assembles a packaged jar that includes all dependencies. This still requires the kafka-connect
 6 |      runtime, but allows running kafka-connect with a connector plugin using only a couple of jars. -->
 7 |     <id>standalone</id>
 8 |     <formats>
 9 |         <format>jar</format>
10 |     </formats>
11 |     <includeBaseDirectory>false</includeBaseDirectory>
12 |     <fileSets>
13 |         <fileSet>
14 |             <directory>${project.basedir}</directory>
15 |             <outputDirectory>/</outputDirectory>
16 |             <includes>
17 |                 <include>README*</include>
18 |                 <include>LICENSE*</include>
19 |                 <include>NOTICE*</include>
20 |                 <include>licenses.html</include>
21 |                 <include>licenses/</include>
22 |                 <include>notices/</include>
23 |             </includes>
24 |         </fileSet>
25 |     </fileSets>
26 |     <dependencySets>
27 |         <dependencySet>
28 |             <outputDirectory>/</outputDirectory>
29 |             <useProjectArtifact>true</useProjectArtifact>
30 |             <unpack>true</unpack>
31 |             <scope>runtime</scope>
32 |         </dependencySet>
33 |     </dependencySets>
34 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.util.Version;
 4 | import org.apache.kafka.common.config.ConfigDef;
 5 | import org.apache.kafka.common.config.ConfigException;
 6 | import org.apache.kafka.connect.connector.Task;
 7 | import org.apache.kafka.connect.errors.ConnectException;
 8 | import org.apache.kafka.connect.source.SourceConnector;
 9 | import org.apache.kafka.connect.util.ConnectorUtils;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import java.util.ArrayList;
14 | import java.util.HashMap;
15 | import java.util.List;
16 | import java.util.Map;
17 | 
18 | public class FsSourceConnector extends SourceConnector {
19 | 
20 |     private static Logger log = LoggerFactory.getLogger(FsSourceConnector.class);
21 | 
22 |     private FsSourceConnectorConfig config;
23 | 
24 |     @Override
25 |     public String version() {
26 |         return Version.getVersion();
27 |     }
28 | 
29 |     @Override
30 |     public void start(Map<String, String> properties) {
31 |         log.info("{} Starting connector...", this);
32 |         try {
33 |             config = new FsSourceConnectorConfig(properties);
34 |         } catch (ConfigException ce) {
35 |             throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce);
36 |         } catch (Exception ce) {
37 |             throw new ConnectException("An error has occurred when starting FsSourceConnector." + ce);
38 |         }
39 |     }
40 | 
41 |     @Override
42 |     public Class<? extends Task> taskClass() {
43 |         return FsSourceTask.class;
44 |     }
45 | 
46 |     @Override
47 |     public List<Map<String, String>> taskConfigs(int maxTasks) {
48 |         if (config == null) {
49 |             throw new ConnectException("Connector config has not been initialized.");
50 |         }
51 |         final List<Map<String, String>> taskConfigs = new ArrayList<>();
52 | 
53 |         List<String> fsUris = config.getFsUris();
54 |         int groups = Math.min(fsUris.size(), maxTasks);
55 |         ConnectorUtils.groupPartitions(fsUris, groups)
56 |                 .forEach(dirs -> {
57 |                     Map<String, String> taskProps = new HashMap<>(config.originalsStrings());
58 |                     taskProps.put(FsSourceConnectorConfig.FS_URIS, String.join(",", dirs));
59 |                     taskConfigs.add(taskProps);
60 |                 });
61 | 
62 |         log.debug("{} Partitions grouped as: {}", this, taskConfigs);
63 | 
64 |         return taskConfigs;
65 |     }
66 | 
67 |     @Override
68 |     public void stop() {
69 |         log.info("{} Stopping FsSourceConnector.", this);
70 |         // Nothing to do
71 |     }
72 | 
73 |     @Override
74 |     public ConfigDef config() {
75 |         return FsSourceConnectorConfig.conf();
76 |     }
77 | 
78 |     @Override
79 |     public String toString() {
80 |         return this.getClass().getSimpleName();
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs;
 2 | 
 3 | import org.apache.kafka.common.config.AbstractConfig;
 4 | import org.apache.kafka.common.config.ConfigDef;
 5 | import org.apache.kafka.common.config.ConfigDef.Importance;
 6 | import org.apache.kafka.common.config.ConfigDef.Type;
 7 | 
 8 | import java.util.List;
 9 | import java.util.Map;
10 | 
11 | 
12 | public class FsSourceConnectorConfig extends AbstractConfig {
13 | 
14 |     public static final String FS_URIS = "fs.uris";
15 |     private static final String FS_URIS_DOC = "Comma-separated URIs of the FS(s).";
16 |     private static final String FS_URIS_DISPLAY = "File system URIs";
17 | 
18 |     public static final String TOPIC = "topic";
19 |     private static final String TOPIC_DOC = "Topic to copy data to.";
20 |     private static final String TOPIC_DISPLAY = "Topic";
21 | 
22 |     private static final String CONNECTOR_GROUP = "Connector";
23 | 
24 |     public FsSourceConnectorConfig(ConfigDef config, Map<String, String> parsedConfig) {
25 |         super(config, parsedConfig);
26 |     }
27 | 
28 |     public FsSourceConnectorConfig(Map<String, String> parsedConfig) {
29 |         this(conf(), parsedConfig);
30 |     }
31 | 
32 |     public static ConfigDef conf() {
33 |         int order = 0;
34 |         return new ConfigDef()
35 |                 .define(
36 |                         FS_URIS,
37 |                         Type.LIST,
38 |                         ConfigDef.NO_DEFAULT_VALUE,
39 |                         Importance.HIGH,
40 |                         FS_URIS_DOC,
41 |                         CONNECTOR_GROUP,
42 |                         ++order,
43 |                         ConfigDef.Width.LONG,
44 |                         FS_URIS_DISPLAY
45 |                 ).define(
46 |                         TOPIC,
47 |                         Type.STRING,
48 |                         ConfigDef.NO_DEFAULT_VALUE,
49 |                         Importance.HIGH,
50 |                         TOPIC_DOC,
51 |                         CONNECTOR_GROUP,
52 |                         ++order,
53 |                         ConfigDef.Width.LONG,
54 |                         TOPIC_DISPLAY
55 |                 );
56 |     }
57 | 
58 |     public List<String> getFsUris() {
59 |         return this.getList(FS_URIS);
60 |     }
61 | 
62 |     public String getTopic() {
63 |         return this.getString(TOPIC);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
  4 | import com.github.mmolimar.kafka.connect.fs.file.reader.AbstractFileReader;
  5 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
  6 | import com.github.mmolimar.kafka.connect.fs.policy.Policy;
  7 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
  8 | import com.github.mmolimar.kafka.connect.fs.util.Version;
  9 | import org.apache.kafka.common.config.ConfigException;
 10 | import org.apache.kafka.common.utils.SystemTime;
 11 | import org.apache.kafka.common.utils.Time;
 12 | import org.apache.kafka.connect.data.Struct;
 13 | import org.apache.kafka.connect.errors.ConnectException;
 14 | import org.apache.kafka.connect.source.SourceRecord;
 15 | import org.apache.kafka.connect.source.SourceTask;
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | import java.io.IOException;
 20 | import java.util.*;
 21 | import java.util.concurrent.atomic.AtomicBoolean;
 22 | import java.util.function.Function;
 23 | import java.util.stream.Collectors;
 24 | import java.util.stream.Stream;
 25 | import java.util.stream.StreamSupport;
 26 | 
 27 | public class FsSourceTask extends SourceTask {
 28 | 
 29 |     private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class);
 30 | 
 31 |     private final AtomicBoolean stop;
 32 |     private final Time time;
 33 | 
 34 |     private FsSourceTaskConfig config;
 35 |     private Policy policy;
 36 |     private int pollInterval;
 37 | 
 38 |     public FsSourceTask() {
 39 |         this.stop = new AtomicBoolean(false);
 40 |         this.time = new SystemTime();
 41 |     }
 42 | 
 43 |     @Override
 44 |     public String version() {
 45 |         return Version.getVersion();
 46 |     }
 47 | 
 48 |     @Override
 49 |     @SuppressWarnings("unchecked")
 50 |     public void start(Map<String, String> properties) {
 51 |         log.info("{} Starting FS source task...", this);
 52 |         try {
 53 |             config = new FsSourceTaskConfig(properties);
 54 |             if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) {
 55 |                 throw new ConfigException("Policy class " +
 56 |                         config.getClass(FsSourceTaskConfig.POLICY_CLASS) + " is not a subclass of " + Policy.class);
 57 |             }
 58 |             if (config.getClass(FsSourceTaskConfig.FILE_READER_CLASS).isAssignableFrom(FileReader.class)) {
 59 |                 throw new ConfigException("FileReader class " +
 60 |                         config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + " is not a subclass of " + FileReader.class);
 61 |             }
 62 | 
 63 |             Class<Policy> policyClass = (Class<Policy>) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS));
 64 |             policy = ReflectionUtils.makePolicy(policyClass, config);
 65 |             pollInterval = config.getInt(FsSourceTaskConfig.POLL_INTERVAL_MS);
 66 |         } catch (ConfigException ce) {
 67 |             log.error("{} Couldn't start FS source task: {}", this, ce.getMessage(), ce);
 68 |             throw new ConnectException("Couldn't start FS source task due to configuration error: " + ce.getMessage(), ce);
 69 |         } catch (Exception e) {
 70 |             log.error("{} Couldn't start FS source task: {}", this, e.getMessage(), e);
 71 |             throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage(), e);
 72 |         }
 73 |         log.info("{} FS source task started with policy [{}].", this, policy.getClass().getName());
 74 |     }
 75 | 
 76 |     @Override
 77 |     public List<SourceRecord> poll() {
 78 |         while (!stop.get() && policy != null && !policy.hasEnded()) {
 79 |             log.trace("{} Polling for new data...", this);
 80 |             Function<FileMetadata, Map<String, Object>> makePartitionKey = (FileMetadata metadata) ->
 81 |                     Collections.singletonMap("path", metadata.getPath());
 82 | 
 83 |             // Fetch all the offsets upfront to avoid fetching offsets once per file
 84 |             List<FileMetadata> filesToProcess = filesToProcess().collect(Collectors.toList());
 85 |             List<Map<String, Object>> partitions = filesToProcess.stream().map(makePartitionKey).collect(Collectors.toList());
 86 |             Map<Map<String, Object>, Map<String, Object>> offsets = context.offsetStorageReader().offsets(partitions);
 87 | 
 88 |             List<SourceRecord> totalRecords = filesToProcess.stream().map(metadata -> {
 89 |                 List<SourceRecord> records = new ArrayList<>();
 90 |                 Map<String, Object> partitionKey = makePartitionKey.apply(metadata);
 91 |                 Map<String, Object> offset = Optional.ofNullable(offsets.get(partitionKey)).orElse(new HashMap<>());
 92 |                 try (FileReader reader = policy.offer(metadata, offset)) {
 93 |                     if (reader.hasNext()) log.info("{} Processing records for file {}...", this, metadata);
 94 |                     while (reader.hasNext()) {
 95 |                         Struct record = reader.next();
 96 |                         // TODO change FileReader interface in the next major version
 97 |                         boolean hasNext = (reader instanceof AbstractFileReader) ?
 98 |                                 ((AbstractFileReader<?>) reader).hasNextBatch() || reader.hasNext() : reader.hasNext();
 99 |                         records.add(convert(metadata, reader.currentOffset(), !hasNext, record));
100 |                     }
101 |                 } catch (IOException | ConnectException e) {
102 |                     // when an exception happens reading a file, the connector continues
103 |                     log.warn("{} Error reading file [{}]: {}. Keep going...",
104 |                             this, metadata.getPath(), e.getMessage(), e);
105 |                 }
106 |                 log.debug("{} Read [{}] records from file [{}].", this, records.size(), metadata.getPath());
107 | 
108 |                 return records;
109 |             }).flatMap(Collection::stream).collect(Collectors.toList());
110 | 
111 |             log.debug("{} Returning [{}] records in execution number [{}] for policy [{}].",
112 |                     this, totalRecords.size(), policy.getExecutions(), policy.getClass().getName());
113 | 
114 |             return totalRecords;
115 |         }
116 |         if (pollInterval > 0) {
117 |             log.trace("{} Waiting [{}] ms for next poll.", this, pollInterval);
118 |             time.sleep(pollInterval);
119 |         }
120 |         return null;
121 |     }
122 | 
123 |     private Stream<FileMetadata> filesToProcess() {
124 |         try {
125 |             return asStream(policy.execute())
126 |                     .filter(metadata -> metadata.getLen() > 0);
127 |         } catch (IOException | ConnectException e) {
128 |             // when an exception happens executing the policy, the connector continues
129 |             log.error("{} Cannot retrieve files to process from the FS: [{}]. " +
130 |                             "There was an error executing the policy but the task tolerates this and continues: {}",
131 |                     this, policy.getURIs(), e.getMessage(), e);
132 |             return Stream.empty();
133 |         }
134 |     }
135 | 
136 |     private <T> Stream<T> asStream(Iterator<T> src) {
137 |         Iterable<T> iterable = () -> src;
138 |         return StreamSupport.stream(iterable.spliterator(), false);
139 |     }
140 | 
141 |     private SourceRecord convert(FileMetadata metadata, long offset, boolean eof, Struct struct) {
142 |         return new SourceRecord(
143 |                 Collections.singletonMap("path", metadata.getPath()),
144 |                 new HashMap<String, Object>() {{
145 |                     put("offset", offset);
146 |                     put("file-size", metadata.getLen());
147 |                     put("eof", eof);
148 |                 }},
149 |                 config.getTopic(),
150 |                 struct.schema(),
151 |                 struct
152 |         );
153 |     }
154 | 
155 |     @Override
156 |     public void stop() {
157 |         log.info("{} Stopping FS source task...", this);
158 |         stop.set(true);
159 |         synchronized (this) {
160 |             if (policy != null) {
161 |                 try {
162 |                     policy.close();
163 |                 } catch (IOException ioe) {
164 |                     log.warn("{} Error closing policy: {}", this, ioe.getMessage(), ioe);
165 |                 }
166 |             }
167 |         }
168 |     }
169 | 
170 |     @Override
171 |     public String toString() {
172 |         return this.getClass().getSimpleName();
173 |     }
174 | }
175 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Optional;
 5 | 
 6 | public class FileMetadata {
 7 |     private String path;
 8 |     private long length;
 9 |     private List<BlockInfo> blocks;
10 | 
11 |     public FileMetadata(String path, long length, List<BlockInfo> blocks) {
12 |         this.path = Optional.ofNullable(path).orElse("");
13 |         this.length = length;
14 |         this.blocks = blocks;
15 |     }
16 | 
17 |     public String getPath() {
18 |         return path;
19 |     }
20 | 
21 |     public long getLen() {
22 |         return length;
23 |     }
24 | 
25 |     public List<BlockInfo> getBlocks() {
26 |         return blocks;
27 |     }
28 | 
29 |     @Override
30 |     public String toString() {
31 |         return String.format("[path = %s, length = %s, blocks = %s]", path, length, blocks);
32 |     }
33 | 
34 |     @Override
35 |     public boolean equals(Object object) {
36 |         if (this == object) return true;
37 |         if (!(object instanceof FileMetadata)) return false;
38 | 
39 |         FileMetadata metadata = (FileMetadata) object;
40 |         return this.path.equals(metadata.getPath()) &&
41 |                 this.length == metadata.length &&
42 |                 this.blocks.equals(metadata.getBlocks());
43 |     }
44 | 
45 |     public int hashCode() {
46 |         return path.hashCode();
47 |     }
48 | 
49 | 
50 |     public static class BlockInfo {
51 |         private long offset;
52 |         private long length;
53 |         private boolean corrupt;
54 | 
55 |         public BlockInfo(long offset, long length, boolean corrupt) {
56 |             this.offset = offset;
57 |             this.length = length;
58 |             this.corrupt = corrupt;
59 |         }
60 | 
61 |         @Override
62 |         public boolean equals(Object object) {
63 |             if (this == object) return true;
64 |             if (!(object instanceof BlockInfo)) return false;
65 | 
66 |             BlockInfo blockInfo = (BlockInfo) object;
67 |             return this.offset == blockInfo.offset &&
68 |                     this.length == blockInfo.length &&
69 |                     this.corrupt == blockInfo.corrupt;
70 |         }
71 | 
72 |         @Override
73 |         public String toString() {
74 |             return String.format("[offset = %s, length = %s, corrupt = %s]", offset, length, corrupt);
75 |         }
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  4 | import org.apache.hadoop.fs.FileSystem;
  5 | import org.apache.hadoop.fs.Path;
  6 | import org.apache.kafka.connect.data.Struct;
  7 | import org.apache.kafka.connect.errors.ConnectException;
  8 | import org.slf4j.Logger;
  9 | import org.slf4j.LoggerFactory;
 10 | 
 11 | import java.io.IOException;
 12 | import java.util.Map;
 13 | import java.util.NoSuchElementException;
 14 | import java.util.stream.Collectors;
 15 | 
 16 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
 17 | 
 18 | public abstract class AbstractFileReader<T> implements FileReader {
 19 | 
 20 |     protected final Logger log = LoggerFactory.getLogger(getClass());
 21 | 
 22 |     private final FileSystem fs;
 23 |     private final Path filePath;
 24 |     private final ReaderAdapter<T> adapter;
 25 |     private final int batchSize;
 26 |     private boolean seeked;
 27 |     private long offset;
 28 | 
 29 |     public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter<T> adapter, Map<String, Object> config) {
 30 |         if (fs == null || filePath == null) {
 31 |             throw new IllegalArgumentException("File system and file path are required.");
 32 |         }
 33 |         this.fs = fs;
 34 |         this.filePath = filePath;
 35 |         this.adapter = adapter;
 36 |         this.batchSize = Integer.parseInt(config.getOrDefault(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, "0").toString());
 37 |         this.seeked = false;
 38 |         this.offset = 0;
 39 | 
 40 |         configure(readerConfig(config));
 41 |         log.trace("{} Initialized file reader with batch size [{}] for file [{}].", this, this.batchSize, this.filePath);
 42 |     }
 43 | 
 44 |     protected final Map<String, String> readerConfig(Map<String, Object> config) {
 45 |         return config.entrySet().stream()
 46 |                 .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX))
 47 |                 .filter(entry -> entry.getValue() != null)
 48 |                 .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString()));
 49 |     }
 50 | 
 51 |     protected abstract void configure(Map<String, String> config);
 52 | 
 53 |     protected FileSystem getFs() {
 54 |         return fs;
 55 |     }
 56 | 
 57 |     @Override
 58 |     public Path getFilePath() {
 59 |         return filePath;
 60 |     }
 61 | 
 62 |     @Override
 63 |     public long currentOffset() {
 64 |         return offset;
 65 |     }
 66 | 
 67 |     protected void incrementOffset() {
 68 |         offset++;
 69 |     }
 70 | 
 71 |     protected void setOffset(long offset) {
 72 |         this.offset = offset;
 73 |     }
 74 | 
 75 |     @Override
 76 |     public final boolean hasNext() {
 77 |         checkClosed();
 78 |         try {
 79 |             return (batchSize <= 0 || offset == 0 || offset % batchSize != 0 || (offset % batchSize == 0 && seeked)) &&
 80 |                     hasNextRecord();
 81 |         } catch (ConnectException ce) {
 82 |             throw ce;
 83 |         } catch (Exception e) {
 84 |             throw new ConnectException("Error when checking if the reader has more records.", e);
 85 |         }
 86 |     }
 87 | 
 88 |     @Override
 89 |     public final Struct next() {
 90 |         if (!hasNext()) {
 91 |             throw new NoSuchElementException("There are no more records in file: " + getFilePath());
 92 |         }
 93 |         try {
 94 |             Struct struct = adapter.apply(nextRecord());
 95 |             seeked = false;
 96 |             return struct;
 97 |         } catch (ConnectException ce) {
 98 |             throw ce;
 99 |         } catch (Exception e) {
100 |             throw new ConnectException("Error processing next record in file: " + getFilePath(), e);
101 |         }
102 |     }
103 | 
104 |     public final boolean hasNextBatch() {
105 |         checkClosed();
106 |         try {
107 |             return batchSize > 0 && hasNextRecord();
108 |         } catch (ConnectException ce) {
109 |             throw ce;
110 |         } catch (Exception e) {
111 |             throw new ConnectException("Error when checking if the reader has more batches.", e);
112 |         }
113 |     }
114 | 
115 |     public final void nextBatch() {
116 |         if (!hasNextBatch()) {
117 |             throw new NoSuchElementException("There are no more batches in file: " + getFilePath());
118 |         }
119 |         long batchOffset = offset + (offset % batchSize);
120 |         seek(batchOffset);
121 |     }
122 | 
123 |     @Override
124 |     public final void seek(long offset) {
125 |         if (offset < 0) {
126 |             throw new IllegalArgumentException("Record offset must be greater than 0.");
127 |         }
128 |         checkClosed();
129 |         try {
130 |             seekFile(offset);
131 |             seeked = true;
132 |         } catch (IOException ioe) {
133 |             throw new ConnectException("Error seeking file: " + getFilePath(), ioe);
134 |         }
135 |     }
136 | 
137 |     @Override
138 |     public String toString() {
139 |         return this.getClass().getSimpleName();
140 |     }
141 | 
142 |     protected ReaderAdapter<T> getAdapter() {
143 |         return adapter;
144 |     }
145 | 
146 |     private void checkClosed() {
147 |         if (isClosed()) {
148 |             throw new ConnectException("File stream is closed!");
149 |         }
150 |     }
151 | 
152 |     protected abstract T nextRecord() throws IOException;
153 | 
154 |     protected abstract boolean hasNextRecord() throws IOException;
155 | 
156 |     protected abstract void seekFile(long offset) throws IOException;
157 | 
158 |     protected abstract boolean isClosed();
159 | 
160 | }
161 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import io.confluent.connect.avro.AvroData;
  4 | import org.apache.avro.Schema;
  5 | import org.apache.avro.file.DataFileReader;
  6 | import org.apache.avro.generic.GenericRecord;
  7 | import org.apache.avro.specific.SpecificDatumReader;
  8 | import org.apache.hadoop.fs.AvroFSInput;
  9 | import org.apache.hadoop.fs.FileContext;
 10 | import org.apache.hadoop.fs.FileSystem;
 11 | import org.apache.hadoop.fs.Path;
 12 | import org.apache.kafka.connect.data.Struct;
 13 | 
 14 | import java.io.IOException;
 15 | import java.util.Map;
 16 | import java.util.Optional;
 17 | 
 18 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
 19 | 
 20 | public class AvroFileReader extends AbstractFileReader<GenericRecord> {
 21 | 
 22 |     private static final String FILE_READER_AVRO = FILE_READER_PREFIX + "avro.";
 23 | 
 24 |     public static final String FILE_READER_AVRO_SCHEMA = FILE_READER_AVRO + "schema";
 25 | 
 26 |     private final DataFileReader<GenericRecord> reader;
 27 |     private Schema schema;
 28 |     private boolean closed;
 29 | 
 30 |     public AvroFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
 31 |         super(fs, filePath, new GenericRecordToStruct(), config);
 32 | 
 33 |         AvroFSInput input = new AvroFSInput(FileContext.getFileContext(filePath.toUri()), filePath);
 34 |         if (this.schema == null) {
 35 |             this.reader = new DataFileReader<>(input, new SpecificDatumReader<>());
 36 |         } else {
 37 |             this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema));
 38 |         }
 39 |         this.closed = false;
 40 |     }
 41 | 
 42 |     @Override
 43 |     protected void configure(Map<String, String> config) {
 44 |         this.schema = Optional.ofNullable(config.get(FILE_READER_AVRO_SCHEMA))
 45 |                 .map(c -> new Schema.Parser().parse(c))
 46 |                 .orElse(null);
 47 |     }
 48 | 
 49 |     @Override
 50 |     public boolean hasNextRecord() {
 51 |         return reader.hasNext();
 52 |     }
 53 | 
 54 |     @Override
 55 |     protected GenericRecord nextRecord() {
 56 |         GenericRecord record = reader.next();
 57 |         incrementOffset();
 58 | 
 59 |         return record;
 60 |     }
 61 | 
 62 |     @Override
 63 |     public void seekFile(long offset) throws IOException {
 64 |         if (offset == currentOffset()) {
 65 |             return;
 66 |         } else if (offset < currentOffset()) {
 67 |             reader.sync(0L);
 68 |         }
 69 |         while (super.hasNext() && offset > currentOffset()) {
 70 |             super.next();
 71 |         }
 72 |         setOffset(offset);
 73 |     }
 74 | 
 75 |     @Override
 76 |     public void close() throws IOException {
 77 |         closed = true;
 78 |         reader.sync(0);
 79 |         reader.close();
 80 |     }
 81 | 
 82 |     @Override
 83 |     public boolean isClosed() {
 84 |         return closed;
 85 |     }
 86 | 
 87 |     static class GenericRecordToStruct implements ReaderAdapter<GenericRecord> {
 88 | 
 89 |         private static final int CACHE_SIZE = 100;
 90 |         private final AvroData avroData;
 91 | 
 92 |         GenericRecordToStruct() {
 93 |             this.avroData = new AvroData(CACHE_SIZE);
 94 |         }
 95 | 
 96 |         @Override
 97 |         public Struct apply(GenericRecord record) {
 98 |             return (Struct) avroData.toConnectData(record.getSchema(), record).value();
 99 |         }
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/BinaryFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.hadoop.fs.FSDataInputStream;
  4 | import org.apache.hadoop.fs.FileStatus;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.kafka.connect.data.Schema;
  8 | import org.apache.kafka.connect.data.SchemaBuilder;
  9 | import org.apache.kafka.connect.data.Struct;
 10 | 
 11 | import java.io.ByteArrayOutputStream;
 12 | import java.io.EOFException;
 13 | import java.io.IOException;
 14 | import java.util.Map;
 15 | 
 16 | public class BinaryFileReader extends AbstractFileReader<BinaryFileReader.BinaryRecord> {
 17 | 
 18 |     private static final String FIELD_PATH = "path";
 19 |     private static final String FIELD_OWNER = "owner";
 20 |     private static final String FIELD_GROUP = "group";
 21 |     private static final String FIELD_LENGTH = "length";
 22 |     private static final String FIELD_ACCESS_TIME = "access_time";
 23 |     private static final String FIELD_MODIFICATION_TIME = "modification_time";
 24 |     private static final String FIELD_CONTENT = "content";
 25 | 
 26 |     protected static final int NUM_RECORDS = 1;
 27 | 
 28 |     private final FileStatus fileStatus;
 29 |     private final Schema schema;
 30 | 
 31 |     private FSDataInputStream is;
 32 |     private boolean closed;
 33 | 
 34 |     public BinaryFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
 35 |         super(fs, filePath, new BinaryToStruct(), config);
 36 | 
 37 |         this.is = getFs().open(getFilePath());
 38 |         this.fileStatus = getFs().getFileStatus(getFilePath());
 39 |         this.schema = buildSchema();
 40 |         this.closed = false;
 41 |     }
 42 | 
 43 |     @Override
 44 |     protected void configure(Map<String, String> config) {
 45 |     }
 46 | 
 47 |     @Override
 48 |     protected BinaryRecord nextRecord() throws IOException {
 49 |         return new BinaryRecord(schema, fileStatus, readFully(is));
 50 |     }
 51 | 
 52 |     @Override
 53 |     protected boolean hasNextRecord() throws IOException {
 54 |         return is.available() > 0;
 55 |     }
 56 | 
 57 |     @Override
 58 |     protected void seekFile(long offset) throws IOException {
 59 |         if (offset == 0 && !isClosed()) {
 60 |             is = getFs().open(getFilePath());
 61 |         } else if (!isClosed()){
 62 |             readFully(is);
 63 |         }
 64 |     }
 65 | 
 66 |     @Override
 67 |     public void close() throws IOException {
 68 |         closed = true;
 69 |         is.close();
 70 |     }
 71 | 
 72 |     @Override
 73 |     public boolean isClosed() {
 74 |         return closed;
 75 |     }
 76 | 
 77 |     private Schema buildSchema() {
 78 |         return SchemaBuilder.struct()
 79 |                 .field(FIELD_PATH, Schema.STRING_SCHEMA)
 80 |                 .field(FIELD_OWNER, Schema.STRING_SCHEMA)
 81 |                 .field(FIELD_GROUP, Schema.STRING_SCHEMA)
 82 |                 .field(FIELD_LENGTH, Schema.INT64_SCHEMA)
 83 |                 .field(FIELD_ACCESS_TIME, Schema.INT64_SCHEMA)
 84 |                 .field(FIELD_MODIFICATION_TIME, Schema.INT64_SCHEMA)
 85 |                 .field(FIELD_CONTENT, Schema.BYTES_SCHEMA)
 86 |                 .build();
 87 |     }
 88 | 
 89 |     private byte[] readFully(FSDataInputStream in) throws IOException {
 90 |         ByteArrayOutputStream baos = new ByteArrayOutputStream();
 91 |         try {
 92 |             while (true) {
 93 |                 baos.write(in.readByte());
 94 |             }
 95 |         } catch (EOFException ignored) {
 96 |         }
 97 |         return baos.toByteArray();
 98 |     }
 99 | 
100 |     static class BinaryToStruct implements ReaderAdapter<BinaryFileReader.BinaryRecord> {
101 | 
102 |         @Override
103 |         public Struct apply(BinaryRecord record) {
104 |             Struct struct = new Struct(record.schema);
105 |             record.schema.fields().forEach(field -> {
106 |                 Object value = null;
107 |                 switch (field.name()) {
108 |                     case FIELD_PATH:
109 |                         value = record.fileStatus.getPath().toString();
110 |                         break;
111 |                     case FIELD_OWNER:
112 |                         value = record.fileStatus.getOwner();
113 |                         break;
114 |                     case FIELD_GROUP:
115 |                         value = record.fileStatus.getGroup();
116 |                         break;
117 |                     case FIELD_LENGTH:
118 |                         value = record.fileStatus.getLen();
119 |                         break;
120 |                     case FIELD_ACCESS_TIME:
121 |                         value = record.fileStatus.getAccessTime();
122 |                         break;
123 |                     case FIELD_MODIFICATION_TIME:
124 |                         value = record.fileStatus.getModificationTime();
125 |                         break;
126 |                     case FIELD_CONTENT:
127 |                         value = record.content;
128 |                         break;
129 |                 }
130 |                 struct.put(field, value);
131 |             });
132 |             return struct;
133 |         }
134 |     }
135 | 
136 |     static class BinaryRecord {
137 | 
138 |         private final Schema schema;
139 |         private final FileStatus fileStatus;
140 |         private final byte[] content;
141 | 
142 |         BinaryRecord(Schema schema, FileStatus fileStatus, byte[] content) {
143 |             this.schema = schema;
144 |             this.fileStatus = fileStatus;
145 |             this.content = content;
146 |         }
147 | 
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | public enum CompressionType {
 4 |     BZIP2,
 5 |     GZIP,
 6 |     NONE;
 7 | 
 8 |     private boolean concatenated;
 9 | 
10 |     CompressionType() {
11 |         this.concatenated = true;
12 |     }
13 | 
14 |     public boolean isConcatenated() {
15 |         return concatenated;
16 |     }
17 | 
18 |     public static CompressionType fromName(String compression, boolean concatenated) {
19 |         CompressionType ct = CompressionType.valueOf(compression.trim().toUpperCase());
20 |         ct.concatenated = concatenated;
21 |         return ct;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.univocity.parsers.common.AbstractParser;
 4 | import com.univocity.parsers.csv.CsvParser;
 5 | import com.univocity.parsers.csv.CsvParserSettings;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | 
 9 | import java.io.IOException;
10 | import java.util.Map;
11 | 
12 | public class CsvFileReader extends UnivocityFileReader<CsvParserSettings> {
13 | 
14 |     public static final String FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE = FILE_READER_DELIMITED_SETTINGS + "empty_value";
15 |     public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection";
16 |     public static final String FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED = FILE_READER_DELIMITED_SETTINGS + "escape_unquoted";
17 | 
18 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter";
19 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote";
20 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_escape";
21 | 
22 |     public CsvFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
23 |         super(fs, filePath, config);
24 |     }
25 | 
26 |     @Override
27 |     protected CsvParserSettings parserSettings(Map<String, String> config) {
28 |         CsvParserSettings settings = new CsvParserSettings();
29 |         settings.setEmptyValue(config.get(FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE));
30 |         settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false));
31 |         settings.setEscapeUnquotedValues(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED, false));
32 |         settings.getFormat().setDelimiter(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","));
33 |         settings.getFormat().setQuote(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE, "\"").charAt(0));
34 |         settings.getFormat().setQuoteEscape(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE, "\"").charAt(0));
35 | 
36 |         return settings;
37 |     }
38 | 
39 |     @Override
40 |     protected AbstractParser<CsvParserSettings> createParser(CsvParserSettings settings) {
41 |         return new CsvParser(settings);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.kafka.connect.data.Struct;
 5 | 
 6 | import java.io.Closeable;
 7 | import java.util.Iterator;
 8 | import java.util.function.Function;
 9 | 
10 | public interface FileReader extends Iterator<Struct>, Closeable {
11 | 
12 |     Path getFilePath();
13 | 
14 |     void seek(long offset);
15 | 
16 |     long currentOffset();
17 | }
18 | 
19 | @FunctionalInterface
20 | interface ReaderAdapter<T> extends Function<T, Struct> {
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.univocity.parsers.common.AbstractParser;
 4 | import com.univocity.parsers.fixed.FixedWidthFields;
 5 | import com.univocity.parsers.fixed.FixedWidthParser;
 6 | import com.univocity.parsers.fixed.FixedWidthParserSettings;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | 
10 | import java.io.IOException;
11 | import java.util.Arrays;
12 | import java.util.Map;
13 | import java.util.Optional;
14 | 
15 | public class FixedWidthFileReader extends UnivocityFileReader<FixedWidthParserSettings> {
16 | 
17 |     public static final String FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS = FILE_READER_DELIMITED_SETTINGS + "field_lengths";
18 |     public static final String FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING = FILE_READER_DELIMITED_SETTINGS + "keep_padding";
19 |     public static final String FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS = FILE_READER_DELIMITED_SETTINGS + "padding_for_headers";
20 |     public static final String FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE = FILE_READER_DELIMITED_SETTINGS + "ends_on_new_line";
21 |     public static final String FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS = FILE_READER_DELIMITED_SETTINGS + "skip_trailing_chars";
22 | 
23 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING = FILE_READER_DELIMITED_SETTINGS_FORMAT + "padding";
24 | 
25 |     public FixedWidthFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
26 |         super(fs, filePath, config);
27 |     }
28 | 
29 |     @Override
30 |     protected FixedWidthParserSettings parserSettings(Map<String, String> config) {
31 |         FixedWidthFields fieldLengths = new FixedWidthFields();
32 |         Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS))
33 |                 .map(fl -> Arrays.stream(fl.split(",")))
34 |                 .ifPresent(fl -> fl.forEach(field -> fieldLengths.addField(Integer.parseInt(field))));
35 | 
36 |         FixedWidthParserSettings settings = new FixedWidthParserSettings(fieldLengths);
37 |         settings.setKeepPadding(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING, false));
38 |         settings.setUseDefaultPaddingForHeaders(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS, true));
39 |         settings.setRecordEndsOnNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE, true));
40 |         settings.setSkipTrailingCharsUntilNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS, false));
41 |         settings.getFormat().setPadding(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING, " ").charAt(0));
42 | 
43 |         return settings;
44 |     }
45 | 
46 |     @Override
47 |     protected AbstractParser<FixedWidthParserSettings> createParser(FixedWidthParserSettings settings) {
48 |         return new FixedWidthParser(settings);
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.Map;
 9 | 
10 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
11 | 
12 | public class JsonFileReader extends JacksonFileReader {
13 | 
14 |     private static final String FILE_READER_JSON = FILE_READER_PREFIX + "json.";
15 |     private static final String FILE_READER_JSON_COMPRESSION = FILE_READER_JSON + "compression.";
16 | 
17 |     static final String FILE_READER_JSON_DESERIALIZATION_CONFIGS = FILE_READER_JSON + "deserialization.";
18 | 
19 |     public static final String FILE_READER_JSON_RECORD_PER_LINE = FILE_READER_JSON + "record_per_line";
20 |     public static final String FILE_READER_JSON_COMPRESSION_TYPE = FILE_READER_JSON_COMPRESSION + "type";
21 |     public static final String FILE_READER_JSON_COMPRESSION_CONCATENATED = FILE_READER_JSON_COMPRESSION + "concatenated";
22 |     public static final String FILE_READER_JSON_ENCODING = FILE_READER_JSON + "encoding";
23 | 
24 |     public JsonFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
25 |         super(fs, filePath, config);
26 |     }
27 | 
28 |     @Override
29 |     protected Object readerEncodingConfig(Map<String, Object> config) {
30 |         return config.get(FILE_READER_JSON_ENCODING);
31 |     }
32 | 
33 |     @Override
34 |     protected Object recordPerLineConfig(Map<String, Object> config) {
35 |         return config.get(FILE_READER_JSON_RECORD_PER_LINE);
36 |     }
37 | 
38 |     @Override
39 |     protected Object compressionTypeConfig(Map<String, Object> config) {
40 |         return config.get(FILE_READER_JSON_COMPRESSION_TYPE);
41 |     }
42 | 
43 |     @Override
44 |     protected Object compressionConcatenatedConfig(Map<String, Object> config) {
45 |         return config.get(FILE_READER_JSON_COMPRESSION_CONCATENATED);
46 |     }
47 | 
48 |     @Override
49 |     protected String deserializationConfigPrefix() {
50 |         return FILE_READER_JSON_DESERIALIZATION_CONFIGS;
51 |     }
52 | 
53 |     @Override
54 |     protected ObjectMapper getObjectMapper() {
55 |         return new ObjectMapper();
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import io.confluent.connect.avro.AvroData;
  4 | import org.apache.avro.Schema;
  5 | import org.apache.avro.generic.GenericData;
  6 | import org.apache.avro.generic.GenericRecord;
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.fs.FileSystem;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.kafka.connect.data.Struct;
 11 | import org.apache.parquet.avro.AvroParquetReader;
 12 | import org.apache.parquet.avro.AvroReadSupport;
 13 | import org.apache.parquet.hadoop.ParquetReader;
 14 | import org.apache.parquet.hadoop.util.HadoopInputFile;
 15 | 
 16 | import java.io.IOException;
 17 | import java.util.Map;
 18 | import java.util.Optional;
 19 | 
 20 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
 21 | 
 22 | public class ParquetFileReader extends AbstractFileReader<GenericRecord> {
 23 | 
 24 |     private static final String FILE_READER_PARQUET = FILE_READER_PREFIX + "parquet.";
 25 | 
 26 |     public static final String FILE_READER_PARQUET_SCHEMA = FILE_READER_PARQUET + "schema";
 27 |     public static final String FILE_READER_PARQUET_PROJECTION = FILE_READER_PARQUET + "projection";
 28 | 
 29 |     private ParquetReader<GenericRecord> reader;
 30 |     private GenericRecord currentRecord;
 31 |     private Schema schema;
 32 |     private Schema projection;
 33 |     private boolean closed;
 34 | 
 35 |     public ParquetFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
 36 |         super(fs, filePath, new GenericRecordToStruct(), config);
 37 | 
 38 |         this.reader = initReader();
 39 |         this.closed = false;
 40 |     }
 41 | 
 42 |     private ParquetReader<GenericRecord> initReader() throws IOException {
 43 |         Configuration configuration = getFs().getConf();
 44 |         if (this.schema != null) {
 45 |             AvroReadSupport.setAvroReadSchema(configuration, this.schema);
 46 |         }
 47 |         if (this.projection != null) {
 48 |             AvroReadSupport.setRequestedProjection(configuration, this.projection);
 49 |         }
 50 |         return AvroParquetReader
 51 |                 .<GenericRecord>builder(HadoopInputFile.fromPath(getFilePath(), configuration))
 52 |                 .build();
 53 |     }
 54 | 
 55 |     protected void configure(Map<String, String> config) {
 56 |         this.schema = Optional.ofNullable(config.get(FILE_READER_PARQUET_SCHEMA))
 57 |                 .map(c -> new Schema.Parser().parse(c))
 58 |                 .orElse(null);
 59 |         this.projection = Optional.ofNullable(config.get(FILE_READER_PARQUET_PROJECTION))
 60 |                 .map(c -> new Schema.Parser().parse(c))
 61 |                 .orElse(null);
 62 |     }
 63 | 
 64 |     @Override
 65 |     public boolean hasNextRecord() throws IOException {
 66 |         if (currentRecord == null) {
 67 |             currentRecord = reader.read();
 68 |         }
 69 |         return currentRecord != null;
 70 |     }
 71 | 
 72 |     @Override
 73 |     protected GenericRecord nextRecord() {
 74 |         GenericRecord record;
 75 |         if (this.projection != null) {
 76 |             record = new GenericData.Record(this.projection);
 77 |             this.projection.getFields().forEach(field -> record.put(field.name(), currentRecord.get(field.name())));
 78 |         } else {
 79 |             record = currentRecord;
 80 |         }
 81 |         currentRecord = null;
 82 |         incrementOffset();
 83 |         return record;
 84 |     }
 85 | 
 86 |     @Override
 87 |     public void seekFile(long offset) throws IOException {
 88 |         if (currentOffset() > offset) {
 89 |             this.reader = initReader();
 90 |             this.closed = false;
 91 |             setOffset(0);
 92 |         }
 93 |         while (hasNext() && currentOffset() < offset) {
 94 |             nextRecord();
 95 |         }
 96 |     }
 97 | 
 98 |     @Override
 99 |     public void close() throws IOException {
100 |         closed = true;
101 |         reader.close();
102 |     }
103 | 
104 |     @Override
105 |     public boolean isClosed() {
106 |         return closed;
107 |     }
108 | 
109 |     static class GenericRecordToStruct implements ReaderAdapter<GenericRecord> {
110 |         private static final int CACHE_SIZE = 100;
111 |         private final AvroData avroData;
112 | 
113 |         GenericRecordToStruct() {
114 |             this.avroData = new AvroData(CACHE_SIZE);
115 |         }
116 | 
117 |         @Override
118 |         public Struct apply(GenericRecord record) {
119 |             return (Struct) avroData.toConnectData(record.getSchema(), record).value();
120 |         }
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.hadoop.fs.FileSystem;
  4 | import org.apache.hadoop.fs.Path;
  5 | import org.apache.hadoop.io.*;
  6 | import org.apache.hadoop.util.ReflectionUtils;
  7 | import org.apache.kafka.connect.data.Schema;
  8 | import org.apache.kafka.connect.data.SchemaBuilder;
  9 | import org.apache.kafka.connect.data.Struct;
 10 | 
 11 | import java.io.EOFException;
 12 | import java.io.IOException;
 13 | import java.util.Map;
 14 | 
 15 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
 16 | 
 17 | public class SequenceFileReader extends AbstractFileReader<SequenceFileReader.SequenceRecord<Writable, Writable>> {
 18 | 
 19 |     public static final String FIELD_NAME_KEY_DEFAULT = "key";
 20 |     public static final String FIELD_NAME_VALUE_DEFAULT = "value";
 21 | 
 22 |     private static final int DEFAULT_BUFFER_SIZE = 4096;
 23 |     private static final String FILE_READER_SEQUENCE = FILE_READER_PREFIX + "sequence.";
 24 |     private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_SEQUENCE + "field_name.";
 25 | 
 26 |     public static final String FILE_READER_BUFFER_SIZE = FILE_READER_SEQUENCE + "buffer_size";
 27 |     public static final String FILE_READER_SEQUENCE_FIELD_NAME_KEY = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "key";
 28 |     public static final String FILE_READER_SEQUENCE_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value";
 29 | 
 30 |     private final SequenceFile.Reader reader;
 31 |     private final Writable key, value;
 32 |     private final Schema schema;
 33 |     private String keyFieldName, valueFieldName;
 34 |     private boolean hasNext;
 35 |     private boolean closed;
 36 | 
 37 |     public SequenceFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
 38 |         super(fs, filePath, new SeqToStruct(), config);
 39 | 
 40 |         this.reader = new SequenceFile.Reader(fs.getConf(),
 41 |                 SequenceFile.Reader.file(filePath),
 42 |                 SequenceFile.Reader.bufferSize(fs.getConf().getInt(FILE_READER_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)));
 43 |         this.key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf());
 44 |         this.value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf());
 45 |         this.schema = SchemaBuilder.struct()
 46 |                 .field(keyFieldName, getSchema(this.key))
 47 |                 .field(valueFieldName, getSchema(this.value))
 48 |                 .build();
 49 |         this.hasNext = false;
 50 |         this.closed = false;
 51 |     }
 52 | 
 53 |     @Override
 54 |     protected void configure(Map<String, String> config) {
 55 |         this.keyFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY_DEFAULT);
 56 |         this.valueFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT);
 57 |     }
 58 | 
 59 |     Schema getSchema(Writable writable) {
 60 |         if (writable instanceof ByteWritable) {
 61 |             return SchemaBuilder.INT8_SCHEMA;
 62 |         } else if (writable instanceof ShortWritable) {
 63 |             return SchemaBuilder.INT16_SCHEMA;
 64 |         } else if (writable instanceof IntWritable) {
 65 |             return SchemaBuilder.INT32_SCHEMA;
 66 |         } else if (writable instanceof LongWritable) {
 67 |             return SchemaBuilder.INT64_SCHEMA;
 68 |         } else if (writable instanceof FloatWritable) {
 69 |             return SchemaBuilder.FLOAT32_SCHEMA;
 70 |         } else if (writable instanceof DoubleWritable) {
 71 |             return SchemaBuilder.FLOAT64_SCHEMA;
 72 |         } else if (writable instanceof BytesWritable) {
 73 |             return SchemaBuilder.BYTES_SCHEMA;
 74 |         } else if (writable instanceof BooleanWritable) {
 75 |             return SchemaBuilder.BOOLEAN_SCHEMA;
 76 |         }
 77 |         return SchemaBuilder.STRING_SCHEMA;
 78 |     }
 79 | 
 80 |     @Override
 81 |     public boolean hasNextRecord() throws IOException {
 82 |         try {
 83 |             if (!hasNext) {
 84 |                 hasNext = reader.next(key, value);
 85 |             }
 86 |             return hasNext;
 87 |         } catch (EOFException eofe) {
 88 |             return false;
 89 |         }
 90 |     }
 91 | 
 92 |     @Override
 93 |     protected SequenceRecord<Writable, Writable> nextRecord() {
 94 |         incrementOffset();
 95 |         hasNext = false;
 96 |         return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value);
 97 |     }
 98 | 
 99 |     @Override
100 |     public void seekFile(long offset) throws IOException {
101 |         if (offset == currentOffset()) {
102 |             return;
103 |         } else if (offset < currentOffset()) {
104 |             reader.sync(0L);
105 |             hasNext = false;
106 |         }
107 |         while (super.hasNext() && offset > currentOffset()) {
108 |             super.next();
109 |             hasNext = false;
110 |         }
111 |         setOffset(offset);
112 |     }
113 | 
114 |     @Override
115 |     public void close() throws IOException {
116 |         closed = true;
117 |         reader.close();
118 |     }
119 | 
120 |     @Override
121 |     public boolean isClosed() {
122 |         return closed;
123 |     }
124 | 
125 |     static class SeqToStruct implements ReaderAdapter<SequenceRecord<Writable, Writable>> {
126 | 
127 |         @Override
128 |         public Struct apply(SequenceRecord<Writable, Writable> record) {
129 |             return new Struct(record.schema)
130 |                     .put(record.keyFieldName, toSchemaValue(record.key))
131 |                     .put(record.valueFieldName, toSchemaValue(record.value));
132 |         }
133 | 
134 |         Object toSchemaValue(Writable writable) {
135 |             if (writable instanceof ByteWritable) {
136 |                 return ((ByteWritable) writable).get();
137 |             } else if (writable instanceof ShortWritable) {
138 |                 return ((ShortWritable) writable).get();
139 |             } else if (writable instanceof IntWritable) {
140 |                 return ((IntWritable) writable).get();
141 |             } else if (writable instanceof LongWritable) {
142 |                 return ((LongWritable) writable).get();
143 |             } else if (writable instanceof FloatWritable) {
144 |                 return ((FloatWritable) writable).get();
145 |             } else if (writable instanceof DoubleWritable) {
146 |                 return ((DoubleWritable) writable).get();
147 |             } else if (writable instanceof BytesWritable) {
148 |                 return ((BytesWritable) writable).getBytes();
149 |             } else if (writable instanceof BooleanWritable) {
150 |                 return ((BooleanWritable) writable).get();
151 |             }
152 |             return writable.toString();
153 |         }
154 |     }
155 | 
156 |     static class SequenceRecord<T, U> {
157 | 
158 |         private final Schema schema;
159 |         private final String keyFieldName;
160 |         private final T key;
161 |         private final String valueFieldName;
162 |         private final U value;
163 | 
164 |         SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) {
165 |             this.schema = schema;
166 |             this.keyFieldName = keyFieldName;
167 |             this.key = key;
168 |             this.valueFieldName = valueFieldName;
169 |             this.value = value;
170 |         }
171 | 
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
  4 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.kafka.connect.data.Schema;
  8 | import org.apache.kafka.connect.data.SchemaBuilder;
  9 | import org.apache.kafka.connect.data.Struct;
 10 | 
 11 | import java.io.*;
 12 | import java.nio.charset.Charset;
 13 | import java.util.List;
 14 | import java.util.Map;
 15 | import java.util.stream.Collectors;
 16 | 
 17 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
 18 | 
 19 | public class TextFileReader extends AbstractFileReader<TextFileReader.TextRecord> {
 20 | 
 21 |     private static final String FILE_READER_TEXT = FILE_READER_PREFIX + "text.";
 22 |     private static final String FILE_READER_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name.";
 23 |     private static final String FILE_READER_TEXT_COMPRESSION = FILE_READER_TEXT + "compression.";
 24 | 
 25 |     public static final String FIELD_NAME_VALUE_DEFAULT = "value";
 26 | 
 27 |     public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_FIELD_NAME_PREFIX + "value";
 28 |     public static final String FILE_READER_TEXT_RECORD_PER_LINE = FILE_READER_TEXT + "record_per_line";
 29 |     public static final String FILE_READER_TEXT_COMPRESSION_TYPE = FILE_READER_TEXT_COMPRESSION + "type";
 30 |     public static final String FILE_READER_TEXT_COMPRESSION_CONCATENATED = FILE_READER_TEXT_COMPRESSION + "concatenated";
 31 |     public static final String FILE_READER_TEXT_ENCODING = FILE_READER_TEXT + "encoding";
 32 | 
 33 |     private String current;
 34 |     private boolean finished = false;
 35 |     private LineNumberReader reader;
 36 |     private Schema schema;
 37 |     private Charset charset;
 38 |     private CompressionType compression;
 39 |     private boolean recordPerLine;
 40 |     private boolean closed;
 41 | 
 42 |     public TextFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
 43 |         super(fs, filePath, new TxtToStruct(), config);
 44 |         this.reader = new LineNumberReader(getFileReader(fs.open(filePath)));
 45 |         this.closed = false;
 46 |     }
 47 | 
 48 |     @Override
 49 |     protected void configure(Map<String, String> config) {
 50 |         this.schema = SchemaBuilder.struct()
 51 |                 .field(config.getOrDefault(FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT),
 52 |                         Schema.STRING_SCHEMA)
 53 |                 .build();
 54 |         this.recordPerLine = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_RECORD_PER_LINE, "true"));
 55 |         String cType = config.getOrDefault(FILE_READER_TEXT_COMPRESSION_TYPE, CompressionType.NONE.toString());
 56 |         boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_COMPRESSION_CONCATENATED,
 57 |                 "true"));
 58 |         this.compression = CompressionType.fromName(cType, concatenated);
 59 |         this.charset = Charset.forName(config.getOrDefault(FILE_READER_TEXT_ENCODING, Charset.defaultCharset().name()));
 60 |     }
 61 | 
 62 |     private Reader getFileReader(InputStream inputStream) throws IOException {
 63 |         final InputStreamReader isr;
 64 |         switch (this.compression) {
 65 |             case BZIP2:
 66 |                 isr = new InputStreamReader(new BZip2CompressorInputStream(inputStream,
 67 |                         this.compression.isConcatenated()), this.charset);
 68 |                 break;
 69 |             case GZIP:
 70 |                 isr = new InputStreamReader(new GzipCompressorInputStream(inputStream,
 71 |                         this.compression.isConcatenated()), this.charset);
 72 |                 break;
 73 |             default:
 74 |                 isr = new InputStreamReader(inputStream, this.charset);
 75 |                 break;
 76 |         }
 77 |         return isr;
 78 |     }
 79 | 
 80 |     @Override
 81 |     public boolean hasNextRecord() throws IOException {
 82 |         if (current != null) {
 83 |             return true;
 84 |         } else if (finished) {
 85 |             return false;
 86 |         } else {
 87 |             if (!recordPerLine) {
 88 |                 List<String> lines = new BufferedReader(reader).lines().collect(Collectors.toList());
 89 |                 current = String.join("\n", lines);
 90 |                 finished = true;
 91 |                 return true;
 92 |             }
 93 |             for (; ; ) {
 94 |                 String line = reader.readLine();
 95 |                 if (line == null) {
 96 |                     finished = true;
 97 |                     return false;
 98 |                 }
 99 |                 current = line;
100 |                 return true;
101 |             }
102 |         }
103 |     }
104 | 
105 |     @Override
106 |     protected TextRecord nextRecord() {
107 |         String aux = current;
108 |         current = null;
109 |         incrementOffset();
110 |         return new TextRecord(schema, aux);
111 |     }
112 | 
113 |     @Override
114 |     public void seekFile(long offset) throws IOException {
115 |         current = null;
116 |         if (offset < reader.getLineNumber()) {
117 |             finished = false;
118 |             reader.close();
119 |             reader = new LineNumberReader(getFileReader(getFs().open(getFilePath())));
120 |         }
121 |         while (reader.getLineNumber() < offset && reader.readLine() != null) {
122 |         }
123 |         setOffset(reader.getLineNumber());
124 |     }
125 | 
126 |     @Override
127 |     public void close() throws IOException {
128 |         closed = true;
129 |         reader.close();
130 |     }
131 | 
132 |     @Override
133 |     public boolean isClosed() {
134 |         return closed;
135 |     }
136 | 
137 |     static class TxtToStruct implements ReaderAdapter<TextRecord> {
138 | 
139 |         @Override
140 |         public Struct apply(TextRecord record) {
141 |             return new Struct(record.schema)
142 |                     .put(record.schema.fields().get(0), record.value);
143 |         }
144 |     }
145 | 
146 |     static class TextRecord {
147 |         private final Schema schema;
148 |         private final String value;
149 | 
150 |         TextRecord(Schema schema, String value) {
151 |             this.schema = schema;
152 |             this.value = value;
153 |         }
154 | 
155 |         public String getValue() {
156 |             return value;
157 |         }
158 |     }
159 | }
160 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.univocity.parsers.common.AbstractParser;
 4 | import com.univocity.parsers.tsv.TsvParser;
 5 | import com.univocity.parsers.tsv.TsvParserSettings;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | 
 9 | import java.io.IOException;
10 | import java.util.Map;
11 | 
12 | public class TsvFileReader extends UnivocityFileReader<TsvParserSettings> {
13 | 
14 |     public static final String FILE_READER_DELIMITED_SETTINGS_LINE_JOINING = FILE_READER_DELIMITED_SETTINGS + "line_joining";
15 | 
16 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escape";
17 |     public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escaped_char";
18 | 
19 |     public TsvFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
20 |         super(fs, filePath, config);
21 |     }
22 | 
23 |     @Override
24 |     protected TsvParserSettings parserSettings(Map<String, String> config) {
25 |         TsvParserSettings settings = new TsvParserSettings();
26 |         settings.setLineJoiningEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_JOINING, false));
27 |         settings.getFormat().setEscapeChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE, "\"").charAt(0));
28 |         settings.getFormat().setEscapedTabChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR, "\"").charAt(0));
29 | 
30 |         return settings;
31 |     }
32 | 
33 |     @Override
34 |     protected AbstractParser<TsvParserSettings> createParser(TsvParserSettings settings) {
35 |         return new TsvParser(settings);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/XmlFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import com.fasterxml.jackson.dataformat.xml.XmlMapper;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | 
 8 | import java.io.IOException;
 9 | import java.util.Map;
10 | 
11 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
12 | 
13 | public class XmlFileReader extends JacksonFileReader {
14 | 
15 |     private static final String FILE_READER_XML = FILE_READER_PREFIX + "xml.";
16 |     private static final String FILE_READER_XML_COMPRESSION = FILE_READER_XML + "compression.";
17 | 
18 |     static final String FILE_READER_XML_DESERIALIZATION_CONFIGS = FILE_READER_XML + "deserialization.";
19 | 
20 |     public static final String FILE_READER_XML_RECORD_PER_LINE = FILE_READER_XML + "record_per_line";
21 |     public static final String FILE_READER_XML_COMPRESSION_TYPE = FILE_READER_XML_COMPRESSION + "type";
22 |     public static final String FILE_READER_XML_COMPRESSION_CONCATENATED = FILE_READER_XML_COMPRESSION + "concatenated";
23 |     public static final String FILE_READER_XML_ENCODING = FILE_READER_XML + "encoding";
24 | 
25 |     public XmlFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
26 |         super(fs, filePath, config);
27 |     }
28 | 
29 |     @Override
30 |     protected Object readerEncodingConfig(Map<String, Object> config) {
31 |         return config.get(FILE_READER_XML_ENCODING);
32 |     }
33 | 
34 |     @Override
35 |     protected Object recordPerLineConfig(Map<String, Object> config) {
36 |         return config.get(FILE_READER_XML_RECORD_PER_LINE);
37 |     }
38 | 
39 |     @Override
40 |     protected Object compressionTypeConfig(Map<String, Object> config) {
41 |         return config.get(FILE_READER_XML_COMPRESSION_TYPE);
42 |     }
43 | 
44 |     @Override
45 |     protected Object compressionConcatenatedConfig(Map<String, Object> config) {
46 |         return config.get(FILE_READER_XML_COMPRESSION_CONCATENATED);
47 |     }
48 | 
49 |     @Override
50 |     protected String deserializationConfigPrefix() {
51 |         return FILE_READER_XML_DESERIALIZATION_CONFIGS;
52 |     }
53 | 
54 |     @Override
55 |     protected ObjectMapper getObjectMapper() {
56 |         return new XmlMapper();
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/YamlFileReader.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import com.fasterxml.jackson.dataformat.yaml.YAMLMapper;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | 
 8 | import java.io.IOException;
 9 | import java.util.Map;
10 | 
11 | import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
12 | 
13 | public class YamlFileReader extends JacksonFileReader {
14 | 
15 |     private static final String FILE_READER_YAML = FILE_READER_PREFIX + "yaml.";
16 |     private static final String FILE_READER_YAML_COMPRESSION = FILE_READER_YAML + "compression.";
17 | 
18 |     static final String FILE_READER_YAML_DESERIALIZATION_CONFIGS = FILE_READER_YAML + "deserialization.";
19 | 
20 |     public static final String FILE_READER_YAML_COMPRESSION_TYPE = FILE_READER_YAML_COMPRESSION + "type";
21 |     public static final String FILE_READER_YAML_COMPRESSION_CONCATENATED = FILE_READER_YAML_COMPRESSION + "concatenated";
22 |     public static final String FILE_READER_YAML_ENCODING = FILE_READER_YAML + "encoding";
23 | 
24 |     public YamlFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
25 |         super(fs, filePath, config);
26 |     }
27 | 
28 |     @Override
29 |     protected Object readerEncodingConfig(Map<String, Object> config) {
30 |         return config.get(FILE_READER_YAML_ENCODING);
31 |     }
32 | 
33 |     @Override
34 |     protected Object recordPerLineConfig(Map<String, Object> config) {
35 |         return false;
36 |     }
37 | 
38 |     @Override
39 |     protected Object compressionTypeConfig(Map<String, Object> config) {
40 |         return config.get(FILE_READER_YAML_COMPRESSION_TYPE);
41 |     }
42 | 
43 |     @Override
44 |     protected Object compressionConcatenatedConfig(Map<String, Object> config) {
45 |         return config.get(FILE_READER_YAML_COMPRESSION_CONCATENATED);
46 |     }
47 | 
48 |     @Override
49 |     protected String deserializationConfigPrefix() {
50 |         return FILE_READER_YAML_DESERIALIZATION_CONFIGS;
51 |     }
52 | 
53 |     @Override
54 |     protected ObjectMapper getObjectMapper() {
55 |         return new YAMLMapper();
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.policy;
 2 | 
 3 | import com.cronutils.model.CronType;
 4 | import com.cronutils.model.definition.CronDefinitionBuilder;
 5 | import com.cronutils.model.time.ExecutionTime;
 6 | import com.cronutils.parser.CronParser;
 7 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 8 | import org.apache.kafka.common.config.ConfigException;
 9 | import org.apache.kafka.common.utils.SystemTime;
10 | import org.apache.kafka.common.utils.Time;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | import java.io.IOException;
15 | import java.time.DateTimeException;
16 | import java.time.LocalDateTime;
17 | import java.time.ZoneId;
18 | import java.time.ZonedDateTime;
19 | import java.util.Date;
20 | import java.util.Map;
21 | 
22 | public class CronPolicy extends AbstractPolicy {
23 | 
24 |     private static final Logger log = LoggerFactory.getLogger(CronPolicy.class);
25 | 
26 |     private static final String CRON_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "cron.";
27 | 
28 |     public static final String CRON_POLICY_EXPRESSION = CRON_POLICY_PREFIX + "expression";
29 |     public static final String CRON_POLICY_END_DATE = CRON_POLICY_PREFIX + "end_date";
30 | 
31 |     private final Time time;
32 |     private ExecutionTime executionTime;
33 |     private Date endDate;
34 | 
35 |     public CronPolicy(FsSourceTaskConfig conf) throws IOException {
36 |         super(conf);
37 |         this.time = new SystemTime();
38 |     }
39 | 
40 |     @Override
41 |     protected void configPolicy(Map<String, Object> customConfigs) {
42 |         try {
43 |             if (customConfigs.get(CRON_POLICY_END_DATE) != null &&
44 |                     !customConfigs.get(CRON_POLICY_END_DATE).toString().equals("")) {
45 |                 endDate = Date.from(LocalDateTime.parse(customConfigs.get(CRON_POLICY_END_DATE).toString().trim())
46 |                         .atZone(ZoneId.systemDefault()).toInstant());
47 |             }
48 |             executionTime = ExecutionTime.forCron(
49 |                     new CronParser(CronDefinitionBuilder.instanceDefinitionFor(CronType.QUARTZ))
50 |                             .parse(customConfigs.get(CRON_POLICY_EXPRESSION).toString())
51 |             );
52 |         } catch (DateTimeException dte) {
53 |             throw new ConfigException(CRON_POLICY_END_DATE + " property must have a proper value. Got: '" +
54 |                     customConfigs.get(CRON_POLICY_END_DATE) + "'.");
55 |         } catch (IllegalArgumentException iae) {
56 |             throw new ConfigException(CRON_POLICY_EXPRESSION + " property must have a proper value. Got: '" +
57 |                     customConfigs.get(CRON_POLICY_EXPRESSION) + "'.");
58 |         }
59 |     }
60 | 
61 |     @Override
62 |     protected void preCheck() {
63 |         executionTime.timeToNextExecution(ZonedDateTime.now())
64 |                 .ifPresent(next -> time.sleep(next.toMillis()));
65 |     }
66 | 
67 |     @Override
68 |     protected boolean isPolicyCompleted() {
69 |         return (endDate != null &&
70 |                 endDate.before(Date.from(LocalDateTime.now().atZone(ZoneId.systemDefault()).toInstant()))) ||
71 |                 !executionTime.timeToNextExecution(ZonedDateTime.now()).isPresent();
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.policy;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 5 | import org.apache.kafka.connect.storage.OffsetStorageReader;
 6 | 
 7 | import java.io.Closeable;
 8 | import java.io.IOException;
 9 | import java.util.Iterator;
10 | import java.util.List;
11 | import java.util.Map;
12 | 
13 | public interface Policy extends Closeable {
14 | 
15 |     Iterator<FileMetadata> execute() throws IOException;
16 | 
17 |     FileReader offer(FileMetadata metadata, Map<String, Object> offset) throws IOException;
18 | 
19 |     boolean hasEnded();
20 | 
21 |     List<String> getURIs();
22 | 
23 |     long getExecutions();
24 | 
25 |     void interrupt();
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicy.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.policy;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | 
 8 | public class SimplePolicy extends AbstractPolicy {
 9 | 
10 |     public SimplePolicy(FsSourceTaskConfig conf) throws IOException {
11 |         super(conf);
12 |     }
13 | 
14 |     @Override
15 |     protected void configPolicy(Map<String, Object> customConfigs) {
16 | 
17 |     }
18 | 
19 |     @Override
20 |     protected boolean isPolicyCompleted() {
21 |         return getExecutions() > 0;
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.policy;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 4 | import org.apache.kafka.common.config.ConfigException;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import java.io.IOException;
 9 | import java.util.Map;
10 | 
11 | public class SleepyPolicy extends AbstractPolicy {
12 | 
13 |     private static final Logger log = LoggerFactory.getLogger(SleepyPolicy.class);
14 | 
15 |     private static final int DEFAULT_SLEEP_FRACTION = 10;
16 |     private static final int DEFAULT_MAX_EXECS = -1;
17 |     private static final String SLEEPY_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "sleepy.";
18 | 
19 |     public static final String SLEEPY_POLICY_SLEEP_MS = SLEEPY_POLICY_PREFIX + "sleep";
20 |     public static final String SLEEPY_POLICY_SLEEP_FRACTION = SLEEPY_POLICY_PREFIX + "fraction";
21 |     public static final String SLEEPY_POLICY_MAX_EXECS = SLEEPY_POLICY_PREFIX + "max_execs";
22 | 
23 |     private long sleep;
24 |     private long sleepFraction;
25 |     private long maxExecs;
26 | 
27 |     public SleepyPolicy(FsSourceTaskConfig conf) throws IOException {
28 |         super(conf);
29 |     }
30 | 
31 |     @Override
32 |     protected void configPolicy(Map<String, Object> customConfigs) {
33 |         try {
34 |             this.sleep = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS));
35 |         } catch (NumberFormatException nfe) {
36 |             throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number (long). Got: " +
37 |                     customConfigs.get(SLEEPY_POLICY_SLEEP_MS));
38 |         }
39 |         try {
40 |             this.maxExecs = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_MAX_EXECS,
41 |                     String.valueOf(DEFAULT_MAX_EXECS)));
42 |         } catch (NumberFormatException nfe) {
43 |             throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number (long). Got: " +
44 |                     customConfigs.get(SLEEPY_POLICY_MAX_EXECS));
45 |         }
46 |         try {
47 |             this.sleepFraction = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_SLEEP_FRACTION,
48 |                     String.valueOf(DEFAULT_SLEEP_FRACTION)));
49 |         } catch (NumberFormatException nfe) {
50 |             throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number (long). Got: " +
51 |                     customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION));
52 |         }
53 |     }
54 | 
55 |     @Override
56 |     protected void preCheck() {
57 |         sleepIfApply();
58 |     }
59 | 
60 |     private void sleepIfApply() {
61 |         if (getExecutions() > 0) {
62 |             int counter = 0;
63 |             while (!hasEnded() && counter < sleepFraction) {
64 |                 try {
65 |                     Thread.sleep(sleep / sleepFraction);
66 |                     counter++;
67 |                 } catch (InterruptedException ie) {
68 |                     log.warn("{} An interrupted exception has occurred when sleeping: {}", this, ie.getMessage(), ie);
69 |                 }
70 |             }
71 |         }
72 |     }
73 | 
74 |     @Override
75 |     protected boolean isPolicyCompleted() {
76 |         return maxExecs >= 0 && getExecutions() >= maxExecs;
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.util;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | public class Iterators {
 6 | 
 7 |     public static <T> Iterator<Iterator<T>> partition(Iterator<T> it, int size) {
 8 |         if (size <= 0) {
 9 |             return Collections.singletonList(it).iterator();
10 |         }
11 | 
12 |         return new Iterator<Iterator<T>>() {
13 |             @Override
14 |             public boolean hasNext() {
15 |                 return it.hasNext();
16 |             }
17 | 
18 |             @Override
19 |             public Iterator<T> next() {
20 |                 if (!hasNext()) {
21 |                     throw new NoSuchElementException();
22 |                 }
23 |                 List<T> elements = new ArrayList<>(size);
24 |                 while (it.hasNext() && elements.size() < size) {
25 |                     elements.add(it.next());
26 |                 }
27 |                 return elements.iterator();
28 |             }
29 |         };
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.util;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 4 | import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 5 | import com.github.mmolimar.kafka.connect.fs.policy.Policy;
 6 | import org.apache.commons.lang3.reflect.ConstructorUtils;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.kafka.connect.errors.ConnectException;
10 | 
11 | import java.lang.reflect.Constructor;
12 | import java.lang.reflect.InvocationTargetException;
13 | import java.util.Arrays;
14 | import java.util.Map;
15 | 
16 | public class ReflectionUtils {
17 | 
18 |     public static FileReader makeReader(Class<? extends FileReader> clazz, FileSystem fs,
19 |                                         Path path, Map<String, Object> config) {
20 |         return make(clazz, fs, path, config);
21 |     }
22 | 
23 |     public static Policy makePolicy(Class<? extends Policy> clazz, FsSourceTaskConfig conf) {
24 |         return make(clazz, conf);
25 |     }
26 | 
27 |     private static <T> T make(Class<T> clazz, Object... args) {
28 |         try {
29 |             Class<?>[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new);
30 |             Constructor<T> constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses);
31 | 
32 |             return constructor.newInstance(args);
33 |         } catch (IllegalAccessException |
34 |                 InstantiationException |
35 |                 InvocationTargetException e) {
36 |             throw new ConnectException(e.getCause());
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.util;
 2 | 
 3 | import java.util.stream.Stream;
 4 | 
 5 | @FunctionalInterface
 6 | public interface TailCall<T> {
 7 | 
 8 |     TailCall<T> apply();
 9 | 
10 |     default boolean completed() {
11 |         return false;
12 |     }
13 | 
14 |     default T result() {
15 |         throw new IllegalStateException("Call does not have a value.");
16 |     }
17 | 
18 |     default T invoke() {
19 |         return Stream.iterate(this, TailCall::apply)
20 |                 .filter(TailCall::completed)
21 |                 .findFirst()
22 |                 .get()
23 |                 .result();
24 |     }
25 | 
26 |     static <T> TailCall<T> done(final T value) {
27 |         return new TailCall<T>() {
28 |             @Override
29 |             public boolean completed() {
30 |                 return true;
31 |             }
32 | 
33 |             @Override
34 |             public T result() {
35 |                 return value;
36 |             }
37 | 
38 |             @Override
39 |             public TailCall<T> apply() {
40 |                 throw new IllegalStateException("Done cannot be applied.");
41 |             }
42 |         };
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.util;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import java.util.Properties;
 7 | 
 8 | public class Version {
 9 |     private static final Logger log = LoggerFactory.getLogger(Version.class);
10 |     private static String version = "unknown";
11 | 
12 |     static {
13 |         try {
14 |             Properties props = new Properties();
15 |             props.load(Version.class.getResourceAsStream("/kafka-connect-fs-version.properties"));
16 |             version = props.getProperty("version", version).trim();
17 |         } catch (Exception e) {
18 |             log.warn("Error while loading version: {}", e.getMessage(), e);
19 |         }
20 |     }
21 | 
22 |     public static String getVersion() {
23 |         return version;
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem:
--------------------------------------------------------------------------------
 1 | org.apache.hadoop.fs.LocalFileSystem
 2 | org.apache.hadoop.fs.viewfs.ViewFileSystem
 3 | org.apache.hadoop.fs.HarFileSystem
 4 | org.apache.hadoop.fs.http.HttpFileSystem
 5 | org.apache.hadoop.fs.http.HttpsFileSystem
 6 | org.apache.hadoop.fs.ftp.FTPFileSystem
 7 | org.apache.hadoop.hdfs.DistributedFileSystem
 8 | org.apache.hadoop.fs.s3a.S3AFileSystem
 9 | org.apache.hadoop.fs.s3native.NativeS3FileSystem
10 | org.apache.hadoop.fs.adl.AdlFileSystem
11 | org.apache.hadoop.fs.azure.NativeAzureFileSystem
12 | org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure
13 | org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem
14 | org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem
15 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
16 | 


--------------------------------------------------------------------------------
/src/main/resources/kafka-connect-fs-version.properties:
--------------------------------------------------------------------------------
1 | version=${project.version}
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/mmolimar/kafka/connect/fs/file/reader/CobrixReader.scala:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.file.reader.CobolFileReader.StructHandler
 4 | import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 5 | import za.co.absa.cobrix.cobol.reader.{VarLenNestedReader, VarLenReader}
 6 | 
 7 | import scala.collection.Seq
 8 | 
 9 | protected object CobrixReader {
10 | 
11 |   def varLenReader(copybookContent: String, params: ReaderParameters): VarLenReader = {
12 |     new VarLenNestedReader[java.util.Map[String, AnyRef]](Seq(copybookContent), params, new StructHandler())
13 |   }
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.hdfs.MiniDFSCluster;
 6 | 
 7 | import java.io.IOException;
 8 | import java.net.URI;
 9 | import java.nio.file.Files;
10 | 
11 | public abstract class AbstractHdfsFsConfig implements FsTestConfig {
12 |     private MiniDFSCluster cluster;
13 |     private FileSystem fs;
14 |     private URI fsUri;
15 | 
16 |     @Override
17 |     public final void initFs() throws IOException {
18 |         Configuration clusterConfig = new Configuration();
19 |         java.nio.file.Path hdfsDir = Files.createTempDirectory("test-");
20 |         clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString());
21 |         cluster = new MiniDFSCluster.Builder(clusterConfig).build();
22 |         fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/");
23 |         fs = FileSystem.newInstance(fsUri, new Configuration());
24 |         init();
25 |     }
26 | 
27 |     protected abstract void init() throws IOException;
28 | 
29 |     @Override
30 |     public FileSystem getFs() {
31 |         return fs;
32 |     }
33 | 
34 |     @Override
35 |     public URI getFsUri() {
36 |         return fsUri;
37 |     }
38 | 
39 |     @Override
40 |     public void close() throws IOException {
41 |         fs.close();
42 |         cluster.shutdown(true, true);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs;
 2 | 
 3 | import org.apache.commons.io.FileUtils;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | 
 7 | import java.io.IOException;
 8 | import java.net.URI;
 9 | import java.nio.file.Files;
10 | 
11 | public abstract class AbstractLocalFsConfig implements FsTestConfig {
12 |     private java.nio.file.Path localDir;
13 |     private FileSystem fs;
14 |     private URI fsUri;
15 | 
16 |     @Override
17 |     public final void initFs() throws IOException {
18 |         localDir = Files.createTempDirectory("test-");
19 |         fsUri = localDir.toUri();
20 |         fs = FileSystem.newInstance(fsUri, new Configuration());
21 |         init();
22 |     }
23 | 
24 |     protected abstract void init() throws IOException;
25 | 
26 |     @Override
27 |     public FileSystem getFs() {
28 |         return fs;
29 |     }
30 | 
31 |     @Override
32 |     public URI getFsUri() {
33 |         return fsUri;
34 |     }
35 | 
36 |     @Override
37 |     public void close() throws IOException {
38 |         fs.close();
39 |         FileUtils.deleteDirectory(localDir.toFile());
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs;
 2 | 
 3 | import org.apache.hadoop.fs.FileSystem;
 4 | 
 5 | import java.io.Closeable;
 6 | import java.io.IOException;
 7 | import java.net.URI;
 8 | 
 9 | public interface FsTestConfig extends Closeable {
10 | 
11 |     void initFs() throws IOException;
12 | 
13 |     FileSystem getFs();
14 | 
15 |     URI getFsUri();
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.connector;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig;
 4 | import org.apache.kafka.common.config.ConfigDef;
 5 | import org.junit.jupiter.api.Test;
 6 | 
 7 | import static org.junit.jupiter.api.Assertions.assertFalse;
 8 | import static org.junit.jupiter.api.Assertions.assertNotNull;
 9 | 
10 | public class FsSourceConnectorConfigTest {
11 | 
12 |     @Test
13 |     public void checkDocumentation() {
14 |         ConfigDef config = FsSourceConnectorConfig.conf();
15 |         config.names().forEach(key -> {
16 |             assertFalse(config.configKeys().get(key).documentation == null ||
17 |                             "".equals(config.configKeys().get(key).documentation.trim()),
18 |                     () -> "Property " + key + " should be documented");
19 |         });
20 |     }
21 | 
22 |     @Test
23 |     public void toRst() {
24 |         assertNotNull(FsSourceConnectorConfig.conf().toRst());
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.connector;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnector;
 4 | import com.github.mmolimar.kafka.connect.fs.FsSourceTask;
 5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 6 | import org.apache.kafka.connect.errors.ConnectException;
 7 | import org.junit.jupiter.api.BeforeEach;
 8 | import org.junit.jupiter.api.Test;
 9 | import org.junit.jupiter.api.io.TempDir;
10 | 
11 | import java.io.File;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | import java.util.stream.IntStream;
16 | 
17 | import static org.junit.jupiter.api.Assertions.*;
18 | 
19 | public class FsSourceConnectorTest {
20 |     @TempDir
21 |     public static File temporaryFolder;
22 | 
23 |     private FsSourceConnector connector;
24 |     private Map<String, String> connProps;
25 | 
26 |     @BeforeEach
27 |     public void setup() {
28 |         connector = new FsSourceConnector();
29 | 
30 |         Map<String, String> cfg = new HashMap<String, String>() {{
31 |             put(FsSourceTaskConfig.FS_URIS, String.join(",",
32 |                     temporaryFolder.toURI() + File.separator + "dir1",
33 |                     temporaryFolder.toURI() + File.separator + "dir2",
34 |                     temporaryFolder.toURI() + File.separator + "dir3"));
35 |             put(FsSourceTaskConfig.TOPIC, "topic_test");
36 |         }};
37 |         connProps = new HashMap<>(cfg);
38 |     }
39 | 
40 |     @Test
41 |     public void nullProperties() {
42 |         assertThrows(ConnectException.class, () -> connector.start(null));
43 |     }
44 | 
45 |     @Test
46 |     public void expectedFsUris() {
47 |         Map<String, String> testProps = new HashMap<>(connProps);
48 |         testProps.remove(FsSourceTaskConfig.FS_URIS);
49 |         assertThrows(ConnectException.class, () -> connector.start(testProps));
50 |     }
51 | 
52 |     @Test
53 |     public void minimumConfig() {
54 |         connector.start(connProps);
55 |         connector.stop();
56 |     }
57 | 
58 |     @Test
59 |     public void checkTaskClass() {
60 |         assertEquals(FsSourceTask.class, connector.taskClass());
61 |     }
62 | 
63 |     @Test
64 |     public void configTasksWithoutStart() {
65 |         assertThrows(ConnectException.class, () -> connector.taskConfigs(1));
66 |     }
67 | 
68 |     @Test
69 |     public void invalidConfigTaskNumber() {
70 |         connector.start(connProps);
71 |         assertThrows(IllegalArgumentException.class, () -> connector.taskConfigs(0));
72 |     }
73 | 
74 |     @Test
75 |     public void configTasks() {
76 |         connector.start(connProps);
77 |         int uris = connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length;
78 |         IntStream.range(1, connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length + 1)
79 |                 .forEach(index -> {
80 |                     List<Map<String, String>> taskConfigs = connector.taskConfigs(index);
81 |                     assertEquals(taskConfigs.size(), Math.min(index, uris));
82 |                 });
83 |         connector.stop();
84 |     }
85 | 
86 |     @Test
87 |     public void checkVersion() {
88 |         assertNotNull(connector.version());
89 |         assertFalse("unknown".equalsIgnoreCase(connector.version()));
90 |     }
91 | 
92 |     @Test
93 |     public void checkDefaultConf() {
94 |         assertNotNull(connector.config());
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.avro.AvroTypeException;
  4 | import org.apache.avro.Schema;
  5 | import org.apache.avro.SchemaParseException;
  6 | import org.apache.avro.file.DataFileWriter;
  7 | import org.apache.avro.generic.GenericData;
  8 | import org.apache.avro.generic.GenericDatumWriter;
  9 | import org.apache.avro.generic.GenericRecord;
 10 | import org.apache.avro.io.DatumWriter;
 11 | import org.apache.hadoop.conf.Configuration;
 12 | import org.apache.hadoop.fs.FileSystem;
 13 | import org.apache.hadoop.fs.Path;
 14 | import org.apache.kafka.connect.data.Struct;
 15 | import org.apache.kafka.connect.errors.ConnectException;
 16 | import org.junit.jupiter.api.BeforeAll;
 17 | import org.junit.jupiter.params.ParameterizedTest;
 18 | import org.junit.jupiter.params.provider.MethodSource;
 19 | 
 20 | import java.io.File;
 21 | import java.io.IOException;
 22 | import java.util.HashMap;
 23 | import java.util.Map;
 24 | import java.util.UUID;
 25 | import java.util.stream.IntStream;
 26 | 
 27 | import static org.junit.jupiter.api.Assertions.*;
 28 | 
 29 | public class AvroFileReaderTest extends FileReaderTestBase {
 30 | 
 31 |     private static final String FIELD_INDEX = "index";
 32 |     private static final String FIELD_NAME = "name";
 33 |     private static final String FIELD_SURNAME = "surname";
 34 |     private static final String FILE_EXTENSION = "avr";
 35 | 
 36 |     private static Schema schema;
 37 | 
 38 |     @BeforeAll
 39 |     public static void setUp() throws IOException {
 40 |         schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc"));
 41 |     }
 42 | 
 43 |     @Override
 44 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 45 |         File avroFile = File.createTempFile("test-", "." + getFileExtension());
 46 |         DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(schema);
 47 |         try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(writer)) {
 48 |             dataFileWriter.setFlushOnEveryBlock(true);
 49 |             dataFileWriter.setSyncInterval(32);
 50 |             dataFileWriter.create(schema, avroFile);
 51 | 
 52 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
 53 |                 GenericRecord datum = new GenericData.Record(schema);
 54 |                 datum.put(FIELD_INDEX, index);
 55 |                 datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID()));
 56 |                 datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID()));
 57 |                 try {
 58 |                     fsConfig.offsetsByIndex().put(index, (long) index);
 59 |                     dataFileWriter.append(datum);
 60 |                 } catch (IOException ioe) {
 61 |                     throw new RuntimeException(ioe);
 62 |                 }
 63 |             });
 64 |         }
 65 |         Path path = new Path(new Path(fsConfig.getFsUri()), avroFile.getName());
 66 |         fsConfig.getFs().moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path);
 67 |         return path;
 68 |     }
 69 | 
 70 |     @ParameterizedTest
 71 |     @MethodSource("fileSystemConfigProvider")
 72 |     public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException {
 73 |         Map<String, Object> readerConfig = getReaderConfig();
 74 |         readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString());
 75 |         FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration());
 76 |         fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig));
 77 |         readAllData(fsConfig);
 78 |     }
 79 | 
 80 |     @ParameterizedTest
 81 |     @MethodSource("fileSystemConfigProvider")
 82 |     public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException {
 83 |         Map<String, Object> readerConfig = getReaderConfig();
 84 |         readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString());
 85 |         FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration());
 86 |         fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig));
 87 |         assertThrows(ConnectException.class, () -> readAllData(fsConfig));
 88 |         assertThrows(AvroTypeException.class, () -> {
 89 |             try {
 90 |                 readAllData(fsConfig);
 91 |             } catch (Exception e) {
 92 |                 throw e.getCause();
 93 |             }
 94 |         });
 95 |     }
 96 | 
 97 |     @ParameterizedTest
 98 |     @MethodSource("fileSystemConfigProvider")
 99 |     public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOException {
100 |         Map<String, Object> readerConfig = getReaderConfig();
101 |         readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema");
102 |         FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration());
103 |         assertThrows(ConnectException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig));
104 |         assertThrows(SchemaParseException.class, () -> {
105 |             try {
106 |                 getReader(testFs, fsConfig.getDataFile(), readerConfig);
107 |             } catch (Exception e) {
108 |                 throw e.getCause();
109 |             }
110 |         });
111 |     }
112 | 
113 |     @Override
114 |     protected Class<? extends FileReader> getReaderClass() {
115 |         return AvroFileReader.class;
116 |     }
117 | 
118 |     @Override
119 |     protected Map<String, Object> getReaderConfig() {
120 |         return new HashMap<>();
121 |     }
122 | 
123 |     @Override
124 |     protected void checkData(Struct record, long index) {
125 |         assertAll(
126 |                 () -> assertEquals(index, (int) record.get(FIELD_INDEX)),
127 |                 () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")),
128 |                 () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_"))
129 |         );
130 |     }
131 | 
132 |     @Override
133 |     protected String getFileExtension() {
134 |         return FILE_EXTENSION;
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/BinaryFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  4 | import org.apache.hadoop.fs.Path;
  5 | import org.apache.kafka.connect.data.Struct;
  6 | import org.junit.jupiter.api.Disabled;
  7 | import org.junit.jupiter.params.ParameterizedTest;
  8 | import org.junit.jupiter.params.provider.MethodSource;
  9 | 
 10 | import java.io.File;
 11 | import java.io.IOException;
 12 | import java.nio.file.Files;
 13 | import java.util.HashMap;
 14 | import java.util.Map;
 15 | import java.util.NoSuchElementException;
 16 | import java.util.stream.IntStream;
 17 | 
 18 | import static org.junit.jupiter.api.Assertions.*;
 19 | 
 20 | public class BinaryFileReaderTest extends FileReaderTestBase {
 21 | 
 22 |     private static final String FILE_EXTENSION = "bin";
 23 | 
 24 |     @Override
 25 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 26 |         File binaryFile = File.createTempFile("test-", "." + getFileExtension());
 27 |         byte[] content = "test".getBytes();
 28 |         Path path = new Path(new Path(fsConfig.getFsUri()), binaryFile.getName());
 29 |         Files.write(binaryFile.toPath(), content);
 30 |         fsConfig.getFs().moveFromLocalFile(new Path(binaryFile.getAbsolutePath()), path);
 31 |         IntStream.range(0, NUM_RECORDS).forEach(index -> fsConfig.offsetsByIndex().put(index, (long) 0));
 32 |         return path;
 33 |     }
 34 | 
 35 |     @ParameterizedTest
 36 |     @MethodSource("fileSystemConfigProvider")
 37 |     public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException {
 38 |         File tmp = File.createTempFile("test-", "." + getFileExtension());
 39 |         Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
 40 |         fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
 41 |         FileReader reader = getReader(fsConfig.getFs(), path, getReaderConfig());
 42 |         assertFalse(reader.hasNext());
 43 |     }
 44 | 
 45 |     @ParameterizedTest
 46 |     @MethodSource("fileSystemConfigProvider")
 47 |     @Override
 48 |     public void readAllData(ReaderFsTestConfig fsConfig) {
 49 |         FileReader reader = fsConfig.getReader();
 50 |         assertTrue(reader.hasNext());
 51 | 
 52 |         int recordCount = 0;
 53 |         while (reader.hasNext()) {
 54 |             Struct record = reader.next();
 55 |             checkData(record, recordCount);
 56 |             recordCount++;
 57 |         }
 58 |         assertEquals(1, recordCount, "The number of records in the file does not match");
 59 |     }
 60 | 
 61 |     @ParameterizedTest
 62 |     @MethodSource("fileSystemConfigProvider")
 63 |     public void readAllDataInBatches(ReaderFsTestConfig fsConfig) {
 64 |         Map<String, Object> config = getReaderConfig();
 65 |         int batchSize = 5;
 66 |         config.put(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, batchSize);
 67 |         AbstractFileReader<?> reader = (AbstractFileReader<?>) getReader(fsConfig.getFs(), fsConfig.getDataFile(), config);
 68 |         assertTrue(reader.hasNext());
 69 | 
 70 |         int recordCount = 0;
 71 |         while (reader.hasNextBatch()) {
 72 |             reader.nextBatch();
 73 |             while (reader.hasNext()) {
 74 |                 Struct record = reader.next();
 75 |                 checkData(record, recordCount);
 76 |                 recordCount++;
 77 |             }
 78 |             assertEquals(1, recordCount % batchSize);
 79 |         }
 80 |         assertThrows(NoSuchElementException.class, reader::nextBatch);
 81 |         assertEquals(1, recordCount, "The number of records in the file does not match");
 82 |     }
 83 | 
 84 |     @ParameterizedTest
 85 |     @MethodSource("fileSystemConfigProvider")
 86 |     @Disabled
 87 |     public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException {
 88 |     }
 89 | 
 90 |     @Override
 91 |     protected Class<? extends FileReader> getReaderClass() {
 92 |         return BinaryFileReader.class;
 93 |     }
 94 | 
 95 |     @Override
 96 |     protected Map<String, Object> getReaderConfig() {
 97 |         return new HashMap<>();
 98 |     }
 99 | 
100 |     @Override
101 |     protected void checkData(Struct record, long index) {
102 |         assertAll(
103 |                 () -> assertFalse(record.get("path").toString().isEmpty()),
104 |                 () -> assertFalse(record.get("owner").toString().isEmpty()),
105 |                 () -> assertFalse(record.get("group").toString().isEmpty()),
106 |                 () -> assertEquals(record.getInt64("length"), 4L),
107 |                 () -> assertNotNull(record.get("access_time")),
108 |                 () -> assertNotNull(record.get("modification_time")),
109 |                 () -> assertEquals(new String(record.getBytes("content")), "test")
110 |         );
111 |     }
112 | 
113 |     @Override
114 |     protected String getFileExtension() {
115 |         return FILE_EXTENSION;
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.hadoop.fs.Path;
  4 | import org.apache.kafka.connect.data.Struct;
  5 | import org.junit.jupiter.params.ParameterizedTest;
  6 | import org.junit.jupiter.params.provider.MethodSource;
  7 | 
  8 | import java.io.File;
  9 | import java.io.FileWriter;
 10 | import java.io.IOException;
 11 | import java.io.PrintWriter;
 12 | import java.util.HashMap;
 13 | import java.util.Map;
 14 | import java.util.stream.IntStream;
 15 | 
 16 | import static org.junit.jupiter.api.Assertions.*;
 17 | 
 18 | public class CsvFileReaderTest extends UnivocityFileReaderTest<CsvFileReader> {
 19 | 
 20 |     @Override
 21 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 22 |         boolean header = args.length < 1 || (boolean) args[0];
 23 |         CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1];
 24 |         File txtFile = File.createTempFile("test-", "." + getFileExtension());
 25 |         try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) {
 26 |             if (header) {
 27 |                 String headerValue = String.join("#", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4,
 28 |                         FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9);
 29 |                 writer.append(headerValue + "\n");
 30 |             }
 31 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
 32 |                 String value = String.format("%d#%d#%d#%d#%f#%f#%s#%s#%s\n",
 33 |                         (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d,
 34 |                         true, "test bytes", "test string");
 35 |                 writer.append(value);
 36 |                 fsConfig.offsetsByIndex().put(index, (long) index);
 37 |             });
 38 |         }
 39 |         Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName());
 40 |         fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path);
 41 |         return path;
 42 |     }
 43 | 
 44 |     @ParameterizedTest
 45 |     @MethodSource("fileSystemConfigProvider")
 46 |     public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOException {
 47 |         File tmp = File.createTempFile("test-", "." + getFileExtension());
 48 |         try (FileWriter writer = new FileWriter(tmp)) {
 49 |             String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4,
 50 |                     FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9);
 51 |             writer.append(headerValue + "\n");
 52 |             writer.append(",\"\",,,,,true,test bytes,test string\n");
 53 |             writer.append("#comment\n");
 54 |             writer.append(",\"\",,,,,true,test bytes,test string\n");
 55 |         }
 56 |         Map<String, Object> readerConfig = getReaderConfig();
 57 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",");
 58 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
 59 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "10");
 60 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "100");
 61 | 
 62 |         Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
 63 |         fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
 64 |         FileReader reader = getReader(fsConfig.getFs(), path, readerConfig);
 65 | 
 66 |         assertTrue(reader.hasNext());
 67 | 
 68 |         int recordCount = 0;
 69 |         while (reader.hasNext()) {
 70 |             Struct record = reader.next();
 71 |             assertAll(
 72 |                     () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 100),
 73 |                     () -> assertEquals(record.get(FIELD_COLUMN2), (short) 10),
 74 |                     () -> assertEquals(record.get(FIELD_COLUMN3), 100),
 75 |                     () -> assertEquals(record.get(FIELD_COLUMN4), 100L),
 76 |                     () -> assertEquals(record.get(FIELD_COLUMN5), 100.00f),
 77 |                     () -> assertEquals(record.get(FIELD_COLUMN6), 100.00d),
 78 |                     () -> assertEquals(record.get(FIELD_COLUMN7), true),
 79 |                     () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"),
 80 |                     () -> assertEquals(record.get(FIELD_COLUMN9), "test string")
 81 |             );
 82 |             recordCount++;
 83 |         }
 84 |         assertEquals(2, recordCount, () -> "The number of records in the file does not match");
 85 |     }
 86 | 
 87 |     @ParameterizedTest
 88 |     @MethodSource("fileSystemConfigProvider")
 89 |     public void readAllDataWithEmptyAndNullValueWithAllowNullsAndWithoutSchemaProvided(ReaderFsTestConfig fsConfig) throws IOException {
 90 |         File tmp = File.createTempFile("test-", "." + getFileExtension());
 91 |         try (FileWriter writer = new FileWriter(tmp)) {
 92 |             String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3);
 93 |             writer.append(headerValue + "\n");
 94 |             writer.append("yes,\"\",\n");
 95 |             writer.append("yes,cool,test");
 96 |         }
 97 | 
 98 |         Map<String, Object> defaultReadConfig = getReaderConfig();
 99 |         defaultReadConfig.remove(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA);
100 |         Map<String, Object> readerConfig = defaultReadConfig;
101 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",");
102 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
103 |         readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "true");
104 | 
105 |         Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
106 |         fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
107 |         FileReader reader = getReader(fsConfig.getFs(), path, readerConfig);
108 | 
109 |         assertTrue(reader.hasNext());
110 | 
111 |         Struct record = reader.next();
112 |         assertAll(
113 |                 () -> assertEquals(record.get(FIELD_COLUMN1), "yes"),
114 |                 () -> assertNull(record.get(FIELD_COLUMN2)),
115 |                 () -> assertNull(record.get(FIELD_COLUMN3))
116 |         );
117 | 
118 |         assertTrue(reader.hasNext());
119 |         Struct record2 = reader.next();
120 |         assertAll(
121 |                 () -> assertEquals(record2.get(FIELD_COLUMN1), "yes"),
122 |                 () -> assertEquals(record2.get(FIELD_COLUMN2), "cool"),
123 |                 () -> assertEquals(record2.get(FIELD_COLUMN3), "test")
124 |         );
125 | 
126 |         assertFalse(reader.hasNext());
127 |     }
128 | 
129 |     @Override
130 |     protected Map<String, Object> getReaderConfig() {
131 |         return new HashMap<String, Object>() {{
132 |             put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, "#");
133 |             put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
134 |             put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string");
135 |         }};
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | 
 5 | import java.io.File;
 6 | import java.io.IOException;
 7 | import java.io.PrintWriter;
 8 | import java.util.Arrays;
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.stream.Collectors;
12 | import java.util.stream.IntStream;
13 | 
14 | public class FixedWidthFileReaderTest extends UnivocityFileReaderTest<FixedWidthFileReader> {
15 | 
16 |     private static final int[] fieldLengths = new int[]{45, 53, 71, 89, 14, 44, 67, 46, 75};
17 | 
18 |     @Override
19 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
20 |         boolean header = args.length < 1 || (boolean) args[0];
21 |         CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1];
22 |         File txtFile = File.createTempFile("test-", "." + getFileExtension());
23 |         try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) {
24 |             if (header) {
25 |                 writer.append(String.format("%-" + fieldLengths[0] + "s", FIELD_COLUMN1) +
26 |                         String.format("%-" + fieldLengths[1] + "s", FIELD_COLUMN2) +
27 |                         String.format("%-" + fieldLengths[2] + "s", FIELD_COLUMN3) +
28 |                         String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) +
29 |                         String.format("%-" + fieldLengths[4] + "s", FIELD_COLUMN5) +
30 |                         String.format("%-" + fieldLengths[5] + "s", FIELD_COLUMN6) +
31 |                         String.format("%-" + fieldLengths[6] + "s", FIELD_COLUMN7) +
32 |                         String.format("%-" + fieldLengths[7] + "s", FIELD_COLUMN8) +
33 |                         String.format("%-" + fieldLengths[8] + "s", FIELD_COLUMN9) + "\n");
34 |             }
35 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
36 |                 writer.append(String.format("%-" + fieldLengths[0] + "s", String.format("%d", (byte) 2)) +
37 |                         String.format("%-" + fieldLengths[1] + "s", String.format("%d", (short) 4)) +
38 |                         String.format("%-" + fieldLengths[2] + "s", String.format("%d", 8)) +
39 |                         String.format("%-" + fieldLengths[3] + "s", String.format("%d", 16L)) +
40 |                         String.format("%-" + fieldLengths[4] + "s", String.format("%f", 32.32f)) +
41 |                         String.format("%-" + fieldLengths[5] + "s", String.format("%f", 64.64d)) +
42 |                         String.format("%-" + fieldLengths[6] + "s", String.format("%s", true)) +
43 |                         String.format("%-" + fieldLengths[7] + "s", String.format("%s", "test bytes")) +
44 |                         String.format("%-" + fieldLengths[8] + "s", String.format("%s", "test string")) + "\n"
45 |                 );
46 |                 fsConfig.offsetsByIndex().put(index, (long) index);
47 |             });
48 |         }
49 |         Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName());
50 |         fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path);
51 |         return path;
52 |     }
53 | 
54 |     @Override
55 |     protected Map<String, Object> getReaderConfig() {
56 |         return new HashMap<String, Object>() {{
57 |             put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
58 |             put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS,
59 |                     Arrays.stream(fieldLengths).mapToObj(String::valueOf).collect(Collectors.joining(",")));
60 |             put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string");
61 |         }};
62 |     }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import org.apache.kafka.connect.data.Struct;
 5 | 
 6 | import java.math.BigInteger;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | 
10 | import static org.junit.jupiter.api.Assertions.*;
11 | 
12 | public class JsonFileReaderTest extends JacksonFileReaderTest {
13 | 
14 |     private static final String FILE_EXTENSION = "jsn";
15 | 
16 |     @Override
17 |     protected void checkData(Struct record, long index) {
18 |         List<Struct> array = record.getArray(FIELD_ARRAY_COMPLEX);
19 |         Struct subrecord = record.getStruct(FIELD_STRUCT);
20 |         assertAll(
21 |                 () -> assertEquals(index, (int) record.get(FIELD_INTEGER)),
22 |                 () -> assertEquals(new BigInteger("9999999999999999999").longValue(), record.get(FIELD_BIG_INTEGER)),
23 |                 () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)),
24 |                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
25 |                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
26 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
27 |                 () -> assertNull(record.get(FIELD_NULL)),
28 |                 () -> assertNotNull(record.schema().field(FIELD_NULL)),
29 |                 () -> assertEquals("dGVzdA==", record.get(FIELD_BINARY)),
30 |                 () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)),
31 | 
32 |                 () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)),
33 |                 () -> assertEquals(Long.MAX_VALUE, (long) array.get(0).get(FIELD_LONG)),
34 |                 () -> assertTrue(array.get(0).get(FIELD_STRING).toString().startsWith(index + "_")),
35 |                 () -> assertTrue(Boolean.parseBoolean(array.get(0).get(FIELD_BOOLEAN).toString())),
36 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(0).get(FIELD_DECIMAL), 0),
37 |                 () -> assertNull(array.get(0).get(FIELD_NULL)),
38 |                 () -> assertNotNull(array.get(0).schema().field(FIELD_NULL)),
39 |                 () -> assertEquals(index + 1, (int) array.get(1).get(FIELD_INTEGER)),
40 |                 () -> assertEquals(Long.MAX_VALUE, (long) array.get(1).get(FIELD_LONG)),
41 |                 () -> assertTrue(array.get(1).get(FIELD_STRING).toString().startsWith(index + "_")),
42 |                 () -> assertTrue(Boolean.parseBoolean(array.get(1).get(FIELD_BOOLEAN).toString())),
43 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(1).get(FIELD_DECIMAL), 0),
44 |                 () -> assertNull(array.get(1).get(FIELD_NULL)),
45 |                 () -> assertNotNull(array.get(1).schema().field(FIELD_NULL)),
46 | 
47 |                 () -> assertEquals(index, (int) subrecord.get(FIELD_INTEGER)),
48 |                 () -> assertEquals(Long.MAX_VALUE, (long) subrecord.get(FIELD_LONG)),
49 |                 () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")),
50 |                 () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())),
51 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) subrecord.get(FIELD_DECIMAL), 0),
52 |                 () -> assertNull(subrecord.get(FIELD_NULL)),
53 |                 () -> assertNotNull(subrecord.schema().field(FIELD_NULL))
54 |         );
55 |     }
56 | 
57 |     @Override
58 |     protected Class<? extends FileReader> getReaderClass() {
59 |         return JsonFileReader.class;
60 |     }
61 | 
62 |     @Override
63 |     protected String getFileExtension() {
64 |         return FILE_EXTENSION;
65 |     }
66 | 
67 |     @Override
68 |     protected String readerEncodingConfig() {
69 |         return JsonFileReader.FILE_READER_JSON_ENCODING;
70 |     }
71 | 
72 |     @Override
73 |     protected String recordPerLineConfig() {
74 |         return JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE;
75 |     }
76 | 
77 |     @Override
78 |     protected String compressionTypeConfig() {
79 |         return JsonFileReader.FILE_READER_JSON_COMPRESSION_TYPE;
80 |     }
81 | 
82 |     @Override
83 |     protected String compressionConcatenatedConfig() {
84 |         return JsonFileReader.FILE_READER_JSON_COMPRESSION_CONCATENATED;
85 |     }
86 | 
87 |     @Override
88 |     protected String deserializationConfigPrefix() {
89 |         return JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS;
90 |     }
91 | 
92 |     @Override
93 |     protected ObjectMapper getObjectMapper() {
94 |         return new ObjectMapper();
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig;
 4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig;
 5 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig;
 6 | import org.apache.hadoop.fs.Path;
 7 | 
 8 | import java.util.HashMap;
 9 | import java.util.Map;
10 | 
11 | interface ReaderFsTestConfig extends FsTestConfig {
12 | 
13 |     void setDataFile(Path dataFile);
14 | 
15 |     Path getDataFile();
16 | 
17 |     void setReader(FileReader reader);
18 | 
19 |     FileReader getReader();
20 | 
21 |     Map<Integer, Long> offsetsByIndex();
22 | 
23 | }
24 | 
25 | class LocalFsConfig extends AbstractLocalFsConfig implements ReaderFsTestConfig {
26 |     private Path dataFile;
27 |     private FileReader reader;
28 |     private Map<Integer, Long> offsetsByIndex;
29 | 
30 |     @Override
31 |     public void init() {
32 |         offsetsByIndex = new HashMap<>();
33 |     }
34 | 
35 |     @Override
36 |     public void setDataFile(Path dataFile) {
37 |         this.dataFile = dataFile;
38 |     }
39 | 
40 |     @Override
41 |     public Path getDataFile() {
42 |         return dataFile;
43 |     }
44 | 
45 |     @Override
46 |     public void setReader(FileReader reader) {
47 |         this.reader = reader;
48 |     }
49 | 
50 |     @Override
51 |     public FileReader getReader() {
52 |         return reader;
53 |     }
54 | 
55 |     @Override
56 |     public Map<Integer, Long> offsetsByIndex() {
57 |         return offsetsByIndex;
58 |     }
59 | 
60 | }
61 | 
62 | class HdfsFsConfig extends AbstractHdfsFsConfig implements ReaderFsTestConfig {
63 |     private Path dataFile;
64 |     private FileReader reader;
65 |     private Map<Integer, Long> offsetsByIndex;
66 | 
67 |     @Override
68 |     public void init() {
69 |         offsetsByIndex = new HashMap<>();
70 |     }
71 | 
72 |     @Override
73 |     public Path getDataFile() {
74 |         return dataFile;
75 |     }
76 | 
77 |     @Override
78 |     public void setDataFile(Path dataFile) {
79 |         this.dataFile = dataFile;
80 |     }
81 | 
82 |     @Override
83 |     public void setReader(FileReader reader) {
84 |         this.reader = reader;
85 |     }
86 | 
87 |     @Override
88 |     public FileReader getReader() {
89 |         return reader;
90 |     }
91 | 
92 |     @Override
93 |     public Map<Integer, Long> offsetsByIndex() {
94 |         return offsetsByIndex;
95 |     }
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.hadoop.fs.FileSystem;
  4 | import org.apache.hadoop.fs.Path;
  5 | import org.apache.hadoop.io.*;
  6 | import org.apache.hadoop.util.ReflectionUtils;
  7 | import org.apache.kafka.connect.data.SchemaBuilder;
  8 | import org.apache.kafka.connect.data.Struct;
  9 | import org.junit.jupiter.params.ParameterizedTest;
 10 | import org.junit.jupiter.params.provider.MethodSource;
 11 | 
 12 | import java.io.DataInput;
 13 | import java.io.DataOutput;
 14 | import java.io.File;
 15 | import java.io.IOException;
 16 | import java.util.HashMap;
 17 | import java.util.Map;
 18 | import java.util.UUID;
 19 | import java.util.stream.IntStream;
 20 | 
 21 | import static org.junit.jupiter.api.Assertions.*;
 22 | 
 23 | public class SequenceFileReaderTest extends FileReaderTestBase {
 24 | 
 25 |     private static final String FIELD_NAME_KEY = "custom_field_key";
 26 |     private static final String FIELD_NAME_VALUE = "custom_field_name";
 27 |     private static final String FILE_EXTENSION = "sq";
 28 | 
 29 |     @Override
 30 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 31 |         FileSystem fs = fsConfig.getFs();
 32 |         File seqFile = File.createTempFile("test-", "." + getFileExtension());
 33 |         try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(),
 34 |                 SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())),
 35 |                 SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) {
 36 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
 37 |                 Writable key = new IntWritable(index);
 38 |                 Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID()));
 39 |                 try {
 40 |                     writer.append(key, value);
 41 |                     writer.sync();
 42 |                     fsConfig.offsetsByIndex().put(index, (long) index);
 43 |                 } catch (IOException ioe) {
 44 |                     throw new RuntimeException(ioe);
 45 |                 }
 46 |             });
 47 |         }
 48 |         Path path = new Path(new Path(fsConfig.getFsUri()), seqFile.getName());
 49 |         fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path);
 50 |         return path;
 51 |     }
 52 | 
 53 |     @ParameterizedTest
 54 |     @MethodSource("fileSystemConfigProvider")
 55 |     public void defaultFieldNames(ReaderFsTestConfig fsConfig) {
 56 |         Map<String, Object> readerConfig = getReaderConfig();
 57 |         readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null);
 58 |         readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null);
 59 |         FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig);
 60 |         assertEquals(reader.getFilePath(), fsConfig.getDataFile());
 61 |         assertTrue(reader.hasNext());
 62 | 
 63 |         int recordCount = 0;
 64 |         while (reader.hasNext()) {
 65 |             Struct record = reader.next();
 66 |             checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT,
 67 |                     record, recordCount);
 68 |             recordCount++;
 69 |         }
 70 |         assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
 71 |     }
 72 | 
 73 |     @ParameterizedTest
 74 |     @MethodSource("fileSystemConfigProvider")
 75 |     public void schemaMapper(ReaderFsTestConfig fsConfig) {
 76 |         SequenceFileReader reader = (SequenceFileReader) fsConfig.getReader();
 77 | 
 78 |         ByteWritable byteWritable = new ByteWritable((byte) 1);
 79 |         ShortWritable shortWritable = new ShortWritable((short) 123);
 80 |         IntWritable intWritable = new IntWritable(123);
 81 |         LongWritable longWritable = new LongWritable(123L);
 82 |         FloatWritable floatWritable = new FloatWritable(0.123F);
 83 |         DoubleWritable doubleWritable = new DoubleWritable(0.123D);
 84 |         BytesWritable bytesWritable = new BytesWritable(new byte[]{1, 2, 3});
 85 |         BooleanWritable booleanWritable = new BooleanWritable(true);
 86 |         Text textWritable = new Text("123");
 87 | 
 88 |         assertEquals(SchemaBuilder.INT8_SCHEMA, reader.getSchema(byteWritable));
 89 |         assertEquals(SchemaBuilder.INT16_SCHEMA, reader.getSchema(shortWritable));
 90 |         assertEquals(SchemaBuilder.INT32_SCHEMA, reader.getSchema(intWritable));
 91 |         assertEquals(SchemaBuilder.INT64_SCHEMA, reader.getSchema(longWritable));
 92 |         assertEquals(SchemaBuilder.FLOAT32_SCHEMA, reader.getSchema(floatWritable));
 93 |         assertEquals(SchemaBuilder.FLOAT64_SCHEMA, reader.getSchema(doubleWritable));
 94 |         assertEquals(SchemaBuilder.BYTES_SCHEMA, reader.getSchema(bytesWritable));
 95 |         assertEquals(SchemaBuilder.BOOLEAN_SCHEMA, reader.getSchema(booleanWritable));
 96 |         assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(textWritable));
 97 |         assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(new Writable() {
 98 | 
 99 |             @Override
100 |             public void write(DataOutput out) {
101 | 
102 |             }
103 | 
104 |             @Override
105 |             public void readFields(DataInput in) {
106 | 
107 |             }
108 |         }));
109 | 
110 |         SequenceFileReader.SeqToStruct seqToStruct = new SequenceFileReader.SeqToStruct();
111 |         assertEquals(seqToStruct.toSchemaValue(byteWritable), byteWritable.get());
112 |         assertEquals(seqToStruct.toSchemaValue(shortWritable), shortWritable.get());
113 |         assertEquals(seqToStruct.toSchemaValue(intWritable), intWritable.get());
114 |         assertEquals(seqToStruct.toSchemaValue(longWritable), longWritable.get());
115 |         assertEquals(seqToStruct.toSchemaValue(floatWritable), floatWritable.get());
116 |         assertEquals(seqToStruct.toSchemaValue(doubleWritable), doubleWritable.get());
117 |         assertEquals(seqToStruct.toSchemaValue(bytesWritable), bytesWritable.getBytes());
118 |         assertEquals(seqToStruct.toSchemaValue(booleanWritable), booleanWritable.get());
119 |         assertEquals(seqToStruct.toSchemaValue(textWritable), textWritable.toString());
120 |     }
121 | 
122 |     @Override
123 |     protected Class<? extends FileReader> getReaderClass() {
124 |         return SequenceFileReader.class;
125 |     }
126 | 
127 |     @Override
128 |     protected Map<String, Object> getReaderConfig() {
129 |         return new HashMap<String, Object>() {{
130 |             put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY);
131 |             put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
132 |         }};
133 |     }
134 | 
135 |     @Override
136 |     protected void checkData(Struct record, long index) {
137 |         checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index);
138 |     }
139 | 
140 |     private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) {
141 |         assertAll(
142 |                 () -> assertEquals(index, (int) record.get(keyFieldName)),
143 |                 () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_"))
144 |         );
145 |     }
146 | 
147 |     @Override
148 |     protected String getFileExtension() {
149 |         return FILE_EXTENSION;
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import org.apache.hadoop.fs.Path;
  4 | import org.apache.kafka.connect.data.Struct;
  5 | import org.apache.kafka.connect.errors.ConnectException;
  6 | import org.junit.jupiter.params.ParameterizedTest;
  7 | import org.junit.jupiter.params.provider.MethodSource;
  8 | 
  9 | import java.io.File;
 10 | import java.io.IOException;
 11 | import java.io.PrintWriter;
 12 | import java.nio.charset.UnsupportedCharsetException;
 13 | import java.util.Arrays;
 14 | import java.util.HashMap;
 15 | import java.util.Map;
 16 | import java.util.UUID;
 17 | import java.util.stream.IntStream;
 18 | 
 19 | import static org.junit.jupiter.api.Assertions.*;
 20 | 
 21 | public class TextFileReaderTest extends FileReaderTestBase {
 22 | 
 23 |     private static final String FIELD_NAME_VALUE = "custom_field_name";
 24 |     private static final String FILE_EXTENSION = "txt";
 25 |     private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.GZIP;
 26 | 
 27 |     @Override
 28 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 29 |         CompressionType compression = args.length < 1 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[0];
 30 |         File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION);
 31 |         try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) {
 32 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
 33 |                 String value = String.format("%d_%s", index, UUID.randomUUID());
 34 |                 writer.append(value + "\n");
 35 |                 fsConfig.offsetsByIndex().put(index, (long) index);
 36 |             });
 37 |         }
 38 |         Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName());
 39 |         fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path);
 40 |         return path;
 41 |     }
 42 | 
 43 |     @ParameterizedTest
 44 |     @MethodSource("fileSystemConfigProvider")
 45 |     public void validFileEncoding(ReaderFsTestConfig fsConfig) {
 46 |         Map<String, Object> readerConfig = getReaderConfig();
 47 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
 48 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252");
 49 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT);
 50 |         FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig);
 51 |         fsConfig.setReader(reader);
 52 |         readAllData(fsConfig);
 53 |     }
 54 | 
 55 |     @ParameterizedTest
 56 |     @MethodSource("fileSystemConfigProvider")
 57 |     public void invalidFileEncoding(ReaderFsTestConfig fsConfig) {
 58 |         Map<String, Object> readerConfig = getReaderConfig();
 59 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
 60 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset");
 61 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT);
 62 |         assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig));
 63 |         assertThrows(UnsupportedCharsetException.class, () -> {
 64 |             try {
 65 |                 getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig);
 66 |             } catch (Exception e) {
 67 |                 throw e.getCause();
 68 |             }
 69 |         });
 70 |     }
 71 | 
 72 |     @ParameterizedTest
 73 |     @MethodSource("fileSystemConfigProvider")
 74 |     public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException {
 75 |         Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT);
 76 |         Map<String, Object> readerConfig = getReaderConfig();
 77 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
 78 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false");
 79 |         readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT);
 80 |         FileReader reader = getReader(fsConfig.getFs(), file, readerConfig);
 81 | 
 82 |         assertTrue(reader.hasNext());
 83 | 
 84 |         int recordCount = 0;
 85 |         while (reader.hasNext()) {
 86 |             Struct record = reader.next();
 87 |             checkData(record, recordCount);
 88 |             recordCount++;
 89 |         }
 90 |         reader.close();
 91 |         assertEquals(1, recordCount, () -> "The number of records in the file does not match");
 92 |     }
 93 | 
 94 |     @ParameterizedTest
 95 |     @MethodSource("fileSystemConfigProvider")
 96 |     public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) {
 97 |         Arrays.stream(CompressionType.values()).forEach(compressionType -> {
 98 |             try {
 99 |                 Path file = createDataFile(fsConfig, compressionType);
100 |                 Map<String, Object> readerConfig = getReaderConfig();
101 |                 readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
102 |                 readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType);
103 |                 FileReader reader = getReader(fsConfig.getFs(), file, readerConfig);
104 | 
105 |                 assertTrue(reader.hasNext());
106 | 
107 |                 int recordCount = 0;
108 |                 while (reader.hasNext()) {
109 |                     Struct record = reader.next();
110 |                     checkData(record, recordCount);
111 |                     recordCount++;
112 |                 }
113 |                 reader.close();
114 |                 assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
115 |             } catch (Exception e) {
116 |                 throw new RuntimeException(e);
117 |             }
118 |         });
119 |     }
120 | 
121 |     @Override
122 |     protected Class<? extends FileReader> getReaderClass() {
123 |         return TextFileReader.class;
124 |     }
125 | 
126 |     @Override
127 |     protected Map<String, Object> getReaderConfig() {
128 |         return new HashMap<String, Object>() {{
129 |             put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
130 |             put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT);
131 |             put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true");
132 |         }};
133 |     }
134 | 
135 |     @Override
136 |     protected void checkData(Struct record, long index) {
137 |         assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_"));
138 |     }
139 | 
140 |     @Override
141 |     protected String getFileExtension() {
142 |         return FILE_EXTENSION;
143 |     }
144 | }
145 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.kafka.connect.data.Struct;
 5 | import org.junit.jupiter.params.ParameterizedTest;
 6 | import org.junit.jupiter.params.provider.MethodSource;
 7 | 
 8 | import java.io.File;
 9 | import java.io.FileWriter;
10 | import java.io.IOException;
11 | import java.io.PrintWriter;
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | import java.util.stream.IntStream;
15 | 
16 | import static org.junit.jupiter.api.Assertions.*;
17 | 
18 | public class TsvFileReaderTest extends UnivocityFileReaderTest<TsvFileReader> {
19 | 
20 |     @Override
21 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
22 |         boolean header = args.length < 1 || (boolean) args[0];
23 |         CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1];
24 |         File txtFile = File.createTempFile("test-", "." + getFileExtension());
25 |         try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) {
26 |             if (header) {
27 |                 String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4,
28 |                         FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9);
29 |                 writer.append(headerValue + "\n");
30 |             }
31 |             IntStream.range(0, NUM_RECORDS).forEach(index -> {
32 |                 String value = String.format("%d\t%d\t%d\t%d\t%f\t%f\t%s\t%s\t%s\n",
33 |                         (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d,
34 |                         true, "test bytes", "test string");
35 |                 writer.append(value);
36 |                 fsConfig.offsetsByIndex().put(index, (long) index);
37 |             });
38 |         }
39 |         Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName());
40 |         fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path);
41 |         return path;
42 |     }
43 | 
44 |     @ParameterizedTest
45 |     @MethodSource("fileSystemConfigProvider")
46 |     public void readAllDataWithEmptyAndNullValueWithAllowNullsAndWithoutSchemaProvided(ReaderFsTestConfig fsConfig) throws IOException {
47 |         File tmp = File.createTempFile("test-", "." + getFileExtension());
48 |         try (FileWriter writer = new FileWriter(tmp)) {
49 |             String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3);
50 |             writer.append(headerValue + "\n");
51 |             writer.append("yes\t\"\"\t\n");
52 |             writer.append("yes\tcool\ttest");
53 |         }
54 | 
55 |         Map<String, Object> defaultReadConfig = getReaderConfig();
56 |         defaultReadConfig.remove(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA);
57 |         Map<String, Object> readerConfig = defaultReadConfig;
58 |         readerConfig.put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
59 |         readerConfig.put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "true");
60 | 
61 |         Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
62 |         fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
63 |         FileReader reader = getReader(fsConfig.getFs(), path, readerConfig);
64 | 
65 |         assertTrue(reader.hasNext());
66 | 
67 |         Struct record = reader.next();
68 |         assertAll(
69 |                 () -> assertEquals(record.get(FIELD_COLUMN1), "yes"),
70 |                 () -> assertNull(record.get(FIELD_COLUMN2)),
71 |                 () -> assertNull(record.get(FIELD_COLUMN3))
72 |         );
73 | 
74 |         assertTrue(reader.hasNext());
75 |         Struct record2 = reader.next();
76 |         assertAll(
77 |                 () -> assertEquals(record2.get(FIELD_COLUMN1), "yes"),
78 |                 () -> assertEquals(record2.get(FIELD_COLUMN2), "cool"),
79 |                 () -> assertEquals(record2.get(FIELD_COLUMN3), "test")
80 |         );
81 | 
82 |         assertFalse(reader.hasNext());
83 |     }
84 | 
85 | 
86 |     @Override
87 |     protected Map<String, Object> getReaderConfig() {
88 |         return new HashMap<String, Object>() {{
89 |             put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true");
90 |             put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string");
91 |         }};
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/XmlFileReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import com.fasterxml.jackson.dataformat.xml.XmlMapper;
 5 | import org.apache.kafka.connect.data.Struct;
 6 | 
 7 | import static org.junit.jupiter.api.Assertions.*;
 8 | 
 9 | public class XmlFileReaderTest extends JacksonFileReaderTest {
10 | 
11 |     private static final String FILE_EXTENSION = "xl";
12 | 
13 |     @Override
14 |     protected void checkData(Struct record, long index) {
15 |         Struct array = record.getStruct(FIELD_ARRAY_COMPLEX);
16 |         Struct subrecord = record.getStruct(FIELD_STRUCT);
17 |         assertAll(
18 |                 () -> assertEquals(index, Integer.parseInt(record.getString(FIELD_INTEGER))),
19 |                 () -> assertEquals("9999999999999999999", record.get(FIELD_BIG_INTEGER)),
20 |                 () -> assertEquals(Long.MAX_VALUE, Long.parseLong(record.getString(FIELD_LONG))),
21 |                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
22 |                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
23 |                 () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(record.getString(FIELD_DECIMAL))),
24 |                 () -> assertNull(record.get(FIELD_NULL)),
25 |                 () -> assertNotNull(record.schema().field(FIELD_NULL)),
26 |                 () -> assertEquals("dGVzdA==", record.get(FIELD_BINARY)),
27 |                 () -> assertEquals("elm[" + (index + 1) + "]", record.get(FIELD_ARRAY_SIMPLE)),
28 | 
29 |                 () -> assertEquals(index + 1, Integer.parseInt(array.getString(FIELD_INTEGER))),
30 |                 () -> assertEquals(Long.MAX_VALUE, Long.parseLong(array.getString(FIELD_LONG))),
31 |                 () -> assertTrue(array.get(FIELD_STRING).toString().startsWith(index + "_")),
32 |                 () -> assertTrue(Boolean.parseBoolean(array.get(FIELD_BOOLEAN).toString())),
33 |                 () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(array.getString(FIELD_DECIMAL))),
34 |                 () -> assertNull(array.get(FIELD_NULL)),
35 |                 () -> assertNotNull(array.schema().field(FIELD_NULL)),
36 |                 () -> assertEquals(index + 1, Integer.parseInt(array.getString(FIELD_INTEGER))),
37 |                 () -> assertEquals(Long.MAX_VALUE, Long.parseLong(array.getString(FIELD_LONG))),
38 |                 () -> assertTrue(array.get(FIELD_STRING).toString().startsWith(index + "_")),
39 |                 () -> assertTrue(Boolean.parseBoolean(array.get(FIELD_BOOLEAN).toString())),
40 |                 () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(array.getString(FIELD_DECIMAL))),
41 |                 () -> assertNull(array.get(FIELD_NULL)),
42 |                 () -> assertNotNull(array.schema().field(FIELD_NULL)),
43 | 
44 |                 () -> assertEquals(index, Integer.parseInt(subrecord.getString(FIELD_INTEGER))),
45 |                 () -> assertEquals(Long.MAX_VALUE, Long.parseLong(subrecord.getString(FIELD_LONG))),
46 |                 () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")),
47 |                 () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())),
48 |                 () -> assertEquals(Double.parseDouble(index + "." + index), Double.parseDouble(subrecord.getString(FIELD_DECIMAL))),
49 |                 () -> assertNull(subrecord.get(FIELD_NULL)),
50 |                 () -> assertNotNull(subrecord.schema().field(FIELD_NULL))
51 |         );
52 |     }
53 | 
54 |     @Override
55 |     protected Class<? extends FileReader> getReaderClass() {
56 |         return XmlFileReader.class;
57 |     }
58 | 
59 |     @Override
60 |     protected String getFileExtension() {
61 |         return FILE_EXTENSION;
62 |     }
63 | 
64 |     @Override
65 |     protected String readerEncodingConfig() {
66 |         return XmlFileReader.FILE_READER_XML_ENCODING;
67 |     }
68 | 
69 |     @Override
70 |     protected String recordPerLineConfig() {
71 |         return XmlFileReader.FILE_READER_XML_RECORD_PER_LINE;
72 |     }
73 | 
74 |     @Override
75 |     protected String compressionTypeConfig() {
76 |         return XmlFileReader.FILE_READER_XML_COMPRESSION_TYPE;
77 |     }
78 | 
79 |     @Override
80 |     protected String compressionConcatenatedConfig() {
81 |         return XmlFileReader.FILE_READER_XML_COMPRESSION_CONCATENATED;
82 |     }
83 | 
84 |     @Override
85 |     protected String deserializationConfigPrefix() {
86 |         return XmlFileReader.FILE_READER_XML_DESERIALIZATION_CONFIGS;
87 |     }
88 | 
89 |     @Override
90 |     protected ObjectMapper getObjectMapper() {
91 |         return new XmlMapper();
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/YamlFileReaderTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.file.reader;
  2 | 
  3 | import com.fasterxml.jackson.databind.ObjectMapper;
  4 | import com.fasterxml.jackson.dataformat.yaml.YAMLMapper;
  5 | import org.apache.hadoop.fs.Path;
  6 | import org.apache.kafka.connect.data.Struct;
  7 | import org.junit.jupiter.api.Disabled;
  8 | import org.junit.jupiter.params.ParameterizedTest;
  9 | import org.junit.jupiter.params.provider.MethodSource;
 10 | 
 11 | import java.io.BufferedWriter;
 12 | import java.io.File;
 13 | import java.io.FileWriter;
 14 | import java.io.IOException;
 15 | import java.math.BigInteger;
 16 | import java.util.Arrays;
 17 | import java.util.List;
 18 | import java.util.Map;
 19 | 
 20 | import static org.junit.jupiter.api.Assertions.*;
 21 | 
 22 | public class YamlFileReaderTest extends JacksonFileReaderTest {
 23 | 
 24 |     private static final String FILE_EXTENSION = "yl";
 25 | 
 26 |     protected static final int NUM_RECORDS = 1;
 27 | 
 28 |     @Override
 29 |     protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
 30 |         CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2];
 31 |         return super.createDataFile(fsConfig, 1, false, compression);
 32 |     }
 33 | 
 34 |     @ParameterizedTest
 35 |     @MethodSource("fileSystemConfigProvider")
 36 |     public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException {
 37 |         File tmp = File.createTempFile("test-", "." + getFileExtension());
 38 |         try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) {
 39 |             writer.write("test");
 40 |         }
 41 |         Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
 42 |         fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
 43 |         getReader(fsConfig.getFs(), path, getReaderConfig());
 44 |     }
 45 | 
 46 |     @ParameterizedTest
 47 |     @MethodSource("fileSystemConfigProvider")
 48 |     public void readAllData(ReaderFsTestConfig fsConfig) {
 49 |         FileReader reader = fsConfig.getReader();
 50 |         assertTrue(reader.hasNext());
 51 | 
 52 |         int recordCount = 0;
 53 |         while (reader.hasNext()) {
 54 |             Struct record = reader.next();
 55 |             checkData(record, recordCount);
 56 |             recordCount++;
 57 |         }
 58 |         assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
 59 |     }
 60 | 
 61 |     @ParameterizedTest
 62 |     @MethodSource("fileSystemConfigProvider")
 63 |     @Disabled
 64 |     public void seekFile(ReaderFsTestConfig fsConfig) {
 65 |     }
 66 | 
 67 |     @ParameterizedTest
 68 |     @MethodSource("fileSystemConfigProvider")
 69 |     @Disabled
 70 |     public void exceededSeek(ReaderFsTestConfig fsConfig) {
 71 | 
 72 |     }
 73 | 
 74 |     @ParameterizedTest
 75 |     @MethodSource("fileSystemConfigProvider")
 76 |     @Disabled
 77 |     public void readAllDataInBatches(ReaderFsTestConfig fsConfig) {
 78 | 
 79 |     }
 80 | 
 81 |     @ParameterizedTest
 82 |     @MethodSource("fileSystemConfigProvider")
 83 |     public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) {
 84 |         Arrays.stream(CompressionType.values()).forEach(compressionType -> {
 85 |             try {
 86 |                 Path file = createDataFile(fsConfig, NUM_RECORDS, true, compressionType);
 87 |                 Map<String, Object> readerConfig = getReaderConfig();
 88 |                 readerConfig.put(compressionTypeConfig(), compressionType.toString());
 89 |                 readerConfig.put(compressionConcatenatedConfig(), "true");
 90 |                 FileReader reader = getReader(fsConfig.getFs(), file, readerConfig);
 91 | 
 92 |                 assertTrue(reader.hasNext());
 93 | 
 94 |                 int recordCount = 0;
 95 |                 while (reader.hasNext()) {
 96 |                     Struct record = reader.next();
 97 |                     checkData(record, recordCount);
 98 |                     recordCount++;
 99 |                 }
100 |                 reader.close();
101 |                 assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
102 |             } catch (Exception e) {
103 |                 throw new RuntimeException(e);
104 |             }
105 |         });
106 |     }
107 | 
108 |     @Override
109 |     protected void checkData(Struct record, long index) {
110 |         List<Struct> array = record.getArray(FIELD_ARRAY_COMPLEX);
111 |         Struct subrecord = record.getStruct(FIELD_STRUCT);
112 |         assertAll(
113 |                 () -> assertEquals(index, (int) record.get(FIELD_INTEGER)),
114 |                 () -> assertEquals(new BigInteger("9999999999999999999").longValue(), record.get(FIELD_BIG_INTEGER)),
115 |                 () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)),
116 |                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
117 |                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
118 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
119 |                 () -> assertNull(record.get(FIELD_NULL)),
120 |                 () -> assertNotNull(record.schema().field(FIELD_NULL)),
121 |                 () -> assertEquals("test", new String((byte[]) record.get(FIELD_BINARY))),
122 |                 () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)),
123 | 
124 |                 () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)),
125 |                 () -> assertEquals(Long.MAX_VALUE, (long) array.get(0).get(FIELD_LONG)),
126 |                 () -> assertTrue(array.get(0).get(FIELD_STRING).toString().startsWith(index + "_")),
127 |                 () -> assertTrue(Boolean.parseBoolean(array.get(0).get(FIELD_BOOLEAN).toString())),
128 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(0).get(FIELD_DECIMAL), 0),
129 |                 () -> assertNull(array.get(0).get(FIELD_NULL)),
130 |                 () -> assertNotNull(array.get(0).schema().field(FIELD_NULL)),
131 |                 () -> assertEquals(index + 1, (int) array.get(1).get(FIELD_INTEGER)),
132 |                 () -> assertEquals(Long.MAX_VALUE, (long) array.get(1).get(FIELD_LONG)),
133 |                 () -> assertTrue(array.get(1).get(FIELD_STRING).toString().startsWith(index + "_")),
134 |                 () -> assertTrue(Boolean.parseBoolean(array.get(1).get(FIELD_BOOLEAN).toString())),
135 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(1).get(FIELD_DECIMAL), 0),
136 |                 () -> assertNull(array.get(1).get(FIELD_NULL)),
137 |                 () -> assertNotNull(array.get(1).schema().field(FIELD_NULL)),
138 | 
139 |                 () -> assertEquals(index, (int) subrecord.get(FIELD_INTEGER)),
140 |                 () -> assertEquals(Long.MAX_VALUE, (long) subrecord.get(FIELD_LONG)),
141 |                 () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")),
142 |                 () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())),
143 |                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) subrecord.get(FIELD_DECIMAL), 0),
144 |                 () -> assertNull(subrecord.get(FIELD_NULL)),
145 |                 () -> assertNotNull(subrecord.schema().field(FIELD_NULL))
146 |         );
147 |     }
148 | 
149 |     @Override
150 |     protected Class<? extends FileReader> getReaderClass() {
151 |         return YamlFileReader.class;
152 |     }
153 | 
154 |     @Override
155 |     protected String getFileExtension() {
156 |         return FILE_EXTENSION;
157 |     }
158 | 
159 |     @Override
160 |     protected String readerEncodingConfig() {
161 |         return YamlFileReader.FILE_READER_YAML_ENCODING;
162 |     }
163 | 
164 |     @Override
165 |     protected String recordPerLineConfig() {
166 |         return "UNKNOWN";
167 |     }
168 | 
169 |     @Override
170 |     protected String compressionTypeConfig() {
171 |         return YamlFileReader.FILE_READER_YAML_COMPRESSION_TYPE;
172 |     }
173 | 
174 |     @Override
175 |     protected String compressionConcatenatedConfig() {
176 |         return YamlFileReader.FILE_READER_YAML_COMPRESSION_CONCATENATED;
177 |     }
178 | 
179 |     @Override
180 |     protected String deserializationConfigPrefix() {
181 |         return YamlFileReader.FILE_READER_YAML_DESERIALIZATION_CONFIGS;
182 |     }
183 | 
184 |     @Override
185 |     protected ObjectMapper getObjectMapper() {
186 |         return new YAMLMapper();
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.policy;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
  5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.kafka.common.config.ConfigException;
  8 | import org.apache.kafka.connect.errors.ConnectException;
  9 | import org.apache.kafka.connect.errors.IllegalWorkerStateException;
 10 | import org.junit.jupiter.params.ParameterizedTest;
 11 | import org.junit.jupiter.params.provider.MethodSource;
 12 | 
 13 | import java.io.IOException;
 14 | import java.time.LocalDateTime;
 15 | import java.util.HashMap;
 16 | import java.util.List;
 17 | import java.util.Map;
 18 | 
 19 | import static org.junit.jupiter.api.Assertions.*;
 20 | 
 21 | public class CronPolicyTest extends PolicyTestBase {
 22 | 
 23 |     @Override
 24 |     protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
 25 |         Map<String, String> cfg = new HashMap<String, String>() {{
 26 |             String[] uris = directories.stream().map(Path::toString)
 27 |                     .toArray(String[]::new);
 28 |             put(FsSourceTaskConfig.FS_URIS, String.join(",", uris));
 29 |             put(FsSourceTaskConfig.TOPIC, "topic_test");
 30 |             put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName());
 31 |             put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName());
 32 |             put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$");
 33 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test");
 34 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test");
 35 |             put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?");
 36 |             put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString());
 37 |         }};
 38 |         return new FsSourceTaskConfig(cfg);
 39 |     }
 40 | 
 41 |     @ParameterizedTest
 42 |     @MethodSource("fileSystemConfigProvider")
 43 |     @Override
 44 |     public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException {
 45 |         fsConfig.getPolicy().execute();
 46 |         fsConfig.getPolicy().interrupt();
 47 |         assertTrue(fsConfig.getPolicy().hasEnded());
 48 |         assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute());
 49 |     }
 50 | 
 51 |     @ParameterizedTest
 52 |     @MethodSource("fileSystemConfigProvider")
 53 |     @SuppressWarnings("unchecked")
 54 |     public void invalidCronExpression(PolicyFsTestConfig fsConfig) {
 55 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 56 |         originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid");
 57 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 58 |         assertThrows(ConnectException.class, () ->
 59 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 60 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
 61 |         assertThrows(ConfigException.class, () -> {
 62 |             try {
 63 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 64 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
 65 |             } catch (Exception e) {
 66 |                 throw e.getCause();
 67 |             }
 68 |         });
 69 |     }
 70 | 
 71 |     @ParameterizedTest
 72 |     @MethodSource("fileSystemConfigProvider")
 73 |     @SuppressWarnings("unchecked")
 74 |     public void invalidEndDate(PolicyFsTestConfig fsConfig) {
 75 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 76 |         originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid");
 77 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 78 |         assertThrows(ConnectException.class, () ->
 79 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 80 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
 81 |         assertThrows(ConfigException.class, () -> {
 82 |             try {
 83 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 84 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
 85 |             } catch (Exception e) {
 86 |                 throw e.getCause();
 87 |             }
 88 |         });
 89 |     }
 90 | 
 91 |     @ParameterizedTest
 92 |     @MethodSource("fileSystemConfigProvider")
 93 |     @SuppressWarnings("unchecked")
 94 |     public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws IOException {
 95 |         try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 96 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS),
 97 |                 fsConfig.getSourceTaskConfig())) {
 98 | 
 99 |             for (int i = 0; i < 5; i++) {
100 |                 assertFalse(policy.hasEnded());
101 |                 policy.execute();
102 |             }
103 |             policy.interrupt();
104 |             assertTrue(policy.hasEnded());
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.policy;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
  5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.kafka.common.config.ConfigException;
  8 | import org.apache.kafka.connect.errors.ConnectException;
  9 | import org.apache.kafka.connect.errors.IllegalWorkerStateException;
 10 | import org.junit.jupiter.api.BeforeAll;
 11 | import org.junit.jupiter.params.ParameterizedTest;
 12 | import org.junit.jupiter.params.provider.MethodSource;
 13 | 
 14 | import java.io.IOException;
 15 | import java.util.Collections;
 16 | import java.util.HashMap;
 17 | import java.util.List;
 18 | import java.util.Map;
 19 | 
 20 | import static org.junit.jupiter.api.Assertions.*;
 21 | 
 22 | public class HdfsFileWatcherPolicyTest extends PolicyTestBase {
 23 | 
 24 |     static {
 25 |         TEST_FILE_SYSTEMS = Collections.singletonList(
 26 |                 new HdfsFsConfig()
 27 |         );
 28 |     }
 29 | 
 30 |     @BeforeAll
 31 |     public static void initFs() throws IOException {
 32 |         for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) {
 33 |             fsConfig.initFs();
 34 |         }
 35 |     }
 36 | 
 37 |     @Override
 38 |     protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
 39 |         Map<String, String> cfg = new HashMap<String, String>() {{
 40 |             String[] uris = directories.stream().map(Path::toString)
 41 |                     .toArray(String[]::new);
 42 |             put(FsSourceTaskConfig.FS_URIS, String.join(",", uris));
 43 |             put(FsSourceTaskConfig.TOPIC, "topic_test");
 44 |             put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName());
 45 |             put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName());
 46 |             put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$");
 47 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test");
 48 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test");
 49 |         }};
 50 |         return new FsSourceTaskConfig(cfg);
 51 |     }
 52 | 
 53 |     // This policy does not throw any exception. Just stop watching those nonexistent dirs
 54 |     @ParameterizedTest
 55 |     @MethodSource("fileSystemConfigProvider")
 56 |     @Override
 57 |     public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException {
 58 |         for (Path dir : fsConfig.getDirectories()) {
 59 |             fsConfig.getFs().delete(dir, true);
 60 |         }
 61 |         try {
 62 |             fsConfig.getPolicy().execute();
 63 |         } finally {
 64 |             for (Path dir : fsConfig.getDirectories()) {
 65 |                 fsConfig.getFs().mkdirs(dir);
 66 |             }
 67 |         }
 68 |     }
 69 | 
 70 |     // This policy never ends. We have to interrupt it
 71 |     @ParameterizedTest
 72 |     @MethodSource("fileSystemConfigProvider")
 73 |     @Override
 74 |     public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException {
 75 |         fsConfig.getPolicy().execute();
 76 |         assertFalse(fsConfig.getPolicy().hasEnded());
 77 |         fsConfig.getPolicy().interrupt();
 78 |         assertTrue(fsConfig.getPolicy().hasEnded());
 79 |         assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute());
 80 |     }
 81 | 
 82 |     @ParameterizedTest
 83 |     @MethodSource("fileSystemConfigProvider")
 84 |     @SuppressWarnings("unchecked")
 85 |     public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException, IOException {
 86 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 87 |         originals.put(FsSourceTaskConfig.FS_URIS, "hdfs://localhost:65432/data");
 88 |         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "0");
 89 |         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "0");
 90 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 91 |         try(Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 92 |                 .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)) {
 93 |             int count = 0;
 94 |             while (!policy.hasEnded() && count < 10) {
 95 |                 Thread.sleep(500);
 96 |                 count++;
 97 |             }
 98 |             assertTrue(count < 10);
 99 |             assertTrue(policy.hasEnded());
100 |         }
101 |     }
102 | 
103 |     @ParameterizedTest
104 |     @MethodSource("fileSystemConfigProvider")
105 |     @SuppressWarnings("unchecked")
106 |     public void invalidPollTime(PolicyFsTestConfig fsConfig) {
107 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
108 |         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "invalid");
109 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
110 |         assertThrows(ConnectException.class, () ->
111 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
112 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
113 |         assertThrows(ConfigException.class, () -> {
114 |             try {
115 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
116 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
117 |             } catch (Exception e) {
118 |                 throw e.getCause();
119 |             }
120 |         });
121 |     }
122 | 
123 |     @ParameterizedTest
124 |     @MethodSource("fileSystemConfigProvider")
125 |     @SuppressWarnings("unchecked")
126 |     public void invalidRetryTime(PolicyFsTestConfig fsConfig) {
127 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
128 |         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "invalid");
129 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
130 |         assertThrows(ConnectException.class, () ->
131 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
132 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
133 |         assertThrows(ConfigException.class, () -> {
134 |             try {
135 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
136 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
137 |             } catch (Exception e) {
138 |                 throw e.getCause();
139 |             }
140 |         });
141 |     }
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.policy;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig;
  4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig;
  5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  6 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig;
  7 | import org.apache.hadoop.fs.Path;
  8 | 
  9 | import java.io.IOException;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.UUID;
 13 | 
 14 | interface PolicyFsTestConfig extends FsTestConfig {
 15 | 
 16 |     Policy getPolicy();
 17 | 
 18 |     void setPolicy(Policy policy);
 19 | 
 20 |     FsSourceTaskConfig getSourceTaskConfig();
 21 | 
 22 |     void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig);
 23 | 
 24 |     List<Path> getDirectories();
 25 | 
 26 | }
 27 | 
 28 | class LocalFsConfig extends AbstractLocalFsConfig implements PolicyFsTestConfig {
 29 |     private Policy policy;
 30 |     private FsSourceTaskConfig sourceTaskConfig;
 31 |     private List<Path> directories;
 32 | 
 33 |     @Override
 34 |     public void init() throws IOException {
 35 |         directories = new ArrayList<Path>() {{
 36 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 37 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 38 |         }};
 39 |         for (Path dir : directories) {
 40 |             getFs().mkdirs(dir);
 41 |         }
 42 |     }
 43 | 
 44 |     @Override
 45 |     public Policy getPolicy() {
 46 |         return policy;
 47 |     }
 48 | 
 49 |     @Override
 50 |     public void setPolicy(Policy policy) {
 51 |         this.policy = policy;
 52 |     }
 53 | 
 54 |     @Override
 55 |     public FsSourceTaskConfig getSourceTaskConfig() {
 56 |         return sourceTaskConfig;
 57 |     }
 58 | 
 59 |     @Override
 60 |     public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) {
 61 |         this.sourceTaskConfig = sourceTaskConfig;
 62 |     }
 63 | 
 64 |     @Override
 65 |     public List<Path> getDirectories() {
 66 |         return directories;
 67 |     }
 68 | 
 69 | }
 70 | 
 71 | class HdfsFsConfig extends AbstractHdfsFsConfig implements PolicyFsTestConfig {
 72 |     private Policy policy;
 73 |     private FsSourceTaskConfig sourceTaskConfig;
 74 |     private List<Path> directories;
 75 | 
 76 |     @Override
 77 |     public void init() throws IOException {
 78 |         directories = new ArrayList<Path>() {{
 79 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 80 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 81 |         }};
 82 |         for (Path dir : directories) {
 83 |             getFs().mkdirs(dir);
 84 |         }
 85 |     }
 86 | 
 87 |     @Override
 88 |     public Policy getPolicy() {
 89 |         return policy;
 90 |     }
 91 | 
 92 |     @Override
 93 |     public void setPolicy(Policy policy) {
 94 |         this.policy = policy;
 95 |     }
 96 | 
 97 |     @Override
 98 |     public FsSourceTaskConfig getSourceTaskConfig() {
 99 |         return sourceTaskConfig;
100 |     }
101 | 
102 |     @Override
103 |     public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) {
104 |         this.sourceTaskConfig = sourceTaskConfig;
105 |     }
106 | 
107 |     @Override
108 |     public List<Path> getDirectories() {
109 |         return directories;
110 |     }
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.policy;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 4 | import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
 5 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
 6 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 7 | 
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.junit.jupiter.params.ParameterizedTest;
11 | import org.junit.jupiter.params.provider.MethodSource;
12 | 
13 | import static org.junit.jupiter.api.Assertions.*;
14 | 
15 | import java.io.IOException;
16 | import java.util.HashMap;
17 | import java.util.Iterator;
18 | import java.util.List;
19 | import java.util.Map;
20 | 
21 | public class SimplePolicyTest extends PolicyTestBase {
22 | 
23 |     @Override
24 |     protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
25 |         Map<String, String> cfg = new HashMap<String, String>() {{
26 |             String[] uris = directories.stream().map(Path::toString)
27 |                     .toArray(String[]::new);
28 |             put(FsSourceTaskConfig.FS_URIS, String.join(",", uris));
29 |             put(FsSourceTaskConfig.TOPIC, "topic_test");
30 |             put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName());
31 |             put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName());
32 |             put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$");
33 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test");
34 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test/");
35 |         }};
36 |         return new FsSourceTaskConfig(cfg);
37 |     }
38 | 
39 |     @ParameterizedTest
40 |     @MethodSource("fileSystemConfigProvider")
41 |     @SuppressWarnings("unchecked")
42 |     public void execPolicyEndsAfterBatching(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException {
43 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
44 |         originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
45 |         FsSourceTaskConfig sourceTaskConfig = new FsSourceTaskConfig(originals);
46 | 
47 |         try (Policy policy = ReflectionUtils.makePolicy(
48 |                 (Class<? extends Policy>) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS),
49 |                 sourceTaskConfig)) {
50 | 
51 |             FileSystem fs = fsConfig.getFs();
52 |             for (Path dir : fsConfig.getDirectories()) {
53 |                 fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
54 |                 // this file does not match the regexp
55 |                 fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
56 | 
57 |                 // we wait till FS has registered the files
58 |                 Thread.sleep(3000);
59 |             }
60 |             
61 |             Iterator<FileMetadata> it = policy.execute();
62 | 
63 |             // First batch of files (1 file)
64 |             assertFalse(policy.hasEnded());
65 |             assertTrue(it.hasNext());
66 |             String firstPath = it.next().getPath();
67 |             assertFalse(it.hasNext());
68 |             assertFalse(policy.hasEnded());
69 | 
70 |             // Second batch of files (1 file)
71 |             it = policy.execute();
72 |             assertTrue(it.hasNext());
73 |             assertNotEquals(firstPath, it.next().getPath());
74 |             assertFalse(it.hasNext());
75 |             assertTrue(policy.hasEnded());
76 |         }
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.policy;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
  4 | import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
  5 | import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.kafka.common.config.ConfigException;
  8 | import org.apache.kafka.connect.errors.ConnectException;
  9 | import org.junit.jupiter.params.ParameterizedTest;
 10 | import org.junit.jupiter.params.provider.MethodSource;
 11 | 
 12 | import java.io.IOException;
 13 | import java.util.HashMap;
 14 | import java.util.List;
 15 | import java.util.Map;
 16 | 
 17 | import static org.junit.jupiter.api.Assertions.*;
 18 | 
 19 | public class SleepyPolicyTest extends PolicyTestBase {
 20 | 
 21 |     @Override
 22 |     protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
 23 |         Map<String, String> cfg = new HashMap<String, String>() {{
 24 |             String[] uris = directories.stream().map(Path::toString)
 25 |                     .toArray(String[]::new);
 26 |             put(FsSourceTaskConfig.FS_URIS, String.join(",", uris));
 27 |             put(FsSourceTaskConfig.TOPIC, "topic_test");
 28 |             put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName());
 29 |             put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName());
 30 |             put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$");
 31 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test");
 32 |             put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test");
 33 |             put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100");
 34 |             put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1");
 35 |         }};
 36 |         return new FsSourceTaskConfig(cfg);
 37 |     }
 38 | 
 39 |     @ParameterizedTest
 40 |     @MethodSource("fileSystemConfigProvider")
 41 |     @SuppressWarnings("unchecked")
 42 |     public void invalidSleepTime(PolicyFsTestConfig fsConfig) {
 43 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 44 |         originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid");
 45 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 46 |         assertThrows(ConnectException.class, () ->
 47 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 48 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
 49 |         assertThrows(ConfigException.class, () -> {
 50 |             try {
 51 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 52 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
 53 |             } catch (Exception e) {
 54 |                 throw e.getCause();
 55 |             }
 56 |         });
 57 |     }
 58 | 
 59 |     @ParameterizedTest
 60 |     @MethodSource("fileSystemConfigProvider")
 61 |     @SuppressWarnings("unchecked")
 62 |     public void invalidMaxExecs(PolicyFsTestConfig fsConfig) {
 63 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 64 |         originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid");
 65 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 66 |         assertThrows(ConnectException.class, () ->
 67 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 68 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
 69 |         assertThrows(ConfigException.class, () -> {
 70 |             try {
 71 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 72 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
 73 |             } catch (Exception e) {
 74 |                 throw e.getCause();
 75 |             }
 76 |         });
 77 |     }
 78 | 
 79 |     @ParameterizedTest
 80 |     @MethodSource("fileSystemConfigProvider")
 81 |     @SuppressWarnings("unchecked")
 82 |     public void invalidSleepFraction(PolicyFsTestConfig fsConfig) {
 83 |         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
 84 |         originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid");
 85 |         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
 86 |         assertThrows(ConnectException.class, () ->
 87 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 88 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg));
 89 |         assertThrows(ConfigException.class, () -> {
 90 |             try {
 91 |                 ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
 92 |                         .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
 93 |             } catch (Exception e) {
 94 |                 throw e.getCause();
 95 |             }
 96 |         });
 97 |     }
 98 | 
 99 |     @ParameterizedTest
100 |     @MethodSource("fileSystemConfigProvider")
101 |     @SuppressWarnings("unchecked")
102 |     public void sleepExecution(PolicyFsTestConfig fsConfig) throws IOException {
103 |         Map<String, String> tConfig = fsConfig.getSourceTaskConfig().originalsStrings();
104 |         tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000");
105 |         tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2");
106 |         FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig);
107 | 
108 |         try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
109 |                 .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) {
110 |             assertFalse(policy.hasEnded());
111 |             policy.execute();
112 |             assertFalse(policy.hasEnded());
113 |             policy.execute();
114 |             assertTrue(policy.hasEnded());
115 |         }
116 |     }
117 | 
118 |     @ParameterizedTest
119 |     @MethodSource("fileSystemConfigProvider")
120 |     @SuppressWarnings("unchecked")
121 |     public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException {
122 |         Map<String, String> tConfig = fsConfig.getSourceTaskConfig().originalsStrings();
123 |         tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1");
124 |         tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS);
125 |         FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig);
126 | 
127 |         try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
128 |                 .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) {
129 |             // it never ends
130 |             for (int i = 0; i < 100; i++) {
131 |                 assertFalse(policy.hasEnded());
132 |                 policy.execute();
133 |             }
134 |             policy.interrupt();
135 |             assertTrue(policy.hasEnded());
136 |         }
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.mmolimar.kafka.connect.fs.task;
 2 | 
 3 | import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig;
 4 | import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 5 | import org.apache.kafka.common.config.ConfigDef;
 6 | import org.junit.jupiter.api.Test;
 7 | 
 8 | import static org.junit.jupiter.api.Assertions.assertFalse;
 9 | import static org.junit.jupiter.api.Assertions.assertNotNull;
10 | 
11 | public class FsSourceTaskConfigTest {
12 | 
13 |     @Test
14 |     public void checkDocumentation() {
15 |         ConfigDef config = FsSourceTaskConfig.conf();
16 |         config.names().forEach(key -> {
17 |             assertFalse(config.configKeys().get(key).documentation == null ||
18 |                             "".equals(config.configKeys().get(key).documentation.trim()),
19 |                     () -> "Property " + key + " should be documented");
20 |         });
21 |     }
22 | 
23 |     @Test
24 |     public void toRst() {
25 |         assertNotNull(FsSourceConnectorConfig.conf().toRst());
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java:
--------------------------------------------------------------------------------
  1 | package com.github.mmolimar.kafka.connect.fs.task;
  2 | 
  3 | import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig;
  4 | import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig;
  5 | import com.github.mmolimar.kafka.connect.fs.FsSourceTask;
  6 | import com.github.mmolimar.kafka.connect.fs.FsTestConfig;
  7 | import org.apache.hadoop.fs.Path;
  8 | 
  9 | import java.io.IOException;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Map;
 13 | import java.util.UUID;
 14 | 
 15 | interface TaskFsTestConfig extends FsTestConfig {
 16 | 
 17 |     FsSourceTask getTask();
 18 | 
 19 |     void setTask(FsSourceTask task);
 20 | 
 21 |     Map<String, String> getTaskConfig();
 22 | 
 23 |     void setTaskConfig(Map<String, String> taskConfig);
 24 | 
 25 |     List<Path> getDirectories();
 26 | 
 27 | }
 28 | 
 29 | class LocalFsConfig extends AbstractLocalFsConfig implements TaskFsTestConfig {
 30 |     private FsSourceTask task;
 31 |     private Map<String, String> taskConfig;
 32 |     private List<Path> directories;
 33 | 
 34 |     @Override
 35 |     public void init() throws IOException {
 36 |         directories = new ArrayList<Path>() {{
 37 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 38 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 39 |         }};
 40 |         for (Path dir : directories) {
 41 |             getFs().mkdirs(dir);
 42 |         }
 43 |     }
 44 | 
 45 |     @Override
 46 |     public FsSourceTask getTask() {
 47 |         return task;
 48 |     }
 49 | 
 50 |     @Override
 51 |     public void setTask(FsSourceTask task) {
 52 |         this.task = task;
 53 |     }
 54 | 
 55 |     @Override
 56 |     public Map<String, String> getTaskConfig() {
 57 |         return taskConfig;
 58 |     }
 59 | 
 60 |     @Override
 61 |     public void setTaskConfig(Map<String, String> taskConfig) {
 62 |         this.taskConfig = taskConfig;
 63 |     }
 64 | 
 65 |     @Override
 66 |     public List<Path> getDirectories() {
 67 |         return directories;
 68 |     }
 69 | 
 70 | }
 71 | 
 72 | class HdfsFsConfig extends AbstractHdfsFsConfig implements TaskFsTestConfig {
 73 |     private FsSourceTask task;
 74 |     private Map<String, String> taskConfig;
 75 |     private List<Path> directories;
 76 | 
 77 |     @Override
 78 |     public void init() throws IOException {
 79 |         directories = new ArrayList<Path>() {{
 80 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 81 |             add(new Path(getFsUri().toString(), UUID.randomUUID().toString()));
 82 |         }};
 83 |         for (Path dir : directories) {
 84 |             getFs().mkdirs(dir);
 85 |         }
 86 |     }
 87 | 
 88 |     @Override
 89 |     public FsSourceTask getTask() {
 90 |         return task;
 91 |     }
 92 | 
 93 |     @Override
 94 |     public void setTask(FsSourceTask task) {
 95 |         this.task = task;
 96 |     }
 97 | 
 98 |     @Override
 99 |     public Map<String, String> getTaskConfig() {
100 |         return taskConfig;
101 |     }
102 | 
103 |     @Override
104 |     public void setTaskConfig(Map<String, String> taskConfig) {
105 |         this.taskConfig = taskConfig;
106 |     }
107 | 
108 |     @Override
109 |     public List<Path> getDirectories() {
110 |         return directories;
111 |     }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/test/resources/file/reader/data/cobol/code-pages.cpy:
--------------------------------------------------------------------------------
1 |         01  TRANSDATA.
2 |             05  CURRENCY          PIC X(3).
3 |             05  SIGNATURE         PIC X(8).
4 |             05  COMPANY-NAME-NP      PIC X(15).
5 |             05  COMPANY-ID       PIC X(10).
6 |             05  WEALTH-QFY        PIC 9(1).
7 |             05  AMOUNT            PIC S9(09)V99  BINARY.
8 | 


--------------------------------------------------------------------------------
/src/test/resources/file/reader/data/cobol/code-pages.dt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/code-pages.dt


--------------------------------------------------------------------------------
/src/test/resources/file/reader/data/cobol/companies.cpy:
--------------------------------------------------------------------------------
 1 |           01  COMPANY-DETAILS.
 2 |               05  SEGMENT-ID        PIC X(5).
 3 |               05  COMPANY-ID        PIC X(10).
 4 |               05  STATIC-DETAILS.
 5 |                  10  COMPANY-NAME      PIC X(15).
 6 |                  10  ADDRESS           PIC X(25).
 7 |                  10  TAXPAYER.
 8 |                     15  TAXPAYER-TYPE  PIC X(1).
 9 |                     15  TAXPAYER-STR   PIC X(8).
10 |                     15  TAXPAYER-NUM  REDEFINES TAXPAYER-STR
11 |                                        PIC 9(8) COMP.
12 |                  10  STRATEGY.
13 |                    15  STRATEGY_DETAIL OCCURS 6.
14 |                      25  NUM1 PIC 9(7) COMP.
15 |                      25  NUM2 PIC 9(7) COMP-3.
16 | 
17 | 


--------------------------------------------------------------------------------
/src/test/resources/file/reader/data/cobol/companies.dt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/companies.dt


--------------------------------------------------------------------------------
/src/test/resources/file/reader/data/cobol/type-variety.dt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmolimar/kafka-connect-fs/7adc7574d04cd2ed20b70f4f1582669477b93f76/src/test/resources/file/reader/data/cobol/type-variety.dt


--------------------------------------------------------------------------------
/src/test/resources/file/reader/schemas/people.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"type": "record",
 3 | 	"namespace": "test.avro",
 4 | 	"name": "People",
 5 | 	"fields": [{
 6 | 		"name": "name",
 7 | 		"type": "string"
 8 | 	}, {
 9 | 		"name": "surname",
10 | 		"type": "string"
11 | 	}, {
12 | 		"name": "index",
13 | 		"type": "int"
14 | 	}]
15 | }


--------------------------------------------------------------------------------
/src/test/resources/file/reader/schemas/people_projection.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"type": "record",
 3 | 	"namespace": "test.avro",
 4 | 	"name": "PeopleProjection",
 5 | 	"fields": [{
 6 | 		"name": "name",
 7 | 		"type": "string"
 8 | 	}, {
 9 | 		"name": "index",
10 | 		"type": "int"
11 | 	}]
12 | }


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Root logger option
 2 | log4j.rootLogger=INFO, stdout
 3 | 
 4 | # Direct log messages to stdout
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.Target=System.out
 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 8 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
 9 | 
10 | log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE
11 | log4j.logger.org.apache.hadoop=ERROR
12 | log4j.logger.BlockStateChange=WARN
13 | log4j.logger.org.apache.parquet=WARN
14 | log4j.logger.org.apache.orc=WARN
15 | log4j.logger.org.eclipse.jetty=WARN
16 | log4j.logger.io.confluent.connect.avro=WARN
17 | 


--------------------------------------------------------------------------------