├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── config
└── kafka-connect-fs.properties
├── docker-compose.yml
├── docs
├── Makefile
├── make.bat
└── source
│ ├── conf.py
│ ├── config_options.rst
│ ├── connector.rst
│ ├── faq.rst
│ ├── filereaders.rst
│ ├── index.rst
│ └── policies.rst
├── pom.xml
└── src
├── main
├── assembly
│ ├── development.xml
│ ├── package.xml
│ └── standalone.xml
├── java
│ └── com
│ │ └── github
│ │ └── mmolimar
│ │ └── kafka
│ │ └── connect
│ │ └── fs
│ │ ├── FsSourceConnector.java
│ │ ├── FsSourceConnectorConfig.java
│ │ ├── FsSourceTask.java
│ │ ├── FsSourceTaskConfig.java
│ │ ├── file
│ │ ├── FileMetadata.java
│ │ └── reader
│ │ │ ├── AbstractFileReader.java
│ │ │ ├── AgnosticFileReader.java
│ │ │ ├── AvroFileReader.java
│ │ │ ├── BinaryFileReader.java
│ │ │ ├── CobolFileReader.java
│ │ │ ├── CompressionType.java
│ │ │ ├── CsvFileReader.java
│ │ │ ├── FileReader.java
│ │ │ ├── FixedWidthFileReader.java
│ │ │ ├── JacksonFileReader.java
│ │ │ ├── JsonFileReader.java
│ │ │ ├── OrcFileReader.java
│ │ │ ├── ParquetFileReader.java
│ │ │ ├── SequenceFileReader.java
│ │ │ ├── TextFileReader.java
│ │ │ ├── TsvFileReader.java
│ │ │ ├── UnivocityFileReader.java
│ │ │ ├── XmlFileReader.java
│ │ │ └── YamlFileReader.java
│ │ ├── policy
│ │ ├── AbstractPolicy.java
│ │ ├── CronPolicy.java
│ │ ├── HdfsFileWatcherPolicy.java
│ │ ├── Policy.java
│ │ ├── S3EventNotificationsPolicy.java
│ │ ├── SimplePolicy.java
│ │ └── SleepyPolicy.java
│ │ └── util
│ │ ├── Iterators.java
│ │ ├── ReflectionUtils.java
│ │ ├── TailCall.java
│ │ └── Version.java
├── resources
│ ├── META-INF
│ │ └── services
│ │ │ └── org.apache.hadoop.fs.FileSystem
│ └── kafka-connect-fs-version.properties
└── scala
│ └── com
│ └── github
│ └── mmolimar
│ └── kafka
│ └── connect
│ └── fs
│ └── file
│ └── reader
│ └── CobrixReader.scala
└── test
├── java
└── com
│ └── github
│ └── mmolimar
│ └── kafka
│ └── connect
│ └── fs
│ ├── AbstractHdfsFsConfig.java
│ ├── AbstractLocalFsConfig.java
│ ├── FsTestConfig.java
│ ├── connector
│ ├── FsSourceConnectorConfigTest.java
│ └── FsSourceConnectorTest.java
│ ├── file
│ └── reader
│ │ ├── AgnosticFileReaderTest.java
│ │ ├── AvroFileReaderTest.java
│ │ ├── BinaryFileReaderTest.java
│ │ ├── CobolFileReaderTest.java
│ │ ├── CsvFileReaderTest.java
│ │ ├── FileReaderTestBase.java
│ │ ├── FixedWidthFileReaderTest.java
│ │ ├── JacksonFileReaderTest.java
│ │ ├── JsonFileReaderTest.java
│ │ ├── OrcFileReaderTest.java
│ │ ├── ParquetFileReaderTest.java
│ │ ├── ReaderFsTestConfig.java
│ │ ├── SequenceFileReaderTest.java
│ │ ├── TextFileReaderTest.java
│ │ ├── TsvFileReaderTest.java
│ │ ├── UnivocityFileReaderTest.java
│ │ ├── XmlFileReaderTest.java
│ │ └── YamlFileReaderTest.java
│ ├── policy
│ ├── CronPolicyTest.java
│ ├── HdfsFileWatcherPolicyTest.java
│ ├── PolicyFsTestConfig.java
│ ├── PolicyTestBase.java
│ ├── S3EventNotificationsPolicyTest.java
│ ├── SimplePolicyTest.java
│ └── SleepyPolicyTest.java
│ └── task
│ ├── FsSourceTaskConfigTest.java
│ ├── FsSourceTaskTest.java
│ └── TaskFsTestConfig.java
└── resources
├── file
└── reader
│ ├── data
│ └── cobol
│ │ ├── code-pages.cpy
│ │ ├── code-pages.dt
│ │ ├── companies.cpy
│ │ ├── companies.dt
│ │ ├── type-variety.cpy
│ │ └── type-variety.dt
│ └── schemas
│ ├── people.avsc
│ └── people_projection.avsc
└── log4j.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | # use glob syntax.
2 | syntax: glob
3 | *.ser
4 | *.class
5 | *~
6 | *.bak
7 | #*.off
8 | *.old
9 |
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 |
16 | # idea
17 | .idea
18 | *.iml
19 |
20 | # building
21 | target
22 | build
23 | null
24 | tmp
25 | temp
26 | test-output
27 | build.log
28 |
29 | # other scm
30 | .svn
31 | .CVS
32 | .hg*
33 |
34 | # switch to regexp syntax.
35 | # syntax: regexp
36 | # ^\.pc/
37 |
38 | # Documentation autogenerated
39 | javadoc
40 | apidocs
41 |
42 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: trusty
2 | language: java
3 | jdk:
4 | - oraclejdk8
5 | install:
6 | - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V
7 | script:
8 | - mvn test jacoco:report
9 | after_success:
10 | - mvn coveralls:report
11 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM confluentinc/cp-kafka-connect-base:6.1.0
2 |
3 | ARG PROJECT_VERSION
4 | ENV CONNECT_PLUGIN_PATH="/usr/share/java,/usr/share/confluent-hub-components"
5 |
6 | COPY ./target/components/packages/mmolimar-kafka-connect-fs-${PROJECT_VERSION}.zip /tmp/kafka-connect-fs.zip
7 | RUN confluent-hub install --no-prompt /tmp/kafka-connect-fs.zip && rm -rf /tmp/kafka-connect-fs.zip
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kafka Connect FileSystem Connector [](https://travis-ci.org/mmolimar/kafka-connect-fs)[](https://coveralls.io/github/mmolimar/kafka-connect-fs?branch=master)
2 |
3 | **kafka-connect-fs** is a [Kafka Connector](https://kafka.apache.org/documentation.html#connect)
4 | for reading records from files in the file systems specified and load them into Kafka.
5 |
6 | Documentation for this connector can be found [here](https://kafka-connect-fs.readthedocs.io/).
7 |
8 | ## Development
9 |
10 | To build a development version you'll need a recent version of Kafka. You can build
11 | kafka-connect-fs with Maven using the standard lifecycle phases.
12 |
13 | ## FAQ
14 |
15 | Some frequently asked questions on Kafka Connect FileSystem Connector can be found here -
16 | https://kafka-connect-fs.readthedocs.io/en/latest/faq.html
17 |
18 | ## Contribute
19 |
20 | - Source Code: https://github.com/mmolimar/kafka-connect-fs
21 | - Issue Tracker: https://github.com/mmolimar/kafka-connect-fs/issues
22 |
23 | ## License
24 |
25 | Released under the Apache License, version 2.0.
26 |
--------------------------------------------------------------------------------
/config/kafka-connect-fs.properties:
--------------------------------------------------------------------------------
1 | name=FsSourceConnector
2 | connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector
3 | tasks.max=1
4 | fs.uris=file:///data,hdfs://localhost:8020/data
5 | topic=mytopic
6 | policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy
7 | policy.recursive=true
8 | policy.regexp=^.*\.txt$
9 | policy.batch_size=0
10 | policy.cleanup=none
11 | file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader
12 | file_reader.batch_size=0
13 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | cp-zookeeper:
4 | image: confluentinc/cp-zookeeper:6.1.0
5 | hostname: zookeeper
6 | container_name: zookeeper
7 | ports:
8 | - "2181:2181"
9 | environment:
10 | ZOOKEEPER_CLIENT_PORT: 2181
11 | ZOOKEEPER_TICK_TIME: 2000
12 |
13 | cp-kafka:
14 | image: confluentinc/cp-kafka:6.1.0
15 | hostname: kafka
16 | container_name: kafka
17 | depends_on:
18 | - cp-zookeeper
19 | ports:
20 | - "29092:29092"
21 | - "9092:9092"
22 | environment:
23 | KAFKA_BROKER_ID: 1
24 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
25 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
26 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
27 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
28 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
29 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092
30 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181
31 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
32 | CONFLUENT_METRICS_ENABLE: 'false'
33 |
34 | cp-schema-registry:
35 | image: confluentinc/cp-schema-registry:6.1.0
36 | hostname: schema-registry
37 | container_name: schema-registry
38 | depends_on:
39 | - cp-zookeeper
40 | - cp-kafka
41 | ports:
42 | - "8081:8081"
43 | environment:
44 | SCHEMA_REGISTRY_HOST_NAME: schema-registry
45 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
46 |
47 | connect-fs:
48 | image: mmolimar/kafka-connect-fs:1.3.0
49 | container_name: connect
50 | depends_on:
51 | - cp-kafka
52 | - cp-schema-registry
53 | ports:
54 | - "8083:8083"
55 | - "8000:8000"
56 | environment:
57 | CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092'
58 | CONNECT_REST_ADVERTISED_HOST_NAME: connect
59 | CONNECT_REST_PORT: 8083
60 | CONNECT_GROUP_ID: compose-connect-group
61 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
62 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
63 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
64 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
65 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
66 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
67 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
68 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
69 | CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
70 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
71 | CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
72 | CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
73 | CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181'
74 | CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/"
75 | CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO"
76 | CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR
77 | KAFKA_OPTS: "-agentlib:jdwp=transport=dt_socket,server=y,address=8000,suspend=n"
78 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = kafka-connect-fs
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=kafka-connect-fs
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # kafka-connect-fs documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Mar 23 20:59:04 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | # import os
21 | # import sys
22 | # sys.path.insert(0, os.path.abspath('.'))
23 |
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = []
35 |
36 | # Add any paths that contain templates here, relative to this directory.
37 | #templates_path = ['_templates']
38 |
39 | # The suffix(es) of source filenames.
40 | # You can specify multiple suffix as a list of string:
41 | #
42 | # source_suffix = ['.rst', '.md']
43 | source_suffix = '.rst'
44 |
45 | # The master toctree document.
46 | master_doc = 'index'
47 |
48 | # General information about the project.
49 | project = 'Kafka Connect FileSystem Connector'
50 | copyright = '2017, Mario Molina'
51 | author = 'Mario Molina'
52 |
53 | # The version info for the project you're documenting, acts as replacement for
54 | # |version| and |release|, also used in various other places throughout the
55 | # built documents.
56 | #
57 | # The short X.Y version.
58 | version = '1.3'
59 | # The full version, including alpha/beta/rc tags.
60 | release = '1.3'
61 |
62 | # The language for content autogenerated by Sphinx. Refer to documentation
63 | # for a list of supported languages.
64 | #
65 | # This is also used if you do content translation via gettext catalogs.
66 | # Usually you set "language" from the command line for these cases.
67 | # language = None
68 |
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This patterns also effect to html_static_path and html_extra_path
72 | exclude_patterns = ['build']
73 |
74 | # The name of the Pygments (syntax highlighting) style to use.
75 | pygments_style = 'sphinx'
76 |
77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
78 | todo_include_todos = False
79 |
80 | # -- Options for HTML output ----------------------------------------------
81 | import sphinx_rtd_theme
82 |
83 | # The theme to use for HTML and HTML Help pages. See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = 'sphinx_rtd_theme'
87 |
88 | # Theme options are theme-specific and customize the look and feel of a theme
89 | # further. For a list of options available for each theme, see the
90 | # documentation.
91 | #
92 | # html_theme_options = {}
93 |
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
98 | #html_static_path = ['_static']
99 |
100 | # -- Options for HTMLHelp output ------------------------------------------
101 |
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'KafkaConnectFileSystemConnectorDoc'
104 |
105 | # -- Options for LaTeX output ---------------------------------------------
106 |
107 | latex_elements = {
108 | # The paper size ('letterpaper' or 'a4paper').
109 | #
110 | # 'papersize': 'letterpaper',
111 |
112 | # The font size ('10pt', '11pt' or '12pt').
113 | #
114 | # 'pointsize': '10pt',
115 |
116 | # Additional stuff for the LaTeX preamble.
117 | #
118 | # 'preamble': '',
119 |
120 | # Latex figure (float) alignment
121 | #
122 | # 'figure_align': 'htbp',
123 | }
124 |
125 | # Grouping the document tree into LaTeX files. List of tuples
126 | # (source start file, target name, title,
127 | # author, documentclass [howto, manual, or own class]).
128 | latex_documents = [
129 | (master_doc, 'KafkaConnectFileSystemConnector.tex', 'Kafka Connect FileSystem Connector Documentation',
130 | 'Mario Molina', 'manual'),
131 | ]
132 |
133 | # -- Options for manual page output ---------------------------------------
134 |
135 | # One entry per manual page. List of tuples
136 | # (source start file, name, description, authors, manual section).
137 | man_pages = [
138 | (master_doc, 'kafkaconnectfs', 'Kafka Connect FileSystem Connector Documentation',
139 | [author], 1)
140 | ]
141 |
142 | # -- Options for Texinfo output -------------------------------------------
143 |
144 | # Grouping the document tree into Texinfo files. List of tuples
145 | # (source start file, target name, title, author,
146 | # dir menu entry, description, category)
147 | texinfo_documents = [
148 | (master_doc, 'KafkaConnectFs', 'Kafka Connect FileSystem Connector Documentation',
149 | author, 'KafkaConnectFileSystemConnector', 'Kafka Connector for FileSystem',
150 | 'Miscellaneous'),
151 | ]
152 |
--------------------------------------------------------------------------------
/docs/source/connector.rst:
--------------------------------------------------------------------------------
1 | .. _connector:
2 |
3 | ********************************************
4 | Connector
5 | ********************************************
6 |
7 | The connector takes advantage of the abstraction provided from `Hadoop Common `__
8 | using the implementation of the ``org.apache.hadoop.fs.FileSystem`` class. So, it's possible to use a
9 | wide variety of FS or if your FS is not included in the Hadoop Common API you can implement an extension
10 | of this abstraction and using it in a transparent way.
11 |
12 | Among others, these are some file systems it supports:
13 |
14 | * HDFS.
15 | * S3.
16 | * Google Cloud Storage.
17 | * Azure Blob Storage & Azure Data Lake Store.
18 | * FTP & SFTP.
19 | * WebHDFS.
20 | * Local File System.
21 | * Hadoop Archive File System.
22 |
23 | Getting started
24 | ============================================
25 |
26 | Prerequisites
27 | --------------------------------------------
28 |
29 | - Apache Kafka 2.6.0.
30 | - Java 8.
31 | - Confluent Schema Registry (recommended).
32 |
33 | Building from source
34 | --------------------------------------------
35 |
36 | .. sourcecode:: bash
37 |
38 | mvn clean package
39 |
40 | General config
41 | --------------------------------------------
42 |
43 | The ``kafka-connect-fs.properties`` file defines the following properties as required:
44 |
45 | .. sourcecode:: bash
46 |
47 | name=FsSourceConnector
48 | connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector
49 | tasks.max=1
50 | fs.uris=file:///data,hdfs://localhost:8020/data
51 | topic=mytopic
52 | policy.class=
53 | policy.recursive=true
54 | policy.regexp=.*
55 | policy.batch_size=0
56 | policy.cleanup=none
57 | file_reader.class=
58 | file_reader.batch_size=0
59 |
60 | #. The connector name.
61 | #. Class indicating the connector.
62 | #. Number of tasks the connector is allowed to start.
63 | #. Comma-separated URIs of the FS(s). They can be URIs pointing out directly to a file
64 | or a directory in the FS. These URIs can also be dynamic by using expressions for
65 | modifying them in runtime.
66 | #. Topic in which copy data from the FS.
67 | #. Policy class to apply (must implement
68 | ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface).
69 | #. Flag to activate traversed recursion in subdirectories when listing files.
70 | #. Regular expression to filter files from the FS.
71 | #. Number of files that should be handled at a time. Non-positive values disable batching.
72 | #. Cleanup strategy to manage processed files.
73 | #. File reader class to read files from the FS
74 | (must implement ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader`` interface).
75 | #. Number of records to process at a time. Non-positive values disable batching.
76 |
77 | A more detailed information about these properties can be found :ref:`here`.
78 |
79 | Running in local
80 | --------------------------------------------
81 |
82 | .. sourcecode:: bash
83 |
84 | export KAFKA_HOME=/path/to/kafka/install/dir
85 |
86 | .. sourcecode:: bash
87 |
88 | mvn clean package
89 | export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')"
90 | $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties config/kafka-connect-fs.properties
91 |
92 | Running in Docker
93 | --------------------------------------------
94 |
95 | .. sourcecode:: bash
96 |
97 | mvn clean package
98 |
99 | .. sourcecode:: bash
100 |
101 | docker build --build-arg PROJECT_VERSION= .
102 | docker-compose build
103 | docker-compose up -d
104 | docker logs --tail="all" -f connect
105 |
106 | .. sourcecode:: bash
107 |
108 | curl -sX GET http://localhost:8083/connector-plugins | grep FsSourceConnector
109 |
110 | Components
111 | ============================================
112 |
113 | There are two main concepts to decouple concerns within the connector.
114 | They are **policies** and **file readers**, described below.
115 |
116 | Policies
117 | --------------------------------------------
118 |
119 | In order to ingest data from the FS(s), the connector needs a **policy** to define the rules to do it.
120 |
121 | Basically, the policy tries to connect to each FS included in the ``fs.uris`` connector property, lists files
122 | (and filter them using the regular expression provided in the ``policy.regexp`` property) and enables
123 | a file reader to read records.
124 |
125 | The policy to be used by the connector is defined in the ``policy.class`` connector property.
126 |
127 | .. important:: When delivering records from the connector to Kafka, they contain their own file offset
128 | so, if in the next eventual policy execution this file is processed again,
129 | the policy will seek the file to this offset and process the next records
130 | if any (**if the offset was committed**).
131 |
132 | .. note:: If the URIs included in the ``fs.uris`` connector property contain any expression of the
133 | form ``${XXX}``, this dynamic URI is built in the moment of the policy execution.
134 |
135 | Currently, there are few policies to support some use cases but, for sure, you can develop your own one
136 | if the existing policies don't fit your needs.
137 | The only restriction is that you must implement the interface
138 | ``com.github.mmolimar.kafka.connect.fs.policy.Policy``.
139 |
140 | .. include:: policies.rst
141 |
142 | File readers
143 | --------------------------------------------
144 |
145 | They read files and process each record from the FS. The **file reader** is needed by the policy to enable
146 | the connector to process each record and includes in the implementation how to seek and iterate over the
147 | records within the file.
148 |
149 | The file reader to be used when processing files is defined in the ``file_reader.class`` connector property.
150 |
151 | In the same way as policies, the connector provides several sort of readers to parse and read records
152 | for different file formats. If you don't have a file reader that fits your needs, just implement one
153 | with the unique restriction that it must implement the interface
154 | ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader``.
155 |
156 | The are several file readers included which can read the following file formats:
157 |
158 | * Parquet.
159 | * Avro.
160 | * ORC.
161 | * SequenceFile.
162 | * Cobol / EBCDIC.
163 | * Other binary files.
164 | * CSV.
165 | * TSV.
166 | * Fixed-width.
167 | * JSON.
168 | * XML.
169 | * YAML.
170 | * Text.
171 |
172 | .. include:: filereaders.rst
173 |
--------------------------------------------------------------------------------
/docs/source/faq.rst:
--------------------------------------------------------------------------------
1 | .. faq:
2 |
3 | ********************************************
4 | FAQs
5 | ********************************************
6 |
7 | **My file was already processed and the connector, when it's executed again,
8 | processes the same records again.**
9 |
10 | If during the previous executions the records were sent successfully to Kafka,
11 | their offsets were sent too. Then, when executing the policy again, it
12 | retrieves the offset and seeks the file. If this didn't happen, it's possible
13 | that the offset was not committed yet and, consequently, the offset retrieved
14 | is non-existent or too old.
15 |
16 | Have a look when the offsets are committed in Kafka and/or try to execute the
17 | policy when you are sure the offsets have been committed.
18 |
19 | **The connector started but does not process any kind of file.**
20 |
21 | This can be for several reasons:
22 |
23 | * Check if the files contained in the FS match the regexp provided.
24 | * Check if there is any kind of problem with the FS. The connector tolerates
25 | FS connection exceptions to process them later but in log files you'll find
26 | these possible errors.
27 | * The file reader is reading files with an invalid format so it cannot
28 | process the file and continues with the next one. You can see
29 | this as an error in the log.
30 |
31 | **I have directories in the FS created day by day and I have to modify
32 | the connector everyday.**
33 |
34 | Don't do this! Take advantage of the dynamic URIs using expressions.
35 |
36 | For instance, if you have this URI ``hdfs://host:9000/data/2020``, you can
37 | use this URI ``hdfs://host:9000/data/${yyyy}`` instead.
38 |
39 | **The connector is too slow to process all URIs I have.**
40 |
41 | Obviously, this depends of the files in the FS(s) but having several URIs in
42 | the connector might be a good idea to adjust the number of tasks
43 | to process those URIs in parallel ( ``tasks.max`` connector property).
44 |
45 | Also, using the properties ``policy.batch_size`` and/or ``file_reader.batch_size``
46 | in case you have tons of files or files too large might help.
47 |
48 | **I removed a file from the FS but the connector is still sending messages
49 | with the contents of that file.**
50 |
51 | This is a tricky issue. The file reader is an iterator and processes
52 | record by record but part of the file is buffered and, even though the
53 | file was removed from the FS, the file reader continues producing records
54 | until throws an exception. It's a matter of time.
55 |
56 | But the main thing is that you don't have to worry about removing files
57 | from the FS when they are being processed. The connector tolerates errors
58 | when reading files and continues with the next file.
59 |
--------------------------------------------------------------------------------
/docs/source/filereaders.rst:
--------------------------------------------------------------------------------
1 | Parquet
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Reads files with `Parquet `__ format.
5 |
6 | The reader takes advantage of the Parquet-Avro API and uses the Parquet file
7 | as if it was an Avro file, so the message sent to Kafka is built in the same
8 | way as the Avro file reader does.
9 |
10 | More information about properties of this file reader :ref:`here`.
11 |
12 | Avro
13 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
14 |
15 | Files with `Avro `__ format can be read with this reader.
16 |
17 | The Avro schema is not needed due to is read from the file. The message sent
18 | to Kafka is created by transforming the record by means of
19 | `Confluent avro-converter `__
20 | API.
21 |
22 | More information about properties of this file reader :ref:`here`.
23 |
24 | ORC
25 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
26 |
27 | `ORC files `__ are a self-describing type-aware
28 | columnar file format designed for Hadoop workloads.
29 |
30 | This reader can process this file format, translating its schema and building
31 | a Kafka message with the content.
32 |
33 | .. warning:: If you have ORC files with ``union`` data types, this sort of
34 | data types will be transformed in a ``map`` object in the Kafka message.
35 | The value of each key will be ``fieldN``, where ``N`` represents
36 | the index within the data type.
37 |
38 | More information about properties of this file reader :ref:`here`.
39 |
40 | SequenceFile
41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
42 |
43 | `Sequence files `__ are one kind of
44 | the Hadoop file formats which are serialized in key-value pairs.
45 |
46 | This reader can process this file format and build a Kafka message with the
47 | key-value pair. These two values are named ``key`` and ``value`` in the message
48 | by default but you can customize these field names.
49 |
50 | More information about properties of this file reader :ref:`here`.
51 |
52 | Cobol
53 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
54 |
55 | Mainframe files (Cobol / EBCDIC binary files) can be processed with this reader which uses the
56 | `Cobrix `__ parser.
57 |
58 | By means of the corresponding copybook -representing its schema-, it parses each record and
59 | translate it into a Kafka message with the schema.
60 |
61 | More information about properties of this file reader :ref:`here`.
62 |
63 | Binary
64 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
65 |
66 | All other kind of binary files can be ingested using this reader.
67 |
68 | It just extracts the content plus some metadata such as: path, file owner, file group, length, access time,
69 | and modification time.
70 |
71 | Each message will contain the following schema:
72 |
73 | * ``path``: File path (string).
74 | * ``owner``: Owner of the file. (string).
75 | * ``group``: Group associated with the file. (string).
76 | * ``length``: Length of this file, in bytes. (long).
77 | * ``access_time``: Access time of the file. (long).
78 | * ``modification_time``: Modification time of the file (long).
79 | * ``content``: Content of the file (bytes).
80 |
81 | More information about properties of this file reader :ref:`here`.
82 |
83 | CSV
84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
85 |
86 | CSV file reader using a custom token to distinguish different columns in each line.
87 |
88 | It allows to distinguish a header in the files and set the name of their columns
89 | in the message sent to Kafka. If there is no header, the value of each column will be in
90 | the field named ``column_N`` (**N** represents the column index) in the message.
91 | Also, the token delimiter for columns is configurable.
92 |
93 | This reader is based on the `Univocity CSV parser `__.
94 |
95 | More information about properties of this file reader :ref:`here`.
96 |
97 | TSV
98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
99 |
100 | TSV file reader using a tab ``\t`` to distinguish different columns in each line.
101 |
102 | Its behaviour is the same one for the CSV file reader regarding the header and the column names.
103 |
104 | This reader is based on the `Univocity TSV parser `__.
105 |
106 | More information about properties of this file reader :ref:`here`.
107 |
108 | FixedWidth
109 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
110 |
111 | FixedWidth is a plain text file reader which distinguishes each column based on the length of each field.
112 |
113 | Its behaviour is the same one for the CSV / TSV file readers regarding the header and the column names.
114 |
115 | This reader is based on the `Univocity Fixed-Width parser `__.
116 |
117 | More information about properties of this file reader :ref:`here`.
118 |
119 | JSON
120 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121 |
122 | Reads JSON files which might contain multiple number of fields with their specified
123 | data types. The schema for this sort of records is inferred reading the first record
124 | and marked as optional in the schema all the fields contained.
125 |
126 | More information about properties of this file reader :ref:`here`.
127 |
128 | XML
129 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
130 |
131 | Reads XML files which might contain multiple number of fields with their specified
132 | data types. The schema for this sort of records is inferred reading the first record
133 | and marked as optional in the schema all the fields contained.
134 |
135 | .. warning:: Take into account the current
136 | `limitations `__.
137 |
138 | More information about properties of this file reader :ref:`here`.
139 |
140 | YAML
141 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142 |
143 | Reads YAML files which might contain multiple number of fields with their specified
144 | data types. The schema for this sort of records is inferred reading the first record
145 | and marked as optional in the schema all the fields contained.
146 |
147 | More information about properties of this file reader :ref:`here`.
148 |
149 | Text
150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
151 |
152 | Reads plain text files.
153 |
154 | Each line represents one record (by default) which will be in a field
155 | named ``value`` in the message sent to Kafka by default but you can
156 | customize these field names.
157 |
158 | More information about properties of this file reader :ref:`here`.
159 |
160 | Agnostic
161 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
162 |
163 | Actually, this reader is a wrapper of the readers listing above.
164 |
165 | It tries to read any kind of file format using an internal reader based on the file extension,
166 | applying the proper one (Parquet, Avro, ORC, SequenceFile, Cobol / EBCDIC, CSV, TSV, FixedWidth, JSON, XML,
167 | YAML, or Text). In case of no extension has been matched, the Text file reader will be applied.
168 |
169 | Default extensions for each format (configurable):
170 |
171 | * Parquet: ``.parquet``
172 | * Avro: ``.avro``
173 | * ORC: ``.orc``
174 | * SequenceFile: ``.seq``
175 | * Cobol / EBCDIC: ``.dat``
176 | * Other binary files: ``.bin``
177 | * CSV: ``.csv``
178 | * TSV: ``.tsv``
179 | * FixedWidth: ``.fixed``
180 | * JSON: ``.json``
181 | * XML: ``.xml``
182 | * YAML: ``.yaml``
183 | * Text: any other sort of file extension.
184 |
185 | More information about properties of this file reader :ref:`here`.
186 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. kafka-connect-fs documentation master file, created by Mario Molina
2 |
3 | ********************************************
4 | Kafka Connect FileSystem Connector
5 | ********************************************
6 |
7 | Kafka Connect FileSystem Connector is a source connector for reading records from
8 | files in the file systems specified and load them into Kafka.
9 |
10 | The connector supports:
11 |
12 | * Several sort of File Systems (FS) to use.
13 | * Dynamic and static URIs to ingest data from.
14 | * Policies to define rules about how to look for files and clean them up after processing.
15 | * File readers to parse and read different kind of file formats.
16 |
17 | To learn more about the connector you can read :ref:`this section` and for more detailed
18 | configuration options you can read :ref:`this other one`.
19 |
20 | Also, you can download the source code from `here. `__
21 |
22 | Contents
23 | ============================================
24 |
25 | .. toctree::
26 | :maxdepth: 2
27 |
28 | connector
29 | config_options
30 | faq
31 |
--------------------------------------------------------------------------------
/docs/source/policies.rst:
--------------------------------------------------------------------------------
1 | Simple
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | It's a policy which just filters and processes files included in the corresponding URIs one time.
5 |
6 | .. attention:: This policy is more oriented for testing purposes.
7 |
8 | Sleepy
9 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
10 |
11 | The behaviour of this policy is similar to Simple policy but on each execution it sleeps
12 | and wait for the next one. Additionally, its custom properties allow to end it.
13 |
14 | You can learn more about the properties of this policy :ref:`here`.
15 |
16 | Cron
17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18 |
19 | This policy is scheduled based on cron expressions and their format to put in the configuration
20 | are based on the library `Quartz Scheduler `__.
21 |
22 | After finishing each execution, the policy gets slept until the next one is scheduled, if applicable.
23 |
24 | You can learn more about the properties of this policy :ref:`here`.
25 |
26 | HDFS file watcher
27 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
28 |
29 | It uses Hadoop notifications events and all create/append/rename/close events will be reported
30 | as files to be ingested.
31 |
32 | Just use it when you have HDFS URIs.
33 |
34 | You can learn more about the properties of this policy :ref:`here`.
35 |
36 | .. attention:: The URIs included in the general property ``fs.uris`` will be filtered and only those
37 | ones which start with the prefix ``hdfs://`` will be watched. Also, this policy
38 | will only work for Hadoop versions 2.6.0 or higher.
39 |
40 | S3 event notifications
41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
42 |
43 | It uses S3 event notifications sent from S3 to process files which have been created or modified in S3.
44 | These notifications will be read from a AWS-SQS queue and they can be sent to SQS directly from S3 or via
45 | AWS-SNS, either as a SNS notification or a raw message in the subscription.
46 |
47 | Just use it when you have S3 URIs and the event notifications in the S3 bucket must be enabled to a SNS
48 | topic or a SQS queue.
49 |
50 | You can learn more about the properties of this policy :ref:`here`.
51 |
--------------------------------------------------------------------------------
/src/main/assembly/development.xml:
--------------------------------------------------------------------------------
1 |
5 |
7 | development
8 |
9 | dir
10 |
11 | false
12 |
13 |
14 | share/java/kafka-connect-fs/
15 |
16 |
17 |
--------------------------------------------------------------------------------
/src/main/assembly/package.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 | package
7 |
8 | dir
9 |
10 | false
11 |
12 |
13 | ${project.basedir}
14 | share/doc/${project.name}/
15 |
16 | README*
17 | LICENSE*
18 | NOTICE*
19 | licenses/
20 |
21 |
22 |
23 | ${project.basedir}/config
24 | etc/${project.name}
25 |
26 | *
27 |
28 |
29 |
30 |
31 |
32 | share/java/${project.name}
33 | true
34 | true
35 |
36 | org.apache.kafka:connect-api
37 | org.mortbay.jetty:*
38 | com.sun.jersey:*
39 | org.eclipse.jetty:jetty-util
40 | com.sun.jersey.contribs:jersey-guice
41 | org.apache.zookeeper:zookeeper
42 | log4j:log4j
43 | org.slf4j:slf4j-api
44 | org.slf4j:slf4j-log4j12
45 | javax.servlet:servlet-api
46 | javax.servlet.jsp:jsp-api
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/src/main/assembly/standalone.xml:
--------------------------------------------------------------------------------
1 |
5 |
7 | standalone
8 |
9 | jar
10 |
11 | false
12 |
13 |
14 | ${project.basedir}
15 | /
16 |
17 | README*
18 | LICENSE*
19 | NOTICE*
20 | licenses.html
21 | licenses/
22 | notices/
23 |
24 |
25 |
26 |
27 |
28 | /
29 | true
30 | true
31 | runtime
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java:
--------------------------------------------------------------------------------
1 | package com.github.mmolimar.kafka.connect.fs;
2 |
3 | import com.github.mmolimar.kafka.connect.fs.util.Version;
4 | import org.apache.kafka.common.config.ConfigDef;
5 | import org.apache.kafka.common.config.ConfigException;
6 | import org.apache.kafka.connect.connector.Task;
7 | import org.apache.kafka.connect.errors.ConnectException;
8 | import org.apache.kafka.connect.source.SourceConnector;
9 | import org.apache.kafka.connect.util.ConnectorUtils;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.util.ArrayList;
14 | import java.util.HashMap;
15 | import java.util.List;
16 | import java.util.Map;
17 |
18 | public class FsSourceConnector extends SourceConnector {
19 |
20 | private static Logger log = LoggerFactory.getLogger(FsSourceConnector.class);
21 |
22 | private FsSourceConnectorConfig config;
23 |
24 | @Override
25 | public String version() {
26 | return Version.getVersion();
27 | }
28 |
29 | @Override
30 | public void start(Map properties) {
31 | log.info("{} Starting connector...", this);
32 | try {
33 | config = new FsSourceConnectorConfig(properties);
34 | } catch (ConfigException ce) {
35 | throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce);
36 | } catch (Exception ce) {
37 | throw new ConnectException("An error has occurred when starting FsSourceConnector." + ce);
38 | }
39 | }
40 |
41 | @Override
42 | public Class extends Task> taskClass() {
43 | return FsSourceTask.class;
44 | }
45 |
46 | @Override
47 | public List