├── README.md ├── pom.xml └── src ├── main └── java │ └── com │ └── urey │ └── flume │ ├── MultiLineExecSource.java │ └── MultiLineExecSourceConfigurationConstants.java └── test └── java └── com └── urey └── flume └── AppTest.java /README.md: -------------------------------------------------------------------------------- 1 | # Flume Plugin: MultiLineExecSource 2 | 3 | Flume-NG 's ExecSource is aimed at collecting every line in xxx.log as a flume event. The line is ended with '$' by default. But in some situations, one log is multiline, for instance, the error logs are mostly multiline because of stacktrace. So I have developed a MultiLineExecSource which based on ExecSource. 4 | 5 | **NOTE 1: MultiLineExecSource plugin is built for Flume-NG and will not work on Flume-OG** 6 | 7 | **NOTE 2: It lacks comprehensive test coverage. Of course contributions are welcome to make its more stable and useful** 8 | 9 | ## Compilation 10 | 11 | The project is maintained by [Maven](http://maven.apache.org/). 12 | 13 | ## Installation instructions 14 | 15 | After your compilation, you should ship the target jar `flume-source-plugin-1.0-SNAPSHOT.jar` to the `$FLUME_HOME/flume-ng/lib/`. Then you can edit flume.conf to use the MultiLineExecSource instead of the default ExecSource. 16 | 17 | Now follows a brief overview of MultiLineExecSource with usage instructions. 18 | 19 | ## Sources 20 | 21 | ### MultiLineExecSource 22 | 23 | The MultiLineExecSource is used for generating one Flume event which is composed of multiple lines in the log. It will inspect every line to see whether it is starting with a symbol which means a new line. The symbol is satisfying some kind of regex. 24 | 25 | For instance, the HDFS's datanode log is usually starting with '2016-03-18 17:53:40,278'. It can be expressed with regex '\s?\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d,\d\d\d'. So MultiLineExecSource will distinguish every line with this regex. If a line starts with it, it is a new line. Otherwise, it belongs to the previous line. 26 | 27 | The MultiLineExecSource is based on the regular exec source and includes the same parameters. It also adds one additional one: 28 | 29 | * **lineStartRegex**: It is used to distinguish every line. 30 | 31 | 32 | Example config: 33 | 34 | ``` 35 | agent.sources.hdfs_namenode_src.type = com.urey.flume.MultiLineExecSource 36 | agent.sources.hdfs_namenode_src.lineStartRegex = \\s?\\d\\d\\d\\d-\\d\\d-\\d\\d\\s\\d\\d:\\d\\d:\\d\\d,\\d\\d\\d 37 | ``` 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.urey.flume 6 | flume-source-plugin 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | flume-source-plugin 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | 26 | org.apache.flume 27 | flume-ng-core 28 | 1.6.0 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/java/com/urey/flume/MultiLineExecSource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package com.urey.flume; 21 | 22 | import com.google.common.base.Preconditions; 23 | import com.google.common.util.concurrent.ThreadFactoryBuilder; 24 | import org.apache.flume.Context; 25 | import org.apache.flume.Event; 26 | import org.apache.flume.EventDrivenSource; 27 | import org.apache.flume.SystemClock; 28 | import org.apache.flume.channel.ChannelProcessor; 29 | import org.apache.flume.conf.Configurable; 30 | import org.apache.flume.event.EventBuilder; 31 | import org.apache.flume.instrumentation.SourceCounter; 32 | import org.apache.flume.source.AbstractSource; 33 | import org.apache.flume.source.ExecSourceConfigurationConstants; 34 | import org.slf4j.Logger; 35 | import org.slf4j.LoggerFactory; 36 | 37 | import java.io.BufferedReader; 38 | import java.io.IOException; 39 | import java.io.InputStreamReader; 40 | 41 | import java.nio.charset.Charset; 42 | import java.util.ArrayList; 43 | import java.util.List; 44 | import java.util.concurrent.*; 45 | import java.util.regex.Matcher; 46 | import java.util.regex.Pattern; 47 | 48 | /** 49 | *

50 | * A {@link org.apache.flume.Source} implementation that executes a Unix process and turns each 51 | * line of text into an event. 52 | *

53 | *

54 | * This source runs a given Unix command on start up and expects that process to 55 | * continuously produce data on standard out (stderr ignored by default). Unless 56 | * told to restart, if the process exits for any reason, the source also exits and 57 | * will produce no further data. This means configurations such as cat [named pipe] 58 | * or tail -F [file] are going to produce the desired results where as 59 | * date will probably not - the former two commands produce streams of 60 | * data where as the latter produces a single event and exits. 61 | *

62 | *

63 | * The ExecSource is meant for situations where one must integrate with 64 | * existing systems without modifying code. It is a compatibility gateway built 65 | * to allow simple, stop-gap integration and doesn't necessarily offer all of 66 | * the benefits or guarantees of native integration with Flume. If one has the 67 | * option of using the AvroSource, for instance, that would be greatly 68 | * preferred to this source as it (and similarly implemented sources) can 69 | * maintain the transactional guarantees that exec can not. 70 | *

71 | *

72 | * Why doesn't ExecSource offer transactional guarantees? 73 | *

74 | *

75 | * The problem with ExecSource and other asynchronous sources is that 76 | * the source can not guarantee that if there is a failure to put the event into 77 | * the {@link org.apache.flume.Channel} the client knows about it. As a for instance, one of the 78 | * most commonly requested features is the tail -F [file]-like use case 79 | * where an application writes to a log file on disk and Flume tails the file, 80 | * sending each line as an event. While this is possible, there's an obvious 81 | * problem; what happens if the channel fills up and Flume can't send an event? 82 | * Flume has no way of indicating to the application writing the log file that 83 | * it needs to retain the log or that the event hasn't been sent, for some 84 | * reason. If this doesn't make sense, you need only know this: Your 85 | * application can never guarantee data has been received when using a 86 | * unidirectional asynchronous interface such as ExecSource! As an extension 87 | * of this warning - and to be completely clear - there is absolutely zero 88 | * guarantee of event delivery when using this source. You have been warned. 89 | *

90 | *

91 | * Configuration options 92 | *

93 | * 94 | * 95 | * 96 | * 97 | * 98 | * 99 | * 100 | * 101 | * 102 | * 103 | * 104 | * 105 | * 106 | * 107 | * 108 | * 109 | * 110 | * 111 | * 112 | * 113 | * 114 | * 115 | * 116 | * 117 | * 118 | * 119 | * 120 | * 121 | * 122 | * 123 | * 124 | * 125 | * 126 | * 127 | * 128 | * 129 | * 130 | * 131 | * 132 | * 133 | * 134 | * 135 | * 136 | *
ParameterDescriptionUnit / TypeDefault
commandThe command to executeStringnone (required)
restartWhether to restart the command when it exitsBooleanfalse
restartThrottleHow long in milliseconds to wait before restarting the commandLong10000
logStderrWhether to log or discard the standard error stream of the commandBooleanfalse
batchSizeThe number of events to commit to channel at a time.integer20
batchTimeoutAmount of time (in milliseconds) to wait, if the buffer size was not reached, before data is pushed downstream.long3000
137 | *

138 | * Metrics 139 | *

140 | *

141 | * TODO 142 | *

143 | */ 144 | /** 145 | * Created by ureyqiao on 2016/3/21. 146 | * contact me: qiaowei@pku.edu.cn 147 | */ 148 | public class MultiLineExecSource extends AbstractSource implements EventDrivenSource, Configurable { 149 | 150 | private static final Logger logger = LoggerFactory.getLogger(MultiLineExecSource.class); 151 | 152 | private String shell; 153 | private String command; 154 | private SourceCounter sourceCounter; 155 | private ExecutorService executor; 156 | private Future runnerFuture; 157 | private long restartThrottle; 158 | private boolean restart; 159 | private boolean logStderr; 160 | private Integer bufferCount; 161 | private long batchTimeout; 162 | private ExecRunnable runner; 163 | private Charset charset; 164 | 165 | private String regex; 166 | 167 | @Override 168 | public void start() { 169 | logger.info("Exec source starting with command:{}", command); 170 | 171 | executor = Executors.newSingleThreadExecutor(); 172 | 173 | runner = new ExecRunnable(shell, command, getChannelProcessor(), sourceCounter, 174 | restart, restartThrottle, logStderr, bufferCount, batchTimeout, charset, regex); 175 | 176 | // FIXME: Use a callback-like executor / future to signal us upon failure. 177 | runnerFuture = executor.submit(runner); 178 | 179 | /* 180 | * NB: This comes at the end rather than the beginning of the method because 181 | * it sets our state to running. We want to make sure the executor is alive 182 | * and well first. 183 | */ 184 | sourceCounter.start(); 185 | super.start(); 186 | 187 | logger.debug("Exec source started"); 188 | } 189 | 190 | @Override 191 | public void stop() { 192 | logger.info("Stopping exec source with command:{}", command); 193 | if(runner != null) { 194 | runner.setRestart(false); 195 | runner.kill(); 196 | } 197 | 198 | if (runnerFuture != null) { 199 | logger.debug("Stopping exec runner"); 200 | runnerFuture.cancel(true); 201 | logger.debug("Exec runner stopped"); 202 | } 203 | executor.shutdown(); 204 | 205 | while (!executor.isTerminated()) { 206 | logger.debug("Waiting for exec executor service to stop"); 207 | try { 208 | executor.awaitTermination(500, TimeUnit.MILLISECONDS); 209 | } catch (InterruptedException e) { 210 | logger.debug("Interrupted while waiting for exec executor service " 211 | + "to stop. Just exiting."); 212 | Thread.currentThread().interrupt(); 213 | } 214 | } 215 | 216 | sourceCounter.stop(); 217 | super.stop(); 218 | 219 | logger.debug("Exec source with command:{} stopped. Metrics:{}", command, 220 | sourceCounter); 221 | } 222 | 223 | @Override 224 | public void configure(Context context) { 225 | command = context.getString("command"); 226 | 227 | Preconditions.checkState(command != null, 228 | "The parameter command must be specified"); 229 | 230 | restartThrottle = context.getLong(ExecSourceConfigurationConstants.CONFIG_RESTART_THROTTLE, 231 | ExecSourceConfigurationConstants.DEFAULT_RESTART_THROTTLE); 232 | 233 | restart = context.getBoolean(ExecSourceConfigurationConstants.CONFIG_RESTART, 234 | ExecSourceConfigurationConstants.DEFAULT_RESTART); 235 | 236 | logStderr = context.getBoolean(ExecSourceConfigurationConstants.CONFIG_LOG_STDERR, 237 | ExecSourceConfigurationConstants.DEFAULT_LOG_STDERR); 238 | 239 | bufferCount = context.getInteger(ExecSourceConfigurationConstants.CONFIG_BATCH_SIZE, 240 | ExecSourceConfigurationConstants.DEFAULT_BATCH_SIZE); 241 | 242 | batchTimeout = context.getLong(ExecSourceConfigurationConstants.CONFIG_BATCH_TIME_OUT, 243 | ExecSourceConfigurationConstants.DEFAULT_BATCH_TIME_OUT); 244 | 245 | charset = Charset.forName(context.getString(ExecSourceConfigurationConstants.CHARSET, 246 | ExecSourceConfigurationConstants.DEFAULT_CHARSET)); 247 | 248 | shell = context.getString(ExecSourceConfigurationConstants.CONFIG_SHELL, null); 249 | 250 | regex = context.getString(MultiLineExecSourceConfigurationConstants.REGEX, MultiLineExecSourceConfigurationConstants.DEFAULT_REGEX); 251 | 252 | if (sourceCounter == null) { 253 | sourceCounter = new SourceCounter(getName()); 254 | } 255 | } 256 | 257 | private static class ExecRunnable implements Runnable { 258 | 259 | public ExecRunnable(String shell, String command, ChannelProcessor channelProcessor, 260 | SourceCounter sourceCounter, boolean restart, long restartThrottle, 261 | boolean logStderr, int bufferCount, long batchTimeout, Charset charset, String regex) { 262 | this.command = command; 263 | this.channelProcessor = channelProcessor; 264 | this.sourceCounter = sourceCounter; 265 | this.restartThrottle = restartThrottle; 266 | this.bufferCount = bufferCount; 267 | this.batchTimeout = batchTimeout; 268 | this.restart = restart; 269 | this.logStderr = logStderr; 270 | this.charset = charset; 271 | this.shell = shell; 272 | this.regex = regex; 273 | this.pattern = Pattern.compile(regex); 274 | } 275 | 276 | private final String shell; 277 | private final String command; 278 | private final ChannelProcessor channelProcessor; 279 | private final SourceCounter sourceCounter; 280 | private volatile boolean restart; 281 | private final long restartThrottle; 282 | private final int bufferCount; 283 | private long batchTimeout; 284 | private final boolean logStderr; 285 | private final Charset charset; 286 | private Process process = null; 287 | private SystemClock systemClock = new SystemClock(); 288 | private Long lastPushToChannel = systemClock.currentTimeMillis(); 289 | ScheduledExecutorService timedFlushService; 290 | ScheduledFuture future; 291 | ///multiline setting start 292 | private String regex; 293 | private Pattern pattern; 294 | List buffer = new ArrayList(); 295 | //multiline setting end 296 | 297 | @Override 298 | public void run() { 299 | do { 300 | String exitCode = "unknown"; 301 | BufferedReader reader = null; 302 | String line = null; 303 | final List eventList = new ArrayList(); 304 | 305 | timedFlushService = Executors.newSingleThreadScheduledExecutor( 306 | new ThreadFactoryBuilder().setNameFormat( 307 | "timedFlushExecService" + 308 | Thread.currentThread().getId() + "-%d").build()); 309 | try { 310 | if(shell != null) { 311 | String[] commandArgs = formulateShellCommand(shell, command); 312 | process = Runtime.getRuntime().exec(commandArgs); 313 | } else { 314 | String[] commandArgs = command.split("\\s+"); 315 | process = new ProcessBuilder(commandArgs).start(); 316 | } 317 | reader = new BufferedReader( 318 | new InputStreamReader(process.getInputStream(), charset)); 319 | 320 | // StderrLogger dies as soon as the input stream is invalid 321 | StderrReader stderrReader = new StderrReader(new BufferedReader( 322 | new InputStreamReader(process.getErrorStream(), charset)), logStderr); 323 | stderrReader.setName("StderrReader-[" + command + "]"); 324 | stderrReader.setDaemon(true); 325 | stderrReader.start(); 326 | 327 | future = timedFlushService.scheduleWithFixedDelay(new Runnable() { 328 | @Override 329 | public void run() { 330 | try { 331 | synchronized (eventList) { 332 | if(!eventList.isEmpty() && timeout()) { 333 | flushEventBatch(eventList); 334 | } 335 | } 336 | } catch (Exception e) { 337 | logger.error("Exception occured when processing event batch", e); 338 | if(e instanceof InterruptedException) { 339 | Thread.currentThread().interrupt(); 340 | } 341 | } 342 | } 343 | }, 344 | batchTimeout, batchTimeout, TimeUnit.MILLISECONDS); 345 | 346 | while ((line = reader.readLine()) != null) { 347 | synchronized (eventList) { 348 | //multiline setting start 349 | Matcher m = pattern.matcher(line); 350 | if(m.find()) { 351 | if(buffer.size() != 0) { 352 | //write to body 353 | sourceCounter.incrementEventReceivedCount(); 354 | String total = ""; 355 | for(int i = 0; i < buffer.size(); ++i) { 356 | total += buffer.get(i); 357 | } 358 | eventList.add(EventBuilder.withBody(total.getBytes(charset))); 359 | if(eventList.size() >= bufferCount || timeout()) { 360 | flushEventBatch(eventList); 361 | } 362 | buffer.clear(); 363 | } 364 | buffer.add(line); 365 | }else { 366 | buffer.add(line); 367 | } 368 | //multiline setting end 369 | } 370 | } 371 | 372 | synchronized (eventList) { 373 | if(!buffer.isEmpty()) { 374 | sourceCounter.incrementEventReceivedCount(); 375 | //multiline setting start 376 | String total = ""; 377 | for(int i = 0; i < buffer.size(); ++i) { 378 | total += buffer.get(i); 379 | } 380 | buffer.clear(); 381 | eventList.add(EventBuilder.withBody(total.getBytes(charset))); 382 | //multiline setting end 383 | } 384 | if(!eventList.isEmpty()) { 385 | flushEventBatch(eventList); 386 | } 387 | } 388 | } catch (Exception e) { 389 | logger.error("Failed while running command: " + command, e); 390 | if(e instanceof InterruptedException) { 391 | Thread.currentThread().interrupt(); 392 | } 393 | } finally { 394 | if (reader != null) { 395 | try { 396 | reader.close(); 397 | } catch (IOException ex) { 398 | logger.error("Failed to close reader for exec source", ex); 399 | } 400 | } 401 | exitCode = String.valueOf(kill()); 402 | } 403 | if(restart) { 404 | logger.info("Restarting in {}ms, exit code {}", restartThrottle, 405 | exitCode); 406 | try { 407 | Thread.sleep(restartThrottle); 408 | } catch (InterruptedException e) { 409 | Thread.currentThread().interrupt(); 410 | } 411 | } else { 412 | logger.info("Command [" + command + "] exited with " + exitCode); 413 | } 414 | } while(restart); 415 | } 416 | 417 | private void flushEventBatch(List eventList){ 418 | channelProcessor.processEventBatch(eventList); 419 | sourceCounter.addToEventAcceptedCount(eventList.size()); 420 | eventList.clear(); 421 | lastPushToChannel = systemClock.currentTimeMillis(); 422 | } 423 | 424 | private boolean timeout(){ 425 | return (systemClock.currentTimeMillis() - lastPushToChannel) >= batchTimeout; 426 | } 427 | 428 | private static String[] formulateShellCommand(String shell, String command) { 429 | String[] shellArgs = shell.split("\\s+"); 430 | String[] result = new String[shellArgs.length + 1]; 431 | System.arraycopy(shellArgs, 0, result, 0, shellArgs.length); 432 | result[shellArgs.length] = command; 433 | return result; 434 | } 435 | 436 | public int kill() { 437 | if(process != null) { 438 | synchronized (process) { 439 | process.destroy(); 440 | 441 | try { 442 | int exitValue = process.waitFor(); 443 | 444 | // Stop the Thread that flushes periodically 445 | if (future != null) { 446 | future.cancel(true); 447 | } 448 | 449 | if (timedFlushService != null) { 450 | timedFlushService.shutdown(); 451 | while (!timedFlushService.isTerminated()) { 452 | try { 453 | timedFlushService.awaitTermination(500, TimeUnit.MILLISECONDS); 454 | } catch (InterruptedException e) { 455 | logger.debug("Interrupted while waiting for exec executor service " 456 | + "to stop. Just exiting."); 457 | Thread.currentThread().interrupt(); 458 | } 459 | } 460 | } 461 | return exitValue; 462 | } catch (InterruptedException ex) { 463 | Thread.currentThread().interrupt(); 464 | } 465 | } 466 | return Integer.MIN_VALUE; 467 | } 468 | return Integer.MIN_VALUE / 2; 469 | } 470 | public void setRestart(boolean restart) { 471 | this.restart = restart; 472 | } 473 | } 474 | private static class StderrReader extends Thread { 475 | private BufferedReader input; 476 | private boolean logStderr; 477 | 478 | protected StderrReader(BufferedReader input, boolean logStderr) { 479 | this.input = input; 480 | this.logStderr = logStderr; 481 | } 482 | 483 | @Override 484 | public void run() { 485 | try { 486 | int i = 0; 487 | String line = null; 488 | while((line = input.readLine()) != null) { 489 | if(logStderr) { 490 | // There is no need to read 'line' with a charset 491 | // as we do not to propagate it. 492 | // It is in UTF-16 and would be printed in UTF-8 format. 493 | logger.info("StderrLogger[{}] = '{}'", ++i, line); 494 | } 495 | } 496 | } catch (IOException e) { 497 | logger.info("StderrLogger exiting", e); 498 | } finally { 499 | try { 500 | if(input != null) { 501 | input.close(); 502 | } 503 | } catch (IOException ex) { 504 | logger.error("Failed to close stderr reader for exec source", ex); 505 | } 506 | } 507 | } 508 | } 509 | 510 | } 511 | -------------------------------------------------------------------------------- /src/main/java/com/urey/flume/MultiLineExecSourceConfigurationConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package com.urey.flume; 21 | 22 | /** 23 | * Created by ureyqiao on 2016/3/21. 24 | * contact me: qiaowei@pku.edu.cn 25 | */ 26 | public class MultiLineExecSourceConfigurationConstants { 27 | 28 | /** 29 | * Every log starts with regex matched string is considered as a flume event 30 | */ 31 | public static final String REGEX = "lineStartRegex"; 32 | public static final String DEFAULT_REGEX = "\\s?\\d\\d\\d\\d-\\d\\d-\\d\\d\\s\\d\\d:\\d\\d:\\d\\d,\\d\\d\\d"; 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/urey/flume/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.urey.flume; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------