├── README.md ├── pom.xml └── src ├── main └── java │ └── com │ └── urey │ └── flume │ ├── MultiLineExecSource.java │ └── MultiLineExecSourceConfigurationConstants.java └── test └── java └── com └── urey └── flume └── AppTest.java /README.md: -------------------------------------------------------------------------------- 1 | # Flume Plugin: MultiLineExecSource 2 | 3 | Flume-NG 's ExecSource is aimed at collecting every line in xxx.log as a flume event. The line is ended with '$' by default. But in some situations, one log is multiline, for instance, the error logs are mostly multiline because of stacktrace. So I have developed a MultiLineExecSource which based on ExecSource. 4 | 5 | **NOTE 1: MultiLineExecSource plugin is built for Flume-NG and will not work on Flume-OG** 6 | 7 | **NOTE 2: It lacks comprehensive test coverage. Of course contributions are welcome to make its more stable and useful** 8 | 9 | ## Compilation 10 | 11 | The project is maintained by [Maven](http://maven.apache.org/). 12 | 13 | ## Installation instructions 14 | 15 | After your compilation, you should ship the target jar `flume-source-plugin-1.0-SNAPSHOT.jar` to the `$FLUME_HOME/flume-ng/lib/`. Then you can edit flume.conf to use the MultiLineExecSource instead of the default ExecSource. 16 | 17 | Now follows a brief overview of MultiLineExecSource with usage instructions. 18 | 19 | ## Sources 20 | 21 | ### MultiLineExecSource 22 | 23 | The MultiLineExecSource is used for generating one Flume event which is composed of multiple lines in the log. It will inspect every line to see whether it is starting with a symbol which means a new line. The symbol is satisfying some kind of regex. 24 | 25 | For instance, the HDFS's datanode log is usually starting with '2016-03-18 17:53:40,278'. It can be expressed with regex '\s?\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d,\d\d\d'. So MultiLineExecSource will distinguish every line with this regex. If a line starts with it, it is a new line. Otherwise, it belongs to the previous line. 26 | 27 | The MultiLineExecSource is based on the regular exec source and includes the same parameters. It also adds one additional one: 28 | 29 | * **lineStartRegex**: It is used to distinguish every line. 30 | 31 | 32 | Example config: 33 | 34 | ``` 35 | agent.sources.hdfs_namenode_src.type = com.urey.flume.MultiLineExecSource 36 | agent.sources.hdfs_namenode_src.lineStartRegex = \\s?\\d\\d\\d\\d-\\d\\d-\\d\\d\\s\\d\\d:\\d\\d:\\d\\d,\\d\\d\\d 37 | ``` 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.urey.flume 6 | flume-source-plugin 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | flume-source-plugin 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | 26 | org.apache.flume 27 | flume-ng-core 28 | 1.6.0 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/java/com/urey/flume/MultiLineExecSource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package com.urey.flume; 21 | 22 | import com.google.common.base.Preconditions; 23 | import com.google.common.util.concurrent.ThreadFactoryBuilder; 24 | import org.apache.flume.Context; 25 | import org.apache.flume.Event; 26 | import org.apache.flume.EventDrivenSource; 27 | import org.apache.flume.SystemClock; 28 | import org.apache.flume.channel.ChannelProcessor; 29 | import org.apache.flume.conf.Configurable; 30 | import org.apache.flume.event.EventBuilder; 31 | import org.apache.flume.instrumentation.SourceCounter; 32 | import org.apache.flume.source.AbstractSource; 33 | import org.apache.flume.source.ExecSourceConfigurationConstants; 34 | import org.slf4j.Logger; 35 | import org.slf4j.LoggerFactory; 36 | 37 | import java.io.BufferedReader; 38 | import java.io.IOException; 39 | import java.io.InputStreamReader; 40 | 41 | import java.nio.charset.Charset; 42 | import java.util.ArrayList; 43 | import java.util.List; 44 | import java.util.concurrent.*; 45 | import java.util.regex.Matcher; 46 | import java.util.regex.Pattern; 47 | 48 | /** 49 | *

50 | * A {@link org.apache.flume.Source} implementation that executes a Unix process and turns each 51 | * line of text into an event. 52 | *

53 | *

54 | * This source runs a given Unix command on start up and expects that process to 55 | * continuously produce data on standard out (stderr ignored by default). Unless 56 | * told to restart, if the process exits for any reason, the source also exits and 57 | * will produce no further data. This means configurations such as cat [named pipe] 58 | * or tail -F [file] are going to produce the desired results where as 59 | * date will probably not - the former two commands produce streams of 60 | * data where as the latter produces a single event and exits. 61 | *

62 | *

63 | * The ExecSource is meant for situations where one must integrate with 64 | * existing systems without modifying code. It is a compatibility gateway built 65 | * to allow simple, stop-gap integration and doesn't necessarily offer all of 66 | * the benefits or guarantees of native integration with Flume. If one has the 67 | * option of using the AvroSource, for instance, that would be greatly 68 | * preferred to this source as it (and similarly implemented sources) can 69 | * maintain the transactional guarantees that exec can not. 70 | *

71 | *

72 | * Why doesn't ExecSource offer transactional guarantees? 73 | *

74 | *

75 | * The problem with ExecSource and other asynchronous sources is that 76 | * the source can not guarantee that if there is a failure to put the event into 77 | * the {@link org.apache.flume.Channel} the client knows about it. As a for instance, one of the 78 | * most commonly requested features is the tail -F [file]-like use case 79 | * where an application writes to a log file on disk and Flume tails the file, 80 | * sending each line as an event. While this is possible, there's an obvious 81 | * problem; what happens if the channel fills up and Flume can't send an event? 82 | * Flume has no way of indicating to the application writing the log file that 83 | * it needs to retain the log or that the event hasn't been sent, for some 84 | * reason. If this doesn't make sense, you need only know this: Your 85 | * application can never guarantee data has been received when using a 86 | * unidirectional asynchronous interface such as ExecSource! As an extension 87 | * of this warning - and to be completely clear - there is absolutely zero 88 | * guarantee of event delivery when using this source. You have been warned. 89 | *

90 | *

91 | * Configuration options 92 | *

93 | * 94 | * 95 | * 96 | * 97 | * 98 | * 99 | * 100 | * 101 | * 102 | * 103 | * 104 | * 105 | * 106 | * 107 | * 108 | * 109 | * 110 | * 111 | * 112 | * 113 | * 114 | * 115 | * 116 | * 117 | * 118 | * 119 | * 120 | * 121 | * 122 | * 123 | * 124 | * 125 | * 126 | * 127 | * 128 | * 129 | * 130 | * 131 | * 132 | * 133 | * 134 | * 135 | * 136 | *

Parameter	Description	Unit / Type	Default
`command`	The command to execute	String	none (required)
`restart`	Whether to restart the command when it exits	Boolean	false
`restartThrottle`	How long in milliseconds to wait before restarting the command	Long	10000
`logStderr`	Whether to log or discard the standard error stream of the command	Boolean	false
`batchSize`	The number of events to commit to channel at a time.	integer	20
`batchTimeout`	Amount of time (in milliseconds) to wait, if the buffer size was not reached, before data is pushed downstream.	long	3000

137 | *

138 | * Metrics 139 | *

140 | *

141 | * TODO 142 | *

143 | */ 144 | /** 145 | * Created by ureyqiao on 2016/3/21. 146 | * contact me: qiaowei@pku.edu.cn 147 | */ 148 | public class MultiLineExecSource extends AbstractSource implements EventDrivenSource, Configurable { 149 | 150 | private static final Logger logger = LoggerFactory.getLogger(MultiLineExecSource.class); 151 | 152 | private String shell; 153 | private String command; 154 | private SourceCounter sourceCounter; 155 | private ExecutorService executor; 156 | private Future runnerFuture; 157 | private long restartThrottle; 158 | private boolean restart; 159 | private boolean logStderr; 160 | private Integer bufferCount; 161 | private long batchTimeout; 162 | private ExecRunnable runner; 163 | private Charset charset; 164 | 165 | private String regex; 166 | 167 | @Override 168 | public void start() { 169 | logger.info("Exec source starting with command:{}", command); 170 | 171 | executor = Executors.newSingleThreadExecutor(); 172 | 173 | runner = new ExecRunnable(shell, command, getChannelProcessor(), sourceCounter, 174 | restart, restartThrottle, logStderr, bufferCount, batchTimeout, charset, regex); 175 | 176 | // FIXME: Use a callback-like executor / future to signal us upon failure. 177 | runnerFuture = executor.submit(runner); 178 | 179 | /* 180 | * NB: This comes at the end rather than the beginning of the method because 181 | * it sets our state to running. We want to make sure the executor is alive 182 | * and well first. 183 | */ 184 | sourceCounter.start(); 185 | super.start(); 186 | 187 | logger.debug("Exec source started"); 188 | } 189 | 190 | @Override 191 | public void stop() { 192 | logger.info("Stopping exec source with command:{}", command); 193 | if(runner != null) { 194 | runner.setRestart(false); 195 | runner.kill(); 196 | } 197 | 198 | if (runnerFuture != null) { 199 | logger.debug("Stopping exec runner"); 200 | runnerFuture.cancel(true); 201 | logger.debug("Exec runner stopped"); 202 | } 203 | executor.shutdown(); 204 | 205 | while (!executor.isTerminated()) { 206 | logger.debug("Waiting for exec executor service to stop"); 207 | try { 208 | executor.awaitTermination(500, TimeUnit.MILLISECONDS); 209 | } catch (InterruptedException e) { 210 | logger.debug("Interrupted while waiting for exec executor service " 211 | + "to stop. Just exiting."); 212 | Thread.currentThread().interrupt(); 213 | } 214 | } 215 | 216 | sourceCounter.stop(); 217 | super.stop(); 218 | 219 | logger.debug("Exec source with command:{} stopped. Metrics:{}", command, 220 | sourceCounter); 221 | } 222 | 223 | @Override 224 | public void configure(Context context) { 225 | command = context.getString("command"); 226 | 227 | Preconditions.checkState(command != null, 228 | "The parameter command must be specified"); 229 | 230 | restartThrottle = context.getLong(ExecSourceConfigurationConstants.CONFIG_RESTART_THROTTLE, 231 | ExecSourceConfigurationConstants.DEFAULT_RESTART_THROTTLE); 232 | 233 | restart = context.getBoolean(ExecSourceConfigurationConstants.CONFIG_RESTART, 234 | ExecSourceConfigurationConstants.DEFAULT_RESTART); 235 | 236 | logStderr = context.getBoolean(ExecSourceConfigurationConstants.CONFIG_LOG_STDERR, 237 | ExecSourceConfigurationConstants.DEFAULT_LOG_STDERR); 238 | 239 | bufferCount = context.getInteger(ExecSourceConfigurationConstants.CONFIG_BATCH_SIZE, 240 | ExecSourceConfigurationConstants.DEFAULT_BATCH_SIZE); 241 | 242 | batchTimeout = context.getLong(ExecSourceConfigurationConstants.CONFIG_BATCH_TIME_OUT, 243 | ExecSourceConfigurationConstants.DEFAULT_BATCH_TIME_OUT); 244 | 245 | charset = Charset.forName(context.getString(ExecSourceConfigurationConstants.CHARSET, 246 | ExecSourceConfigurationConstants.DEFAULT_CHARSET)); 247 | 248 | shell = context.getString(ExecSourceConfigurationConstants.CONFIG_SHELL, null); 249 | 250 | regex = context.getString(MultiLineExecSourceConfigurationConstants.REGEX, MultiLineExecSourceConfigurationConstants.DEFAULT_REGEX); 251 | 252 | if (sourceCounter == null) { 253 | sourceCounter = new SourceCounter(getName()); 254 | } 255 | } 256 | 257 | private static class ExecRunnable implements Runnable { 258 | 259 | public ExecRunnable(String shell, String command, ChannelProcessor channelProcessor, 260 | SourceCounter sourceCounter, boolean restart, long restartThrottle, 261 | boolean logStderr, int bufferCount, long batchTimeout, Charset charset, String regex) { 262 | this.command = command; 263 | this.channelProcessor = channelProcessor; 264 | this.sourceCounter = sourceCounter; 265 | this.restartThrottle = restartThrottle; 266 | this.bufferCount = bufferCount; 267 | this.batchTimeout = batchTimeout; 268 | this.restart = restart; 269 | this.logStderr = logStderr; 270 | this.charset = charset; 271 | this.shell = shell; 272 | this.regex = regex; 273 | this.pattern = Pattern.compile(regex); 274 | } 275 | 276 | private final String shell; 277 | private final String command; 278 | private final ChannelProcessor channelProcessor; 279 | private final SourceCounter sourceCounter; 280 | private volatile boolean restart; 281 | private final long restartThrottle; 282 | private final int bufferCount; 283 | private long batchTimeout; 284 | private final boolean logStderr; 285 | private final Charset charset; 286 | private Process process = null; 287 | private SystemClock systemClock = new SystemClock(); 288 | private Long lastPushToChannel = systemClock.currentTimeMillis(); 289 | ScheduledExecutorService timedFlushService; 290 | ScheduledFuture future; 291 | ///multiline setting start 292 | private String regex; 293 | private Pattern pattern; 294 | List buffer = new ArrayList(); 295 | //multiline setting end 296 | 297 | @Override 298 | public void run() { 299 | do { 300 | String exitCode = "unknown"; 301 | BufferedReader reader = null; 302 | String line = null; 303 | final List eventList = new ArrayList(); 304 | 305 | timedFlushService = Executors.newSingleThreadScheduledExecutor( 306 | new ThreadFactoryBuilder().setNameFormat( 307 | "timedFlushExecService" + 308 | Thread.currentThread().getId() + "-%d").build()); 309 | try { 310 | if(shell != null) { 311 | String[] commandArgs = formulateShellCommand(shell, command); 312 | process = Runtime.getRuntime().exec(commandArgs); 313 | } else { 314 | String[] commandArgs = command.split("\\s+"); 315 | process = new ProcessBuilder(commandArgs).start(); 316 | } 317 | reader = new BufferedReader( 318 | new InputStreamReader(process.getInputStream(), charset)); 319 | 320 | // StderrLogger dies as soon as the input stream is invalid 321 | StderrReader stderrReader = new StderrReader(new BufferedReader( 322 | new InputStreamReader(process.getErrorStream(), charset)), logStderr); 323 | stderrReader.setName("StderrReader-[" + command + "]"); 324 | stderrReader.setDaemon(true); 325 | stderrReader.start(); 326 | 327 | future = timedFlushService.scheduleWithFixedDelay(new Runnable() { 328 | @Override 329 | public void run() { 330 | try { 331 | synchronized (eventList) { 332 | if(!eventList.isEmpty() && timeout()) { 333 | flushEventBatch(eventList); 334 | } 335 | } 336 | } catch (Exception e) { 337 | logger.error("Exception occured when processing event batch", e); 338 | if(e instanceof InterruptedException) { 339 | Thread.currentThread().interrupt(); 340 | } 341 | } 342 | } 343 | }, 344 | batchTimeout, batchTimeout, TimeUnit.MILLISECONDS); 345 | 346 | while ((line = reader.readLine()) != null) { 347 | synchronized (eventList) { 348 | //multiline setting start 349 | Matcher m = pattern.matcher(line); 350 | if(m.find()) { 351 | if(buffer.size() != 0) { 352 | //write to body 353 | sourceCounter.incrementEventReceivedCount(); 354 | String total = ""; 355 | for(int i = 0; i < buffer.size(); ++i) { 356 | total += buffer.get(i); 357 | } 358 | eventList.add(EventBuilder.withBody(total.getBytes(charset))); 359 | if(eventList.size() >= bufferCount || timeout()) { 360 | flushEventBatch(eventList); 361 | } 362 | buffer.clear(); 363 | } 364 | buffer.add(line); 365 | }else { 366 | buffer.add(line); 367 | } 368 | //multiline setting end 369 | } 370 | } 371 | 372 | synchronized (eventList) { 373 | if(!buffer.isEmpty()) { 374 | sourceCounter.incrementEventReceivedCount(); 375 | //multiline setting start 376 | String total = ""; 377 | for(int i = 0; i < buffer.size(); ++i) { 378 | total += buffer.get(i); 379 | } 380 | buffer.clear(); 381 | eventList.add(EventBuilder.withBody(total.getBytes(charset))); 382 | //multiline setting end 383 | } 384 | if(!eventList.isEmpty()) { 385 | flushEventBatch(eventList); 386 | } 387 | } 388 | } catch (Exception e) { 389 | logger.error("Failed while running command: " + command, e); 390 | if(e instanceof InterruptedException) { 391 | Thread.currentThread().interrupt(); 392 | } 393 | } finally { 394 | if (reader != null) { 395 | try { 396 | reader.close(); 397 | } catch (IOException ex) { 398 | logger.error("Failed to close reader for exec source", ex); 399 | } 400 | } 401 | exitCode = String.valueOf(kill()); 402 | } 403 | if(restart) { 404 | logger.info("Restarting in {}ms, exit code {}", restartThrottle, 405 | exitCode); 406 | try { 407 | Thread.sleep(restartThrottle); 408 | } catch (InterruptedException e) { 409 | Thread.currentThread().interrupt(); 410 | } 411 | } else { 412 | logger.info("Command [" + command + "] exited with " + exitCode); 413 | } 414 | } while(restart); 415 | } 416 | 417 | private void flushEventBatch(List eventList){ 418 | channelProcessor.processEventBatch(eventList); 419 | sourceCounter.addToEventAcceptedCount(eventList.size()); 420 | eventList.clear(); 421 | lastPushToChannel = systemClock.currentTimeMillis(); 422 | } 423 | 424 | private boolean timeout(){ 425 | return (systemClock.currentTimeMillis() - lastPushToChannel) >= batchTimeout; 426 | } 427 | 428 | private static String[] formulateShellCommand(String shell, String command) { 429 | String[] shellArgs = shell.split("\\s+"); 430 | String[] result = new String[shellArgs.length + 1]; 431 | System.arraycopy(shellArgs, 0, result, 0, shellArgs.length); 432 | result[shellArgs.length] = command; 433 | return result; 434 | } 435 | 436 | public int kill() { 437 | if(process != null) { 438 | synchronized (process) { 439 | process.destroy(); 440 | 441 | try { 442 | int exitValue = process.waitFor(); 443 | 444 | // Stop the Thread that flushes periodically 445 | if (future != null) { 446 | future.cancel(true); 447 | } 448 | 449 | if (timedFlushService != null) { 450 | timedFlushService.shutdown(); 451 | while (!timedFlushService.isTerminated()) { 452 | try { 453 | timedFlushService.awaitTermination(500, TimeUnit.MILLISECONDS); 454 | } catch (InterruptedException e) { 455 | logger.debug("Interrupted while waiting for exec executor service " 456 | + "to stop. Just exiting."); 457 | Thread.currentThread().interrupt(); 458 | } 459 | } 460 | } 461 | return exitValue; 462 | } catch (InterruptedException ex) { 463 | Thread.currentThread().interrupt(); 464 | } 465 | } 466 | return Integer.MIN_VALUE; 467 | } 468 | return Integer.MIN_VALUE / 2; 469 | } 470 | public void setRestart(boolean restart) { 471 | this.restart = restart; 472 | } 473 | } 474 | private static class StderrReader extends Thread { 475 | private BufferedReader input; 476 | private boolean logStderr; 477 | 478 | protected StderrReader(BufferedReader input, boolean logStderr) { 479 | this.input = input; 480 | this.logStderr = logStderr; 481 | } 482 | 483 | @Override 484 | public void run() { 485 | try { 486 | int i = 0; 487 | String line = null; 488 | while((line = input.readLine()) != null) { 489 | if(logStderr) { 490 | // There is no need to read 'line' with a charset 491 | // as we do not to propagate it. 492 | // It is in UTF-16 and would be printed in UTF-8 format. 493 | logger.info("StderrLogger[{}] = '{}'", ++i, line); 494 | } 495 | } 496 | } catch (IOException e) { 497 | logger.info("StderrLogger exiting", e); 498 | } finally { 499 | try { 500 | if(input != null) { 501 | input.close(); 502 | } 503 | } catch (IOException ex) { 504 | logger.error("Failed to close stderr reader for exec source", ex); 505 | } 506 | } 507 | } 508 | } 509 | 510 | } 511 | -------------------------------------------------------------------------------- /src/main/java/com/urey/flume/MultiLineExecSourceConfigurationConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package com.urey.flume; 21 | 22 | /** 23 | * Created by ureyqiao on 2016/3/21. 24 | * contact me: qiaowei@pku.edu.cn 25 | */ 26 | public class MultiLineExecSourceConfigurationConstants { 27 | 28 | /** 29 | * Every log starts with regex matched string is considered as a flume event 30 | */ 31 | public static final String REGEX = "lineStartRegex"; 32 | public static final String DEFAULT_REGEX = "\\s?\\d\\d\\d\\d-\\d\\d-\\d\\d\\s\\d\\d:\\d\\d:\\d\\d,\\d\\d\\d"; 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/urey/flume/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.urey.flume; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------