├── test ├── all-tests ├── log4j.properties └── com │ └── minsheng │ └── flume │ └── source │ ├── TestSimpleFileMonitor.java │ ├── TestDirFileRecorder.java │ └── TestMultiLineParser.java ├── lib ├── junit-4.8.2.jar ├── guava-11.0.2.jar ├── log4j-1.2.16.jar ├── slf4j-api-1.6.1.jar ├── slf4j-log4j12-1.6.1.jar ├── flume-tools-1.4.0-cdh5.0.0-beta-1.jar ├── flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar ├── flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar ├── flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar ├── flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar ├── flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar ├── flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar ├── flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar ├── flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar ├── flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar ├── flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar └── flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar ├── flume_monitor_source使用说明文档.pdf ├── src └── com │ └── minsheng │ ├── flume │ └── source │ │ ├── FileParser.java │ │ ├── FileMonitor.java │ │ ├── DirectoryMonitorSource.java │ │ ├── FlumeConstants.java │ │ ├── FileInfo.java │ │ ├── FileMapReaderWriter.java │ │ ├── DirFileRecorder.java │ │ ├── SimpleFileMonitor.java │ │ └── MultiLineParser.java │ └── util │ ├── Time.java │ ├── StringUtil.java │ └── Shell.java ├── README.md └── LICENSE.txt /test/all-tests: -------------------------------------------------------------------------------- 1 | **/Test*.java 2 | -------------------------------------------------------------------------------- /lib/junit-4.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/junit-4.8.2.jar -------------------------------------------------------------------------------- /lib/guava-11.0.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/guava-11.0.2.jar -------------------------------------------------------------------------------- /lib/log4j-1.2.16.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/log4j-1.2.16.jar -------------------------------------------------------------------------------- /lib/slf4j-api-1.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/slf4j-api-1.6.1.jar -------------------------------------------------------------------------------- /lib/slf4j-log4j12-1.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/slf4j-log4j12-1.6.1.jar -------------------------------------------------------------------------------- /flume_monitor_source使用说明文档.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/flume_monitor_source使用说明文档.pdf -------------------------------------------------------------------------------- /lib/flume-tools-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-tools-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /lib/flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar -------------------------------------------------------------------------------- /test/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG,console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=[%p] %d{dd MMM yyy} %m%n 6 | -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/FileParser.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | import java.util.List; 5 | import org.apache.flume.Context; 6 | 7 | public abstract class FileParser { 8 | public abstract void Configure(Context context); 9 | public abstract List GetNextBatchRecords(String file_name, 10 | Long offset); 11 | public abstract boolean ShouldDrop(String record); 12 | } -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/FileMonitor.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | import java.util.Map; 5 | import java.util.concurrent.ConcurrentHashMap; 6 | 7 | import org.apache.flume.Context; 8 | 9 | public abstract class FileMonitor { 10 | public FileMonitor() { 11 | } 12 | 13 | public abstract void Configure(Context context); 14 | 15 | public abstract void Start(); 16 | 17 | public abstract void Stop(); 18 | 19 | public abstract Map GetLatestFileInfo( 20 | Map file_map_with_latest_offet); 21 | 22 | public abstract String GetMonitorDir(); 23 | } -------------------------------------------------------------------------------- /src/com/minsheng/util/Time.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.util; 4 | 5 | 6 | public final class Time { 7 | 8 | /** 9 | * Current system time. Do not use this to calculate a duration or interval 10 | * to sleep, because it will be broken by settimeofday. Instead, use 11 | * monotonicNow. 12 | * @return current time in msec. 13 | */ 14 | public static long now() { 15 | return System.currentTimeMillis(); 16 | } 17 | 18 | /** 19 | * Current time from some arbitrary time base in the past, counting in 20 | * milliseconds, and not affected by settimeofday or similar system clock 21 | * changes. This is appropriate to use when computing how much longer to 22 | * wait for an interval to expire. 23 | * @return a monotonic clock that counts in milliseconds. 24 | */ 25 | public static long monotonicNow() { 26 | final long NANOSECONDS_PER_MILLISECOND = 1000000; 27 | 28 | return System.nanoTime() / NANOSECONDS_PER_MILLISECOND; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/com/minsheng/util/StringUtil.java: -------------------------------------------------------------------------------- 1 | package com.minsheng.util; 2 | import java.util.ArrayList; 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | public class StringUtil { 7 | public static String[] Split(String content, String sub_seq) { 8 | int start_index = 0; 9 | ArrayList ret = new ArrayList(); 10 | 11 | int pos = -1; 12 | while (start_index < content.length() && 13 | (pos = content.indexOf(sub_seq, start_index)) != -1) { 14 | ret.add(content.substring(start_index, pos+sub_seq.length())); 15 | start_index = pos + sub_seq.length(); 16 | } 17 | if (start_index < content.length()) { 18 | ret.add(content.substring(start_index)); 19 | } 20 | String[] result = new String[ret.size()]; 21 | return ret.toArray(result); 22 | } 23 | 24 | 25 | public static String[] SplitAndTrim(Pattern pat, CharSequence input) { 26 | int index = 0; 27 | ArrayList matchList = new ArrayList(); 28 | Matcher m = pat.matcher(input); 29 | 30 | while (m.find()) { 31 | String match = input.subSequence(index, m.start()).toString(); 32 | if (!match.trim().isEmpty()) 33 | matchList.add(match); 34 | index = m.end(); 35 | } 36 | 37 | // If no match was found, return this 38 | if (index == 0) 39 | return new String[] {input.toString()}; 40 | matchList.add(input.subSequence(index, input.length()).toString()); 41 | int resultSize = matchList.size(); 42 | String[] result = new String[resultSize]; 43 | return matchList.subList(0, resultSize).toArray(result); 44 | } 45 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | flume_monitor_source 2 | ==================== 3 | # What is 4 | flume_monitor_source is a flume source plug-in for monitoring files under a specified directory. It is different from the 'spooldir' source in flume in the following ways: 5 | 1. The flume_monitor_source can incrementally read data from the specified directory in real time, which means the file under the specified directory is writeable (only append operation). This is not support by 'spooldir' source; 6 | 2. The flume_monitor_source can handle the multiple lines such as Java call stack or exception as ONE understandable complete record , while the flume can only handle one line per time; 7 | 3. The flume_monitor_source will process the file at the point which it had already processed when it was stopped at the last time. 8 | 9 | --- 10 | 11 | # How to use it 12 | 13 | * Build the jar 14 | 15 | ``` 16 | ant jar 17 | ``` 18 | 19 | * Copy jar to lib of flume 20 | 21 | ``` 22 | cp dist/flume-monitor-source-0.1.jar ${FLUME_HOME}/lib 23 | ``` 24 | 25 | # Configure the source 26 | 27 | ## Prerequisites 28 | [FlumeUserGuide]: http://flume.apache.org/FlumeUserGuide.html 29 | You are supposed to known how to use flume. See [Flume Documentation.][FlumeUserGuide] 30 | 31 | ## Parameter List 32 | 33 | **Property Name** | **default** | **Description** 34 | :--------------- | :--------- | :--------------------------------- 35 | *type* | - | The component type name, needs to be com.minsheng.flume.source.MonitorDirectorySource 36 | *monitor_dir* | - | Required. The directory under which all files under will be monitored. Files satisfying the condition will be parsed and send to the flume channel 37 | *meta_store_dir* | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/DirectoryMonitorSource.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | import java.util.Map; 6 | 7 | import org.apache.flume.Context; 8 | import org.apache.flume.EventDrivenSource; 9 | import org.apache.flume.conf.Configurable; 10 | import org.apache.flume.instrumentation.SourceCounter; 11 | import org.apache.flume.source.AbstractSource; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | public class DirectoryMonitorSource extends AbstractSource 16 | implements EventDrivenSource, Configurable { 17 | private static Logger LOG = 18 | LoggerFactory.getLogger(DirectoryMonitorSource.class); 19 | 20 | private SourceCounter sourceCounter_; 21 | 22 | DirFileRecorder dir_recorder_ = null; 23 | public DirectoryMonitorSource() { 24 | dir_recorder_ = new DirFileRecorder(this); 25 | if (sourceCounter_ == null) { 26 | sourceCounter_ = new SourceCounter(getName()); 27 | } 28 | } 29 | 30 | public void UpdateSourceCounter(long event_size) { 31 | sourceCounter_.addToEventAcceptedCount(event_size); 32 | } 33 | 34 | @Override 35 | public void configure(Context context) { 36 | // TODO Auto-generated method stub 37 | if (LOG.isDebugEnabled()) { 38 | for (Map.Entry entry : 39 | context.getParameters().entrySet()) { 40 | LOG.debug("*****key=" + entry.getKey() + " value=" + entry.getValue()); 41 | } 42 | } 43 | dir_recorder_.Configure(context); 44 | } 45 | 46 | @Override 47 | public void start() { 48 | dir_recorder_.Start(); 49 | } 50 | 51 | @Override 52 | public void stop() { 53 | dir_recorder_.Stop(); 54 | } 55 | } -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/FlumeConstants.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | public class FlumeConstants { 6 | public static String DIR_SEP = "/"; 7 | public static String LINE_SEP = "\n"; 8 | // \s means any space character('\t','\n',' ', '\f', '\r') 9 | public static String AUTO_DELETE_LINE_DEILMITER = "delete_line_delimiter"; 10 | public static String SHELL_RESULT_REGEX = "\\s+"; 11 | public static String INTEGER_REGEX = "[0-9]+"; 12 | public static int SHELL_RESULT_FIELD_NUM = 6; 13 | // records > 10MB are cut into pieces 14 | public static int MAX_RECORD_LENGH = 1024 * 1024 * 10; 15 | public static int READ_BUFFER_SIZE = 1024 * 1024 * 1; // 2MB 16 | // we assume all records is smaller than 20MB, if we meet such a record 17 | // the program just skip to process 18 | public static int MAX_READ_BUFFER_SIZE = 1024 * 1024 * 20; 19 | public static String FILE_CHECK_INTERVAL = "file_check_interval_sec"; 20 | public static String FILE_SEND_INTERVAL = "file_send_interval_sec"; 21 | public static String FILE_NAME_INCLUDE = "file_name_include_pattern"; 22 | public static String FILE_NAME_EXCLUDE = "file_name_exclude_pattern"; 23 | public static String FIRST_LINE_PATTERN = "first_line_pattern"; 24 | public static String LAST_LINE_PATTERN = "last_line_pattern"; 25 | public static String FILE_CONTENT_INCLUDE = "file_content_include_pattern"; 26 | public static String FILE_CONTENT_EXCLUDE = "file_content_exclude_pattern"; 27 | public static String META_STORE_DIR = "meta_store_dir"; 28 | public static String MONITOR_DIR = "monitor_dir"; 29 | public static String SHELL_COMMAND[] = {"ls", "-il", "-o", "-g", 30 | "--time-style=+%m", "TARGET_DIR"}; 31 | static String[] GetShellCommand(String monitor_dir) { 32 | SHELL_COMMAND[SHELL_COMMAND.length - 1] = monitor_dir; 33 | return SHELL_COMMAND.clone(); 34 | } 35 | } -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/FileInfo.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | public class FileInfo { 6 | public static String FIELD_SEPERATOR = "\t"; 7 | public static int NUM_FIELDS = 4; 8 | public static int ID_INDEX = 0; 9 | public static int FILE_LENGTH_INDEX = 1; 10 | public static int OFFSET_INDEX = 2; 11 | public static int FILE_NAME_INDEX = 3; 12 | public static int LIFE_SPAN = 200; 13 | 14 | private String file_name_ = null; 15 | private Long file_length_ ; 16 | private Long offset_; 17 | // file_tag is the unique-identifier of a file,it may be the inode 18 | // in linux sysmte or the hash value the absolute file name; 19 | private Integer id_; 20 | 21 | // when file under monitor directory is deleted, keep delete_delay_round 22 | // then delete this meta 23 | private int life_span_ = LIFE_SPAN; 24 | 25 | public static Integer GetIdFromName(String file_name) { 26 | return new Integer(file_name.hashCode()); 27 | } 28 | 29 | public FileInfo() { 30 | file_name_ = ""; 31 | file_length_ = 0L; 32 | offset_ = 0L; 33 | id_ = 0; 34 | } 35 | 36 | public FileInfo(String name, Long length, Long offset) { 37 | this.file_name_ = name; 38 | this.file_length_ = length; 39 | this.offset_ = offset; 40 | this.id_ = GetIdFromName(file_name_); 41 | } 42 | 43 | public FileInfo(String name, Long length, Long offset, Integer id) { 44 | this.file_name_ = name; 45 | this.file_length_ = length; 46 | this.offset_ = offset; 47 | this.id_ = id; 48 | } 49 | 50 | public void DecLifeSpan() { 51 | this.life_span_ -= 1; 52 | } 53 | 54 | public int get_life_span() { 55 | return this.life_span_; 56 | } 57 | 58 | public String get_file_name() { 59 | return this.file_name_; 60 | } 61 | 62 | public Long get_file_length() { 63 | return this.file_length_; 64 | } 65 | 66 | public Long get_offset() { 67 | return offset_; 68 | } 69 | 70 | public Integer get_id() { 71 | return id_; 72 | } 73 | 74 | public void set_file_name(String name) { 75 | file_name_ = name; 76 | } 77 | 78 | public void set_file_length(Long len) { 79 | file_length_ = len; 80 | } 81 | 82 | public void set_offset(Long offset) { 83 | offset_ = offset; 84 | } 85 | 86 | public void set_id(Integer id) { 87 | id_ = id; 88 | } 89 | 90 | public String toString() { 91 | StringBuilder builder = new StringBuilder(); 92 | builder.append("inode="); 93 | builder.append(id_); 94 | builder.append(FIELD_SEPERATOR); 95 | builder.append("length="); 96 | builder.append(file_length_); 97 | builder.append(FIELD_SEPERATOR); 98 | builder.append("offset="); 99 | builder.append(offset_); 100 | builder.append(FIELD_SEPERATOR); 101 | builder.append("file_name="); 102 | builder.append(file_name_); 103 | return builder.toString(); 104 | } 105 | 106 | 107 | public String GetWriteString() { 108 | StringBuilder builder = new StringBuilder(); 109 | builder.append(id_); 110 | builder.append(FIELD_SEPERATOR); 111 | builder.append(file_length_); 112 | builder.append(FIELD_SEPERATOR); 113 | builder.append(offset_); 114 | builder.append(FIELD_SEPERATOR); 115 | builder.append(file_name_); 116 | return builder.toString(); 117 | } 118 | 119 | public int hashCode() { 120 | return id_; 121 | } 122 | } -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/FileMapReaderWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | import java.util.Map; 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.FileInputStream; 9 | import java.io.FileNotFoundException; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.io.PrintWriter; 13 | 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.google.common.base.Preconditions; 18 | 19 | public class FileMapReaderWriter { 20 | private static final Logger LOG = LoggerFactory 21 | .getLogger(FileMapReaderWriter.class); 22 | private static String format_string_ = "# ${inode}\t${length}\t{$offset}\t${file_name}"; 23 | private String file_name_; 24 | 25 | public FileMapReaderWriter() { 26 | file_name_ = null; 27 | } 28 | 29 | public void Configure(String name) { 30 | Preconditions.checkState(null != name && !name.isEmpty(), 31 | "Map meta record file must be specified"); 32 | file_name_ = name; 33 | LOG.info("map_record_meta_file=" + file_name_); 34 | } 35 | 36 | public boolean ParseLine(String line, FileInfo file_info) { 37 | Preconditions.checkState(file_name_ != null, 38 | "Plz call Configure to initialize before call other functions"); 39 | String eles[] = line.split(FileInfo.FIELD_SEPERATOR); 40 | if (eles.length != FileInfo.NUM_FIELDS) { 41 | LOG.warn("Invalid record line:" + line); 42 | return false; 43 | } 44 | try { 45 | file_info.set_file_name(eles[FileInfo.FILE_NAME_INDEX]); 46 | file_info.set_file_length((Long.valueOf(eles[FileInfo.FILE_LENGTH_INDEX]))); 47 | file_info.set_offset((Long.valueOf(eles[FileInfo.OFFSET_INDEX]))); 48 | file_info.set_id((Integer.valueOf(eles[FileInfo.ID_INDEX]))); 49 | } catch (NumberFormatException e) { 50 | LOG.warn("Invalid line:" + line); 51 | return false; 52 | } 53 | return true; 54 | } 55 | 56 | public static void PrintMap(Map file_info_map) { 57 | int cnter = 0; 58 | LOG.debug("Total num file in file_map:" + file_info_map.size()); 59 | for (FileInfo file_info : file_info_map.values()) { 60 | LOG.debug("idx = " + cnter + " info = " + file_info.toString()); 61 | } 62 | } 63 | 64 | public synchronized void LoadMap(Map file_info_map) { 65 | LOG.info("LoadFileMap from file: " + file_name_); 66 | BufferedReader file_reader = null; 67 | try { 68 | file_reader = new BufferedReader(new InputStreamReader( 69 | new FileInputStream(file_name_))); 70 | String line = null; 71 | while ((line = file_reader.readLine()) != null) { 72 | if (line.startsWith("#")) { 73 | continue; 74 | } 75 | FileInfo file_info = new FileInfo(); 76 | if (ParseLine(line, file_info)) { 77 | file_info_map.put(file_info.get_id(), file_info); 78 | } else { 79 | LOG.warn("LoadMap invalid line, parse error: " + line); 80 | } 81 | } 82 | PrintMap(file_info_map); 83 | } catch(FileNotFoundException e) { 84 | LOG.info("Map record file not exist, skip loading " + file_name_); 85 | } catch (Exception e) { 86 | LOG.warn("Map record file read error due to " + e.toString()); 87 | } finally { 88 | if (null != file_reader) { 89 | try { 90 | file_reader.close(); 91 | } catch (IOException e) { 92 | LOG.warn("close file exception, " + e.toString()); 93 | } 94 | } 95 | } 96 | } 97 | 98 | public synchronized void WriteMap(Map file_info_map) { 99 | PrintWriter file_writter = null; 100 | int cnter = 0; 101 | try { 102 | File file = new File(file_name_); 103 | // create parent directory if not exist 104 | file.getParentFile().mkdirs(); 105 | file_writter = new PrintWriter(file_name_); 106 | file_writter.println(format_string_); 107 | synchronized (file_info_map) { 108 | for (FileInfo file_info : file_info_map.values()) { 109 | file_writter.println(file_info.GetWriteString()); 110 | cnter++; 111 | } 112 | } 113 | } catch (FileNotFoundException e) { 114 | LOG.warn("File not found, you should never see this"); 115 | } catch (Exception e) { 116 | LOG.warn("Write map meta failed due to " + e.toString()); 117 | } finally { 118 | if (null != file_writter) { 119 | file_writter.close(); 120 | } 121 | LOG.info("Write \'" + cnter + "\' records to map meta file" + file_name_ ); 122 | } 123 | } 124 | 125 | } -------------------------------------------------------------------------------- /test/com/minsheng/flume/source/TestSimpleFileMonitor.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.HashMap; 8 | import java.util.concurrent.*; 9 | import java.util.regex.Pattern; 10 | import java.lang.*; 11 | 12 | import junit.framework.Assert; 13 | import static org.junit.Assert.*; 14 | 15 | import org.junit.Before; 16 | import org.junit.After; 17 | import org.junit.Test; 18 | import org.apache.flume.Context; 19 | 20 | import com.minsheng.flume.source.FileInfo; 21 | import com.minsheng.flume.source.FlumeConstants; 22 | import com.minsheng.flume.source.SimpleFileMonitor; 23 | 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | public class TestSimpleFileMonitor { 28 | private static final Logger LOG = LoggerFactory 29 | .getLogger(TestSimpleFileMonitor.class); 30 | 31 | private int file_num_ = 3; 32 | private String target_dir_; 33 | 34 | private String prefix_ = "file"; 35 | private String include_suffix_ = ".test"; 36 | private String include_file_regex_ = prefix_ + ".*" + include_suffix_; 37 | 38 | private String date_str_ = ".2013-01-17-12"; 39 | private String exclude_suffix_ = include_suffix_ + date_str_; 40 | private String exclude_file_regex_ = prefix_ + ".*" + exclude_suffix_; 41 | 42 | private Pattern default_include_pattern_ = 43 | Pattern.compile(".*"); 44 | 45 | private Pattern full_exclude_pattern_ = 46 | Pattern.compile(exclude_file_regex_); 47 | 48 | Context default_context_; 49 | Context full_context_; 50 | 51 | SimpleFileMonitor default_monitor_; 52 | SimpleFileMonitor full_monitor_; 53 | 54 | 55 | @Before 56 | public void SetUp() { 57 | target_dir_ = "/tmp/ms_flume/monitor/"; 58 | File file_target_dir = new File(target_dir_); 59 | LOG.info("Create test target directory: " + target_dir_); 60 | Assert.assertTrue("Create target dir failed", file_target_dir.mkdirs()); 61 | 62 | LOG.info("Create child directory"); 63 | for (int i = 0; i < file_num_; i++) { 64 | File child_dir = new File(target_dir_ +"/" + i); 65 | LOG.info("\tCreate directory:" + child_dir.toString()); 66 | Assert.assertTrue("Create child dir failed", child_dir.mkdirs()); 67 | } 68 | 69 | try { 70 | LOG.info("Create include file in target directory"); 71 | for (int i = 0; i < file_num_; i++) { 72 | File test_file = File.createTempFile(prefix_ + i, 73 | include_suffix_, file_target_dir); 74 | LOG.info("\tCreate include file:" + test_file.toString()); 75 | Assert.assertTrue("Create failed for test file " + test_file.getName(), 76 | test_file.exists()); 77 | } 78 | 79 | System.out 80 | .println("Create exclude file(start with '.') in target directory"); 81 | for (int i = 0; i < file_num_; i++) { 82 | File test_file = File 83 | .createTempFile("." + prefix_ + i, include_suffix_, file_target_dir); 84 | LOG.info("\tCreate hidden file:" + test_file.toString()); 85 | Assert.assertTrue("Create failed for test file " + test_file.getName(), 86 | test_file.exists()); 87 | } 88 | 89 | System.out 90 | .println("Create exclude file(with date suffix) in target directory"); 91 | for (int i = 0; i < file_num_; i++) { 92 | File test_file = File.createTempFile(prefix_ + i, 93 | exclude_suffix_, file_target_dir); 94 | LOG.info("\tCreate exclude file:" + test_file.toString()); 95 | Assert.assertTrue("Create failed for test file " + test_file.getName(), 96 | test_file.exists()); 97 | } 98 | } catch (IOException e) { 99 | LOG.info("IOException: " + e.getMessage()); 100 | } 101 | 102 | LOG.info("\n*****Create default flume context(only specify target dir)"); 103 | Map params = new HashMap(); 104 | params.put(FlumeConstants.MONITOR_DIR, target_dir_); 105 | default_context_ = new Context(params); 106 | 107 | for (Map.Entry s : params.entrySet()) { 108 | LOG.info("key=" + s.getKey() 109 | + " value=" + s.getValue()); 110 | } 111 | 112 | LOG.info("\n*****Create full flume context"); 113 | params.put(FlumeConstants.FILE_CHECK_INTERVAL, "3"); 114 | params.put(FlumeConstants.FILE_NAME_INCLUDE, include_file_regex_); 115 | params.put(FlumeConstants.FILE_NAME_EXCLUDE, exclude_file_regex_); 116 | 117 | for (Map.Entry s : params.entrySet()) { 118 | LOG.info("key=" + s.getKey() 119 | + " value=" + s.getValue()); 120 | } 121 | full_context_ = new Context(params); 122 | } 123 | 124 | @Test 125 | public void TestDefaultMonitor() { 126 | LOG.info("Start test default action at" + target_dir_); 127 | default_monitor_ = new SimpleFileMonitor(); 128 | default_monitor_.Configure(default_context_); 129 | default_monitor_.Start(); 130 | 131 | try { 132 | LOG.info("Slepp 5 sec for monitor to update"); 133 | Thread.sleep(5000L); 134 | } catch (InterruptedException e) { 135 | LOG.info("Sleep interrupted."); 136 | } 137 | 138 | Map my_map = new ConcurrentHashMap(); 139 | Map new_map = 140 | default_monitor_.GetLatestFileInfo(my_map); 141 | LOG.info("Total valid file num: " + new_map.size()); 142 | Assert.assertTrue(new_map.size() == (file_num_ * 2)); 143 | for (FileInfo file_info : new_map.values()) { 144 | LOG.info(file_info.toString()); 145 | Assert.assertTrue("include ilega files", 146 | default_include_pattern_.matcher(file_info.get_file_name()).matches()); 147 | } 148 | default_monitor_.Stop(); 149 | } 150 | 151 | @Test 152 | public void TestFullMonitor() { 153 | LOG.info("Start test default action at" + target_dir_); 154 | full_monitor_ = new SimpleFileMonitor(); 155 | full_monitor_.Configure(full_context_); 156 | full_monitor_.Start(); 157 | 158 | try { 159 | LOG.info("Slepp 5 sec for monitor to update"); 160 | Thread.sleep(5000L); 161 | } catch (InterruptedException e) { 162 | LOG.info("Sleep interrupted."); 163 | } 164 | 165 | Map my_map = new ConcurrentHashMap(); 166 | Map new_map = 167 | full_monitor_.GetLatestFileInfo(my_map); 168 | LOG.info("Total valid file num: " + new_map.size()); 169 | Assert.assertTrue(new_map.size() == (file_num_)); 170 | for (FileInfo file_info : new_map.values()) { 171 | LOG.info(file_info.toString()); 172 | Assert.assertTrue("include files not in include pattern", 173 | default_include_pattern_.matcher(file_info.get_file_name()) 174 | .matches()); 175 | Assert.assertFalse("some file should be excludesd", full_exclude_pattern_ 176 | .matcher(file_info.get_file_name()).matches()); 177 | } 178 | full_monitor_.Stop(); 179 | } 180 | 181 | public void RecursiveDelete(File file) { 182 | if (file.isDirectory()) { 183 | for (File f : file.listFiles()) { 184 | RecursiveDelete(f); 185 | } 186 | file.delete(); 187 | } else { 188 | file.delete(); 189 | } 190 | } 191 | 192 | @After 193 | public void CleanUp() { 194 | File file = new File(target_dir_); 195 | LOG.info("Clean target dir"); 196 | RecursiveDelete(file); 197 | Assert.assertFalse(file.exists()); 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/DirFileRecorder.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | import java.util.regex.PatternSyntaxException; 14 | import java.util.concurrent.Executors; 15 | import java.util.concurrent.ExecutorService; 16 | import java.util.concurrent.Future; 17 | import java.util.concurrent.ConcurrentHashMap; 18 | import java.util.concurrent.ScheduledExecutorService; 19 | import java.util.concurrent.ScheduledFuture; 20 | import java.util.concurrent.TimeUnit; 21 | 22 | import org.apache.flume.Event; 23 | import org.apache.flume.event.EventBuilder; 24 | import org.apache.flume.Context; 25 | import org.apache.flume.event.EventBuilder; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | import com.google.common.base.Preconditions; 30 | 31 | public class DirFileRecorder { 32 | private static final Logger LOG = LoggerFactory 33 | .getLogger(DirFileRecorder.class); 34 | 35 | // id --> FileInfo, id may be either the inode in fs or the hash value of 36 | // file name(FileInfo.GetIdFromName) 37 | private Map file_info_map_ = null; 38 | private FileMonitor file_monitor_ = null; 39 | private FileMapReaderWriter reader_writer_ = null; 40 | private FileParser file_parser_ = null; 41 | private boolean auto_delete_line_delimiter_ = false; 42 | 43 | // check and send content every 3 second 44 | private Long send_interval_ = 3L; 45 | private String meta_store_file_ = ""; 46 | 47 | DirectoryMonitorSource monitor_source_ = null; 48 | 49 | private ScheduledExecutorService executor_service_ = null; 50 | private Runnable sender_runnable_ = null; 51 | private ScheduledFuture sender_future_ = null; 52 | 53 | 54 | public DirFileRecorder(DirectoryMonitorSource source) { 55 | LOG.info("Init DirFileRecorder"); 56 | file_info_map_ = new ConcurrentHashMap(); 57 | file_monitor_ = new SimpleFileMonitor(); 58 | reader_writer_ = new FileMapReaderWriter(); 59 | file_parser_ = new MultiLineParser(); 60 | monitor_source_ = source; 61 | } 62 | 63 | public void Configure(Context context) { 64 | LOG.info("Configure DirFileRecorder."); 65 | file_monitor_.Configure(context); 66 | String meta_dir = context.getString(FlumeConstants.META_STORE_DIR, 67 | "./meta/"); 68 | send_interval_ = context.getLong(FlumeConstants.FILE_SEND_INTERVAL, 3L); 69 | String tmp_meta_store_file = meta_dir + FlumeConstants.DIR_SEP + 70 | file_monitor_.GetMonitorDir().hashCode(); 71 | File tmp_file = new File(tmp_meta_store_file); 72 | meta_store_file_ = tmp_file.getAbsolutePath(); // 73 | 74 | auto_delete_line_delimiter_ = 75 | context.getBoolean(FlumeConstants.AUTO_DELETE_LINE_DEILMITER, false); 76 | 77 | reader_writer_.Configure(meta_store_file_); 78 | 79 | file_parser_ = new MultiLineParser(); 80 | file_parser_.Configure(context); 81 | 82 | executor_service_ = Executors.newScheduledThreadPool(1); 83 | sender_runnable_ = new SenderRunnable(); 84 | file_info_map_ = new ConcurrentHashMap(); 85 | } 86 | 87 | public void Start() { 88 | LOG.info("Start DirFileRecorder."); 89 | reader_writer_.LoadMap(file_info_map_); 90 | FileMapReaderWriter.PrintMap(file_info_map_); 91 | file_monitor_.Start(); 92 | sender_future_ = executor_service_.scheduleAtFixedRate(sender_runnable_, 93 | 0L, 94 | send_interval_.longValue(), 95 | TimeUnit.SECONDS); 96 | } 97 | 98 | public void Stop() { 99 | file_monitor_.Stop(); 100 | sender_future_.cancel(true); 101 | reader_writer_.WriteMap(file_info_map_); 102 | executor_service_.shutdown(); 103 | } 104 | 105 | private boolean SendEvents(Map file_map) { 106 | if (null == file_map || file_map.isEmpty()) { 107 | LOG.warn("file_map is null(wait for update) or file_map is empty"); 108 | return false; 109 | } 110 | if (LOG.isDebugEnabled()) { 111 | LOG.debug("SendEvents, with total file num = " + file_map.size() 112 | + " dir {}" 113 | , file_monitor_.GetMonitorDir()); 114 | } 115 | // currently we update every time for debug 116 | boolean should_update_meta = true; 117 | long event_num = 0; 118 | try { 119 | for (FileInfo file_info : file_map.values()) { 120 | if (file_info.get_offset() >= file_info.get_file_length()) { 121 | // this file already processd 122 | if (LOG.isDebugEnabled()) { 123 | LOG.debug("File done, skip: " + file_info.get_file_name()); 124 | } 125 | continue; 126 | } 127 | 128 | List records = file_parser_.GetNextBatchRecords( 129 | file_info.get_file_name(), file_info.get_offset()); 130 | Long offset = file_info.get_offset(); 131 | for (String record : records) { 132 | // no matter drop it or not ,we should first update file read offset 133 | byte[] record_bytes = record.getBytes(); 134 | offset += record_bytes.length; 135 | /* 136 | * if (auto_delete_line_delimiter_) { record_bytes = 137 | * record.trim().getBytes(); offset += record_bytes.length + 1; // 1 138 | * for line delimiter } else { record_bytes = record.getBytes(); 139 | * offset += record_bytes.length; } 140 | */ 141 | // NOTICE: every record is end in with a '\n',if the flume will 142 | // auto to add a '\n', we may handle it here, other otherwise, switch 143 | // off 144 | // the auto-add-new-line. 145 | if (file_parser_.ShouldDrop(record)) { 146 | if (LOG.isDebugEnabled()) { 147 | LOG.debug("Drop record: " + record); 148 | } 149 | continue; 150 | } 151 | Event event = EventBuilder.withBody(record_bytes); 152 | monitor_source_.getChannelProcessor().processEvent(event); 153 | event_num += 1; 154 | } // end for loop 155 | // update offset 156 | file_info.set_offset(offset); 157 | should_update_meta = true; 158 | } // end for loop 159 | if (LOG.isDebugEnabled()) { 160 | LOG.debug("Send Event Num this time: " + event_num + " for dir {}", 161 | file_monitor_.GetMonitorDir()); 162 | } 163 | monitor_source_.UpdateSourceCounter(event_num); 164 | } catch (Exception e) { 165 | LOG.warn("Exception in SendEvents: " + e.getMessage()); 166 | e.printStackTrace(); 167 | } 168 | return should_update_meta; 169 | } 170 | 171 | class SenderRunnable implements Runnable { 172 | @Override 173 | public void run() { 174 | // TODO Auto-generated method stub 175 | try { 176 | if (LOG.isDebugEnabled()) { 177 | LOG.debug("Before Update, file_map_size: " + file_info_map_.size() +" dir {}", 178 | file_monitor_.GetMonitorDir()); 179 | FileMapReaderWriter.PrintMap(file_info_map_); 180 | } 181 | Map new_map = 182 | file_monitor_.GetLatestFileInfo(file_info_map_); 183 | if (LOG.isDebugEnabled()) { 184 | LOG.debug("After Update, file_map_size: " 185 | + new_map.size() + " dir {}", file_monitor_.GetMonitorDir()); 186 | FileMapReaderWriter.PrintMap(new_map); 187 | } 188 | 189 | file_info_map_ = new_map; 190 | if (SendEvents(file_info_map_)) { 191 | if (LOG.isDebugEnabled()) { 192 | LOG.debug("Write file map for dir {}", file_monitor_.GetMonitorDir()); 193 | } 194 | reader_writer_.WriteMap(new_map); 195 | } 196 | 197 | } catch (Exception e) { 198 | LOG.warn("Exception in SenderRunable: " + e.getMessage()); 199 | e.printStackTrace(); 200 | } 201 | } 202 | } 203 | 204 | }; 205 | -------------------------------------------------------------------------------- /test/com/minsheng/flume/source/TestDirFileRecorder.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.FileNotFoundException; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.HashMap; 10 | import java.util.concurrent.*; 11 | import java.util.regex.Pattern; 12 | import java.lang.*; 13 | import java.io.PrintWriter; 14 | import junit.framework.Assert; 15 | import static org.junit.Assert.*; 16 | 17 | import org.junit.Before; 18 | import org.junit.After; 19 | import org.junit.Test; 20 | import org.apache.flume.Context; 21 | 22 | import com.minsheng.flume.source.FileInfo; 23 | import com.minsheng.flume.source.FlumeConstants; 24 | import com.minsheng.flume.source.MultiLineParser; 25 | import com.minsheng.flume.source.SimpleFileMonitor; 26 | 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | 31 | public class TestDirFileRecorder { 32 | private static final Logger LOG = LoggerFactory 33 | .getLogger(TestDirFileRecorder.class); 34 | 35 | private File target_dir_ = new File("/tmp/ms_flume/parser"); 36 | 37 | private String first_regex_ = "\\[\\[.*"; 38 | private String last_regex_ = ".*END\\]\\]"; 39 | private Pattern start_line_pattern_ = Pattern.compile(first_regex_); 40 | private Pattern end_line_pattern_ = Pattern.compile(last_regex_); 41 | private Pattern record_include_pattern_ = 42 | Pattern.compile(".*INCLUDE_RECORD.*"); 43 | private Pattern record_exclude_pattern_ = 44 | Pattern.compile(".*EXCLUDE_RECORD.*"); 45 | 46 | private String record_include_str_ = "\tINCLUDE_RECORD"; 47 | private String record_exclude_str_ = "\tEXCLUDE_RECORD"; 48 | 49 | 50 | public void CreateMIXFile(File file) { 51 | PrintWriter writer = null; 52 | try { 53 | writer = new PrintWriter(file); 54 | String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line"; 55 | String end_line = "\trecord end line END]]"; 56 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 57 | 58 | // write record 1 ---include if include_record = .* 59 | writer.println(start_line); 60 | writer.println(end_line); 61 | 62 | // write record 1 ---include 63 | writer.println(start_line); 64 | writer.print(record_include_str_); 65 | writer.println(end_line); 66 | 67 | // write record 1 ---exclude 68 | writer.println(start_line); 69 | writer.print(record_exclude_str_); 70 | writer.println(end_line); 71 | 72 | // write record 2 -- include 73 | writer.println(start_line); 74 | writer.print(record_include_str_); 75 | writer.println(mid_line); 76 | writer.println(end_line); 77 | 78 | // write record 2 -- exclude 79 | writer.println(start_line); 80 | writer.print(record_exclude_str_); 81 | writer.println(mid_line); 82 | writer.println(end_line); 83 | 84 | // write random line and tailer witout header 85 | // we should see this as a new record 86 | writer.println("\tno header no header no header"); 87 | writer.println(end_line); 88 | 89 | 90 | // write random line and tailer witout header 91 | // we should never see this, because it will wait the end or start line; 92 | writer.println(start_line); 93 | writer.println("\t no tailer no tailer no tailer"); 94 | } catch (FileNotFoundException e) { 95 | LOG.error("Write content failed at file: " + file); 96 | } finally { 97 | if (writer != null) { 98 | writer.close(); 99 | writer = null; 100 | } 101 | } 102 | } 103 | 104 | public void CreateFIRSTFile(File file) { 105 | PrintWriter writer = null; 106 | try { 107 | writer = new PrintWriter(file); 108 | String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line"; 109 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 110 | // write record 1 111 | writer.println(start_line); 112 | writer.print(record_include_str_); 113 | // write record 2 114 | writer.println(start_line); 115 | writer.print(record_exclude_str_); 116 | writer.println(mid_line); 117 | 118 | // write record 3 119 | writer.println(start_line); 120 | writer.println(mid_line); 121 | writer.println(mid_line); 122 | writer.println(mid_line); 123 | writer.println(mid_line); 124 | writer.println(mid_line); 125 | writer.println(mid_line); 126 | 127 | } catch (FileNotFoundException e) { 128 | LOG.error("Write content failed at file: " + file); 129 | } finally { 130 | if (writer != null) { 131 | writer.close(); 132 | writer = null; 133 | } 134 | } 135 | } 136 | 137 | public void CreateLASTFile(File file) { 138 | PrintWriter writer = null; 139 | try { 140 | writer = new PrintWriter(file); 141 | String end_line = "[[INFO] 2013-01-17 13:54:32 reord end line"; 142 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 143 | // write record 1 144 | writer.print(record_include_str_); 145 | writer.println(end_line); 146 | // write record 2 147 | writer.println(mid_line); 148 | writer.print(record_include_str_); 149 | writer.println(end_line); 150 | 151 | // write record 3 152 | 153 | writer.println(mid_line); 154 | writer.println(mid_line); 155 | writer.println(mid_line); 156 | writer.println(end_line); 157 | 158 | // useless line 159 | writer.println(mid_line); 160 | writer.println(mid_line); 161 | writer.println(mid_line); 162 | writer.print(record_exclude_str_); 163 | writer.println(mid_line); 164 | writer.println(mid_line); 165 | writer.println(mid_line); 166 | } catch (FileNotFoundException e) { 167 | LOG.error("Write content failed at file: " + file); 168 | } finally { 169 | if (writer != null) { 170 | writer.close(); 171 | writer = null; 172 | } 173 | } 174 | } 175 | 176 | public void CreateNONEFile(File file) { 177 | PrintWriter writer = null; 178 | try { 179 | writer = new PrintWriter(file); 180 | String end_line = "[[INFO] 2013-01-17 13:54:32 random arbitray"; 181 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 182 | // write record 1 183 | writer.println(end_line); 184 | writer.print(record_exclude_str_); 185 | writer.print(record_include_str_); 186 | // write record 2 187 | writer.println(end_line); 188 | writer.println(mid_line); 189 | } catch (FileNotFoundException e) { 190 | LOG.error("Write content failed at file: " + file); 191 | } finally { 192 | if (writer != null) { 193 | writer.close(); 194 | writer = null; 195 | } 196 | } 197 | } 198 | 199 | 200 | public void SetUp() { 201 | LOG.info("Start test MultiLineParser"); 202 | LOG.info("Create test dir: " + target_dir_); 203 | target_dir_.mkdirs(); 204 | Assert.assertTrue(target_dir_.exists()); 205 | 206 | 207 | 208 | File first_file = new File(target_dir_ + "/" + "first.file"); 209 | CreateFIRSTFile(first_file); 210 | 211 | File last_file = new File(target_dir_ + "/" + "last.file"); 212 | CreateLASTFile(last_file); 213 | 214 | File none_file = new File(target_dir_ + "/" + "none.file"); 215 | CreateNONEFile(none_file); 216 | 217 | } 218 | 219 | public void WriteFile(File file, List records) { 220 | PrintWriter writer = null; 221 | try { 222 | writer = new PrintWriter(file); 223 | for (String line : records) { 224 | writer.print(line); 225 | } 226 | } catch (FileNotFoundException e) { 227 | LOG.error("Write content failed at file: " + file); 228 | } finally { 229 | if (writer != null) { 230 | writer.close(); 231 | writer = null; 232 | } 233 | } 234 | } 235 | 236 | 237 | @Test 238 | public void TestDirFile() { 239 | LOG.info("Start test dir file recorder"); 240 | } 241 | } -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/SimpleFileMonitor.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | import java.util.concurrent.ConcurrentHashMap; 10 | import java.util.concurrent.Executors; 11 | import java.util.concurrent.ScheduledExecutorService; 12 | import java.util.concurrent.ScheduledFuture; 13 | import java.util.concurrent.TimeUnit; 14 | 15 | import org.apache.flume.Context; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import com.google.common.base.Preconditions; 20 | import com.minsheng.util.Shell; 21 | import com.minsheng.util.StringUtil; 22 | 23 | public class SimpleFileMonitor extends FileMonitor { 24 | private static final Logger LOG = LoggerFactory 25 | .getLogger(SimpleFileMonitor.class); 26 | private static int ID_SHELL_IDX = 0; 27 | private static int FILE_NAME_SHELL_IDX = 5; 28 | private static int FILE_LENGTH_SHELL_IDX = 3; 29 | private static int FILE_META_IDX = 1; 30 | 31 | private String default_file_name_include_str_ = ".*"; 32 | private String default_file_name_exclude_str = "^[.].*"; 33 | private String file_name_file_name_include_pattern_str_ = null; 34 | // file start with "." is excluded 35 | private String file_name_file_name_exclude_pattern_str_ = null; 36 | private Pattern file_name_include_pattern_ = null; 37 | private Pattern file_name_exclude_pattern_ = null; 38 | 39 | private String ls_output_total_ = "total"; 40 | private Pattern integer_pattern_ = null; 41 | 42 | private Pattern shell_result_pattern_ = null; 43 | private String monitor_dir_ = null; 44 | private Map file_info_map_ = new ConcurrentHashMap(); 45 | private String shell_command_[] = null; 46 | private Shell.ShellCommandExecutor shell_executor_ = null; 47 | private ScheduledExecutorService executor_service_ = null; 48 | private Runnable shell_runnable_ = null; 49 | private ScheduledFuture shell_future_ = null; 50 | private Long check_interval_sec_ = null; 51 | // 52 | private Long default_interval_sec = 5L; 53 | private String shell_output_ = null; 54 | 55 | private boolean can_fetch_file_map_ = false; 56 | 57 | public SimpleFileMonitor() { 58 | super(); 59 | } 60 | 61 | @Override 62 | public void Configure(Context context) { 63 | // TODO Auto-generated method stub 64 | monitor_dir_ = context.getString(FlumeConstants.MONITOR_DIR); 65 | Preconditions.checkState(monitor_dir_ != null, 66 | "you must specified \'monitor_dir\' in config file"); 67 | check_interval_sec_ = context.getLong(FlumeConstants.FILE_CHECK_INTERVAL, 68 | default_interval_sec); 69 | file_name_file_name_include_pattern_str_ = context.getString( 70 | FlumeConstants.FILE_NAME_INCLUDE, default_file_name_include_str_); 71 | file_name_file_name_exclude_pattern_str_ = context.getString( 72 | FlumeConstants.FILE_NAME_EXCLUDE, default_file_name_exclude_str); 73 | 74 | shell_result_pattern_ = Pattern.compile(FlumeConstants.SHELL_RESULT_REGEX); 75 | file_name_include_pattern_ = Pattern 76 | .compile(file_name_file_name_include_pattern_str_); 77 | file_name_exclude_pattern_ = Pattern 78 | .compile(file_name_file_name_exclude_pattern_str_); 79 | 80 | integer_pattern_ = Pattern.compile(FlumeConstants.INTEGER_REGEX); 81 | 82 | shell_command_ = FlumeConstants.GetShellCommand(monitor_dir_); 83 | shell_executor_ = new Shell.ShellCommandExecutor(shell_command_); 84 | executor_service_ = Executors.newScheduledThreadPool(1); 85 | } 86 | 87 | @Override 88 | public void Start() { 89 | // TODO Auto-generated method stub 90 | StringBuilder builder = new StringBuilder(); 91 | builder.append("Start SimpleFileMonitor with [dir="); 92 | builder.append(monitor_dir_); 93 | builder.append(", file_name_file_name_include_pattern_str_="); 94 | builder.append(file_name_file_name_include_pattern_str_); 95 | builder.append(",file_name_file_name_exclude_pattern_str_="); 96 | builder.append(file_name_file_name_exclude_pattern_str_); 97 | builder.append(",check_interval_sec_="); 98 | builder.append(check_interval_sec_); 99 | LOG.info(builder.toString()); 100 | builder = null; 101 | 102 | shell_runnable_ = new ShellRunnable(); 103 | shell_future_ = executor_service_.scheduleAtFixedRate(shell_runnable_, 0L, 104 | check_interval_sec_.longValue(), TimeUnit.SECONDS); 105 | } 106 | 107 | @Override 108 | public void Stop() { 109 | // TODO Auto-generated method stub 110 | LOG.info("Stop SimpleFileMonitor"); 111 | shell_future_.cancel(true); 112 | executor_service_.shutdown(); 113 | file_info_map_ = null; 114 | } 115 | 116 | // this function the latest_offset is synchronized at entry 117 | @Override 118 | public Map GetLatestFileInfo( 119 | Map latest_offset) { 120 | // TODO Auto-generated method stub 121 | if (!can_fetch_file_map_) { 122 | // wait for next round to update 123 | return latest_offset; 124 | } 125 | 126 | Map new_file_map = null; 127 | synchronized (this) { 128 | if (latest_offset != null) { 129 | for (FileInfo file_info : latest_offset.values()) { 130 | if (this.file_info_map_.containsKey(file_info.get_id())) { 131 | // update offset from latest_offset map to current map 132 | this.file_info_map_.get(file_info.get_id()).set_offset( 133 | file_info.get_offset()); 134 | } else { 135 | 136 | // file is delete from monitor dir 137 | // we run into a problem, inode will be reused when file deleted 138 | // so we need to delete file as soon as possible 139 | // if (file_info.get_life_span() > 0) { 140 | // file_info.DecLifeSpan(); 141 | // this.file_info_map_.put(file_info.get_id(), file_info); 142 | // } 143 | } 144 | } 145 | } 146 | new_file_map = file_info_map_; 147 | // file_info_map_ = null; 148 | } 149 | return new_file_map; 150 | } 151 | 152 | @Override 153 | public String GetMonitorDir() { 154 | // TODO Auto-generated method stub 155 | return monitor_dir_; 156 | } 157 | 158 | public void UpdateMapFromShellResult() { 159 | Map new_file_map = new ConcurrentHashMap(); 160 | shell_output_ = shell_executor_.getOutput(); 161 | if (LOG.isDebugEnabled()) { 162 | LOG.debug("Shelloutput****************\n" + shell_output_); 163 | } 164 | for (String line : shell_output_.split(FlumeConstants.LINE_SEP)) { 165 | // String eles[] = shell_result_pattern_.split(line); 166 | String eles[] = StringUtil.SplitAndTrim(shell_result_pattern_, line); 167 | 168 | /** 169 | * when use ls -il -o -g --time-style=+%Y the first line of the output is 170 | * a total infomation like: total 22 ID_SHLL_IDX ID_META_IDX xx 171 | * FILE_LENGTH_SHELL_IDX xx FILE_NAME_IDX ${inode} drwxrwx-- 1 ${filesize} 172 | * ${date} ${filename} 173 | * 174 | * (1) for first line, just skip (2) for othern line, if length !=6, print 175 | * warn information (3) for directory, skip (4) check ${inode} ${filesize) 176 | * match integer pattern 177 | * */ 178 | if (eles[ID_SHELL_IDX].toLowerCase().startsWith(ls_output_total_)) { 179 | LOG.debug("Skip first line -- {}", line); 180 | continue; 181 | } 182 | if (eles.length != FlumeConstants.SHELL_RESULT_FIELD_NUM) { 183 | LOG.warn("Check system env, Invalid shell result,fields = {} line={} ", 184 | eles.length, line); 185 | continue; 186 | } 187 | 188 | if (eles[FILE_META_IDX].startsWith("d")) { 189 | // this file is a directory, skip 190 | LOG.debug("Skip monitor directory: " + eles[FILE_NAME_SHELL_IDX]); 191 | continue; 192 | } 193 | 194 | if (!integer_pattern_.matcher(eles[ID_SHELL_IDX]).matches() 195 | || !integer_pattern_.matcher(eles[FILE_LENGTH_SHELL_IDX]).matches()) { 196 | if (LOG.isDebugEnabled()) { 197 | StringBuilder builder = new StringBuilder(); 198 | builder.append("Skip invalid integer regex line:"); 199 | builder.append(line); 200 | builder.append("id_shell_idx="); 201 | builder.append(eles[ID_SHELL_IDX]); 202 | builder.append(" file_length="); 203 | builder.append(eles[FILE_LENGTH_SHELL_IDX]); 204 | LOG.debug(builder.toString()); 205 | builder = null; 206 | } 207 | continue; 208 | } 209 | 210 | Matcher include_matcher = file_name_include_pattern_ 211 | .matcher(eles[FILE_NAME_SHELL_IDX]); 212 | Matcher exclude_matcher = file_name_exclude_pattern_ 213 | .matcher(eles[FILE_NAME_SHELL_IDX]); 214 | 215 | if (include_matcher.matches() && !exclude_matcher.matches()) { 216 | if (LOG.isDebugEnabled()) { 217 | LOG.debug("File accepted, " + eles[FILE_NAME_SHELL_IDX]); 218 | } 219 | // file in white list and not in black list is accepted 220 | FileInfo file_info = new FileInfo(); 221 | // we already use INTEGER_REGEX to check the ID and Length must be number 222 | // but just in case, we catch the exception 223 | try { 224 | file_info.set_id(Integer.valueOf(eles[ID_SHELL_IDX])); 225 | file_info.set_file_length(Long.valueOf(eles[FILE_LENGTH_SHELL_IDX])); 226 | } catch (NumberFormatException e) { 227 | LOG.warn("Invalid shell result, number format error," + line); 228 | continue; 229 | } 230 | StringBuilder abs_path_builder = new StringBuilder(); 231 | abs_path_builder.append(monitor_dir_); 232 | abs_path_builder.append(FlumeConstants.DIR_SEP); 233 | abs_path_builder.append(eles[FILE_NAME_SHELL_IDX]); 234 | file_info.set_file_name(abs_path_builder.toString()); 235 | file_info.set_offset(0L); 236 | new_file_map.put(file_info.get_id(), file_info); 237 | } else { 238 | if (LOG.isDebugEnabled()) { 239 | LOG.debug("File rejected, " + eles[FILE_NAME_SHELL_IDX]); 240 | } 241 | } 242 | } 243 | synchronized (this) { 244 | file_info_map_ = new_file_map; 245 | can_fetch_file_map_ = true; 246 | } 247 | } 248 | 249 | class ShellRunnable implements Runnable { 250 | @Override 251 | public void run() { 252 | // TODO Auto-generated method stub 253 | try { 254 | shell_executor_.execute(); 255 | UpdateMapFromShellResult(); 256 | } catch (Exception e) { 257 | LOG.warn("Execute shell failed due to " + e.getMessage()); 258 | e.printStackTrace(); 259 | } 260 | } 261 | } 262 | } -------------------------------------------------------------------------------- /src/com/minsheng/util/Shell.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.util; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | 10 | import java.util.Map; 11 | import java.util.Timer; 12 | import java.util.TimerTask; 13 | import java.util.concurrent.atomic.AtomicBoolean; 14 | 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | /** 19 | * A base class for running a Unix command. 20 | * 21 | * Shell can be used to run unix commands like du or 22 | * df. It also offers facilities to gate commands by 23 | * time-intervals. 24 | */ 25 | abstract public class Shell { 26 | 27 | public static final Logger LOG = LoggerFactory.getLogger(Shell.class); 28 | 29 | private static boolean IS_JAVA7_OR_ABOVE = 30 | System.getProperty("java.version").substring(0, 3).compareTo("1.7") >= 0; 31 | 32 | public static boolean isJava7OrAbove() { 33 | return IS_JAVA7_OR_ABOVE; 34 | } 35 | 36 | 37 | /** Windows CreateProcess synchronization object */ 38 | public static final Object WindowsProcessLaunchLock = new Object(); 39 | 40 | 41 | 42 | /** Return a regular expression string that match environment variables */ 43 | public static String getEnvironmentVariableRegex() { 44 | return (WINDOWS) ? "%([A-Za-z_][A-Za-z0-9_]*?)%" : 45 | "\\$([A-Za-z_][A-Za-z0-9_]*)"; 46 | } 47 | 48 | /**Time after which the executing script would be timedout*/ 49 | protected long timeOutInterval = 0L; 50 | /** If or not script timed out*/ 51 | private AtomicBoolean timedOut; 52 | 53 | 54 | /** Set to true on Windows platforms */ 55 | public static final boolean WINDOWS /* borrowed from Path.WINDOWS */ 56 | = System.getProperty("os.name").startsWith("Windows"); 57 | 58 | public static final boolean LINUX 59 | = System.getProperty("os.name").startsWith("Linux"); 60 | 61 | public static final boolean isSetsidAvailable = isSetsidSupported(); 62 | private static boolean isSetsidSupported() { 63 | if (Shell.WINDOWS) { 64 | return false; 65 | } 66 | ShellCommandExecutor shexec = null; 67 | boolean setsidSupported = true; 68 | try { 69 | String[] args = {"setsid", "bash", "-c", "echo $$"}; 70 | shexec = new ShellCommandExecutor(args); 71 | shexec.execute(); 72 | } catch (IOException ioe) { 73 | LOG.debug("setsid is not available on this machine. So not using it."); 74 | setsidSupported = false; 75 | } finally { // handle the exit code 76 | if (LOG.isDebugEnabled()) { 77 | LOG.debug("setsid exited with exit code " 78 | + (shexec != null ? shexec.getExitCode() : "(null executor)")); 79 | } 80 | } 81 | return setsidSupported; 82 | } 83 | 84 | /** Token separator regex used to parse Shell tool outputs */ 85 | public static final String TOKEN_SEPARATOR_REGEX 86 | = WINDOWS ? "[|\n\r]" : "[ \t\n\r\f]"; 87 | 88 | private long interval; // refresh interval in msec 89 | private long lastTime; // last time the command was performed 90 | private Map environment; // env for the command execution 91 | private File dir; 92 | private Process process; // sub process used to execute the command 93 | private int exitCode; 94 | 95 | /**If or not script finished executing*/ 96 | private volatile AtomicBoolean completed; 97 | 98 | public Shell() { 99 | this(0L); 100 | } 101 | 102 | /** 103 | * @param interval the minimum duration to wait before re-executing the 104 | * command. 105 | */ 106 | public Shell( long interval ) { 107 | this.interval = interval; 108 | this.lastTime = (interval<0) ? 0 : -interval; 109 | } 110 | 111 | /** set the environment for the command 112 | * @param env Mapping of environment variables 113 | */ 114 | protected void setEnvironment(Map env) { 115 | this.environment = env; 116 | } 117 | 118 | /** set the working directory 119 | * @param dir The directory where the command would be executed 120 | */ 121 | protected void setWorkingDirectory(File dir) { 122 | this.dir = dir; 123 | } 124 | 125 | /** check to see if a command needs to be executed and execute if needed */ 126 | protected void run() throws IOException { 127 | if (lastTime + interval > Time.now()) 128 | return; 129 | exitCode = 0; // reset for next run 130 | runCommand(); 131 | } 132 | 133 | /** Run a command */ 134 | private void runCommand() throws IOException { 135 | ProcessBuilder builder = new ProcessBuilder(getExecString()); 136 | Timer timeOutTimer = null; 137 | ShellTimeoutTimerTask timeoutTimerTask = null; 138 | timedOut = new AtomicBoolean(false); 139 | completed = new AtomicBoolean(false); 140 | 141 | if (environment != null) { 142 | builder.environment().putAll(this.environment); 143 | } 144 | if (dir != null) { 145 | builder.directory(this.dir); 146 | } 147 | 148 | if (Shell.WINDOWS) { 149 | synchronized (WindowsProcessLaunchLock) { 150 | // To workaround the race condition issue with child processes 151 | // inheriting unintended handles during process launch that can 152 | // lead to hangs on reading output and error streams, we 153 | // serialize process creation. More info available at: 154 | // http://support.microsoft.com/kb/315939 155 | process = builder.start(); 156 | } 157 | } else { 158 | process = builder.start(); 159 | } 160 | 161 | if (timeOutInterval > 0) { 162 | timeOutTimer = new Timer("Shell command timeout"); 163 | timeoutTimerTask = new ShellTimeoutTimerTask( 164 | this); 165 | //One time scheduling. 166 | timeOutTimer.schedule(timeoutTimerTask, timeOutInterval); 167 | } 168 | final BufferedReader errReader = 169 | new BufferedReader(new InputStreamReader(process 170 | .getErrorStream())); 171 | BufferedReader inReader = 172 | new BufferedReader(new InputStreamReader(process 173 | .getInputStream())); 174 | final StringBuffer errMsg = new StringBuffer(); 175 | 176 | // read error and input streams as this would free up the buffers 177 | // free the error stream buffer 178 | Thread errThread = new Thread() { 179 | @Override 180 | public void run() { 181 | try { 182 | String line = errReader.readLine(); 183 | while((line != null) && !isInterrupted()) { 184 | errMsg.append(line); 185 | errMsg.append(System.getProperty("line.separator")); 186 | line = errReader.readLine(); 187 | } 188 | } catch(IOException ioe) { 189 | LOG.warn("Error reading the error stream", ioe); 190 | } 191 | } 192 | }; 193 | try { 194 | errThread.start(); 195 | } catch (IllegalStateException ise) { } 196 | try { 197 | parseExecResult(inReader); // parse the output 198 | // clear the input stream buffer 199 | String line = inReader.readLine(); 200 | while(line != null) { 201 | line = inReader.readLine(); 202 | } 203 | // wait for the process to finish and check the exit code 204 | exitCode = process.waitFor(); 205 | try { 206 | // make sure that the error thread exits 207 | errThread.join(); 208 | } catch (InterruptedException ie) { 209 | LOG.warn("Interrupted while reading the error stream", ie); 210 | } 211 | completed.set(true); 212 | //the timeout thread handling 213 | //taken care in finally block 214 | if (exitCode != 0) { 215 | throw new ExitCodeException(exitCode, errMsg.toString()); 216 | } 217 | } catch (InterruptedException ie) { 218 | throw new IOException(ie.toString()); 219 | } finally { 220 | if (timeOutTimer != null) { 221 | timeOutTimer.cancel(); 222 | } 223 | // close the input stream 224 | try { 225 | inReader.close(); 226 | } catch (IOException ioe) { 227 | LOG.warn("Error while closing the input stream", ioe); 228 | } 229 | try { 230 | if (!completed.get()) { 231 | errThread.interrupt(); 232 | errThread.join(); 233 | } 234 | } catch (InterruptedException ie) { 235 | LOG.warn("Interrupted while joining errThread"); 236 | } 237 | try { 238 | errReader.close(); 239 | } catch (IOException ioe) { 240 | LOG.warn("Error while closing the error stream", ioe); 241 | } 242 | process.destroy(); 243 | lastTime = Time.now(); 244 | } 245 | } 246 | 247 | /** return an array containing the command name & its parameters */ 248 | protected abstract String[] getExecString(); 249 | 250 | /** Parse the execution result */ 251 | protected abstract void parseExecResult(BufferedReader lines) 252 | throws IOException; 253 | 254 | /** get the current sub-process executing the given command 255 | * @return process executing the command 256 | */ 257 | public Process getProcess() { 258 | return process; 259 | } 260 | 261 | /** get the exit code 262 | * @return the exit code of the process 263 | */ 264 | public int getExitCode() { 265 | return exitCode; 266 | } 267 | 268 | /** 269 | * This is an IOException with exit code added. 270 | */ 271 | public static class ExitCodeException extends IOException { 272 | int exitCode; 273 | 274 | public ExitCodeException(int exitCode, String message) { 275 | super(message); 276 | this.exitCode = exitCode; 277 | } 278 | 279 | public int getExitCode() { 280 | return exitCode; 281 | } 282 | } 283 | 284 | /** 285 | * A simple shell command executor. 286 | * 287 | * ShellCommandExecutorshould be used in cases where the output 288 | * of the command needs no explicit parsing and where the command, working 289 | * directory and the environment remains unchanged. The output of the command 290 | * is stored as-is and is expected to be small. 291 | */ 292 | public static class ShellCommandExecutor extends Shell { 293 | 294 | private String[] command; 295 | private StringBuffer output; 296 | 297 | 298 | public ShellCommandExecutor(String[] execString) { 299 | this(execString, null); 300 | } 301 | 302 | public ShellCommandExecutor(String[] execString, File dir) { 303 | this(execString, dir, null); 304 | } 305 | 306 | public ShellCommandExecutor(String[] execString, File dir, 307 | Map env) { 308 | this(execString, dir, env , 0L); 309 | } 310 | 311 | /** 312 | * Create a new instance of the ShellCommandExecutor to execute a command. 313 | * 314 | * @param execString The command to execute with arguments 315 | * @param dir If not-null, specifies the directory which should be set 316 | * as the current working directory for the command. 317 | * If null, the current working directory is not modified. 318 | * @param env If not-null, environment of the command will include the 319 | * key-value pairs specified in the map. If null, the current 320 | * environment is not modified. 321 | * @param timeout Specifies the time in milliseconds, after which the 322 | * command will be killed and the status marked as timedout. 323 | * If 0, the command will not be timed out. 324 | */ 325 | public ShellCommandExecutor(String[] execString, File dir, 326 | Map env, long timeout) { 327 | command = execString.clone(); 328 | if (dir != null) { 329 | setWorkingDirectory(dir); 330 | } 331 | if (env != null) { 332 | setEnvironment(env); 333 | } 334 | timeOutInterval = timeout; 335 | } 336 | 337 | 338 | /** Execute the shell command. */ 339 | public void execute() throws IOException { 340 | this.run(); 341 | } 342 | 343 | @Override 344 | public String[] getExecString() { 345 | return command; 346 | } 347 | 348 | @Override 349 | protected void parseExecResult(BufferedReader lines) throws IOException { 350 | output = new StringBuffer(1024); 351 | char[] buf = new char[512]; 352 | int nRead; 353 | while ( (nRead = lines.read(buf, 0, buf.length)) > 0 ) { 354 | output.append(buf, 0, nRead); 355 | } 356 | } 357 | 358 | /** Get the output of the shell command.*/ 359 | public String getOutput() { 360 | return (output == null) ? "" : output.toString(); 361 | } 362 | 363 | /** 364 | * Returns the commands of this instance. 365 | * Arguments with spaces in are presented with quotes round; other 366 | * arguments are presented raw 367 | * 368 | * @return a string representation of the object. 369 | */ 370 | @Override 371 | public String toString() { 372 | StringBuilder builder = new StringBuilder(); 373 | String[] args = getExecString(); 374 | for (String s : args) { 375 | if (s.indexOf(' ') >= 0) { 376 | builder.append('"').append(s).append('"'); 377 | } else { 378 | builder.append(s); 379 | } 380 | builder.append(' '); 381 | } 382 | return builder.toString(); 383 | } 384 | } 385 | 386 | /** 387 | * To check if the passed script to shell command executor timed out or 388 | * not. 389 | * 390 | * @return if the script timed out. 391 | */ 392 | public boolean isTimedOut() { 393 | return timedOut.get(); 394 | } 395 | 396 | /** 397 | * Set if the command has timed out. 398 | * 399 | */ 400 | private void setTimedOut() { 401 | this.timedOut.set(true); 402 | } 403 | 404 | /** 405 | * Static method to execute a shell command. 406 | * Covers most of the simple cases without requiring the user to implement 407 | * the Shell interface. 408 | * @param cmd shell command to execute. 409 | * @return the output of the executed command. 410 | */ 411 | public static String execCommand(String ... cmd) throws IOException { 412 | return execCommand(null, cmd, 0L); 413 | } 414 | 415 | /** 416 | * Static method to execute a shell command. 417 | * Covers most of the simple cases without requiring the user to implement 418 | * the Shell interface. 419 | * @param env the map of environment key=value 420 | * @param cmd shell command to execute. 421 | * @param timeout time in milliseconds after which script should be marked timeout 422 | * @return the output of the executed command.o 423 | */ 424 | 425 | public static String execCommand(Map env, String[] cmd, 426 | long timeout) throws IOException { 427 | ShellCommandExecutor exec = new ShellCommandExecutor(cmd, null, env, 428 | timeout); 429 | exec.execute(); 430 | return exec.getOutput(); 431 | } 432 | 433 | /** 434 | * Static method to execute a shell command. 435 | * Covers most of the simple cases without requiring the user to implement 436 | * the Shell interface. 437 | * @param env the map of environment key=value 438 | * @param cmd shell command to execute. 439 | * @return the output of the executed command. 440 | */ 441 | public static String execCommand(Map env, String ... cmd) 442 | throws IOException { 443 | return execCommand(env, cmd, 0L); 444 | } 445 | 446 | /** 447 | * Timer which is used to timeout scripts spawned off by shell. 448 | */ 449 | private static class ShellTimeoutTimerTask extends TimerTask { 450 | 451 | private Shell shell; 452 | 453 | public ShellTimeoutTimerTask(Shell shell) { 454 | this.shell = shell; 455 | } 456 | 457 | @Override 458 | public void run() { 459 | Process p = shell.getProcess(); 460 | try { 461 | p.exitValue(); 462 | } catch (Exception e) { 463 | //Process has not terminated. 464 | //So check if it has completed 465 | //if not just destroy it. 466 | if (p != null && !shell.completed.get()) { 467 | shell.setTimedOut(); 468 | p.destroy(); 469 | } 470 | } 471 | } 472 | } 473 | } 474 | -------------------------------------------------------------------------------- /test/com/minsheng/flume/source/TestMultiLineParser.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.FileNotFoundException; 7 | import java.util.List; 8 | import java.util.ArrayList; 9 | import java.util.Map; 10 | import java.util.HashMap; 11 | import java.util.concurrent.*; 12 | import java.util.regex.Pattern; 13 | import java.lang.*; 14 | import java.io.PrintWriter; 15 | import junit.framework.Assert; 16 | import static org.junit.Assert.*; 17 | 18 | import org.junit.Before; 19 | import org.junit.After; 20 | import org.junit.Test; 21 | import org.apache.flume.Context; 22 | 23 | import com.minsheng.flume.source.FileInfo; 24 | import com.minsheng.flume.source.FlumeConstants; 25 | import com.minsheng.flume.source.MultiLineParser; 26 | import com.minsheng.flume.source.SimpleFileMonitor; 27 | 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | 32 | public class TestMultiLineParser { 33 | private static final Logger LOG = LoggerFactory 34 | .getLogger(TestMultiLineParser.class); 35 | 36 | private File target_dir_ = new File("/tmp/ms_flume/parser"); 37 | 38 | private String first_regex_ = "\\[\\[.*"; 39 | private String last_regex_ = ".*END\\]\\]"; 40 | private Pattern start_line_pattern_ = Pattern.compile(first_regex_); 41 | private Pattern end_line_pattern_ = Pattern.compile(last_regex_); 42 | private Pattern record_include_pattern_ = 43 | Pattern.compile(".*INCLUDE_RECORD.*"); 44 | private Pattern record_exclude_pattern_ = 45 | Pattern.compile(".*EXCLUDE_RECORD.*"); 46 | 47 | private String record_include_str_ = "\tINCLUDE_RECORD"; 48 | private String record_exclude_str_ = "\tEXCLUDE_RECORD"; 49 | 50 | 51 | public void CreateMIXFile(File file) { 52 | PrintWriter writer = null; 53 | try { 54 | writer = new PrintWriter(file); 55 | String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line"; 56 | String end_line = "\trecord end line END]]"; 57 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 58 | 59 | // write record 1 ---include if include_record = .* 60 | writer.println(start_line); 61 | writer.println(end_line); 62 | 63 | // write record 1 ---include 64 | writer.println(start_line); 65 | writer.print(record_include_str_); 66 | writer.println(end_line); 67 | 68 | // write record 1 ---exclude 69 | writer.println(start_line); 70 | writer.print(record_exclude_str_); 71 | writer.println(end_line); 72 | 73 | // write record 2 -- include 74 | writer.println(start_line); 75 | writer.print(record_include_str_); 76 | writer.println(mid_line); 77 | writer.println(end_line); 78 | 79 | // write record 2 -- exclude 80 | writer.println(start_line); 81 | writer.print(record_exclude_str_); 82 | writer.println(mid_line); 83 | writer.println(end_line); 84 | 85 | // write random line and tailer witout header 86 | // we should see this as a new record 87 | // writer.println("\tno header no header no header"); 88 | // writer.println(end_line); 89 | 90 | 91 | // write random line and tailer witout header 92 | // we should never see this, because it will wait the end or start line; 93 | writer.println(start_line); 94 | writer.println("\t no tailer no tailer no tailer"); 95 | } catch (FileNotFoundException e) { 96 | LOG.error("Write content failed at file: " + file); 97 | } finally { 98 | if (writer != null) { 99 | writer.close(); 100 | writer = null; 101 | } 102 | } 103 | } 104 | 105 | public void CreateFIRSTFile(File file) { 106 | PrintWriter writer = null; 107 | try { 108 | writer = new PrintWriter(file); 109 | String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line"; 110 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 111 | // write record 1 112 | writer.println(start_line); 113 | writer.print(record_include_str_); 114 | // write record 2 115 | writer.println(start_line); 116 | writer.print(record_exclude_str_); 117 | writer.println(mid_line); 118 | 119 | // write record 3 120 | writer.println(start_line); 121 | writer.println(mid_line); 122 | writer.println(mid_line); 123 | writer.println(mid_line); 124 | writer.println(mid_line); 125 | writer.println(mid_line); 126 | writer.println(mid_line); 127 | 128 | } catch (FileNotFoundException e) { 129 | LOG.error("Write content failed at file: " + file); 130 | } finally { 131 | if (writer != null) { 132 | writer.close(); 133 | writer = null; 134 | } 135 | } 136 | } 137 | 138 | public void CreateLASTFile(File file) { 139 | PrintWriter writer = null; 140 | try { 141 | writer = new PrintWriter(file); 142 | String end_line = "record end line END]]"; 143 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 144 | // write record 1 145 | writer.print(record_include_str_); 146 | writer.println(); 147 | writer.println(end_line); 148 | // write record 2 149 | writer.println(mid_line); 150 | writer.print(record_include_str_); 151 | writer.println(end_line); 152 | 153 | // write record 3 154 | 155 | writer.println(mid_line); 156 | writer.println(mid_line); 157 | writer.println(mid_line); 158 | writer.println(); 159 | writer.println(end_line); 160 | 161 | // useless line 162 | writer.println(mid_line); 163 | writer.println(mid_line); 164 | writer.println(); 165 | writer.println(); 166 | } catch (FileNotFoundException e) { 167 | LOG.error("Write content failed at file: " + file); 168 | } finally { 169 | if (writer != null) { 170 | writer.close(); 171 | writer = null; 172 | } 173 | } 174 | } 175 | 176 | public void CreateNONEFile(File file) { 177 | PrintWriter writer = null; 178 | try { 179 | writer = new PrintWriter(file); 180 | String end_line = "[[INFO] 2013-01-17 13:54:32 random arbitray"; 181 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 182 | // write record 1 183 | writer.println(end_line); 184 | writer.print(record_exclude_str_); 185 | writer.println(); 186 | writer.print(record_include_str_); 187 | // write record 2 188 | writer.println(end_line); 189 | writer.println(mid_line); 190 | writer.println(); 191 | } catch (FileNotFoundException e) { 192 | LOG.error("Write content failed at file: " + file); 193 | } finally { 194 | if (writer != null) { 195 | writer.close(); 196 | writer = null; 197 | } 198 | } 199 | } 200 | 201 | public void CreateFIRSTBigRecordFile(File file) { 202 | PrintWriter writer = null; 203 | try { 204 | writer = new PrintWriter(file); 205 | String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line"; 206 | String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"; 207 | // write record 1 208 | writer.println(start_line); 209 | writer.println(); 210 | writer.println(mid_line); 211 | writer.println(); 212 | // write record 3 213 | writer.println(start_line); 214 | writer.println(mid_line); 215 | writer.println(mid_line); 216 | writer.println(mid_line); 217 | writer.println(mid_line); 218 | writer.println(mid_line); 219 | writer.println(mid_line); 220 | writer.println(mid_line); 221 | writer.println(mid_line); 222 | writer.println(mid_line); 223 | writer.println(mid_line); 224 | writer.println(); 225 | 226 | } catch (FileNotFoundException e) { 227 | LOG.error("Write content failed at file: " + file); 228 | } finally { 229 | if (writer != null) { 230 | writer.close(); 231 | writer = null; 232 | } 233 | } 234 | } 235 | 236 | @Before 237 | public void SetUp() { 238 | LOG.info("Start test MultiLineParser"); 239 | LOG.info("Create test dir: " + target_dir_); 240 | target_dir_.mkdirs(); 241 | Assert.assertTrue(target_dir_.exists()); 242 | } 243 | 244 | public void WriteFile(File file, List records) { 245 | PrintWriter writer = null; 246 | try { 247 | writer = new PrintWriter(file); 248 | for (String line : records) { 249 | writer.print(line); 250 | } 251 | } catch (FileNotFoundException e) { 252 | LOG.error("Write content failed at file: " + file); 253 | } finally { 254 | if (writer != null) { 255 | writer.close(); 256 | writer = null; 257 | } 258 | } 259 | } 260 | 261 | @Test 262 | public void TestMixParse() { 263 | LOG.info("Start test mix file mode"); 264 | Map params = new HashMap(); 265 | params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD."); 266 | params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD."); 267 | params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_); 268 | params.put(FlumeConstants.LAST_LINE_PATTERN, last_regex_); 269 | Context context = new Context(params); 270 | 271 | File mix_file = new File(target_dir_ + "/" + "mix.file"); 272 | File mix_file_handled = new File(target_dir_ + "/" + "mix.file.hd"); 273 | CreateMIXFile(mix_file); 274 | 275 | MultiLineParser parser = new MultiLineParser(); 276 | parser.Configure(context); 277 | List records = parser.GetNextBatchRecords(mix_file.toString(), 0L); 278 | int cnter = 0; 279 | int byte_length = 0; 280 | for (String record : records) { 281 | cnter++; 282 | LOG.info("the \'" + cnter + "\' record=" + record); 283 | byte_length += record.getBytes().length; 284 | } 285 | WriteFile(mix_file_handled, records); 286 | LOG.info("***********Summary Info**************"); 287 | LOG.info("File: " + mix_file); 288 | LOG.info("Processed Record Num:" + cnter); 289 | LOG.info("File Length: " + mix_file.length()); 290 | LOG.info("Process Bytes:" + byte_length); 291 | LOG.info("*************************************"); 292 | } 293 | 294 | @Test 295 | public void TestFirstParse() { 296 | LOG.info("Start test first file mode"); 297 | Map params = new HashMap(); 298 | params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD."); 299 | params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD."); 300 | params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_); 301 | Context context = new Context(params); 302 | 303 | File first_file = new File(target_dir_ + "/" + "first.file"); 304 | File first_file_handled = new File(target_dir_ + "/" + "first.file.hd"); 305 | CreateFIRSTFile(first_file); 306 | 307 | MultiLineParser parser = new MultiLineParser(); 308 | parser.Configure(context); 309 | List records = parser.GetNextBatchRecords(first_file.toString(), 0L); 310 | int cnter = 0; 311 | int byte_length = 0; 312 | for (String record : records) { 313 | cnter++; 314 | LOG.info("the \'" + cnter + "\' record=" + record); 315 | byte_length += record.getBytes().length; 316 | } 317 | WriteFile(first_file_handled, records); 318 | LOG.info("***********Summary Info**************"); 319 | LOG.info("File: " + first_file); 320 | LOG.info("Processed Record Num:" + cnter); 321 | LOG.info("File Length: " + first_file.length()); 322 | LOG.info("Process Bytes:" + byte_length); 323 | LOG.info("*************************************"); 324 | } 325 | 326 | @Test 327 | public void TestLastParse() { 328 | LOG.info("Start test last file mode"); 329 | Map params = new HashMap(); 330 | params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD."); 331 | params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_); 332 | Context context = new Context(params); 333 | 334 | File last_file = new File(target_dir_ + "/" + "last.file"); 335 | File last_file_handled = new File(target_dir_ + "/" + "last.file.hd"); 336 | CreateLASTFile(last_file); 337 | 338 | MultiLineParser parser = new MultiLineParser(); 339 | parser.Configure(context); 340 | List records = parser.GetNextBatchRecords(last_file.toString(), 0L); 341 | int cnter = 0; 342 | int byte_length = 0; 343 | for (String record : records) { 344 | cnter++; 345 | LOG.info("the \'" + cnter + "\' record=" + record); 346 | byte_length += record.getBytes().length; 347 | } 348 | WriteFile(last_file_handled, records); 349 | LOG.info("***********Summary Info**************"); 350 | LOG.info("File: " + last_file); 351 | LOG.info("Processed Record Num:" + cnter); 352 | LOG.info("File Length: " + last_file.length()); 353 | LOG.info("Process Bytes:" + byte_length); 354 | LOG.info("*************************************"); 355 | } 356 | 357 | 358 | @Test 359 | public void TestNoneParse() { 360 | LOG.info("Start test none mode"); 361 | Map params = new HashMap(); 362 | params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD."); 363 | params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD."); 364 | params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_); 365 | Context context = new Context(params); 366 | 367 | File none_file = new File(target_dir_ + "/" + "none.file"); 368 | File none_file_handled = new File(target_dir_ + "/" + "none.file.hd"); 369 | CreateNONEFile(none_file); 370 | 371 | MultiLineParser parser = new MultiLineParser(); 372 | parser.Configure(context); 373 | List records = parser.GetNextBatchRecords(none_file.toString(), 0L); 374 | int cnter = 0; 375 | int byte_length = 0; 376 | for (String record : records) { 377 | cnter++; 378 | LOG.info("the \'" + cnter + "\' record=" + record); 379 | byte_length += record.getBytes().length; 380 | } 381 | WriteFile(none_file_handled, records); 382 | LOG.info("***********Summary Info**************"); 383 | LOG.info("File: " + none_file); 384 | LOG.info("Processed Record Num:" + cnter); 385 | LOG.info("File Length: " + none_file.length()); 386 | LOG.info("Process Bytes:" + byte_length); 387 | LOG.info("*************************************"); 388 | } 389 | 390 | @Test 391 | public void TestFirstBigLine() { 392 | LOG.info("Start test first file mode with very small buffersize"); 393 | Map params = new HashMap(); 394 | params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD."); 395 | params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD."); 396 | params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_); 397 | params.put("read_buffer_size", "10"); 398 | params.put("max_read_buffer_size", "200"); 399 | params.put("max_record_size", "100"); 400 | Context context = new Context(params); 401 | 402 | File first_file = new File(target_dir_ + "/" + "first.bigrecord.file"); 403 | File first_file_handled = new File(target_dir_ + "/" + 404 | "first.bigrecord.file.hd"); 405 | CreateFIRSTBigRecordFile(first_file); 406 | 407 | MultiLineParser parser = new MultiLineParser(); 408 | parser.Configure(context); 409 | int cnter = 0; 410 | int max_round = 20; 411 | Long offset = 0L; 412 | 413 | List total_records = new ArrayList(); 414 | while (offset < first_file.length() && cnter++ < max_round) { 415 | List records = parser.GetNextBatchRecords(first_file.toString(), 416 | offset); 417 | int record_cnter = 0; 418 | int byte_length = 0; 419 | for (String record : records) { 420 | record_cnter++; 421 | LOG.info("the \'" + record_cnter + "\' record=" + record); 422 | byte_length += record.getBytes().length; 423 | } 424 | offset += byte_length; 425 | total_records.addAll(records); 426 | } 427 | WriteFile(first_file_handled, total_records); 428 | 429 | LOG.info("***********Summary Info**************"); 430 | LOG.info("File: " + first_file); 431 | LOG.info("Processed Record Num:" + cnter); 432 | LOG.info("File Length: " + first_file.length()); 433 | LOG.info("Process Bytes:" + offset); 434 | LOG.info("*************************************"); 435 | } 436 | } -------------------------------------------------------------------------------- /src/com/minsheng/flume/source/MultiLineParser.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved 2 | // Author: peng.he.ia@gmail.com 3 | package com.minsheng.flume.source; 4 | 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.RandomAccessFile; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.regex.Pattern; 11 | import java.util.regex.Matcher; 12 | 13 | import org.apache.flume.Context; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.minsheng.util.StringUtil; 18 | 19 | public class MultiLineParser extends FileParser { 20 | private static final Logger LOG = LoggerFactory 21 | .getLogger(MultiLineParser.class); 22 | 23 | private int RECORD_INIT_SIZE = 1024; 24 | private int max_record_size_ = FlumeConstants.MAX_RECORD_LENGH; 25 | private int max_buffer_size_ = FlumeConstants.MAX_READ_BUFFER_SIZE; 26 | // start from 1MB 27 | private int buffer_size_ = FlumeConstants.READ_BUFFER_SIZE; 28 | private byte[] read_buffer_ = null; 29 | private String content_str_ = null; 30 | /* 31 | * when use ParseFirst mode, we need a signal to indicate: hi, body,this is 32 | * the last line (without the end line pattern), we should treat it as a 33 | * record, we do not need to wait data 34 | */ 35 | private boolean is_end_of_file_ = false; 36 | 37 | /* 38 | * this pattern is for filter record; default all records are accepted; 39 | */ 40 | 41 | private String default_content_include_str_ = "[\\s\\S]*"; // match all, 42 | private String file_content_include_pattern_str_ 43 | = default_content_include_str_; 44 | // default no exclude 45 | private String file_content_exclude_pattern_str_ = null; 46 | private Pattern record_include_pattern_ = null; 47 | private Pattern record_exclude_pattern_ = null; 48 | 49 | /* 50 | * all combination: 51 | * 52 | * MIX -- first & last --ret1 = line.match(first),ret2 = line.match(last) (1) 53 | * ret1 == true && ret2 == true current line is a new record (2) ret1 == true 54 | * && ret2 == false read next line and do the process again (3) ret1 == false 55 | * && ret2 == true current line is the last line in current record (4) ret1 == 56 | * false && ret2 == false invalid line means maybe the "first_line_pattern" 57 | * and the "last_line_pattern" should be modified for matchingFIRST -- first 58 | * --ret1 = line.match(first) (1) ret1 == true current line is the start line 59 | * of a new record; it means the previous line is the last line of the current 60 | * record; (2) ret1 == false current line belongs to the current record, read 61 | * next to processLAST -- last --ret1 = line.match(last) (1) ret1 == true 62 | * current line is the last line of current record; and next line is the start 63 | * line of the next record; (2) ret1 == false current line belongs to the 64 | * current record; read next line to processNONE -- none --use default 65 | * strategy, treat every line as a new recorddefault is none 66 | */ 67 | enum ParseType { 68 | MIX, FIRST, LAST, NONE 69 | }; 70 | 71 | private ParseType parse_type_ = ParseType.NONE; 72 | private String first_line_pattern_str_ = null; 73 | private String last_line_pattern_str_ = null; 74 | private Pattern first_line_pattern_ = null; 75 | private Pattern last_line_pattern_ = null; 76 | 77 | /* 78 | * parse_type 79 | */ 80 | 81 | public MultiLineParser() { 82 | super(); 83 | } 84 | 85 | @Override 86 | public List GetNextBatchRecords(String file_name, Long offset) { 87 | // TODO Auto-generated method stub 88 | RandomAccessFile file_reader = null; 89 | try { 90 | file_reader = new RandomAccessFile(file_name, "r"); 91 | if (file_reader.length() < offset) { 92 | LOG.warn("File length small than read offset, truncate(rename)? offset=" 93 | + offset + " file_length=" + file_reader.length()); 94 | return new ArrayList(); 95 | } 96 | file_reader.seek(offset.longValue()); 97 | if (LOG.isDebugEnabled()) { 98 | LOG.debug("Random Read: file {}, start {}, buffer_size {}" + buffer_size_, 99 | file_name, offset); 100 | } 101 | int read_bytes = file_reader.read(read_buffer_, 0, 102 | read_buffer_.length); 103 | // if read_bytes == -1, means stream is at end of file, 104 | List records = ParseRecord(read_bytes); 105 | if (records == null) { 106 | // the buffer is two large, we treat this as error 107 | LOG.error("SUPER-WARNING, we meet super-huge line, which is larger than " 108 | + buffer_size_ + "MB, the following process will skip this file:" + 109 | file_name + " read_bytes=" + read_bytes + " offset=" + offset); 110 | return new ArrayList(); 111 | } 112 | return records; 113 | } catch (FileNotFoundException e) { 114 | LOG.warn("target monitor file not exist, " + file_name); 115 | } catch (IOException e) { 116 | LOG.error("Read file error due to " + e.getMessage()); 117 | } finally { 118 | if (file_reader != null) { 119 | try { 120 | file_reader.close(); 121 | file_reader = null; 122 | } catch (IOException e) { 123 | 124 | } 125 | } 126 | } 127 | return new ArrayList(); 128 | } 129 | 130 | /* 131 | * MIX -- first & last --ret1 = line.match(first),ret2 = line.match(last) (1) 132 | * ret1 == true && ret2 == true current line is a new record (2) ret1 == true 133 | * && ret2 == false read next line and do the process again (3) ret1 == false 134 | * && ret2 == true current line is the last line in current record (4) ret1 == 135 | * false && ret2 == false invalid line means maybe the "first_line_pattern" 136 | * and the "last_line_pattern" should be modified for matching 137 | */ 138 | private List ParseMIX(String lines[], int end_idx) { 139 | StringBuilder record = new StringBuilder(RECORD_INIT_SIZE); 140 | List records = new ArrayList(); 141 | for (int i = 0; i < end_idx; i++) { 142 | boolean match_first = first_line_pattern_.matcher(lines[i]).matches(); 143 | boolean match_last = last_line_pattern_.matcher(lines[i]).matches(); 144 | if (LOG.isDebugEnabled()) { 145 | LOG.debug("Process Line: " + lines[i]); 146 | } 147 | if (match_first) { 148 | if (record.length() > 0) { 149 | // means the previous line is also the end line of current record 150 | records.add(record.toString()); 151 | if (LOG.isDebugEnabled()) { 152 | LOG.debug("MATCH START, Get a new record(which miss its end line):" 153 | + record.toString()); 154 | } 155 | } 156 | // create a new record 157 | record = new StringBuilder(RECORD_INIT_SIZE); 158 | if (match_last) { 159 | // current line is a record(first last both matched); 160 | // records.add(lines[i] + FlumeConstants.LINE_SEP); 161 | // we use our Split method, no need to add LINE_SEP 162 | records.add(lines[i]); 163 | if (LOG.isDebugEnabled()) { 164 | LOG.debug("MATCH START-LAST, Get a new record:" + lines[i]); 165 | } 166 | } else { 167 | record.append(lines[i]); 168 | // record.append(FlumeConstants.LINE_SEP); 169 | } 170 | } else if (match_last) { 171 | record.append(lines[i]); 172 | // record.append(FlumeConstants.LINE_SEP); 173 | records.add(record.toString()); 174 | if (LOG.isDebugEnabled()) { 175 | LOG.debug("MATCH LAST, Get a new record:" + record.toString()); 176 | } 177 | 178 | record = null; 179 | record = new StringBuilder(RECORD_INIT_SIZE); 180 | } else { 181 | record.append(lines[i]); 182 | // this is a middle line, we recovery it's '\n' character 183 | // in parserecord, new Split called 184 | // record.append(FlumeConstants.LINE_SEP); 185 | } 186 | } 187 | 188 | HandleSpecialSituation(record, records); 189 | record = null; 190 | return records; 191 | } 192 | 193 | private List ParseFIRST(String[] lines, int end_idx) { 194 | StringBuilder record = new StringBuilder(RECORD_INIT_SIZE); 195 | List records = new ArrayList(); 196 | for (int i = 0; i < end_idx; i++) { 197 | boolean match_first = first_line_pattern_.matcher(lines[i]).matches(); 198 | if (match_first) { 199 | if (record.length() > 0) { 200 | // means the previous line is also the end line of current record 201 | records.add(record.toString()); 202 | // create a new record 203 | if (LOG.isDebugEnabled()) { 204 | LOG.debug("MATCH first, get a new record: " + record.toString()); 205 | } 206 | record = null; 207 | record = new StringBuilder(RECORD_INIT_SIZE); 208 | } 209 | record.append(lines[i]); 210 | // record.append(FlumeConstants.LINE_SEP); 211 | } else { 212 | record.append(lines[i]); 213 | // record.append(FlumeConstants.LINE_SEP); 214 | } 215 | } 216 | 217 | HandleSpecialSituation(record, records); 218 | record = null; 219 | return records; 220 | } 221 | 222 | private List ParseLAST(String[] lines, int end_idx) { 223 | StringBuilder record = new StringBuilder(RECORD_INIT_SIZE); 224 | List records = new ArrayList(); 225 | for (int i = 0; i < end_idx; i++) { 226 | boolean match_last = last_line_pattern_.matcher(lines[i]).matches(); 227 | if (match_last) { 228 | record.append(lines[i]); 229 | // record.append(FlumeConstants.LINE_SEP); 230 | records.add(record.toString()); 231 | LOG.debug("MATCH last, get a new record" + record.toString()); 232 | record = null; 233 | record = new StringBuilder(RECORD_INIT_SIZE); 234 | } else { 235 | record.append(lines[i]); 236 | // record.append(FlumeConstants.LINE_SEP); 237 | } 238 | } 239 | /* 240 | * means the last record is incomplete, we need more data to handle the last 241 | * record, just skip if (record.length() >0) { 242 | * 243 | * } 244 | */ 245 | record = null; 246 | return records; 247 | } 248 | 249 | 250 | private boolean ExpandReadBuffer() { 251 | if (LOG.isDebugEnabled()) { 252 | LOG.debug("ExpandReadBuffer called, current buffer size:" + buffer_size_); 253 | } 254 | if ((buffer_size_ << 1) > max_buffer_size_) { 255 | // this is the only place we return null instead of empty list 256 | return false; 257 | } 258 | read_buffer_ = null; 259 | buffer_size_ = buffer_size_ * 2; 260 | read_buffer_ = new byte[buffer_size_]; 261 | return true; 262 | } 263 | 264 | private void HandleSpecialSituation(StringBuilder record, 265 | List records) { 266 | // end of file 267 | if (is_end_of_file_ && record.length() > 0) { 268 | // this is the end line of the record 269 | records.add(record.toString()); 270 | if (LOG.isDebugEnabled()) { 271 | LOG.debug("end of file in First or mix mode, get a new record" 272 | + record.toString()); 273 | } 274 | } 275 | 276 | /* 277 | * we processed all the lines, but we do not get a record until now, it 278 | * means this must be a very large record, currently we just get a piece of 279 | * this record. on assumption this situation is rarely 280 | */ 281 | if (records.size() == 0) { 282 | if (record.length() > max_record_size_) { 283 | records.add(record.toString()); 284 | if (LOG.isDebugEnabled()) { 285 | LOG.debug("Big record, get a new record(part of) " + record.toString()); 286 | } 287 | } 288 | } 289 | } 290 | 291 | private List ParseNONE(String[] lines, int end_idx) { 292 | if (LOG.isDebugEnabled()) { 293 | LOG.debug("ParseNONE mode, handle line num" + lines.length); 294 | } 295 | List records = new ArrayList(); 296 | for (int i = 0; i < end_idx; i++) { 297 | // take each line as a record 298 | records.add(lines[i]); 299 | } 300 | return records; 301 | } 302 | 303 | private List ParseRecord(int read_bytes) { 304 | if (-1 == read_bytes) { 305 | // no more data can be read 306 | return new ArrayList(); 307 | } 308 | 309 | // only used for ParseFirst 310 | is_end_of_file_ = false; 311 | if (read_bytes < read_buffer_.length) { 312 | is_end_of_file_ = true; 313 | } 314 | 315 | content_str_ = new String(read_buffer_, 0, read_bytes); 316 | /* 317 | * special situation: very-very big line, content in read_buffer_ is part of 318 | * a line, in this situation, we return null, and double the buffer size 319 | */ 320 | if (!is_end_of_file_ && content_str_.indexOf(FlumeConstants.LINE_SEP) == -1) { 321 | if (LOG.isDebugEnabled()) { 322 | LOG.debug("cannot find a linesep in this content, content_size: {}, content ={}", 323 | content_str_.length(), content_str_); 324 | } 325 | if (!ExpandReadBuffer()) { 326 | return null; 327 | } 328 | return new ArrayList(); 329 | } 330 | 331 | int last_line_sep = content_str_.lastIndexOf(FlumeConstants.LINE_SEP); 332 | 333 | /**when use String.split,it will split "123\n\n" to: 334 | * "123" 335 | * whe use StringUtil.Split, it split "123\n\n"to: 336 | * "123\n" 337 | * "\n" 338 | * we need StringUtil.Split 339 | * */ 340 | // String[] lines = content_str_.split(FlumeConstants.LINE_SEP); 341 | String[] lines = StringUtil.Split(content_str_, FlumeConstants.LINE_SEP); 342 | 343 | /* 344 | * this means,the last line is a complete line, we can handle it otherwise, 345 | * it is just part of the line, drop it in this process round 346 | */ 347 | int end_idx = lines.length; 348 | if (!is_end_of_file_ && last_line_sep != (content_str_.length() - 1)) { 349 | // the last line is not a complete line 350 | end_idx = end_idx - 1; 351 | } 352 | 353 | List records = null; 354 | switch (parse_type_) { 355 | case MIX: 356 | records = ParseMIX(lines, end_idx); 357 | break; 358 | case FIRST: 359 | records = ParseFIRST(lines, end_idx); 360 | break; 361 | case LAST: 362 | records = ParseLAST(lines, end_idx); 363 | break; 364 | default: 365 | records = ParseNONE(lines, end_idx); 366 | break; 367 | } 368 | 369 | if (!is_end_of_file_ && records.size() == 0 && content_str_.length() < max_record_size_) { 370 | /** 371 | * this is a tough situation current data in lines is part of a record, 372 | * but current data is small than max_record_size, this will make it into 373 | * the following loop: (1) read data into buffer; (2) parse data, found no 374 | * new record( records.size == 0 current lines is a part of the record) 375 | * (3) current data is small than max record, so no record generate, and 376 | * buffer do not double the size; (4) next round, go to the (1),with no 377 | * change of all meata data(offset,buffersize) data 378 | * */ 379 | if (LOG.isDebugEnabled()) { 380 | LOG.debug("read lines {}, but no record, expand buffer", lines.length); 381 | } 382 | if (!ExpandReadBuffer()) { 383 | return null; 384 | } 385 | } 386 | return records; 387 | } 388 | 389 | /* 390 | * @return false -- this record is valid, keep it true -- this record is 391 | * invalid, drop it 392 | */ 393 | public boolean ShouldDrop(String record) { 394 | Matcher in_matcher = record_include_pattern_.matcher(record); 395 | 396 | if (!in_matcher.matches()) { 397 | // not in white list, drop it;(default pattern matches all) 398 | return true; 399 | } 400 | 401 | if (record_exclude_pattern_ != null) { 402 | Matcher ex_matcher = record_exclude_pattern_.matcher(record); 403 | if (ex_matcher.matches()) { 404 | // in black list, should drop 405 | return true; 406 | } 407 | } 408 | // in white list and not in black list, this record is legal, keep it 409 | return false; 410 | } 411 | 412 | @Override 413 | public void Configure(Context context) { 414 | // TODO Auto-generated method stub 415 | LOG.info("Config MultiLineParser"); 416 | buffer_size_ = context.getInteger("read_buffer_size", 417 | FlumeConstants.READ_BUFFER_SIZE).intValue(); 418 | read_buffer_ = new byte[buffer_size_]; 419 | max_buffer_size_ = context.getInteger("max_read_buffer_size", 420 | FlumeConstants.MAX_READ_BUFFER_SIZE).intValue(); 421 | max_record_size_ = context.getInteger("max_record_size", 422 | FlumeConstants.MAX_RECORD_LENGH); 423 | 424 | file_content_include_pattern_str_ = context.getString( 425 | FlumeConstants.FILE_CONTENT_INCLUDE, default_content_include_str_); 426 | file_content_exclude_pattern_str_ = context 427 | .getString(FlumeConstants.FILE_CONTENT_EXCLUDE); 428 | 429 | record_include_pattern_ = Pattern 430 | .compile(file_content_include_pattern_str_); 431 | if (file_content_exclude_pattern_str_ != null) { 432 | record_exclude_pattern_ = Pattern 433 | .compile(file_content_exclude_pattern_str_); 434 | } 435 | 436 | first_line_pattern_str_ = context 437 | .getString(FlumeConstants.FIRST_LINE_PATTERN); 438 | last_line_pattern_str_ = context 439 | .getString(FlumeConstants.LAST_LINE_PATTERN); 440 | if (first_line_pattern_str_ != null) { 441 | first_line_pattern_ = Pattern.compile(first_line_pattern_str_); 442 | } 443 | if (last_line_pattern_str_ != null) { 444 | last_line_pattern_ = Pattern.compile(last_line_pattern_str_); 445 | } 446 | 447 | if (first_line_pattern_ != null && last_line_pattern_ != null) { 448 | parse_type_ = ParseType.MIX; 449 | } else if (first_line_pattern_ != null) { 450 | parse_type_ = ParseType.FIRST; 451 | } else if (last_line_pattern_ != null) { 452 | parse_type_ = ParseType.LAST; 453 | } else { 454 | parse_type_ = ParseType.NONE; 455 | } 456 | 457 | StringBuilder builder = new StringBuilder(); 458 | builder.append("Config MultiLineParser with ["); 459 | builder.append("read_buffer_size(init)="); 460 | builder.append(buffer_size_); 461 | builder.append(",max_buffer_size="); 462 | builder.append(max_buffer_size_); 463 | builder.append(",max_record_size="); 464 | builder.append(max_record_size_); 465 | builder.append(",first_line_pattern_str_="); 466 | builder.append(first_line_pattern_str_); 467 | builder.append(",last_line_pattern_str_="); 468 | builder.append(last_line_pattern_str_); 469 | builder.append(",file_content_include_pattern_str_="); 470 | builder.append(file_content_include_pattern_str_); 471 | builder.append(",record_include_pattern_="); 472 | builder.append(record_include_pattern_ == null ? "null" 473 | : record_include_pattern_.toString()); 474 | builder.append(",file_content_exclude_pattern_str_="); 475 | builder.append(file_content_exclude_pattern_str_); 476 | builder.append(",record_exclude_pattern_="); 477 | builder.append(record_exclude_pattern_ == null ? "null" 478 | : record_exclude_pattern_.toString()); 479 | builder.append(", parse_type="); 480 | builder.append("" + parse_type_); 481 | builder.append("]"); 482 | LOG.info(builder.toString()); 483 | builder = null; 484 | } 485 | } --------------------------------------------------------------------------------