├── test
    ├── all-tests
    ├── log4j.properties
    └── com
    │   └── minsheng
    │       └── flume
    │           └── source
    │               ├── TestSimpleFileMonitor.java
    │               ├── TestDirFileRecorder.java
    │               └── TestMultiLineParser.java
├── lib
    ├── junit-4.8.2.jar
    ├── guava-11.0.2.jar
    ├── log4j-1.2.16.jar
    ├── slf4j-api-1.6.1.jar
    ├── slf4j-log4j12-1.6.1.jar
    ├── flume-tools-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar
    ├── flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar
    └── flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar
├── flume_monitor_source使用说明文档.pdf
├── src
    └── com
    │   └── minsheng
    │       ├── flume
    │           └── source
    │           │   ├── FileParser.java
    │           │   ├── FileMonitor.java
    │           │   ├── DirectoryMonitorSource.java
    │           │   ├── FlumeConstants.java
    │           │   ├── FileInfo.java
    │           │   ├── FileMapReaderWriter.java
    │           │   ├── DirFileRecorder.java
    │           │   ├── SimpleFileMonitor.java
    │           │   └── MultiLineParser.java
    │       └── util
    │           ├── Time.java
    │           ├── StringUtil.java
    │           └── Shell.java
├── README.md
└── LICENSE.txt


/test/all-tests:
--------------------------------------------------------------------------------
1 | **/Test*.java
2 | 


--------------------------------------------------------------------------------
/lib/junit-4.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/junit-4.8.2.jar


--------------------------------------------------------------------------------
/lib/guava-11.0.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/guava-11.0.2.jar


--------------------------------------------------------------------------------
/lib/log4j-1.2.16.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/log4j-1.2.16.jar


--------------------------------------------------------------------------------
/lib/slf4j-api-1.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/slf4j-api-1.6.1.jar


--------------------------------------------------------------------------------
/lib/slf4j-log4j12-1.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/slf4j-log4j12-1.6.1.jar


--------------------------------------------------------------------------------
/flume_monitor_source使用说明文档.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/flume_monitor_source使用说明文档.pdf


--------------------------------------------------------------------------------
/lib/flume-tools-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-tools-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-irc-sink-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-core-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-node-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-sdk-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-hdfs-sink-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-jms-source-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-avro-source-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-file-channel-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-jdbc-channel-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-hbase-sink-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-scribe-source-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-thrift-source-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-twitter-source-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-configuration-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-embedded-agent-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-log4jappender-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-elasticsearch-sink-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/lib/flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpttlook/flume_monitor_source/HEAD/lib/flume-ng-morphline-solr-sink-1.4.0-cdh5.0.0-beta-1.jar


--------------------------------------------------------------------------------
/test/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=DEBUG,console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.err
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=[%p] %d{dd MMM yyy} %m%n
6 | 


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/FileParser.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
 2 | // Author: peng.he.ia@gmail.com <he peng>
 3 | package com.minsheng.flume.source;
 4 | import java.util.List;
 5 | import org.apache.flume.Context;
 6 | 
 7 | public abstract class FileParser {
 8 |   public abstract void Configure(Context context);
 9 |   public abstract List<String> GetNextBatchRecords(String file_name,
10 |                                                    Long offset);
11 |   public abstract boolean ShouldDrop(String record);
12 | }


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/FileMonitor.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
 2 | // Author: peng.he.ia@gmail.com <he peng>
 3 | package com.minsheng.flume.source;
 4 | import java.util.Map;
 5 | import java.util.concurrent.ConcurrentHashMap;
 6 | 
 7 | import org.apache.flume.Context;
 8 | 
 9 | public abstract class FileMonitor {
10 |   public FileMonitor() {
11 |   }
12 |   
13 |   public abstract void Configure(Context context);
14 |   
15 |   public abstract void Start();
16 |   
17 |   public abstract void Stop();
18 |   
19 |   public abstract Map<Integer, FileInfo> GetLatestFileInfo(
20 |       Map<Integer, FileInfo> file_map_with_latest_offet);
21 |   
22 |   public abstract String GetMonitorDir();
23 | }


--------------------------------------------------------------------------------
/src/com/minsheng/util/Time.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
 2 | // Author: peng.he.ia@gmail.com <he peng>
 3 | package com.minsheng.util;
 4 | 
 5 | 
 6 | public final class Time {
 7 | 
 8 |   /**
 9 |    * Current system time.  Do not use this to calculate a duration or interval
10 |    * to sleep, because it will be broken by settimeofday.  Instead, use
11 |    * monotonicNow.
12 |    * @return current time in msec.
13 |    */
14 |   public static long now() {
15 |     return System.currentTimeMillis();
16 |   }
17 |   
18 |   /**
19 |    * Current time from some arbitrary time base in the past, counting in
20 |    * milliseconds, and not affected by settimeofday or similar system clock
21 |    * changes.  This is appropriate to use when computing how much longer to
22 |    * wait for an interval to expire.
23 |    * @return a monotonic clock that counts in milliseconds.
24 |    */
25 |   public static long monotonicNow() {
26 |     final long NANOSECONDS_PER_MILLISECOND = 1000000;
27 | 
28 |     return System.nanoTime() / NANOSECONDS_PER_MILLISECOND;
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/com/minsheng/util/StringUtil.java:
--------------------------------------------------------------------------------
 1 | package com.minsheng.util;
 2 | import java.util.ArrayList;
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | public class StringUtil {
 7 |   public static String[] Split(String content, String sub_seq) {
 8 |     int start_index = 0;
 9 |     ArrayList<String> ret = new ArrayList<String>();
10 | 
11 |     int pos = -1; 
12 |     while (start_index < content.length() && 
13 |         (pos = content.indexOf(sub_seq, start_index)) != -1) {
14 |       ret.add(content.substring(start_index, pos+sub_seq.length()));
15 |       start_index = pos + sub_seq.length();
16 |     }
17 |     if (start_index < content.length()) {
18 |       ret.add(content.substring(start_index));
19 |     } 
20 |     String[] result = new String[ret.size()];
21 |     return ret.toArray(result);
22 |   }
23 |   
24 |   
25 |   public static String[] SplitAndTrim(Pattern pat, CharSequence input) {
26 |     int index = 0;
27 |     ArrayList<String> matchList = new ArrayList<String>();
28 |     Matcher m = pat.matcher(input);
29 | 
30 |     while (m.find()) {
31 |       String match = input.subSequence(index, m.start()).toString();
32 |       if (!match.trim().isEmpty())
33 |         matchList.add(match);
34 |       index = m.end();
35 |     }
36 | 
37 |     // If no match was found, return this
38 |     if (index == 0)
39 |         return new String[] {input.toString()};
40 |     matchList.add(input.subSequence(index, input.length()).toString());
41 |     int resultSize = matchList.size();
42 |     String[] result = new String[resultSize];
43 |     return matchList.subList(0, resultSize).toArray(result);
44 | }
45 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | flume_monitor_source
 2 | ====================
 3 | # What is 
 4 | flume_monitor_source is a flume source plug-in for monitoring files under a specified directory. It is different from the 'spooldir' source in flume in the following ways:
 5 | 1. The flume_monitor_source can incrementally read data from the specified directory in real time, which means the file under the specified directory is writeable (only append operation). This is not support by 'spooldir' source;
 6 | 2. The flume_monitor_source can handle the multiple lines such as Java call stack  or exception as ONE understandable complete record , while the flume can only handle one line per time;
 7 | 3.  The flume_monitor_source will process the file at the point which it had already processed when it was stopped at the last time.
 8 | 
 9 | ---
10 | 
11 | # How to use it
12 | 
13 | * Build the jar
14 | 
15 | ```
16 |    ant jar
17 | ```
18 | 
19 | * Copy jar to lib of flume
20 | 
21 | ```
22 |    cp dist/flume-monitor-source-0.1.jar ${FLUME_HOME}/lib
23 | ```
24 | 
25 | # Configure the source
26 | 
27 | ## Prerequisites
28 | [FlumeUserGuide]: http://flume.apache.org/FlumeUserGuide.html
29 |    You are supposed to known how to use flume. See [Flume Documentation.][FlumeUserGuide]
30 |    
31 | ## Parameter List
32 |   
33 |    **Property Name**  | **default**  | **Description** 
34 |    :--------------- | :--------- | :---------------------------------
35 |    *type*           |  -         | The component type name, needs to be com.minsheng.flume.source.MonitorDirectorySource
36 |    *monitor_dir*    |  -         | Required. The directory under which all files under will be monitored. Files satisfying the condition will be parsed and send to the flume channel
37 |    *meta_store_dir* | 
38 |    
39 |    
40 | 
41 | 


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/DirectoryMonitorSource.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
 2 | // Author: peng.he.ia@gmail.com <he peng>
 3 | package com.minsheng.flume.source;
 4 | 
 5 | import java.util.Map;
 6 | 
 7 | import org.apache.flume.Context;
 8 | import org.apache.flume.EventDrivenSource;
 9 | import org.apache.flume.conf.Configurable;
10 | import org.apache.flume.instrumentation.SourceCounter;
11 | import org.apache.flume.source.AbstractSource;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | public class DirectoryMonitorSource extends AbstractSource
16 |  implements EventDrivenSource, Configurable {
17 |   private static Logger LOG = 
18 |       LoggerFactory.getLogger(DirectoryMonitorSource.class);
19 |   
20 |   private  SourceCounter sourceCounter_;
21 | 
22 |   DirFileRecorder dir_recorder_ = null;
23 |   public DirectoryMonitorSource() {
24 |     dir_recorder_ = new DirFileRecorder(this);
25 |     if (sourceCounter_ == null) {
26 |       sourceCounter_ = new SourceCounter(getName());
27 |     }
28 |   }
29 |   
30 |   public void UpdateSourceCounter(long event_size) {
31 |     sourceCounter_.addToEventAcceptedCount(event_size);
32 |   }
33 |   
34 |   @Override
35 |   public void configure(Context context) {
36 |     // TODO Auto-generated method stub
37 |     if (LOG.isDebugEnabled()) {
38 |       for (Map.Entry<String, String> entry : 
39 |         context.getParameters().entrySet()) {
40 |         LOG.debug("*****key=" + entry.getKey() + " value=" + entry.getValue());
41 |       }
42 |     }
43 |     dir_recorder_.Configure(context);
44 |   }
45 |   
46 |   @Override
47 |   public void start() {
48 |     dir_recorder_.Start();
49 |   }
50 |   
51 |   @Override
52 |   public void stop() {
53 |     dir_recorder_.Stop();
54 |   }
55 | }


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/FlumeConstants.java:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
 2 | // Author: peng.he.ia@gmail.com <he peng>
 3 | package com.minsheng.flume.source;
 4 | 
 5 | public class FlumeConstants {
 6 |   public static String DIR_SEP = "/";
 7 |   public static String LINE_SEP = "\n";
 8 |   // \s means any space character('\t','\n',' ', '\f', '\r')
 9 |   public static String AUTO_DELETE_LINE_DEILMITER = "delete_line_delimiter";
10 |   public static String SHELL_RESULT_REGEX = "\\s+";
11 |   public static String INTEGER_REGEX = "[0-9]+";
12 |   public static int SHELL_RESULT_FIELD_NUM = 6;
13 |   // records > 10MB  are cut into pieces
14 |   public static int MAX_RECORD_LENGH = 1024 * 1024 * 10;
15 |   public static int READ_BUFFER_SIZE = 1024 * 1024 * 1; // 2MB
16 |   // we assume all records is smaller than 20MB, if we meet such a record
17 |   // the program just skip to process
18 |   public static int MAX_READ_BUFFER_SIZE = 1024 * 1024 * 20;  
19 |   public static String FILE_CHECK_INTERVAL = "file_check_interval_sec";
20 |   public static String FILE_SEND_INTERVAL = "file_send_interval_sec";
21 |   public static String FILE_NAME_INCLUDE = "file_name_include_pattern";
22 |   public static String FILE_NAME_EXCLUDE = "file_name_exclude_pattern";
23 |   public static String FIRST_LINE_PATTERN = "first_line_pattern";
24 |   public static String LAST_LINE_PATTERN = "last_line_pattern";
25 |   public static String FILE_CONTENT_INCLUDE = "file_content_include_pattern";
26 |   public static String FILE_CONTENT_EXCLUDE = "file_content_exclude_pattern";
27 |   public static String META_STORE_DIR = "meta_store_dir";
28 |   public static String MONITOR_DIR = "monitor_dir";
29 |   public static String SHELL_COMMAND[] = {"ls", "-il", "-o", "-g", 
30 |                                           "--time-style=+%m", "TARGET_DIR"};
31 |   static String[] GetShellCommand(String monitor_dir) {
32 |     SHELL_COMMAND[SHELL_COMMAND.length - 1] = monitor_dir;
33 |     return SHELL_COMMAND.clone();
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/FileInfo.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | 
  5 | public class FileInfo {
  6 |   public static String FIELD_SEPERATOR = "\t";
  7 |   public static int NUM_FIELDS = 4;
  8 |   public static int ID_INDEX = 0;
  9 |   public static int FILE_LENGTH_INDEX = 1;
 10 |   public static int OFFSET_INDEX = 2;
 11 |   public static int FILE_NAME_INDEX = 3;
 12 |   public static int LIFE_SPAN = 200;
 13 |   
 14 |   private String file_name_ = null;
 15 |   private Long file_length_ ;
 16 |   private Long offset_;  
 17 |   // file_tag is the unique-identifier of a file,it may be the inode
 18 |   // in linux sysmte or the hash value the absolute file name;
 19 |   private Integer id_;
 20 |   
 21 |   // when file under monitor directory is deleted, keep delete_delay_round
 22 |   // then delete this meta
 23 |   private int life_span_ = LIFE_SPAN;
 24 |   
 25 |   public static Integer GetIdFromName(String file_name) {
 26 |     return new Integer(file_name.hashCode());
 27 |   }
 28 |   
 29 |   public FileInfo() {
 30 |     file_name_ = "";
 31 |     file_length_ = 0L;
 32 |     offset_ = 0L;
 33 |     id_ = 0;
 34 |   }
 35 |   
 36 |   public FileInfo(String name, Long length, Long offset) {
 37 |     this.file_name_ = name;
 38 |     this.file_length_ = length;
 39 |     this.offset_ = offset;
 40 |     this.id_ = GetIdFromName(file_name_);
 41 |   }
 42 |   
 43 |   public FileInfo(String name, Long length, Long offset, Integer id) {
 44 |     this.file_name_ = name;
 45 |     this.file_length_ = length;
 46 |     this.offset_ = offset;
 47 |     this.id_ = id;
 48 |   }
 49 |   
 50 |   public void DecLifeSpan() {
 51 |     this.life_span_ -= 1;
 52 |   }
 53 |   
 54 |   public int get_life_span() {
 55 |     return this.life_span_;
 56 |   }
 57 |   
 58 |   public String get_file_name() {
 59 |     return this.file_name_;
 60 |   }
 61 |   
 62 |   public Long get_file_length() {
 63 |     return this.file_length_;
 64 |   }
 65 |   
 66 |   public Long get_offset() {
 67 |     return offset_;
 68 |   }
 69 |   
 70 |   public Integer get_id() {
 71 |     return id_;
 72 |   }
 73 |   
 74 |   public void set_file_name(String name) {
 75 |     file_name_ = name;
 76 |   }
 77 |   
 78 |   public void set_file_length(Long len) {
 79 |     file_length_ = len;
 80 |   }
 81 |   
 82 |   public void set_offset(Long offset) {
 83 |     offset_ = offset;
 84 |   }
 85 |   
 86 |   public void set_id(Integer id) {
 87 |     id_ = id;
 88 |   }
 89 |   
 90 |   public String toString() {
 91 |     StringBuilder builder = new StringBuilder();
 92 |     builder.append("inode=");
 93 |     builder.append(id_);
 94 |     builder.append(FIELD_SEPERATOR);
 95 |     builder.append("length=");
 96 |     builder.append(file_length_);
 97 |     builder.append(FIELD_SEPERATOR);
 98 |     builder.append("offset=");
 99 |     builder.append(offset_);
100 |     builder.append(FIELD_SEPERATOR);
101 |     builder.append("file_name=");
102 |     builder.append(file_name_);
103 |     return builder.toString();
104 |   }
105 |   
106 |   
107 |   public String GetWriteString() {
108 |     StringBuilder builder = new StringBuilder();
109 |     builder.append(id_);
110 |     builder.append(FIELD_SEPERATOR);
111 |     builder.append(file_length_);
112 |     builder.append(FIELD_SEPERATOR);
113 |     builder.append(offset_);
114 |     builder.append(FIELD_SEPERATOR);
115 |     builder.append(file_name_);
116 |     return builder.toString();
117 |   }
118 |   
119 |   public int hashCode() {
120 |     return id_;
121 |   }
122 | }


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/FileMapReaderWriter.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | 
  5 | import java.util.Map;
  6 | import java.io.BufferedReader;
  7 | import java.io.File;
  8 | import java.io.FileInputStream;
  9 | import java.io.FileNotFoundException;
 10 | import java.io.IOException;
 11 | import java.io.InputStreamReader;
 12 | import java.io.PrintWriter;
 13 | 
 14 | import org.slf4j.Logger;
 15 | import org.slf4j.LoggerFactory;
 16 | 
 17 | import com.google.common.base.Preconditions;
 18 | 
 19 | public class FileMapReaderWriter {
 20 |   private static final Logger LOG = LoggerFactory
 21 |       .getLogger(FileMapReaderWriter.class);
 22 |   private static String format_string_ = "# ${inode}\t${length}\t{$offset}\t${file_name}";    
 23 |   private String file_name_;
 24 |    
 25 |   public FileMapReaderWriter() {
 26 |     file_name_ = null;
 27 |   }
 28 |   
 29 |   public void Configure(String name) {
 30 |     Preconditions.checkState(null != name && !name.isEmpty(),
 31 |         "Map meta record file must be specified");
 32 |     file_name_ = name;
 33 |     LOG.info("map_record_meta_file=" + file_name_);
 34 |   }
 35 |   
 36 |   public boolean ParseLine(String line, FileInfo file_info) {
 37 |     Preconditions.checkState(file_name_ != null, 
 38 |         "Plz call Configure to initialize before call other functions");
 39 |     String eles[] = line.split(FileInfo.FIELD_SEPERATOR);
 40 |     if (eles.length != FileInfo.NUM_FIELDS) {
 41 |       LOG.warn("Invalid record line:" + line);
 42 |       return false;
 43 |     }
 44 |     try {
 45 |     file_info.set_file_name(eles[FileInfo.FILE_NAME_INDEX]);
 46 |     file_info.set_file_length((Long.valueOf(eles[FileInfo.FILE_LENGTH_INDEX])));
 47 |     file_info.set_offset((Long.valueOf(eles[FileInfo.OFFSET_INDEX])));
 48 |     file_info.set_id((Integer.valueOf(eles[FileInfo.ID_INDEX])));
 49 |     } catch (NumberFormatException e) {
 50 |       LOG.warn("Invalid line:" + line);
 51 |       return false;
 52 |     }
 53 |     return true;
 54 |   }
 55 |   
 56 |   public static void PrintMap(Map<Integer, FileInfo> file_info_map) {
 57 |     int cnter = 0;
 58 |     LOG.debug("Total num file in file_map:" + file_info_map.size());
 59 |     for (FileInfo file_info : file_info_map.values()) {
 60 |       LOG.debug("idx = " + cnter + " info = " + file_info.toString());
 61 |     }
 62 |   }
 63 |   
 64 |   public synchronized void LoadMap(Map<Integer, FileInfo> file_info_map) {
 65 |     LOG.info("LoadFileMap from file: " + file_name_);
 66 |     BufferedReader file_reader = null;
 67 |     try {
 68 |       file_reader = new BufferedReader(new InputStreamReader(
 69 |           new FileInputStream(file_name_)));
 70 |       String line = null;
 71 |       while ((line = file_reader.readLine()) != null) {
 72 |         if (line.startsWith("#")) {
 73 |           continue;
 74 |         }
 75 |         FileInfo file_info = new FileInfo();
 76 |         if (ParseLine(line, file_info)) {
 77 |           file_info_map.put(file_info.get_id(), file_info);
 78 |         } else {
 79 |           LOG.warn("LoadMap invalid line, parse error: " + line);
 80 |         }
 81 |       }
 82 |       PrintMap(file_info_map);
 83 |     } catch(FileNotFoundException e) {
 84 |       LOG.info("Map record file not exist, skip loading " + file_name_);
 85 |     } catch (Exception e) {
 86 |       LOG.warn("Map record file read error due to " + e.toString());
 87 |     } finally {
 88 |       if (null != file_reader) {
 89 |         try {
 90 |           file_reader.close();
 91 |         } catch (IOException e) {
 92 |           LOG.warn("close file exception, " + e.toString());
 93 |         }
 94 |       }
 95 |     }
 96 |   }
 97 |   
 98 |   public synchronized void WriteMap(Map<Integer, FileInfo> file_info_map) {
 99 |     PrintWriter file_writter = null;
100 |     int cnter = 0;
101 |     try {
102 |       File file = new File(file_name_);
103 |       // create parent directory if not exist
104 |       file.getParentFile().mkdirs();
105 |       file_writter = new PrintWriter(file_name_);
106 | 	  file_writter.println(format_string_);
107 |       synchronized (file_info_map) {
108 |         for (FileInfo file_info : file_info_map.values()) {
109 |           file_writter.println(file_info.GetWriteString());
110 |           cnter++;
111 |         }
112 |       }
113 |     } catch (FileNotFoundException e) {
114 |       LOG.warn("File not found, you should never see this");
115 |     } catch (Exception e) {
116 |       LOG.warn("Write map meta failed due to " + e.toString());
117 |     } finally {
118 |       if (null != file_writter) {
119 |         file_writter.close();
120 |       }
121 |       LOG.info("Write \'" + cnter + "\' records to map meta file" + file_name_ );
122 |     }
123 |   }
124 |   
125 | }


--------------------------------------------------------------------------------
/test/com/minsheng/flume/source/TestSimpleFileMonitor.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | import java.io.File;
  5 | import java.io.IOException;
  6 | import java.util.Map;
  7 | import java.util.HashMap;
  8 | import java.util.concurrent.*;
  9 | import java.util.regex.Pattern;
 10 | import java.lang.*;
 11 | 
 12 | import junit.framework.Assert;
 13 | import static org.junit.Assert.*;
 14 | 
 15 | import org.junit.Before;
 16 | import org.junit.After;
 17 | import org.junit.Test;
 18 | import org.apache.flume.Context;
 19 | 
 20 | import com.minsheng.flume.source.FileInfo;
 21 | import com.minsheng.flume.source.FlumeConstants;
 22 | import com.minsheng.flume.source.SimpleFileMonitor;
 23 | 
 24 | import org.slf4j.Logger;
 25 | import org.slf4j.LoggerFactory;
 26 | 
 27 | public class TestSimpleFileMonitor {
 28 |   private static final Logger LOG = LoggerFactory
 29 |       .getLogger(TestSimpleFileMonitor.class);
 30 |  
 31 |   private int file_num_ = 3;
 32 |   private String target_dir_;
 33 |   
 34 |   private String prefix_ = "file";
 35 |   private String include_suffix_ = ".test";
 36 |   private String include_file_regex_ = prefix_ + ".*" + include_suffix_;
 37 |   
 38 |   private String date_str_ = ".2013-01-17-12";
 39 |   private String exclude_suffix_ = include_suffix_ + date_str_;
 40 |   private String exclude_file_regex_ = prefix_ + ".*" + exclude_suffix_;
 41 |   
 42 |   private Pattern default_include_pattern_ = 
 43 |       Pattern.compile(".*");
 44 |   
 45 |   private Pattern full_exclude_pattern_ =
 46 |       Pattern.compile(exclude_file_regex_);
 47 |   
 48 |   Context default_context_;
 49 |   Context full_context_;
 50 |   
 51 |   SimpleFileMonitor default_monitor_;
 52 |   SimpleFileMonitor full_monitor_;
 53 |   
 54 |   
 55 |   @Before
 56 |   public void SetUp() {
 57 |     target_dir_ = "/tmp/ms_flume/monitor/";
 58 |     File file_target_dir = new File(target_dir_);
 59 |     LOG.info("Create test target directory: " + target_dir_);
 60 |     Assert.assertTrue("Create target dir failed", file_target_dir.mkdirs());
 61 |     
 62 |     LOG.info("Create child directory");
 63 |     for (int i = 0; i < file_num_; i++) {
 64 |       File child_dir = new File(target_dir_ +"/" + i);
 65 |       LOG.info("\tCreate  directory:" + child_dir.toString());
 66 |       Assert.assertTrue("Create child dir failed", child_dir.mkdirs());
 67 |     }
 68 |     
 69 |     try {
 70 |       LOG.info("Create include file in target directory");
 71 |       for (int i = 0; i < file_num_; i++) {
 72 |         File test_file = File.createTempFile(prefix_ + i, 
 73 |             include_suffix_, file_target_dir);
 74 |         LOG.info("\tCreate  include file:" + test_file.toString());
 75 |         Assert.assertTrue("Create failed for test file " + test_file.getName(),
 76 |             test_file.exists());
 77 |       }
 78 | 
 79 |       System.out
 80 |           .println("Create exclude file(start with '.') in target directory");
 81 |       for (int i = 0; i < file_num_; i++) {
 82 |         File test_file = File
 83 |             .createTempFile("." + prefix_ + i, include_suffix_, file_target_dir);
 84 |         LOG.info("\tCreate  hidden file:" + test_file.toString());
 85 |         Assert.assertTrue("Create failed for test file " + test_file.getName(),
 86 |             test_file.exists());
 87 |       }
 88 | 
 89 |       System.out
 90 |           .println("Create exclude file(with date suffix) in target directory");
 91 |       for (int i = 0; i < file_num_; i++) {
 92 |         File test_file = File.createTempFile(prefix_ + i,
 93 |             exclude_suffix_, file_target_dir);
 94 |         LOG.info("\tCreate  exclude file:" + test_file.toString());
 95 |         Assert.assertTrue("Create failed for test file " + test_file.getName(),
 96 |             test_file.exists());
 97 |       }
 98 |     } catch (IOException e) {
 99 |       LOG.info("IOException: " + e.getMessage());
100 |     }
101 |     
102 |     LOG.info("\n*****Create default flume context(only specify target dir)");
103 |     Map<String, String> params = new HashMap<String, String>();
104 |     params.put(FlumeConstants.MONITOR_DIR, target_dir_);
105 |     default_context_ = new Context(params);
106 |     
107 |     for (Map.Entry<String, String> s : params.entrySet()) {
108 |       LOG.info("key=" + s.getKey()
109 |           + " value=" + s.getValue());
110 |     }
111 |     
112 |     LOG.info("\n*****Create full flume context");
113 |     params.put(FlumeConstants.FILE_CHECK_INTERVAL, "3");
114 |     params.put(FlumeConstants.FILE_NAME_INCLUDE, include_file_regex_);
115 |     params.put(FlumeConstants.FILE_NAME_EXCLUDE, exclude_file_regex_);
116 | 
117 |     for (Map.Entry<String, String> s : params.entrySet()) {
118 |       LOG.info("key=" + s.getKey()
119 |           + " value=" + s.getValue());
120 |     }
121 |     full_context_ = new Context(params);
122 |   }
123 | 
124 |   @Test
125 |   public void TestDefaultMonitor() {
126 |     LOG.info("Start test default action at" + target_dir_);
127 |     default_monitor_ = new SimpleFileMonitor();
128 |     default_monitor_.Configure(default_context_);
129 |     default_monitor_.Start();
130 |     
131 |     try {
132 |       LOG.info("Slepp 5 sec for monitor to update");
133 |       Thread.sleep(5000L);
134 |     } catch (InterruptedException e) {
135 |       LOG.info("Sleep interrupted.");
136 |     }
137 |     
138 |     Map<Integer, FileInfo> my_map = new ConcurrentHashMap<Integer, FileInfo>();
139 |     Map<Integer, FileInfo> new_map = 
140 |         default_monitor_.GetLatestFileInfo(my_map);
141 |     LOG.info("Total valid file num: " + new_map.size());
142 |     Assert.assertTrue(new_map.size() == (file_num_ * 2));
143 |     for (FileInfo file_info : new_map.values()) {
144 |       LOG.info(file_info.toString());
145 |       Assert.assertTrue("include ilega files", 
146 |           default_include_pattern_.matcher(file_info.get_file_name()).matches());
147 |     }
148 |     default_monitor_.Stop();
149 |   }
150 |   
151 |   @Test
152 |   public void TestFullMonitor() {
153 |     LOG.info("Start test default action at" + target_dir_);
154 |     full_monitor_ = new SimpleFileMonitor();
155 |     full_monitor_.Configure(full_context_);
156 |     full_monitor_.Start();
157 |     
158 |     try {
159 |       LOG.info("Slepp 5 sec for monitor to update");
160 |       Thread.sleep(5000L);
161 |     } catch (InterruptedException e) {
162 |       LOG.info("Sleep interrupted.");
163 |     }
164 |     
165 |     Map<Integer, FileInfo> my_map = new ConcurrentHashMap<Integer, FileInfo>();
166 |     Map<Integer, FileInfo> new_map = 
167 |         full_monitor_.GetLatestFileInfo(my_map);
168 |     LOG.info("Total valid file num: " + new_map.size());
169 |     Assert.assertTrue(new_map.size() == (file_num_));
170 |     for (FileInfo file_info : new_map.values()) {
171 |       LOG.info(file_info.toString());
172 |       Assert.assertTrue("include files not in include pattern",
173 |               default_include_pattern_.matcher(file_info.get_file_name())
174 |                   .matches());
175 |       Assert.assertFalse("some file should be excludesd", full_exclude_pattern_
176 |           .matcher(file_info.get_file_name()).matches());
177 |     }
178 |     full_monitor_.Stop();
179 |   }
180 |   
181 |   public void RecursiveDelete(File file) {
182 |     if (file.isDirectory()) {
183 |       for (File f : file.listFiles()) {
184 |         RecursiveDelete(f);
185 |       }
186 |       file.delete();
187 |     } else {
188 |       file.delete();
189 |     }
190 |   }
191 | 
192 |   @After
193 |   public void CleanUp() {
194 |     File file = new File(target_dir_);
195 |     LOG.info("Clean target dir");
196 |     RecursiveDelete(file);
197 |     Assert.assertFalse(file.exists());
198 |   }
199 | }
200 | 


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/DirFileRecorder.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | 
  5 | 
  6 | import java.io.File;
  7 | import java.io.IOException;
  8 | import java.util.HashMap;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | import java.util.regex.PatternSyntaxException;
 14 | import java.util.concurrent.Executors;
 15 | import java.util.concurrent.ExecutorService;
 16 | import java.util.concurrent.Future;
 17 | import java.util.concurrent.ConcurrentHashMap;
 18 | import java.util.concurrent.ScheduledExecutorService;
 19 | import java.util.concurrent.ScheduledFuture;
 20 | import java.util.concurrent.TimeUnit;
 21 | 
 22 | import org.apache.flume.Event;
 23 | import org.apache.flume.event.EventBuilder;
 24 | import org.apache.flume.Context;
 25 | import org.apache.flume.event.EventBuilder;
 26 | import org.slf4j.Logger;
 27 | import org.slf4j.LoggerFactory;
 28 | 
 29 | import com.google.common.base.Preconditions;
 30 | 
 31 | public class DirFileRecorder {
 32 |   private static final Logger LOG = LoggerFactory
 33 |       .getLogger(DirFileRecorder.class);
 34 | 
 35 |   // id --> FileInfo, id may be either the inode in fs or the hash value of
 36 |   // file name(FileInfo.GetIdFromName)
 37 |   private Map<Integer, FileInfo> file_info_map_ = null;
 38 |   private FileMonitor file_monitor_ = null;
 39 |   private FileMapReaderWriter reader_writer_ = null;
 40 |   private FileParser file_parser_ = null;
 41 |   private boolean auto_delete_line_delimiter_ = false;
 42 | 
 43 |   // check and send content every 3 second
 44 |   private Long send_interval_ = 3L;
 45 |   private String meta_store_file_  = "";
 46 |   
 47 |   DirectoryMonitorSource monitor_source_ = null;
 48 |   
 49 |   private ScheduledExecutorService executor_service_ = null;
 50 |   private Runnable sender_runnable_ = null;
 51 |   private ScheduledFuture<?> sender_future_ = null;
 52 |   
 53 |   
 54 |   public DirFileRecorder(DirectoryMonitorSource source) {
 55 |     LOG.info("Init DirFileRecorder");
 56 |     file_info_map_ = new ConcurrentHashMap<Integer, FileInfo>();
 57 |     file_monitor_ = new SimpleFileMonitor();
 58 |     reader_writer_ = new FileMapReaderWriter();
 59 |     file_parser_ = new MultiLineParser();
 60 |     monitor_source_ = source;
 61 |   }
 62 | 
 63 |   public void Configure(Context context) {
 64 |     LOG.info("Configure DirFileRecorder.");
 65 |     file_monitor_.Configure(context);
 66 |     String meta_dir = context.getString(FlumeConstants.META_STORE_DIR,
 67 |         "./meta/");
 68 |     send_interval_ = context.getLong(FlumeConstants.FILE_SEND_INTERVAL, 3L);
 69 |     String tmp_meta_store_file = meta_dir + FlumeConstants.DIR_SEP + 
 70 |                        file_monitor_.GetMonitorDir().hashCode();
 71 |     File tmp_file = new File(tmp_meta_store_file);
 72 |     meta_store_file_ = tmp_file.getAbsolutePath(); // 
 73 |     
 74 |     auto_delete_line_delimiter_ = 
 75 |         context.getBoolean(FlumeConstants.AUTO_DELETE_LINE_DEILMITER, false);
 76 |     
 77 |     reader_writer_.Configure(meta_store_file_);
 78 |     
 79 |     file_parser_ = new MultiLineParser();
 80 |     file_parser_.Configure(context);
 81 |     
 82 |     executor_service_ = Executors.newScheduledThreadPool(1);
 83 |     sender_runnable_ = new SenderRunnable();
 84 |     file_info_map_ = new ConcurrentHashMap<Integer, FileInfo>();
 85 |   }
 86 | 
 87 |   public void Start() {
 88 |     LOG.info("Start DirFileRecorder.");
 89 |     reader_writer_.LoadMap(file_info_map_);
 90 |     FileMapReaderWriter.PrintMap(file_info_map_);
 91 |     file_monitor_.Start();
 92 |     sender_future_ = executor_service_.scheduleAtFixedRate(sender_runnable_,
 93 |            0L, 
 94 |            send_interval_.longValue(), 
 95 |            TimeUnit.SECONDS);
 96 |   }
 97 | 
 98 |   public void Stop() {
 99 |     file_monitor_.Stop();
100 |     sender_future_.cancel(true);
101 |     reader_writer_.WriteMap(file_info_map_);
102 |     executor_service_.shutdown();
103 |   }
104 | 
105 |   private boolean SendEvents(Map<Integer, FileInfo> file_map) {
106 |     if (null == file_map || file_map.isEmpty()) {
107 |       LOG.warn("file_map is null(wait for update) or file_map is empty");
108 |       return false;
109 |     }
110 |     if (LOG.isDebugEnabled()) {
111 |       LOG.debug("SendEvents, with total file num = " + file_map.size() 
112 |           + " dir {}"
113 |          , file_monitor_.GetMonitorDir());
114 |     }
115 |     // currently we update every time for debug
116 |     boolean should_update_meta = true;
117 |     long event_num = 0;
118 |     try {
119 |       for (FileInfo file_info : file_map.values()) {
120 |         if (file_info.get_offset() >= file_info.get_file_length()) {
121 |           // this file already processd
122 |           if (LOG.isDebugEnabled()) {
123 |             LOG.debug("File done, skip: " + file_info.get_file_name());
124 |           }
125 |           continue;
126 |         }
127 | 
128 |         List<String> records = file_parser_.GetNextBatchRecords(
129 |             file_info.get_file_name(), file_info.get_offset());
130 |         Long offset = file_info.get_offset();
131 |         for (String record : records) {
132 |           // no matter drop it or not ,we should first update file read offset
133 |           byte[] record_bytes = record.getBytes();
134 |           offset += record_bytes.length;
135 |           /*
136 |            * if (auto_delete_line_delimiter_) { record_bytes =
137 |            * record.trim().getBytes(); offset += record_bytes.length + 1; // 1
138 |            * for line delimiter } else { record_bytes = record.getBytes();
139 |            * offset += record_bytes.length; }
140 |            */
141 |           // NOTICE: every record is end in with a '\n',if the flume will
142 |           // auto to add a '\n', we may handle it here, other otherwise, switch
143 |           // off
144 |           // the auto-add-new-line.
145 |           if (file_parser_.ShouldDrop(record)) {
146 |             if (LOG.isDebugEnabled()) {
147 |               LOG.debug("Drop record: " + record);
148 |             }
149 |             continue;
150 |           }
151 |           Event event = EventBuilder.withBody(record_bytes);
152 |           monitor_source_.getChannelProcessor().processEvent(event);
153 |           event_num += 1;
154 |         } // end for loop
155 |         // update offset
156 |         file_info.set_offset(offset);
157 |         should_update_meta = true;
158 |       } // end for loop
159 |       if (LOG.isDebugEnabled()) {
160 |         LOG.debug("Send Event Num this time: " + event_num + " for dir {}",
161 |             file_monitor_.GetMonitorDir());
162 |       }
163 |       monitor_source_.UpdateSourceCounter(event_num);
164 |     } catch (Exception e) {
165 |       LOG.warn("Exception in SendEvents: " + e.getMessage());
166 |       e.printStackTrace();
167 |     }
168 |     return should_update_meta;
169 |   }
170 |   
171 |   class SenderRunnable implements Runnable {
172 |     @Override
173 |     public void run() {
174 |       // TODO Auto-generated method stub
175 |       try {
176 |         if (LOG.isDebugEnabled()) {
177 |           LOG.debug("Before Update, file_map_size: " + file_info_map_.size() +" dir {}",
178 |               file_monitor_.GetMonitorDir());
179 |           FileMapReaderWriter.PrintMap(file_info_map_);
180 |         }
181 |         Map<Integer, FileInfo> new_map =
182 |           file_monitor_.GetLatestFileInfo(file_info_map_);
183 |         if (LOG.isDebugEnabled()) {
184 |           LOG.debug("After Update, file_map_size: " 
185 |                 + new_map.size() + " dir {}", file_monitor_.GetMonitorDir());
186 |           FileMapReaderWriter.PrintMap(new_map);
187 |         }
188 |         
189 |         file_info_map_ = new_map;
190 |         if (SendEvents(file_info_map_)) {
191 |           if (LOG.isDebugEnabled()) {
192 |             LOG.debug("Write file map for dir {}", file_monitor_.GetMonitorDir());
193 |           }
194 |           reader_writer_.WriteMap(new_map);
195 |         }
196 |         
197 |       } catch (Exception e) {
198 |         LOG.warn("Exception in SenderRunable: " + e.getMessage());
199 |         e.printStackTrace();
200 |       }
201 |     }
202 |   }
203 |   
204 | };
205 | 


--------------------------------------------------------------------------------
/test/com/minsheng/flume/source/TestDirFileRecorder.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | import java.io.File;
  5 | import java.io.IOException;
  6 | import java.io.FileNotFoundException;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | import java.util.HashMap;
 10 | import java.util.concurrent.*;
 11 | import java.util.regex.Pattern;
 12 | import java.lang.*;
 13 | import java.io.PrintWriter;
 14 | import junit.framework.Assert;
 15 | import static org.junit.Assert.*;
 16 | 
 17 | import org.junit.Before;
 18 | import org.junit.After;
 19 | import org.junit.Test;
 20 | import org.apache.flume.Context;
 21 | 
 22 | import com.minsheng.flume.source.FileInfo;
 23 | import com.minsheng.flume.source.FlumeConstants;
 24 | import com.minsheng.flume.source.MultiLineParser;
 25 | import com.minsheng.flume.source.SimpleFileMonitor;
 26 | 
 27 | import org.slf4j.Logger;
 28 | import org.slf4j.LoggerFactory;
 29 | 
 30 | 
 31 | public class TestDirFileRecorder {
 32 |   private static final Logger LOG = LoggerFactory
 33 |       .getLogger(TestDirFileRecorder.class);
 34 |   
 35 |   private File target_dir_ = new File("/tmp/ms_flume/parser");
 36 |   
 37 |   private String first_regex_ = "\\[\\[.*";
 38 |   private String last_regex_ = ".*END\\]\\]";
 39 |   private Pattern start_line_pattern_ = Pattern.compile(first_regex_);
 40 |   private Pattern end_line_pattern_ = Pattern.compile(last_regex_);
 41 |   private Pattern record_include_pattern_ = 
 42 |       Pattern.compile(".*INCLUDE_RECORD.*");
 43 |   private Pattern record_exclude_pattern_ = 
 44 |       Pattern.compile(".*EXCLUDE_RECORD.*");
 45 |   
 46 |   private String record_include_str_ = "\tINCLUDE_RECORD";
 47 |   private String record_exclude_str_ = "\tEXCLUDE_RECORD";
 48 |   
 49 |   
 50 |   public void CreateMIXFile(File file) {
 51 |     PrintWriter writer = null;
 52 |     try {
 53 |       writer = new PrintWriter(file);
 54 |       String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line";
 55 |       String end_line = "\trecord end line END]]";
 56 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
 57 |       
 58 |       // write record 1  ---include if include_record = .*
 59 |       writer.println(start_line);
 60 |       writer.println(end_line);
 61 |       
 62 |       // write record 1  ---include
 63 |       writer.println(start_line);
 64 |       writer.print(record_include_str_);
 65 |       writer.println(end_line);
 66 |       
 67 |    // write record 1  ---exclude
 68 |       writer.println(start_line);
 69 |       writer.print(record_exclude_str_);
 70 |       writer.println(end_line);
 71 |       
 72 |       // write record 2  -- include
 73 |       writer.println(start_line);
 74 |       writer.print(record_include_str_);
 75 |       writer.println(mid_line);
 76 |       writer.println(end_line);
 77 |       
 78 |       // write record 2  -- exclude
 79 |       writer.println(start_line);
 80 |       writer.print(record_exclude_str_);
 81 |       writer.println(mid_line);
 82 |       writer.println(end_line);
 83 | 
 84 |       // write random line and tailer witout header
 85 |       // we should see this as a new record
 86 |       writer.println("\tno header no header no header");
 87 |       writer.println(end_line);
 88 |       
 89 |       
 90 |       // write random line and tailer witout header
 91 |       // we should never see this, because it will wait the end or start line;
 92 |       writer.println(start_line);
 93 |       writer.println("\t no tailer no tailer no tailer");
 94 |     } catch (FileNotFoundException e) {
 95 |       LOG.error("Write content failed at file: " + file);
 96 |     } finally {
 97 |       if (writer != null) {
 98 |         writer.close();
 99 |         writer = null;
100 |       }
101 |     } 
102 |   }
103 |   
104 |   public void CreateFIRSTFile(File file) {
105 |     PrintWriter writer = null;
106 |     try {
107 |       writer = new PrintWriter(file);
108 |       String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line";
109 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
110 |       // write record 1
111 |       writer.println(start_line);
112 |       writer.print(record_include_str_);
113 |       // write record 2
114 |       writer.println(start_line);
115 |       writer.print(record_exclude_str_);
116 |       writer.println(mid_line);
117 |       
118 |       // write record 3
119 |       writer.println(start_line);
120 |       writer.println(mid_line);
121 |       writer.println(mid_line);
122 |       writer.println(mid_line);
123 |       writer.println(mid_line);
124 |       writer.println(mid_line);
125 |       writer.println(mid_line);
126 | 
127 |     } catch (FileNotFoundException e) {
128 |       LOG.error("Write content failed at file: " + file);
129 |     } finally {
130 |       if (writer != null) {
131 |         writer.close();
132 |         writer = null;
133 |       }
134 |     } 
135 |   }
136 |   
137 |   public void CreateLASTFile(File file) {
138 |     PrintWriter writer = null;
139 |     try {
140 |       writer = new PrintWriter(file);
141 |       String end_line = "[[INFO] 2013-01-17 13:54:32 reord end line";
142 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
143 |       // write record 1
144 |       writer.print(record_include_str_);
145 |       writer.println(end_line);
146 |       // write record 2
147 |       writer.println(mid_line);
148 |       writer.print(record_include_str_);
149 |       writer.println(end_line);
150 |       
151 |       // write record 3
152 | 
153 |       writer.println(mid_line);
154 |       writer.println(mid_line);
155 |       writer.println(mid_line);
156 |       writer.println(end_line);
157 |       
158 |       // useless line
159 |       writer.println(mid_line);
160 |       writer.println(mid_line);
161 |       writer.println(mid_line);
162 |       writer.print(record_exclude_str_);
163 |       writer.println(mid_line);
164 |       writer.println(mid_line);
165 |       writer.println(mid_line);
166 |     } catch (FileNotFoundException e) {
167 |       LOG.error("Write content failed at file: " + file);
168 |     } finally {
169 |       if (writer != null) {
170 |         writer.close();
171 |         writer = null;
172 |       }
173 |     } 
174 |   }
175 |   
176 |   public void CreateNONEFile(File file) {
177 |     PrintWriter writer = null;
178 |     try {
179 |       writer = new PrintWriter(file);
180 |       String end_line = "[[INFO] 2013-01-17 13:54:32 random arbitray";
181 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
182 |       // write record 1
183 |       writer.println(end_line);
184 |       writer.print(record_exclude_str_);
185 |       writer.print(record_include_str_);
186 |       // write record 2
187 |       writer.println(end_line);
188 |       writer.println(mid_line);
189 |     } catch (FileNotFoundException e) {
190 |       LOG.error("Write content failed at file: " + file);
191 |     } finally {
192 |       if (writer != null) {
193 |         writer.close();
194 |         writer = null;
195 |       }
196 |     } 
197 |   }
198 |   
199 | 
200 |   public void SetUp() {
201 |    LOG.info("Start test MultiLineParser");
202 |    LOG.info("Create test dir: " + target_dir_);
203 |    target_dir_.mkdirs();
204 |    Assert.assertTrue(target_dir_.exists());
205 | 
206 |   
207 |    
208 |    File first_file = new File(target_dir_ + "/" + "first.file");
209 |    CreateFIRSTFile(first_file);
210 |    
211 |    File last_file = new File(target_dir_ + "/" + "last.file");
212 |    CreateLASTFile(last_file);
213 |    
214 |    File none_file = new File(target_dir_ + "/" + "none.file");
215 |    CreateNONEFile(none_file);
216 |     
217 |   }
218 |   
219 |   public void WriteFile(File file, List<String> records) {
220 |     PrintWriter writer = null;
221 |     try {
222 |       writer = new PrintWriter(file);
223 |       for (String line : records) {
224 |         writer.print(line);
225 |       }
226 |     } catch (FileNotFoundException e) {
227 |       LOG.error("Write content failed at file: " + file);
228 |     } finally {
229 |       if (writer != null) {
230 |         writer.close();
231 |         writer = null;
232 |       }
233 |     } 
234 |   }
235 |   
236 | 
237 |   @Test
238 |   public void TestDirFile() {
239 |     LOG.info("Start test dir file recorder");
240 |   }
241 | }


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/SimpleFileMonitor.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | 
  5 | import java.io.IOException;
  6 | import java.util.Map;
  7 | import java.util.regex.Matcher;
  8 | import java.util.regex.Pattern;
  9 | import java.util.concurrent.ConcurrentHashMap;
 10 | import java.util.concurrent.Executors;
 11 | import java.util.concurrent.ScheduledExecutorService;
 12 | import java.util.concurrent.ScheduledFuture;
 13 | import java.util.concurrent.TimeUnit;
 14 | 
 15 | import org.apache.flume.Context;
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | import com.google.common.base.Preconditions;
 20 | import com.minsheng.util.Shell;
 21 | import com.minsheng.util.StringUtil;
 22 | 
 23 | public class SimpleFileMonitor extends FileMonitor {
 24 |   private static final Logger LOG = LoggerFactory
 25 |       .getLogger(SimpleFileMonitor.class);
 26 |   private static int ID_SHELL_IDX = 0;
 27 |   private static int FILE_NAME_SHELL_IDX = 5;
 28 |   private static int FILE_LENGTH_SHELL_IDX = 3;
 29 |   private static int FILE_META_IDX = 1;
 30 | 
 31 |   private String default_file_name_include_str_ = ".*";
 32 |   private String default_file_name_exclude_str = "^[.].*";
 33 |   private String file_name_file_name_include_pattern_str_ = null;
 34 |   // file start with "." is excluded
 35 |   private String file_name_file_name_exclude_pattern_str_ = null;
 36 |   private Pattern file_name_include_pattern_ = null;
 37 |   private Pattern file_name_exclude_pattern_ = null;
 38 | 
 39 |   private String ls_output_total_ = "total";
 40 |   private Pattern integer_pattern_ = null;
 41 | 
 42 |   private Pattern shell_result_pattern_ = null;
 43 |   private String monitor_dir_ = null;
 44 |   private Map<Integer, FileInfo> file_info_map_ = new ConcurrentHashMap<Integer, FileInfo>();
 45 |   private String shell_command_[] = null;
 46 |   private Shell.ShellCommandExecutor shell_executor_ = null;
 47 |   private ScheduledExecutorService executor_service_ = null;
 48 |   private Runnable shell_runnable_ = null;
 49 |   private ScheduledFuture<?> shell_future_ = null;
 50 |   private Long check_interval_sec_ = null;
 51 |   // 
 52 |   private Long default_interval_sec = 5L;
 53 |   private String shell_output_ = null;
 54 |   
 55 |   private boolean can_fetch_file_map_ = false;
 56 | 
 57 |   public SimpleFileMonitor() {
 58 |     super();
 59 |   }
 60 | 
 61 |   @Override
 62 |   public void Configure(Context context) {
 63 |     // TODO Auto-generated method stub
 64 |     monitor_dir_ = context.getString(FlumeConstants.MONITOR_DIR);
 65 |     Preconditions.checkState(monitor_dir_ != null,
 66 |         "you must specified  \'monitor_dir\' in config file");
 67 |     check_interval_sec_ = context.getLong(FlumeConstants.FILE_CHECK_INTERVAL,
 68 |         default_interval_sec);
 69 |     file_name_file_name_include_pattern_str_ = context.getString(
 70 |         FlumeConstants.FILE_NAME_INCLUDE, default_file_name_include_str_);
 71 |     file_name_file_name_exclude_pattern_str_ = context.getString(
 72 |         FlumeConstants.FILE_NAME_EXCLUDE, default_file_name_exclude_str);
 73 | 
 74 |     shell_result_pattern_ = Pattern.compile(FlumeConstants.SHELL_RESULT_REGEX);
 75 |     file_name_include_pattern_ = Pattern
 76 |         .compile(file_name_file_name_include_pattern_str_);
 77 |     file_name_exclude_pattern_ = Pattern
 78 |         .compile(file_name_file_name_exclude_pattern_str_);
 79 | 
 80 |     integer_pattern_ = Pattern.compile(FlumeConstants.INTEGER_REGEX);
 81 | 
 82 |     shell_command_ = FlumeConstants.GetShellCommand(monitor_dir_);
 83 |     shell_executor_ = new Shell.ShellCommandExecutor(shell_command_);
 84 |     executor_service_ = Executors.newScheduledThreadPool(1);
 85 |   }
 86 | 
 87 |   @Override
 88 |   public void Start() {
 89 |     // TODO Auto-generated method stub
 90 |     StringBuilder builder = new StringBuilder();
 91 |     builder.append("Start SimpleFileMonitor with [dir=");
 92 |     builder.append(monitor_dir_);
 93 |     builder.append(", file_name_file_name_include_pattern_str_=");
 94 |     builder.append(file_name_file_name_include_pattern_str_);
 95 |     builder.append(",file_name_file_name_exclude_pattern_str_=");
 96 |     builder.append(file_name_file_name_exclude_pattern_str_);
 97 |     builder.append(",check_interval_sec_=");
 98 |     builder.append(check_interval_sec_);
 99 |     LOG.info(builder.toString());
100 |     builder = null;
101 | 
102 |     shell_runnable_ = new ShellRunnable();
103 |     shell_future_ = executor_service_.scheduleAtFixedRate(shell_runnable_, 0L,
104 |         check_interval_sec_.longValue(), TimeUnit.SECONDS);
105 |   }
106 | 
107 |   @Override
108 |   public void Stop() {
109 |     // TODO Auto-generated method stub
110 |     LOG.info("Stop SimpleFileMonitor");
111 |     shell_future_.cancel(true);
112 |     executor_service_.shutdown();
113 |     file_info_map_ = null;
114 |   }
115 | 
116 |   // this function the latest_offset is synchronized at entry
117 |   @Override
118 |   public Map<Integer, FileInfo> GetLatestFileInfo(
119 |       Map<Integer, FileInfo> latest_offset) {
120 |     // TODO Auto-generated method stub
121 |     if (!can_fetch_file_map_) {
122 |       // wait for next round to update
123 |       return latest_offset;
124 |     }
125 |     
126 |     Map<Integer, FileInfo> new_file_map = null;
127 |     synchronized (this) {
128 |       if (latest_offset != null) {
129 |         for (FileInfo file_info : latest_offset.values()) {
130 |           if (this.file_info_map_.containsKey(file_info.get_id())) {
131 |             // update offset from latest_offset map to current map
132 |             this.file_info_map_.get(file_info.get_id()).set_offset(
133 |                 file_info.get_offset());
134 |           } else {
135 |             
136 |             // file is delete from monitor dir
137 |             // we run into a problem, inode will be reused when file deleted
138 |             // so we need to delete file as soon as possible
139 |             // if (file_info.get_life_span() > 0) {
140 |             //  file_info.DecLifeSpan();
141 |             //  this.file_info_map_.put(file_info.get_id(), file_info);
142 |             // }
143 |           }
144 |         }
145 |       }
146 |       new_file_map = file_info_map_;
147 |       // file_info_map_ = null;
148 |     }
149 |     return new_file_map;
150 |   }
151 |  
152 |   @Override
153 |   public String GetMonitorDir() {
154 |     // TODO Auto-generated method stub
155 |     return monitor_dir_;
156 |   }
157 | 
158 |   public void UpdateMapFromShellResult() {
159 |     Map<Integer, FileInfo> new_file_map = new ConcurrentHashMap<Integer, FileInfo>();
160 |     shell_output_ = shell_executor_.getOutput();
161 |     if (LOG.isDebugEnabled()) {
162 |       LOG.debug("Shelloutput****************\n" + shell_output_);
163 |     }
164 |     for (String line : shell_output_.split(FlumeConstants.LINE_SEP)) {
165 |       // String eles[] = shell_result_pattern_.split(line);
166 |       String eles[] = StringUtil.SplitAndTrim(shell_result_pattern_, line);
167 | 
168 |       /**
169 |        * when use ls -il -o -g --time-style=+%Y the first line of the output is
170 |        * a total infomation like: total 22 ID_SHLL_IDX ID_META_IDX xx
171 |        * FILE_LENGTH_SHELL_IDX xx FILE_NAME_IDX ${inode} drwxrwx-- 1 ${filesize}
172 |        * ${date} ${filename}
173 |        * 
174 |        * (1) for first line, just skip (2) for othern line, if length !=6, print
175 |        * warn information (3) for directory, skip (4) check ${inode} ${filesize)
176 |        * match integer pattern
177 |        * */
178 |       if (eles[ID_SHELL_IDX].toLowerCase().startsWith(ls_output_total_)) {
179 |         LOG.debug("Skip first line -- {}", line);
180 |         continue;
181 |       }
182 |       if (eles.length != FlumeConstants.SHELL_RESULT_FIELD_NUM) {
183 |         LOG.warn("Check system env, Invalid shell result,fields = {} line={} ",
184 |             eles.length, line);
185 |         continue;
186 |       }
187 | 
188 |       if (eles[FILE_META_IDX].startsWith("d")) {
189 |         // this file is a directory, skip
190 |         LOG.debug("Skip monitor directory: " + eles[FILE_NAME_SHELL_IDX]);
191 |         continue;
192 |       }
193 | 
194 |       if (!integer_pattern_.matcher(eles[ID_SHELL_IDX]).matches()
195 |           || !integer_pattern_.matcher(eles[FILE_LENGTH_SHELL_IDX]).matches()) {
196 |         if (LOG.isDebugEnabled()) {
197 |           StringBuilder builder = new StringBuilder();
198 |           builder.append("Skip invalid integer regex line:");
199 |           builder.append(line);
200 |           builder.append("id_shell_idx=");
201 |           builder.append(eles[ID_SHELL_IDX]);
202 |           builder.append(" file_length=");
203 |           builder.append(eles[FILE_LENGTH_SHELL_IDX]);
204 |           LOG.debug(builder.toString());
205 |           builder = null;
206 |         }
207 |         continue;
208 |       }
209 | 
210 |       Matcher include_matcher = file_name_include_pattern_
211 |           .matcher(eles[FILE_NAME_SHELL_IDX]);
212 |       Matcher exclude_matcher = file_name_exclude_pattern_
213 |           .matcher(eles[FILE_NAME_SHELL_IDX]);
214 | 
215 |       if (include_matcher.matches() && !exclude_matcher.matches()) {
216 |         if (LOG.isDebugEnabled()) {
217 |           LOG.debug("File accepted, " + eles[FILE_NAME_SHELL_IDX]);
218 |         }
219 |         // file in white list and not in black list is accepted
220 |         FileInfo file_info = new FileInfo();
221 |         // we already use INTEGER_REGEX to check the ID and Length must be number
222 |         // but just in case, we catch the exception
223 |         try {
224 |           file_info.set_id(Integer.valueOf(eles[ID_SHELL_IDX]));
225 |           file_info.set_file_length(Long.valueOf(eles[FILE_LENGTH_SHELL_IDX]));
226 |         } catch (NumberFormatException e) {
227 |           LOG.warn("Invalid shell result, number format error," + line);
228 |           continue;
229 |         }
230 |         StringBuilder abs_path_builder = new StringBuilder();
231 |         abs_path_builder.append(monitor_dir_);
232 |         abs_path_builder.append(FlumeConstants.DIR_SEP);
233 |         abs_path_builder.append(eles[FILE_NAME_SHELL_IDX]);
234 |         file_info.set_file_name(abs_path_builder.toString());
235 |         file_info.set_offset(0L);
236 |         new_file_map.put(file_info.get_id(), file_info);
237 |       } else {
238 |         if (LOG.isDebugEnabled()) {
239 |           LOG.debug("File rejected, " + eles[FILE_NAME_SHELL_IDX]);
240 |         }
241 |       }
242 |     }
243 |     synchronized (this) {
244 |       file_info_map_ = new_file_map;
245 |       can_fetch_file_map_ = true;
246 |     }
247 |   }
248 | 
249 |   class ShellRunnable implements Runnable {
250 |     @Override
251 |     public void run() {
252 |       // TODO Auto-generated method stub
253 |       try {
254 |         shell_executor_.execute();
255 |         UpdateMapFromShellResult();
256 |       } catch (Exception e) {
257 |         LOG.warn("Execute shell failed due to " + e.getMessage());
258 |         e.printStackTrace();
259 |       }
260 |     }
261 |   }
262 | }


--------------------------------------------------------------------------------
/src/com/minsheng/util/Shell.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.util;
  4 | 
  5 | import java.io.BufferedReader;
  6 | import java.io.File;
  7 | import java.io.IOException;
  8 | import java.io.InputStreamReader;
  9 | 
 10 | import java.util.Map;
 11 | import java.util.Timer;
 12 | import java.util.TimerTask;
 13 | import java.util.concurrent.atomic.AtomicBoolean;
 14 | 
 15 | import org.slf4j.Logger;
 16 | import org.slf4j.LoggerFactory;
 17 | 
 18 | /** 
 19 |  * A base class for running a Unix command.
 20 |  * 
 21 |  * <code>Shell</code> can be used to run unix commands like <code>du</code> or
 22 |  * <code>df</code>. It also offers facilities to gate commands by 
 23 |  * time-intervals.
 24 |  */
 25 | abstract public class Shell {
 26 |   
 27 |   public static final Logger LOG = LoggerFactory.getLogger(Shell.class);
 28 |   
 29 |   private static boolean IS_JAVA7_OR_ABOVE =
 30 |       System.getProperty("java.version").substring(0, 3).compareTo("1.7") >= 0;
 31 | 
 32 |   public static boolean isJava7OrAbove() {
 33 |     return IS_JAVA7_OR_ABOVE;
 34 |   }
 35 | 
 36 | 
 37 |   /** Windows CreateProcess synchronization object */
 38 |   public static final Object WindowsProcessLaunchLock = new Object();
 39 | 
 40 | 
 41 | 
 42 |   /** Return a regular expression string that match environment variables */
 43 |   public static String getEnvironmentVariableRegex() {
 44 |     return (WINDOWS) ? "%([A-Za-z_][A-Za-z0-9_]*?)%" :
 45 |       "\\$([A-Za-z_][A-Za-z0-9_]*)";
 46 |   }
 47 |   
 48 |   /**Time after which the executing script would be timedout*/
 49 |   protected long timeOutInterval = 0L;
 50 |   /** If or not script timed out*/
 51 |   private AtomicBoolean timedOut;
 52 | 
 53 | 
 54 |   /** Set to true on Windows platforms */
 55 |   public static final boolean WINDOWS /* borrowed from Path.WINDOWS */
 56 |                 = System.getProperty("os.name").startsWith("Windows");
 57 | 
 58 |   public static final boolean LINUX
 59 |                 = System.getProperty("os.name").startsWith("Linux");
 60 |   
 61 |   public static final boolean isSetsidAvailable = isSetsidSupported();
 62 |   private static boolean isSetsidSupported() {
 63 |     if (Shell.WINDOWS) {
 64 |       return false;
 65 |     }
 66 |     ShellCommandExecutor shexec = null;
 67 |     boolean setsidSupported = true;
 68 |     try {
 69 |       String[] args = {"setsid", "bash", "-c", "echo $$"};
 70 |       shexec = new ShellCommandExecutor(args);
 71 |       shexec.execute();
 72 |     } catch (IOException ioe) {
 73 |       LOG.debug("setsid is not available on this machine. So not using it.");
 74 |       setsidSupported = false;
 75 |     } finally { // handle the exit code
 76 |       if (LOG.isDebugEnabled()) {
 77 |         LOG.debug("setsid exited with exit code "
 78 |                  + (shexec != null ? shexec.getExitCode() : "(null executor)"));
 79 |       }
 80 |     }
 81 |     return setsidSupported;
 82 |   }
 83 | 
 84 |   /** Token separator regex used to parse Shell tool outputs */
 85 |   public static final String TOKEN_SEPARATOR_REGEX
 86 |                 = WINDOWS ? "[|\n\r]" : "[ \t\n\r\f]";
 87 | 
 88 |   private long    interval;   // refresh interval in msec
 89 |   private long    lastTime;   // last time the command was performed
 90 |   private Map<String, String> environment; // env for the command execution
 91 |   private File dir;
 92 |   private Process process; // sub process used to execute the command
 93 |   private int exitCode;
 94 | 
 95 |   /**If or not script finished executing*/
 96 |   private volatile AtomicBoolean completed;
 97 |   
 98 |   public Shell() {
 99 |     this(0L);
100 |   }
101 |   
102 |   /**
103 |    * @param interval the minimum duration to wait before re-executing the 
104 |    *        command.
105 |    */
106 |   public Shell( long interval ) {
107 |     this.interval = interval;
108 |     this.lastTime = (interval<0) ? 0 : -interval;
109 |   }
110 |   
111 |   /** set the environment for the command 
112 |    * @param env Mapping of environment variables
113 |    */
114 |   protected void setEnvironment(Map<String, String> env) {
115 |     this.environment = env;
116 |   }
117 | 
118 |   /** set the working directory 
119 |    * @param dir The directory where the command would be executed
120 |    */
121 |   protected void setWorkingDirectory(File dir) {
122 |     this.dir = dir;
123 |   }
124 | 
125 |   /** check to see if a command needs to be executed and execute if needed */
126 |   protected void run() throws IOException {
127 |     if (lastTime + interval > Time.now())
128 |       return;
129 |     exitCode = 0; // reset for next run
130 |     runCommand();
131 |   }
132 | 
133 |   /** Run a command */
134 |   private void runCommand() throws IOException { 
135 |     ProcessBuilder builder = new ProcessBuilder(getExecString());
136 |     Timer timeOutTimer = null;
137 |     ShellTimeoutTimerTask timeoutTimerTask = null;
138 |     timedOut = new AtomicBoolean(false);
139 |     completed = new AtomicBoolean(false);
140 |     
141 |     if (environment != null) {
142 |       builder.environment().putAll(this.environment);
143 |     }
144 |     if (dir != null) {
145 |       builder.directory(this.dir);
146 |     }
147 |     
148 |     if (Shell.WINDOWS) {
149 |       synchronized (WindowsProcessLaunchLock) {
150 |         // To workaround the race condition issue with child processes
151 |         // inheriting unintended handles during process launch that can
152 |         // lead to hangs on reading output and error streams, we
153 |         // serialize process creation. More info available at:
154 |         // http://support.microsoft.com/kb/315939
155 |         process = builder.start();
156 |       }
157 |     } else {
158 |       process = builder.start();
159 |     }
160 | 
161 |     if (timeOutInterval > 0) {
162 |       timeOutTimer = new Timer("Shell command timeout");
163 |       timeoutTimerTask = new ShellTimeoutTimerTask(
164 |           this);
165 |       //One time scheduling.
166 |       timeOutTimer.schedule(timeoutTimerTask, timeOutInterval);
167 |     }
168 |     final BufferedReader errReader = 
169 |             new BufferedReader(new InputStreamReader(process
170 |                                                      .getErrorStream()));
171 |     BufferedReader inReader = 
172 |             new BufferedReader(new InputStreamReader(process
173 |                                                      .getInputStream()));
174 |     final StringBuffer errMsg = new StringBuffer();
175 |      
176 |     // read error and input streams as this would free up the buffers
177 |     // free the error stream buffer
178 |     Thread errThread = new Thread() {
179 |       @Override
180 |       public void run() {
181 |         try {
182 |           String line = errReader.readLine();
183 |           while((line != null) && !isInterrupted()) {
184 |             errMsg.append(line);
185 |             errMsg.append(System.getProperty("line.separator"));
186 |             line = errReader.readLine();
187 |           }
188 |         } catch(IOException ioe) {
189 |           LOG.warn("Error reading the error stream", ioe);
190 |         }
191 |       }
192 |     };
193 |     try {
194 |       errThread.start();
195 |     } catch (IllegalStateException ise) { }
196 |     try {
197 |       parseExecResult(inReader); // parse the output
198 |       // clear the input stream buffer
199 |       String line = inReader.readLine();
200 |       while(line != null) { 
201 |         line = inReader.readLine();
202 |       }
203 |       // wait for the process to finish and check the exit code
204 |       exitCode  = process.waitFor();
205 |       try {
206 |         // make sure that the error thread exits
207 |         errThread.join();
208 |       } catch (InterruptedException ie) {
209 |         LOG.warn("Interrupted while reading the error stream", ie);
210 |       }
211 |       completed.set(true);
212 |       //the timeout thread handling
213 |       //taken care in finally block
214 |       if (exitCode != 0) {
215 |         throw new ExitCodeException(exitCode, errMsg.toString());
216 |       }
217 |     } catch (InterruptedException ie) {
218 |       throw new IOException(ie.toString());
219 |     } finally {
220 |       if (timeOutTimer != null) {
221 |         timeOutTimer.cancel();
222 |       }
223 |       // close the input stream
224 |       try {
225 |         inReader.close();
226 |       } catch (IOException ioe) {
227 |         LOG.warn("Error while closing the input stream", ioe);
228 |       }
229 |       try {
230 |         if (!completed.get()) {
231 |           errThread.interrupt();
232 |           errThread.join();
233 |         }
234 |       } catch (InterruptedException ie) {
235 |         LOG.warn("Interrupted while joining errThread");
236 |       }
237 |       try {
238 |         errReader.close();
239 |       } catch (IOException ioe) {
240 |         LOG.warn("Error while closing the error stream", ioe);
241 |       }
242 |       process.destroy();
243 |       lastTime = Time.now();
244 |     }
245 |   }
246 | 
247 |   /** return an array containing the command name & its parameters */ 
248 |   protected abstract String[] getExecString();
249 |   
250 |   /** Parse the execution result */
251 |   protected abstract void parseExecResult(BufferedReader lines)
252 |   throws IOException;
253 | 
254 |   /** get the current sub-process executing the given command 
255 |    * @return process executing the command
256 |    */
257 |   public Process getProcess() {
258 |     return process;
259 |   }
260 | 
261 |   /** get the exit code 
262 |    * @return the exit code of the process
263 |    */
264 |   public int getExitCode() {
265 |     return exitCode;
266 |   }
267 | 
268 |   /**
269 |    * This is an IOException with exit code added.
270 |    */
271 |   public static class ExitCodeException extends IOException {
272 |     int exitCode;
273 |     
274 |     public ExitCodeException(int exitCode, String message) {
275 |       super(message);
276 |       this.exitCode = exitCode;
277 |     }
278 |     
279 |     public int getExitCode() {
280 |       return exitCode;
281 |     }
282 |   }
283 |   
284 |   /**
285 |    * A simple shell command executor.
286 |    * 
287 |    * <code>ShellCommandExecutor</code>should be used in cases where the output 
288 |    * of the command needs no explicit parsing and where the command, working 
289 |    * directory and the environment remains unchanged. The output of the command 
290 |    * is stored as-is and is expected to be small.
291 |    */
292 |   public static class ShellCommandExecutor extends Shell {
293 |     
294 |     private String[] command;
295 |     private StringBuffer output;
296 |     
297 |     
298 |     public ShellCommandExecutor(String[] execString) {
299 |       this(execString, null);
300 |     }
301 |     
302 |     public ShellCommandExecutor(String[] execString, File dir) {
303 |       this(execString, dir, null);
304 |     }
305 |    
306 |     public ShellCommandExecutor(String[] execString, File dir, 
307 |                                  Map<String, String> env) {
308 |       this(execString, dir, env , 0L);
309 |     }
310 | 
311 |     /**
312 |      * Create a new instance of the ShellCommandExecutor to execute a command.
313 |      * 
314 |      * @param execString The command to execute with arguments
315 |      * @param dir If not-null, specifies the directory which should be set
316 |      *            as the current working directory for the command.
317 |      *            If null, the current working directory is not modified.
318 |      * @param env If not-null, environment of the command will include the
319 |      *            key-value pairs specified in the map. If null, the current
320 |      *            environment is not modified.
321 |      * @param timeout Specifies the time in milliseconds, after which the
322 |      *                command will be killed and the status marked as timedout.
323 |      *                If 0, the command will not be timed out. 
324 |      */
325 |     public ShellCommandExecutor(String[] execString, File dir, 
326 |         Map<String, String> env, long timeout) {
327 |       command = execString.clone();
328 |       if (dir != null) {
329 |         setWorkingDirectory(dir);
330 |       }
331 |       if (env != null) {
332 |         setEnvironment(env);
333 |       }
334 |       timeOutInterval = timeout;
335 |     }
336 |         
337 | 
338 |     /** Execute the shell command. */
339 |     public void execute() throws IOException {
340 |       this.run();    
341 |     }
342 | 
343 |     @Override
344 |     public String[] getExecString() {
345 |       return command;
346 |     }
347 | 
348 |     @Override
349 |     protected void parseExecResult(BufferedReader lines) throws IOException {
350 |       output = new StringBuffer(1024);
351 |       char[] buf = new char[512];
352 |       int nRead;
353 |       while ( (nRead = lines.read(buf, 0, buf.length)) > 0 ) {
354 |         output.append(buf, 0, nRead);
355 |       }
356 |     }
357 |     
358 |     /** Get the output of the shell command.*/
359 |     public String getOutput() {
360 |       return (output == null) ? "" : output.toString();
361 |     }
362 | 
363 |     /**
364 |      * Returns the commands of this instance.
365 |      * Arguments with spaces in are presented with quotes round; other
366 |      * arguments are presented raw
367 |      *
368 |      * @return a string representation of the object.
369 |      */
370 |     @Override
371 |     public String toString() {
372 |       StringBuilder builder = new StringBuilder();
373 |       String[] args = getExecString();
374 |       for (String s : args) {
375 |         if (s.indexOf(' ') >= 0) {
376 |           builder.append('"').append(s).append('"');
377 |         } else {
378 |           builder.append(s);
379 |         }
380 |         builder.append(' ');
381 |       }
382 |       return builder.toString();
383 |     }
384 |   }
385 |   
386 |   /**
387 |    * To check if the passed script to shell command executor timed out or
388 |    * not.
389 |    * 
390 |    * @return if the script timed out.
391 |    */
392 |   public boolean isTimedOut() {
393 |     return timedOut.get();
394 |   }
395 |   
396 |   /**
397 |    * Set if the command has timed out.
398 |    * 
399 |    */
400 |   private void setTimedOut() {
401 |     this.timedOut.set(true);
402 |   }
403 |   
404 |   /** 
405 |    * Static method to execute a shell command. 
406 |    * Covers most of the simple cases without requiring the user to implement  
407 |    * the <code>Shell</code> interface.
408 |    * @param cmd shell command to execute.
409 |    * @return the output of the executed command.
410 |    */
411 |   public static String execCommand(String ... cmd) throws IOException {
412 |     return execCommand(null, cmd, 0L);
413 |   }
414 |   
415 |   /** 
416 |    * Static method to execute a shell command. 
417 |    * Covers most of the simple cases without requiring the user to implement  
418 |    * the <code>Shell</code> interface.
419 |    * @param env the map of environment key=value
420 |    * @param cmd shell command to execute.
421 |    * @param timeout time in milliseconds after which script should be marked timeout
422 |    * @return the output of the executed command.o
423 |    */
424 |   
425 |   public static String execCommand(Map<String, String> env, String[] cmd,
426 |       long timeout) throws IOException {
427 |     ShellCommandExecutor exec = new ShellCommandExecutor(cmd, null, env, 
428 |                                                           timeout);
429 |     exec.execute();
430 |     return exec.getOutput();
431 |   }
432 | 
433 |   /** 
434 |    * Static method to execute a shell command. 
435 |    * Covers most of the simple cases without requiring the user to implement  
436 |    * the <code>Shell</code> interface.
437 |    * @param env the map of environment key=value
438 |    * @param cmd shell command to execute.
439 |    * @return the output of the executed command.
440 |    */
441 |   public static String execCommand(Map<String,String> env, String ... cmd) 
442 |   throws IOException {
443 |     return execCommand(env, cmd, 0L);
444 |   }
445 |   
446 |   /**
447 |    * Timer which is used to timeout scripts spawned off by shell.
448 |    */
449 |   private static class ShellTimeoutTimerTask extends TimerTask {
450 | 
451 |     private Shell shell;
452 | 
453 |     public ShellTimeoutTimerTask(Shell shell) {
454 |       this.shell = shell;
455 |     }
456 | 
457 |     @Override
458 |     public void run() {
459 |       Process p = shell.getProcess();
460 |       try {
461 |         p.exitValue();
462 |       } catch (Exception e) {
463 |         //Process has not terminated.
464 |         //So check if it has completed 
465 |         //if not just destroy it.
466 |         if (p != null && !shell.completed.get()) {
467 |           shell.setTimedOut();
468 |           p.destroy();
469 |         }
470 |       }
471 |     }
472 |   }
473 | }
474 | 


--------------------------------------------------------------------------------
/test/com/minsheng/flume/source/TestMultiLineParser.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | import java.io.File;
  5 | import java.io.IOException;
  6 | import java.io.FileNotFoundException;
  7 | import java.util.List;
  8 | import java.util.ArrayList;
  9 | import java.util.Map;
 10 | import java.util.HashMap;
 11 | import java.util.concurrent.*;
 12 | import java.util.regex.Pattern;
 13 | import java.lang.*;
 14 | import java.io.PrintWriter;
 15 | import junit.framework.Assert;
 16 | import static org.junit.Assert.*;
 17 | 
 18 | import org.junit.Before;
 19 | import org.junit.After;
 20 | import org.junit.Test;
 21 | import org.apache.flume.Context;
 22 | 
 23 | import com.minsheng.flume.source.FileInfo;
 24 | import com.minsheng.flume.source.FlumeConstants;
 25 | import com.minsheng.flume.source.MultiLineParser;
 26 | import com.minsheng.flume.source.SimpleFileMonitor;
 27 | 
 28 | import org.slf4j.Logger;
 29 | import org.slf4j.LoggerFactory;
 30 | 
 31 | 
 32 | public class TestMultiLineParser {
 33 |   private static final Logger LOG = LoggerFactory
 34 |       .getLogger(TestMultiLineParser.class);
 35 |   
 36 |   private File target_dir_ = new File("/tmp/ms_flume/parser");
 37 |   
 38 |   private String first_regex_ = "\\[\\[.*";
 39 |   private String last_regex_ = ".*END\\]\\]";
 40 |   private Pattern start_line_pattern_ = Pattern.compile(first_regex_);
 41 |   private Pattern end_line_pattern_ = Pattern.compile(last_regex_);
 42 |   private Pattern record_include_pattern_ = 
 43 |       Pattern.compile(".*INCLUDE_RECORD.*");
 44 |   private Pattern record_exclude_pattern_ = 
 45 |       Pattern.compile(".*EXCLUDE_RECORD.*");
 46 |   
 47 |   private String record_include_str_ = "\tINCLUDE_RECORD";
 48 |   private String record_exclude_str_ = "\tEXCLUDE_RECORD";
 49 |   
 50 |   
 51 |   public void CreateMIXFile(File file) {
 52 |     PrintWriter writer = null;
 53 |     try {
 54 |       writer = new PrintWriter(file);
 55 |       String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line";
 56 |       String end_line = "\trecord end line END]]";
 57 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
 58 |       
 59 |       // write record 1  ---include if include_record = .*
 60 |       writer.println(start_line);
 61 |       writer.println(end_line);
 62 |       
 63 |       // write record 1  ---include
 64 |       writer.println(start_line);
 65 |       writer.print(record_include_str_);
 66 |       writer.println(end_line);
 67 |       
 68 |    // write record 1  ---exclude
 69 |       writer.println(start_line);
 70 |       writer.print(record_exclude_str_);
 71 |       writer.println(end_line);
 72 |       
 73 |       // write record 2  -- include
 74 |       writer.println(start_line);
 75 |       writer.print(record_include_str_);
 76 |       writer.println(mid_line);
 77 |       writer.println(end_line);
 78 |       
 79 |       // write record 2  -- exclude
 80 |       writer.println(start_line);
 81 |       writer.print(record_exclude_str_);
 82 |       writer.println(mid_line);
 83 |       writer.println(end_line);
 84 | 
 85 |       // write random line and tailer witout header
 86 |       // we should see this as a new record
 87 |       // writer.println("\tno header no header no header");
 88 |       // writer.println(end_line);
 89 |       
 90 |       
 91 |       // write random line and tailer witout header
 92 |       // we should never see this, because it will wait the end or start line;
 93 |       writer.println(start_line);
 94 |       writer.println("\t no tailer no tailer no tailer");
 95 |     } catch (FileNotFoundException e) {
 96 |       LOG.error("Write content failed at file: " + file);
 97 |     } finally {
 98 |       if (writer != null) {
 99 |         writer.close();
100 |         writer = null;
101 |       }
102 |     } 
103 |   }
104 |   
105 |   public void CreateFIRSTFile(File file) {
106 |     PrintWriter writer = null;
107 |     try {
108 |       writer = new PrintWriter(file);
109 |       String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line";
110 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
111 |       // write record 1
112 |       writer.println(start_line);
113 |       writer.print(record_include_str_);
114 |       // write record 2
115 |       writer.println(start_line);
116 |       writer.print(record_exclude_str_);
117 |       writer.println(mid_line);
118 |       
119 |       // write record 3
120 |       writer.println(start_line);
121 |       writer.println(mid_line);
122 |       writer.println(mid_line);
123 |       writer.println(mid_line);
124 |       writer.println(mid_line);
125 |       writer.println(mid_line);
126 |       writer.println(mid_line);
127 | 
128 |     } catch (FileNotFoundException e) {
129 |       LOG.error("Write content failed at file: " + file);
130 |     } finally {
131 |       if (writer != null) {
132 |         writer.close();
133 |         writer = null;
134 |       }
135 |     } 
136 |   }
137 |   
138 |   public void CreateLASTFile(File file) {
139 |     PrintWriter writer = null;
140 |     try {
141 |       writer = new PrintWriter(file);
142 |       String end_line = "record end line END]]";
143 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
144 |       // write record 1
145 |       writer.print(record_include_str_);
146 |       writer.println();
147 |       writer.println(end_line);
148 |       // write record 2
149 |       writer.println(mid_line);
150 |       writer.print(record_include_str_);
151 |       writer.println(end_line);
152 |       
153 |       // write record 3
154 | 
155 |       writer.println(mid_line);
156 |       writer.println(mid_line);
157 |       writer.println(mid_line);
158 |       writer.println();
159 |       writer.println(end_line);
160 |       
161 |       // useless line
162 |       writer.println(mid_line);
163 |       writer.println(mid_line);
164 |       writer.println();
165 |       writer.println();
166 |     } catch (FileNotFoundException e) {
167 |       LOG.error("Write content failed at file: " + file);
168 |     } finally {
169 |       if (writer != null) {
170 |         writer.close();
171 |         writer = null;
172 |       }
173 |     } 
174 |   }
175 |   
176 |   public void CreateNONEFile(File file) {
177 |     PrintWriter writer = null;
178 |     try {
179 |       writer = new PrintWriter(file);
180 |       String end_line = "[[INFO] 2013-01-17 13:54:32 random arbitray";
181 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
182 |       // write record 1
183 |       writer.println(end_line);
184 |       writer.print(record_exclude_str_);
185 |       writer.println();
186 |       writer.print(record_include_str_);
187 |       // write record 2
188 |       writer.println(end_line);
189 |       writer.println(mid_line);
190 |       writer.println();
191 |     } catch (FileNotFoundException e) {
192 |       LOG.error("Write content failed at file: " + file);
193 |     } finally {
194 |       if (writer != null) {
195 |         writer.close();
196 |         writer = null;
197 |       }
198 |     } 
199 |   }
200 |   
201 |   public void CreateFIRSTBigRecordFile(File file) {
202 |     PrintWriter writer = null;
203 |     try {
204 |       writer = new PrintWriter(file);
205 |       String start_line = "[[INFO] 2013-01-17 13:54:32 reord first line";
206 |       String mid_line = "\tmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm";
207 |       // write record 1
208 |       writer.println(start_line);
209 |       writer.println();
210 |       writer.println(mid_line);
211 |       writer.println();
212 |       // write record 3
213 |       writer.println(start_line);
214 |       writer.println(mid_line);
215 |       writer.println(mid_line);
216 |       writer.println(mid_line);
217 |       writer.println(mid_line);
218 |       writer.println(mid_line);
219 |       writer.println(mid_line);
220 |       writer.println(mid_line);
221 |       writer.println(mid_line);
222 |       writer.println(mid_line);
223 |       writer.println(mid_line);
224 |       writer.println();
225 | 
226 |     } catch (FileNotFoundException e) {
227 |       LOG.error("Write content failed at file: " + file);
228 |     } finally {
229 |       if (writer != null) {
230 |         writer.close();
231 |         writer = null;
232 |       }
233 |     } 
234 |   }
235 |   
236 |   @Before 
237 |   public void SetUp() {
238 |    LOG.info("Start test MultiLineParser");
239 |    LOG.info("Create test dir: " + target_dir_);
240 |    target_dir_.mkdirs();
241 |    Assert.assertTrue(target_dir_.exists());
242 |   }
243 |   
244 |   public void WriteFile(File file, List<String> records) {
245 |     PrintWriter writer = null;
246 |     try {
247 |       writer = new PrintWriter(file);
248 |       for (String line : records) {
249 |         writer.print(line);
250 |       }
251 |     } catch (FileNotFoundException e) {
252 |       LOG.error("Write content failed at file: " + file);
253 |     } finally {
254 |       if (writer != null) {
255 |         writer.close();
256 |         writer = null;
257 |       }
258 |     } 
259 |   }
260 |   
261 |   @Test
262 |   public void TestMixParse() {
263 |     LOG.info("Start test mix file mode");
264 |     Map<String, String> params = new HashMap<String, String>();
265 |     params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD.");
266 |     params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD.");
267 |     params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_);
268 |     params.put(FlumeConstants.LAST_LINE_PATTERN, last_regex_);
269 |     Context context = new Context(params);
270 |     
271 |     File mix_file = new File(target_dir_ + "/" + "mix.file");
272 |     File mix_file_handled = new File(target_dir_ + "/" + "mix.file.hd");
273 |     CreateMIXFile(mix_file);
274 |     
275 |     MultiLineParser parser = new MultiLineParser();
276 |     parser.Configure(context);
277 |     List<String> records = parser.GetNextBatchRecords(mix_file.toString(), 0L);
278 |     int cnter = 0;
279 |     int byte_length = 0;
280 |     for (String record : records) {
281 |       cnter++;
282 |       LOG.info("the \'" + cnter + "\' record=" + record);
283 |       byte_length += record.getBytes().length;
284 |     }
285 |     WriteFile(mix_file_handled, records);
286 |     LOG.info("***********Summary Info**************");
287 |     LOG.info("File: " + mix_file);
288 |     LOG.info("Processed Record Num:" + cnter);
289 |     LOG.info("File Length: " + mix_file.length());
290 |     LOG.info("Process Bytes:" + byte_length);
291 |     LOG.info("*************************************");
292 |   }
293 |   
294 |   @Test
295 |   public void TestFirstParse() {
296 |     LOG.info("Start test first file mode");
297 |     Map<String, String> params = new HashMap<String, String>();
298 |     params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD.");
299 |     params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD.");
300 |     params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_);
301 |     Context context = new Context(params);
302 |     
303 |     File first_file = new File(target_dir_ + "/" + "first.file");
304 |     File first_file_handled = new File(target_dir_ + "/" + "first.file.hd");
305 |     CreateFIRSTFile(first_file);
306 |     
307 |     MultiLineParser parser = new MultiLineParser();
308 |     parser.Configure(context);
309 |     List<String> records = parser.GetNextBatchRecords(first_file.toString(), 0L);
310 |     int cnter = 0;
311 |     int byte_length = 0;
312 |     for (String record : records) {
313 |       cnter++;
314 |       LOG.info("the \'" + cnter + "\' record=" + record);
315 |       byte_length += record.getBytes().length;
316 |     }
317 |     WriteFile(first_file_handled, records);
318 |     LOG.info("***********Summary Info**************");
319 |     LOG.info("File: " + first_file);
320 |     LOG.info("Processed Record Num:" + cnter);
321 |     LOG.info("File Length: " + first_file.length());
322 |     LOG.info("Process Bytes:" + byte_length);
323 |     LOG.info("*************************************");
324 |   }
325 |   
326 |   @Test
327 |   public void TestLastParse() {
328 |     LOG.info("Start test last file mode");
329 |     Map<String, String> params = new HashMap<String, String>();
330 |     params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD.");
331 |     params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_);
332 |     Context context = new Context(params);
333 |     
334 |     File last_file = new File(target_dir_ + "/" + "last.file");
335 |     File last_file_handled = new File(target_dir_ + "/" + "last.file.hd");
336 |     CreateLASTFile(last_file);
337 |     
338 |     MultiLineParser parser = new MultiLineParser();
339 |     parser.Configure(context);
340 |     List<String> records = parser.GetNextBatchRecords(last_file.toString(), 0L);
341 |     int cnter = 0;
342 |     int byte_length = 0;
343 |     for (String record : records) {
344 |       cnter++;
345 |       LOG.info("the \'" + cnter + "\' record=" + record);
346 |       byte_length += record.getBytes().length;
347 |     }
348 |     WriteFile(last_file_handled, records);
349 |     LOG.info("***********Summary Info**************");
350 |     LOG.info("File: " + last_file);
351 |     LOG.info("Processed Record Num:" + cnter);
352 |     LOG.info("File Length: " + last_file.length());
353 |     LOG.info("Process Bytes:" + byte_length);
354 |     LOG.info("*************************************");
355 |   }
356 |   
357 | 
358 |   @Test
359 |   public void TestNoneParse() {
360 |     LOG.info("Start test none  mode");
361 |     Map<String, String> params = new HashMap<String, String>();
362 |     params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD.");
363 |     params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD.");
364 |     params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_);
365 |     Context context = new Context(params);
366 |     
367 |     File none_file = new File(target_dir_ + "/" + "none.file");
368 |     File none_file_handled = new File(target_dir_ + "/" + "none.file.hd");
369 |     CreateNONEFile(none_file);
370 |     
371 |     MultiLineParser parser = new MultiLineParser();
372 |     parser.Configure(context);
373 |     List<String> records = parser.GetNextBatchRecords(none_file.toString(), 0L);
374 |     int cnter = 0;
375 |     int byte_length = 0;
376 |     for (String record : records) {
377 |       cnter++;
378 |       LOG.info("the \'" + cnter + "\' record=" + record);
379 |       byte_length += record.getBytes().length;
380 |     }
381 |     WriteFile(none_file_handled, records);
382 |     LOG.info("***********Summary Info**************");
383 |     LOG.info("File: " + none_file);
384 |     LOG.info("Processed Record Num:" + cnter);
385 |     LOG.info("File Length: " + none_file.length());
386 |     LOG.info("Process Bytes:" + byte_length);
387 |     LOG.info("*************************************");
388 |   }
389 |   
390 |   @Test
391 |   public void TestFirstBigLine() {
392 |     LOG.info("Start test first file mode with very small buffersize");
393 |     Map<String, String> params = new HashMap<String, String>();
394 |     params.put(FlumeConstants.FILE_CONTENT_INCLUDE, ".*INCLUDE_RECORD.");
395 |     params.put(FlumeConstants.FILE_CONTENT_EXCLUDE, ".*EXCLUDE_RECORD.");
396 |     params.put(FlumeConstants.FIRST_LINE_PATTERN, first_regex_);
397 |     params.put("read_buffer_size", "10");
398 |     params.put("max_read_buffer_size", "200");
399 |     params.put("max_record_size", "100");
400 |     Context context = new Context(params);
401 |     
402 |     File first_file = new File(target_dir_ + "/" + "first.bigrecord.file");
403 |     File first_file_handled = new File(target_dir_ + "/" + 
404 |                                         "first.bigrecord.file.hd");
405 |     CreateFIRSTBigRecordFile(first_file);
406 |     
407 |     MultiLineParser parser = new MultiLineParser();
408 |     parser.Configure(context);
409 |     int cnter = 0;
410 |     int max_round = 20;
411 |     Long offset = 0L;
412 |     
413 |     List<String> total_records = new ArrayList<String>();
414 |     while (offset < first_file.length() && cnter++ < max_round) {
415 |       List<String> records = parser.GetNextBatchRecords(first_file.toString(),
416 |           offset);
417 |       int record_cnter = 0;
418 |       int byte_length = 0;
419 |       for (String record : records) {
420 |         record_cnter++;
421 |         LOG.info("the \'" + record_cnter + "\' record=" + record);
422 |         byte_length += record.getBytes().length;
423 |       }
424 |       offset += byte_length;
425 |       total_records.addAll(records);
426 |     }
427 |     WriteFile(first_file_handled, total_records);  
428 |     
429 |     LOG.info("***********Summary Info**************");
430 |     LOG.info("File: " + first_file);
431 |     LOG.info("Processed Record Num:" + cnter);
432 |     LOG.info("File Length: " + first_file.length());
433 |     LOG.info("Process Bytes:" + offset);
434 |     LOG.info("*************************************");
435 |   }
436 | }


--------------------------------------------------------------------------------
/src/com/minsheng/flume/source/MultiLineParser.java:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014 Minsheng.Corp. All rights reserved
  2 | // Author: peng.he.ia@gmail.com <he peng>
  3 | package com.minsheng.flume.source;
  4 | 
  5 | import java.io.FileNotFoundException;
  6 | import java.io.IOException;
  7 | import java.io.RandomAccessFile;
  8 | import java.util.ArrayList;
  9 | import java.util.List;
 10 | import java.util.regex.Pattern;
 11 | import java.util.regex.Matcher;
 12 | 
 13 | import org.apache.flume.Context;
 14 | import org.slf4j.Logger;
 15 | import org.slf4j.LoggerFactory;
 16 | 
 17 | import com.minsheng.util.StringUtil;
 18 | 
 19 | public class MultiLineParser extends FileParser {
 20 |   private static final Logger LOG = LoggerFactory
 21 |       .getLogger(MultiLineParser.class);
 22 | 
 23 |   private int RECORD_INIT_SIZE = 1024;
 24 |   private int max_record_size_ = FlumeConstants.MAX_RECORD_LENGH;
 25 |   private int max_buffer_size_ = FlumeConstants.MAX_READ_BUFFER_SIZE;
 26 |   // start from 1MB
 27 |   private int buffer_size_ = FlumeConstants.READ_BUFFER_SIZE;
 28 |   private byte[] read_buffer_ = null;
 29 |   private String content_str_ = null;
 30 |   /*
 31 |    * when use ParseFirst mode, we need a signal to indicate: hi, body,this is
 32 |    * the last line (without the end line pattern), we should treat it as a
 33 |    * record, we do not need to wait data
 34 |    */
 35 |   private boolean is_end_of_file_ = false;
 36 | 
 37 |   /*
 38 |    * this pattern is for filter record; default all records are accepted;
 39 |    */
 40 |   
 41 |   private String default_content_include_str_ = "[\\s\\S]*"; // match all,
 42 |   private String file_content_include_pattern_str_ 
 43 |         = default_content_include_str_;
 44 |   // default no exclude
 45 |   private String file_content_exclude_pattern_str_ = null;
 46 |   private Pattern record_include_pattern_ = null;
 47 |   private Pattern record_exclude_pattern_ = null;
 48 | 
 49 |   /*
 50 |    * all combination:
 51 |    * 
 52 |    * MIX -- first & last --ret1 = line.match(first),ret2 = line.match(last) (1)
 53 |    * ret1 == true && ret2 == true current line is a new record (2) ret1 == true
 54 |    * && ret2 == false read next line and do the process again (3) ret1 == false
 55 |    * && ret2 == true current line is the last line in current record (4) ret1 ==
 56 |    * false && ret2 == false invalid line means maybe the "first_line_pattern"
 57 |    * and the "last_line_pattern" should be modified for matchingFIRST -- first
 58 |    * --ret1 = line.match(first) (1) ret1 == true current line is the start line
 59 |    * of a new record; it means the previous line is the last line of the current
 60 |    * record; (2) ret1 == false current line belongs to the current record, read
 61 |    * next to processLAST -- last --ret1 = line.match(last) (1) ret1 == true
 62 |    * current line is the last line of current record; and next line is the start
 63 |    * line of the next record; (2) ret1 == false current line belongs to the
 64 |    * current record; read next line to processNONE -- none --use default
 65 |    * strategy, treat every line as a new recorddefault is none
 66 |    */
 67 |   enum ParseType {
 68 |     MIX, FIRST, LAST, NONE
 69 |   };
 70 | 
 71 |   private ParseType parse_type_ = ParseType.NONE;
 72 |   private String first_line_pattern_str_ = null;
 73 |   private String last_line_pattern_str_ = null;
 74 |   private Pattern first_line_pattern_ = null;
 75 |   private Pattern last_line_pattern_ = null;
 76 | 
 77 |   /*
 78 |    * parse_type
 79 |    */
 80 | 
 81 |   public MultiLineParser() {
 82 |     super();
 83 |   }
 84 | 
 85 |   @Override
 86 |   public List<String> GetNextBatchRecords(String file_name, Long offset) {
 87 |     // TODO Auto-generated method stub
 88 |     RandomAccessFile file_reader = null;
 89 |     try {
 90 |       file_reader = new RandomAccessFile(file_name, "r");
 91 |       if (file_reader.length() < offset) {
 92 |         LOG.warn("File length small than read offset, truncate(rename)? offset="
 93 |             + offset + " file_length=" + file_reader.length());
 94 |         return new ArrayList<String>();
 95 |       }
 96 |       file_reader.seek(offset.longValue());
 97 |       if (LOG.isDebugEnabled()) {
 98 |         LOG.debug("Random Read: file {}, start {}, buffer_size {}" + buffer_size_,
 99 |             file_name, offset);
100 |       }
101 |       int read_bytes = file_reader.read(read_buffer_, 0,
102 |           read_buffer_.length);
103 |       // if read_bytes == -1, means stream is at end of file,
104 |       List<String> records = ParseRecord(read_bytes);
105 |       if (records == null) {
106 |         // the buffer is two large, we treat this as error
107 |         LOG.error("SUPER-WARNING, we meet super-huge line, which is larger than " 
108 |              + buffer_size_ + "MB, the following process will skip this file:" + 
109 |              file_name + " read_bytes=" + read_bytes + " offset=" + offset);
110 |         return new ArrayList<String>();
111 |       }
112 |       return records;
113 |     } catch (FileNotFoundException e) {
114 |       LOG.warn("target monitor file not exist, " + file_name);
115 |     } catch (IOException e) {
116 |       LOG.error("Read file error due to " + e.getMessage());
117 |     } finally {
118 |       if (file_reader != null) {
119 |         try {
120 |           file_reader.close();
121 |           file_reader = null;
122 |         } catch (IOException e) {
123 | 
124 |         }
125 |       }
126 |     }
127 |     return new ArrayList<String>();
128 |   }
129 | 
130 |   /*
131 |    * MIX -- first & last --ret1 = line.match(first),ret2 = line.match(last) (1)
132 |    * ret1 == true && ret2 == true current line is a new record (2) ret1 == true
133 |    * && ret2 == false read next line and do the process again (3) ret1 == false
134 |    * && ret2 == true current line is the last line in current record (4) ret1 ==
135 |    * false && ret2 == false invalid line means maybe the "first_line_pattern"
136 |    * and the "last_line_pattern" should be modified for matching
137 |    */
138 |   private List<String> ParseMIX(String lines[], int end_idx) {
139 |     StringBuilder record = new StringBuilder(RECORD_INIT_SIZE);
140 |     List<String> records = new ArrayList<String>();
141 |     for (int i = 0; i < end_idx; i++) {
142 |       boolean match_first = first_line_pattern_.matcher(lines[i]).matches();
143 |       boolean match_last = last_line_pattern_.matcher(lines[i]).matches();
144 |       if (LOG.isDebugEnabled()) {
145 |         LOG.debug("Process Line: " + lines[i]);
146 |       }
147 |       if (match_first) {
148 |         if (record.length() > 0) {
149 |           // means the previous line is also the end line of current record
150 |           records.add(record.toString());
151 |           if (LOG.isDebugEnabled()) {
152 |             LOG.debug("MATCH START, Get a new record(which miss its end line):"
153 |                 + record.toString());
154 |           }
155 |         }
156 |         // create a new record
157 |         record = new StringBuilder(RECORD_INIT_SIZE);
158 |         if (match_last) {
159 |           // current line is a record(first last both matched);
160 |           // records.add(lines[i] + FlumeConstants.LINE_SEP);
161 |           // we use our Split method, no need to add LINE_SEP
162 |           records.add(lines[i]);
163 |           if (LOG.isDebugEnabled()) {
164 |             LOG.debug("MATCH START-LAST, Get a new record:" + lines[i]);
165 |           }
166 |         } else {
167 |           record.append(lines[i]);
168 |           // record.append(FlumeConstants.LINE_SEP);
169 |         }
170 |       } else if (match_last) {
171 |         record.append(lines[i]);
172 |         // record.append(FlumeConstants.LINE_SEP);
173 |         records.add(record.toString());
174 |         if (LOG.isDebugEnabled()) {
175 |           LOG.debug("MATCH LAST, Get a new record:" + record.toString());
176 |         }
177 | 
178 |         record = null;
179 |         record = new StringBuilder(RECORD_INIT_SIZE);
180 |       } else {
181 |         record.append(lines[i]);
182 |         // this is a middle line, we recovery it's '\n' character
183 |         // in parserecord, new Split called
184 |         // record.append(FlumeConstants.LINE_SEP);
185 |       }
186 |     }
187 |     
188 |     HandleSpecialSituation(record, records);
189 |     record = null;
190 |     return records;
191 |   }
192 | 
193 |   private List<String> ParseFIRST(String[] lines, int end_idx) {
194 |     StringBuilder record = new StringBuilder(RECORD_INIT_SIZE);
195 |     List<String> records = new ArrayList<String>();
196 |     for (int i = 0; i < end_idx; i++) {
197 |       boolean match_first = first_line_pattern_.matcher(lines[i]).matches();
198 |       if (match_first) {
199 |         if (record.length() > 0) {
200 |           // means the previous line is also the end line of current record
201 |           records.add(record.toString());
202 |           // create a new record
203 |           if (LOG.isDebugEnabled()) {
204 |             LOG.debug("MATCH first, get a new record: " + record.toString());
205 |           }
206 |           record = null;
207 |           record = new StringBuilder(RECORD_INIT_SIZE);
208 |         }
209 |         record.append(lines[i]);
210 |         // record.append(FlumeConstants.LINE_SEP);
211 |       } else {
212 |         record.append(lines[i]);
213 |         // record.append(FlumeConstants.LINE_SEP);
214 |       }
215 |     }
216 |     
217 |     HandleSpecialSituation(record, records);
218 |     record = null;
219 |     return records;
220 |   }
221 | 
222 |   private List<String> ParseLAST(String[] lines, int end_idx) {
223 |     StringBuilder record = new StringBuilder(RECORD_INIT_SIZE);
224 |     List<String> records = new ArrayList<String>();
225 |     for (int i = 0; i < end_idx; i++) {
226 |       boolean match_last = last_line_pattern_.matcher(lines[i]).matches();
227 |       if (match_last) {
228 |         record.append(lines[i]);
229 |         // record.append(FlumeConstants.LINE_SEP);
230 |         records.add(record.toString());
231 |         LOG.debug("MATCH last, get a new record" + record.toString());
232 |         record = null;
233 |         record = new StringBuilder(RECORD_INIT_SIZE);
234 |       } else {
235 |         record.append(lines[i]);
236 |         // record.append(FlumeConstants.LINE_SEP);
237 |       }
238 |     }
239 |     /*
240 |      * means the last record is incomplete, we need more data to handle the last
241 |      * record, just skip if (record.length() >0) {
242 |      * 
243 |      * }
244 |      */
245 |     record = null;
246 |     return records;
247 |   }
248 |   
249 |   
250 |   private boolean ExpandReadBuffer() {
251 |     if (LOG.isDebugEnabled()) {
252 |       LOG.debug("ExpandReadBuffer called, current buffer size:" + buffer_size_);
253 |     }
254 |     if ((buffer_size_ << 1) > max_buffer_size_) {
255 |       // this is the only place we return null instead of empty list
256 |       return false;
257 |     }
258 |     read_buffer_ = null;
259 |     buffer_size_ = buffer_size_ * 2;
260 |     read_buffer_ = new byte[buffer_size_];
261 |     return true;
262 |   }
263 |   
264 |   private void HandleSpecialSituation(StringBuilder record, 
265 |  List<String> records) {
266 |     // end of file
267 |     if (is_end_of_file_ && record.length() > 0) {
268 |       // this is the end line of the record
269 |       records.add(record.toString());
270 |       if (LOG.isDebugEnabled()) {
271 |         LOG.debug("end of file in First or mix mode, get a new record"
272 |             + record.toString());
273 |       }
274 |     }
275 |     
276 |     /*
277 |      * we processed all the lines, but we do not get a record until now, it
278 |      * means this must be a very large record, currently we just get a piece of
279 |      * this record. on assumption this situation is rarely
280 |      */
281 |     if (records.size() == 0) {
282 |       if (record.length() > max_record_size_) {
283 |         records.add(record.toString());
284 |         if (LOG.isDebugEnabled()) {
285 |           LOG.debug("Big record, get a new record(part of) " + record.toString());
286 |         }
287 |       } 
288 |     }
289 |   }
290 | 
291 |   private List<String> ParseNONE(String[] lines, int end_idx) {
292 |     if (LOG.isDebugEnabled()) {
293 |       LOG.debug("ParseNONE mode, handle line num" + lines.length);
294 |     }
295 |     List<String> records = new ArrayList<String>();
296 |     for (int i = 0; i < end_idx; i++) {
297 |       // take each line as a record
298 |       records.add(lines[i]);
299 |     }
300 |     return records;
301 |   }
302 | 
303 |   private List<String> ParseRecord(int read_bytes) {
304 |     if (-1 == read_bytes) {
305 |       // no more data can be read
306 |       return new ArrayList<String>();
307 |     }
308 | 
309 |     // only used for ParseFirst
310 |     is_end_of_file_ = false;
311 |     if (read_bytes < read_buffer_.length) {
312 |       is_end_of_file_ = true;
313 |     }
314 |     
315 |     content_str_ = new String(read_buffer_, 0, read_bytes);
316 |     /*
317 |      * special situation: very-very big line, content in read_buffer_ is part of
318 |      * a line, in this situation, we return null, and double the buffer size
319 |      */
320 |     if (!is_end_of_file_ && content_str_.indexOf(FlumeConstants.LINE_SEP) == -1) {
321 |       if (LOG.isDebugEnabled()) {
322 |         LOG.debug("cannot find a linesep in this content, content_size: {}, content ={}",
323 |             content_str_.length(), content_str_);
324 |       }
325 |       if (!ExpandReadBuffer()) {
326 |         return null;
327 |       }
328 |       return new ArrayList<String>();
329 |     }
330 |     
331 |     int last_line_sep = content_str_.lastIndexOf(FlumeConstants.LINE_SEP);
332 |     
333 |     /**when use String.split,it will split "123\n\n" to:
334 |      *    "123"
335 |      * whe use StringUtil.Split, it split "123\n\n"to:
336 |      *   "123\n"
337 |      *   "\n"
338 |      * we need StringUtil.Split
339 |      * */  
340 |     // String[] lines = content_str_.split(FlumeConstants.LINE_SEP);
341 |     String[] lines = StringUtil.Split(content_str_, FlumeConstants.LINE_SEP);
342 | 
343 |     /*
344 |      * this means,the last line is a complete line, we can handle it otherwise,
345 |      * it is just part of the line, drop it in this process round
346 |      */
347 |     int end_idx = lines.length;
348 |     if (!is_end_of_file_ && last_line_sep != (content_str_.length() - 1)) {
349 |       // the last line is not a complete line
350 |       end_idx = end_idx - 1;
351 |     }
352 |    
353 |     List<String> records = null;
354 |     switch (parse_type_) {
355 |     case MIX:
356 |       records = ParseMIX(lines, end_idx);
357 |       break;
358 |     case FIRST:
359 |       records = ParseFIRST(lines, end_idx);
360 |       break;
361 |     case LAST:
362 |       records = ParseLAST(lines, end_idx);
363 |       break;
364 |     default:
365 |       records = ParseNONE(lines, end_idx);
366 |       break;
367 |     }
368 |     
369 |     if (!is_end_of_file_ && records.size() == 0 && content_str_.length() < max_record_size_) {
370 |       /**
371 |        * this is a tough situation current data in lines is part of a record,
372 |        * but current data is small than max_record_size, this will make it into
373 |        * the following loop: (1) read data into buffer; (2) parse data, found no
374 |        * new record( records.size == 0 current lines is a part of the record)
375 |        * (3) current data is small than max record, so no record generate, and
376 |        * buffer do not double the size; (4) next round, go to the (1),with no
377 |        * change of all meata data(offset,buffersize) data
378 |        * */
379 |       if (LOG.isDebugEnabled()) {
380 |         LOG.debug("read lines {}, but no record, expand buffer", lines.length);
381 |       }
382 |       if (!ExpandReadBuffer()) {
383 |         return null;
384 |       }
385 |     }
386 |     return records;
387 |   }
388 | 
389 |   /*
390 |    * @return false -- this record is valid, keep it true -- this record is
391 |    * invalid, drop it
392 |    */
393 |   public boolean ShouldDrop(String record) {
394 |     Matcher in_matcher = record_include_pattern_.matcher(record);
395 | 
396 |     if (!in_matcher.matches()) {
397 |       // not in white list, drop it;(default pattern matches all)
398 |       return true;
399 |     }
400 | 
401 |     if (record_exclude_pattern_ != null) {
402 |       Matcher ex_matcher = record_exclude_pattern_.matcher(record);
403 |       if (ex_matcher.matches()) {
404 |         // in black list, should drop
405 |         return true;
406 |       }
407 |     }
408 |     // in white list and not in black list, this record is legal, keep it
409 |     return false;
410 |   }
411 | 
412 |   @Override
413 |   public void Configure(Context context) {
414 |     // TODO Auto-generated method stub
415 |     LOG.info("Config MultiLineParser");
416 |     buffer_size_ = context.getInteger("read_buffer_size", 
417 |         FlumeConstants.READ_BUFFER_SIZE).intValue();
418 |     read_buffer_ = new byte[buffer_size_];
419 |     max_buffer_size_ = context.getInteger("max_read_buffer_size", 
420 |         FlumeConstants.MAX_READ_BUFFER_SIZE).intValue();
421 |     max_record_size_ = context.getInteger("max_record_size",
422 |         FlumeConstants.MAX_RECORD_LENGH);
423 |     
424 |     file_content_include_pattern_str_ = context.getString(
425 |         FlumeConstants.FILE_CONTENT_INCLUDE, default_content_include_str_);
426 |     file_content_exclude_pattern_str_ = context
427 |         .getString(FlumeConstants.FILE_CONTENT_EXCLUDE);
428 | 
429 |     record_include_pattern_ = Pattern
430 |         .compile(file_content_include_pattern_str_);
431 |     if (file_content_exclude_pattern_str_ != null) {
432 |       record_exclude_pattern_ = Pattern
433 |           .compile(file_content_exclude_pattern_str_);
434 |     }
435 | 
436 |     first_line_pattern_str_ = context
437 |         .getString(FlumeConstants.FIRST_LINE_PATTERN);
438 |     last_line_pattern_str_ = context
439 |         .getString(FlumeConstants.LAST_LINE_PATTERN);
440 |     if (first_line_pattern_str_ != null) {
441 |       first_line_pattern_ = Pattern.compile(first_line_pattern_str_);
442 |     }
443 |     if (last_line_pattern_str_ != null) {
444 |       last_line_pattern_ = Pattern.compile(last_line_pattern_str_);
445 |     }
446 | 
447 |     if (first_line_pattern_ != null && last_line_pattern_ != null) {
448 |       parse_type_ = ParseType.MIX;
449 |     } else if (first_line_pattern_ != null) {
450 |       parse_type_ = ParseType.FIRST;
451 |     } else if (last_line_pattern_ != null) {
452 |       parse_type_ = ParseType.LAST;
453 |     } else {
454 |       parse_type_ = ParseType.NONE;
455 |     }
456 | 
457 |     StringBuilder builder = new StringBuilder();
458 |     builder.append("Config MultiLineParser with [");
459 |     builder.append("read_buffer_size(init)=");
460 |     builder.append(buffer_size_);
461 |     builder.append(",max_buffer_size=");
462 |     builder.append(max_buffer_size_);
463 |     builder.append(",max_record_size=");
464 |     builder.append(max_record_size_);
465 |     builder.append(",first_line_pattern_str_=");
466 |     builder.append(first_line_pattern_str_);
467 |     builder.append(",last_line_pattern_str_=");
468 |     builder.append(last_line_pattern_str_);
469 |     builder.append(",file_content_include_pattern_str_=");
470 |     builder.append(file_content_include_pattern_str_);
471 |     builder.append(",record_include_pattern_=");
472 |     builder.append(record_include_pattern_ == null ? "null"
473 |         : record_include_pattern_.toString());
474 |     builder.append(",file_content_exclude_pattern_str_=");
475 |     builder.append(file_content_exclude_pattern_str_);
476 |     builder.append(",record_exclude_pattern_=");
477 |     builder.append(record_exclude_pattern_ == null ? "null"
478 |         : record_exclude_pattern_.toString());
479 |     builder.append(", parse_type=");
480 |     builder.append("" + parse_type_);
481 |     builder.append("]");
482 |     LOG.info(builder.toString());
483 |     builder = null;
484 |   }
485 | }


--------------------------------------------------------------------------------