├── .gitignore
├── LICENSE.txt
├── README.md
├── pom.xml
└── src
└── main
├── java
└── com
│ └── lucidworks
│ └── dq
│ ├── data
│ ├── DateChecker.java
│ ├── DeleteByIds.java
│ ├── DocCount.java
│ ├── DumpIds.java
│ ├── EmptyFieldStats.java
│ ├── SolrToCsv.java
│ ├── SolrToSolr.java
│ ├── TermCodepointStats.java
│ ├── TermStats.java
│ └── TestArgs.java
│ ├── diff
│ ├── DiffEmptyFieldStats.java
│ ├── DiffIds.java
│ ├── DiffSchema.java
│ └── DiffSolrConfig.java
│ ├── logs
│ ├── LogEntry.java
│ ├── LogEntryBase.java
│ ├── LogEntryFromSolr.java
│ ├── LogEntryGroup.java
│ ├── LogEntryGroupFromSolr.java
│ ├── LogEntryReference.java
│ ├── LogEntryReferenceBase.java
│ ├── LogFile.java
│ ├── LogFileBase.java
│ ├── LogFileFromSolr.java
│ ├── LogFileRepo.java
│ └── LogFileRepoBase.java
│ ├── schema
│ ├── Schema.java
│ ├── SchemaBase.java
│ ├── SchemaFromLocalCore_broken.java
│ ├── SchemaFromRest.java
│ ├── SchemaFromRestAdHock.java
│ ├── SchemaFromXml.java
│ ├── SchemalessPlus.java
│ ├── SolrConfig.java
│ ├── SolrConfigBase.java
│ └── SolrConfigFromXml.java
│ ├── util
│ ├── CharUtils.java
│ ├── CmdLineLauncher.java
│ ├── DateUtils.java
│ ├── HasDescription.java
│ ├── HashAndShard.java
│ ├── IO_Utils.java
│ ├── LLR.java
│ ├── LLR.java-new
│ ├── SetUtils.java
│ ├── SolrUtils.java
│ ├── StatsUtils.java
│ ├── StringUtils.java
│ └── TupleEntropy.java
│ └── zk_experiment
│ └── ZkSmartClient.java
└── resources
├── DQ-Prototype-and-SolrJ.key
├── DQ-Prototype-and-SolrJ.pdf
├── sample-reports
├── README.txt
├── dates-curve-fitting.txt
├── llr-larger-sample.txt
├── llr-tiny-sample.txt
├── populated-fields-diff.txt
├── populated-fields-single-extended-options.txt
├── populated-fields-single.txt
├── report-terms-via-termsReqHandler.txt
├── schema-info-diff.txt
├── schema-info-single.txt
├── term-counts.txt
├── term-lengths.txt
├── unicode-format1.txt
└── unicode-format2.txt
├── schema-461.xml
└── schema-481.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Eclipse
4 | .classpath
5 | .project
6 | .settings
7 |
8 | # Package Files #
9 | *.jar
10 | *.war
11 | *.ear
12 | /target
13 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
2 |
3 | You may obtain a copy of the License at:
4 | http://www.apache.org/licenses/LICENSE-2.0
5 |
6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
7 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.lucidworks
5 | data-quality-java
6 | jar
7 | 1.0-SNAPSHOT
8 | data-quality-java
9 | http://maven.apache.org
10 |
11 |
12 | Data-Quality Checks
13 |
14 |
15 | 4.10.3
16 | 1.6.4
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | org.apache.solr
25 | solr-solrj
26 | ${solr.version}
27 |
28 |
29 | org.apache.solr
30 | solr-core
31 | ${solr.version}
32 |
33 |
34 |
35 |
36 | commons-cli
37 | commons-cli
38 | 1.2
39 |
40 |
41 |
42 |
43 | com.google.code.gson
44 | gson
45 | 2.2.4
46 |
47 |
48 |
49 |
50 | org.codehaus.jackson
51 | jackson-mapper-asl
52 | 1.6.4
53 |
54 |
55 |
56 |
59 |
60 |
72 |
73 |
74 | junit
75 | junit
76 | 3.8.1
77 | test
78 |
79 |
80 |
81 |
82 |
83 | commons-logging
84 | commons-logging
85 | 1.1.1
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 | maven-compiler-plugin
97 | 2.3.2
98 |
99 | 1.7
100 | 1.7
101 |
102 |
103 |
104 |
105 |
106 | org.apache.maven.plugins
107 | maven-shade-plugin
108 |
109 | 2.2
110 |
111 |
112 | package
113 |
114 | shade
115 |
116 |
117 |
118 | false
119 |
120 |
121 | com.lucidworks.dq.util.CmdLineLauncher
122 |
123 |
124 |
125 |
126 |
127 |
128 | *:*
129 |
130 | META-INF/*.SF
131 | META-INF/*.DSA
132 | META-INF/*.RSA
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DeleteByIds.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.data;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStreamReader;
7 | import java.io.PrintWriter;
8 | import java.io.StringWriter;
9 | import java.nio.charset.Charset;
10 | import java.nio.charset.CharsetDecoder;
11 | import java.nio.charset.CodingErrorAction;
12 | import java.text.MessageFormat;
13 | import java.text.NumberFormat;
14 | import java.util.Arrays;
15 | import java.util.Collection;
16 | import java.util.LinkedHashMap;
17 | import java.util.LinkedHashSet;
18 | import java.util.LinkedList;
19 | import java.util.List;
20 | import java.util.Map;
21 | import java.util.Map.Entry;
22 | import java.util.Set;
23 |
24 | import org.apache.commons.cli.CommandLine;
25 | import org.apache.commons.cli.CommandLineParser;
26 | import org.apache.commons.cli.HelpFormatter;
27 | import org.apache.commons.cli.OptionBuilder;
28 | import org.apache.commons.cli.Options;
29 | import org.apache.commons.cli.ParseException;
30 | import org.apache.commons.cli.PosixParser;
31 | import org.apache.solr.client.solrj.SolrServerException;
32 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
33 |
34 | import com.lucidworks.dq.util.HasDescription;
35 | import com.lucidworks.dq.util.SetUtils;
36 | import com.lucidworks.dq.util.SolrUtils;
37 |
38 | public class DeleteByIds /*implements HasDescription*/ {
39 |
40 | static String HELP_WHAT_IS_IT = "Delete documents by their ID, either passed on the command line, or from a file, or from standard in / stdin.";
41 | static String HELP_USAGE = "DeleteByIds -u http://localhost:8983/collection1 --ids 1234 5678 ... or --input_file ids_to_delete.txt";
42 |
43 | public static String getShortDescription() {
44 | return HELP_WHAT_IS_IT;
45 | }
46 |
47 | static int DEFAULT_BATCH_SIZE = 1000;
48 |
49 | static Options options;
50 |
51 | // We use List instead of Set because that's what SolrJ expects in deleteById
52 | static List readIdsFromFile( String targetFile, CharsetDecoder deccoder ) throws IOException {
53 | List ids = new LinkedList();
54 | BufferedReader in = null;
55 | if( null!=targetFile && ! targetFile.equals("-") ) {
56 | in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder));
57 | } else {
58 | in = new BufferedReader(new InputStreamReader(System.in, deccoder));
59 | }
60 | String line;
61 | while ((line = in.readLine()) != null) {
62 | // skip completely blank lines, but doesn't do any trimming
63 | if ( line.length()<1 ) {
64 | continue;
65 | }
66 | ids.add( line );
67 | }
68 | in.close();
69 | return ids;
70 | }
71 |
72 | static void helpAndExit() {
73 | helpAndExit( null, 1 );
74 | }
75 | static void helpAndExit( String optionalError, int errorCode ) {
76 | HelpFormatter formatter = new HelpFormatter();
77 | if ( null==optionalError ) {
78 | System.err.println( HELP_WHAT_IS_IT );
79 | }
80 | else {
81 | // log.error( optionalError );
82 | System.err.println( optionalError );
83 | }
84 | // stdout
85 | //formatter.printHelp( HELP_USAGE, options, true );
86 | // stderr
87 | PrintWriter pw = new PrintWriter(System.err);
88 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
89 | pw.flush();
90 | System.exit( errorCode );
91 | }
92 |
93 | public static void main( String [] argv ) throws Exception {
94 |
95 | options = new Options();
96 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
97 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
98 | options.addOption( "p", "port", true, "Port for Solr, default=8983" );
99 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
100 | options.addOption( "f", "input_file", true, "File to read IDs from, one ID per line (skips 0 length lines, not counting newlines) (Use \"-\" for stdout / standard out)" );
101 | options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" );
102 | options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" );
103 |
104 | options.addOption( OptionBuilder.withLongOpt( "batch_size" )
105 | .withDescription( "Batch size, 1=doc-by-doc, 0=all-at-once (be careful memory-wise), default="+DEFAULT_BATCH_SIZE )
106 | .hasArg()
107 | .withType( Number.class ) // NOT Long.class
108 | .create( "b" )
109 | );
110 |
111 | options.addOption( OptionBuilder.withLongOpt( "ids" )
112 | .withDescription( "Pass one or more IDs on the command line" )
113 | .hasArgs() // PLURAL!
114 | .create( "i" )
115 | );
116 |
117 | if ( argv.length < 1 ) {
118 | helpAndExit( "Must specifify at least url or host", 1 );
119 | }
120 | CommandLine cmd = null;
121 | try {
122 | CommandLineParser parser = new PosixParser();
123 | cmd = parser.parse( options, argv );
124 | }
125 | catch( ParseException exp ) {
126 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
127 | }
128 | String fullUrl = cmd.getOptionValue( "url" );
129 | String host = cmd.getOptionValue( "host" );
130 | String port = cmd.getOptionValue( "port" );
131 | String coll = cmd.getOptionValue( "collection" );
132 | if ( null==fullUrl && null==host ) {
133 | helpAndExit( "Must specifify at least url or host (b)", 3 );
134 | }
135 | if ( null!=fullUrl && null!=host ) {
136 | helpAndExit( "Must not specifify both url and host", 4 );
137 | }
138 | // Init
139 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
140 | HttpSolrServer solr;
141 | if ( null!=fullUrl ) {
142 | solr = SolrUtils.getServer( fullUrl );
143 | }
144 | else {
145 | // Utils handle null values
146 | solr = SolrUtils.getServer( host, port, coll );
147 | }
148 |
149 | int batchSize = DEFAULT_BATCH_SIZE;
150 | Long batchObj = (Long) cmd.getParsedOptionValue( "batch_size" );
151 | if ( null!=batchObj ) {
152 | if ( batchObj.longValue() < 0L ) {
153 | helpAndExit( "batch_size must be >= 0", 5 );
154 | }
155 | batchSize = batchObj.intValue();
156 | }
157 |
158 | String encodingStr = cmd.getOptionValue( "encoding" );
159 | // Didn't set encoding
160 | if ( null==encodingStr || encodingStr.trim().length()<1 ) {
161 | encodingStr = "UTF-8";
162 | }
163 | // Did set encoding
164 | else {
165 | // But didn't set input file
166 | if ( null == cmd.getOptionValue( "input_file" ) ) {
167 | helpAndExit( "Encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 6 );
168 | }
169 | }
170 | boolean strictEncoding = true;
171 | if(cmd.hasOption("loose_encoding")) {
172 | strictEncoding = false;
173 | if ( null == cmd.getOptionValue( "input_file" ) ) {
174 | helpAndExit( "loose_encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 7 );
175 | }
176 | }
177 | // Setup IO encoding
178 | Charset charset = Charset.forName( encodingStr );
179 | // Input uses Decoder
180 | CharsetDecoder decoder = charset.newDecoder();
181 | if ( strictEncoding ) {
182 | decoder.onMalformedInput( CodingErrorAction.REPORT );
183 | }
184 |
185 | String inputFile = cmd.getOptionValue( "input_file" );
186 |
187 | String [] cmdLineIds = cmd.getOptionValues( "ids" );
188 |
189 | if ( null==inputFile && null==cmdLineIds ) {
190 | helpAndExit( "Must use at least one of --input_file or --ids ..., OK to use both. For standard in / stdin use --input_file -", 8 );
191 | }
192 |
193 | // We use List instead of Set because that's what SolrJ expects in deleteById
194 | List ids = new LinkedList();
195 | if ( null!=inputFile ) {
196 | ids = readIdsFromFile( inputFile, decoder );
197 | }
198 | if ( null!=cmdLineIds ) {
199 | ids.addAll( Arrays.asList( cmdLineIds ) );
200 | }
201 |
202 | if ( batchSize < 1 ) {
203 | solr.deleteById(ids);
204 | }
205 | else if ( batchSize == 1 ) {
206 | for ( String id : ids ) {
207 | solr.deleteById( id );
208 | }
209 | }
210 | else {
211 | for ( int start = 0; start < ids.size(); start += batchSize ) {
212 | int end = start + batchSize;
213 | if ( end > ids.size() ) {
214 | end = ids.size();
215 | }
216 | List sublist = ids.subList( start, end );
217 | solr.deleteById( sublist );
218 | }
219 | }
220 | // Wait for disk commit and new searcher to fire up
221 | // TODO: maybe have other commit options, although this is probably the safest
222 | solr.commit( true, true );
223 |
224 | }
225 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DocCount.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.data;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.text.MessageFormat;
6 | import java.text.NumberFormat;
7 | import java.util.Collection;
8 | import java.util.LinkedHashMap;
9 | import java.util.LinkedHashSet;
10 | import java.util.Map;
11 | import java.util.Map.Entry;
12 | import java.util.Set;
13 |
14 | import org.apache.commons.cli.CommandLine;
15 | import org.apache.commons.cli.CommandLineParser;
16 | import org.apache.commons.cli.HelpFormatter;
17 | import org.apache.commons.cli.Options;
18 | import org.apache.commons.cli.ParseException;
19 | import org.apache.commons.cli.PosixParser;
20 | import org.apache.solr.client.solrj.SolrServerException;
21 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
22 |
23 | import com.lucidworks.dq.util.HasDescription;
24 | import com.lucidworks.dq.util.SetUtils;
25 | import com.lucidworks.dq.util.SolrUtils;
26 |
27 | public class DocCount /*implements HasDescription*/ {
28 |
29 | static String HELP_WHAT_IS_IT = "Count of active documents in a collection to standard out / stdout.";
30 | static String HELP_USAGE = "DocCount -u http://localhost:8983 (output sent to stdout)";
31 |
32 | public static String getShortDescription() {
33 | return HELP_WHAT_IS_IT;
34 | }
35 |
36 | static Options options;
37 |
38 | HttpSolrServer solrServer;
39 |
40 | static void helpAndExit() {
41 | helpAndExit( null, 1 );
42 | }
43 | static void helpAndExit( String optionalError, int errorCode ) {
44 | HelpFormatter formatter = new HelpFormatter();
45 | if ( null==optionalError ) {
46 | System.err.println( HELP_WHAT_IS_IT );
47 | }
48 | else {
49 | // log.error( optionalError );
50 | System.err.println( optionalError );
51 | }
52 | // stdout
53 | //formatter.printHelp( HELP_USAGE, options, true );
54 | // stderr
55 | PrintWriter pw = new PrintWriter(System.err);
56 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
57 | pw.flush();
58 | System.exit( errorCode );
59 | }
60 |
61 | public static void main( String [] argv ) throws Exception {
62 |
63 | options = new Options();
64 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
65 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
66 | options.addOption( "p", "port", true, "Port for Solr, default=8983" );
67 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
68 | if ( argv.length < 1 ) {
69 | helpAndExit( "Must specifify at least url or host", 1 );
70 | }
71 | CommandLine cmd = null;
72 | try {
73 | CommandLineParser parser = new PosixParser();
74 | cmd = parser.parse( options, argv );
75 | }
76 | catch( ParseException exp ) {
77 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
78 | }
79 | String fullUrl = cmd.getOptionValue( "url" );
80 | String host = cmd.getOptionValue( "host" );
81 | String port = cmd.getOptionValue( "port" );
82 | String coll = cmd.getOptionValue( "collection" );
83 | if ( null==fullUrl && null==host ) {
84 | helpAndExit( "Must specifify at least url or host (b)", 3 );
85 | }
86 | if ( null!=fullUrl && null!=host ) {
87 | helpAndExit( "Must not specifify both url and host", 4 );
88 | }
89 | // Init
90 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
91 | HttpSolrServer solr;
92 | if ( null!=fullUrl ) {
93 | solr = SolrUtils.getServer( fullUrl );
94 | }
95 | else {
96 | // Utils handle null values
97 | solr = SolrUtils.getServer( host, port, coll );
98 | }
99 |
100 | long count = SolrUtils.getTotalDocCount( solr );
101 | System.out.println( count );
102 |
103 | }
104 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DumpIds.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.data;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.text.MessageFormat;
6 | import java.text.NumberFormat;
7 | import java.util.Collection;
8 | import java.util.LinkedHashMap;
9 | import java.util.LinkedHashSet;
10 | import java.util.Map;
11 | import java.util.Map.Entry;
12 | import java.util.Set;
13 |
14 | import org.apache.commons.cli.CommandLine;
15 | import org.apache.commons.cli.CommandLineParser;
16 | import org.apache.commons.cli.HelpFormatter;
17 | import org.apache.commons.cli.Options;
18 | import org.apache.commons.cli.ParseException;
19 | import org.apache.commons.cli.PosixParser;
20 | import org.apache.solr.client.solrj.SolrServerException;
21 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
22 |
23 | import com.lucidworks.dq.util.HasDescription;
24 | import com.lucidworks.dq.util.SetUtils;
25 | import com.lucidworks.dq.util.SolrUtils;
26 |
27 | public class DumpIds /*implements HasDescription*/ {
28 |
29 | static String HELP_WHAT_IS_IT = "Dump all the IDs from a collection to standard out / stdout.";
30 | static String HELP_USAGE = "DumpIds -u http://localhost:8983 (output sent to stdout)";
31 | // final static Logger log = LoggerFactory.getLogger( FieldStats.class );
32 |
33 | public static String getShortDescription() {
34 | return HELP_WHAT_IS_IT;
35 | }
36 |
37 | static Options options;
38 |
39 | HttpSolrServer solrServer;
40 |
41 | // TODO: refactor to allow options to be settable after constructor is run
42 | public DumpIds( HttpSolrServer server ) throws SolrServerException {
43 | this.solrServer = server;
44 | }
45 | public HttpSolrServer getSolrServer() {
46 | return this.solrServer;
47 | }
48 |
49 | void dumpIds() throws SolrServerException {
50 | Set ids = SolrUtils.getAllIds( getSolrServer() );
51 | for ( String id : ids ) {
52 | System.out.println( id );
53 | }
54 | }
55 |
56 | static void helpAndExit() {
57 | helpAndExit( null, 1 );
58 | }
59 | static void helpAndExit( String optionalError, int errorCode ) {
60 | HelpFormatter formatter = new HelpFormatter();
61 | if ( null==optionalError ) {
62 | // log.info( HELP_WHAT_IS_IT );
63 | System.out.println( HELP_WHAT_IS_IT );
64 | }
65 | else {
66 | // log.error( optionalError );
67 | System.err.println( optionalError );
68 | }
69 | formatter.printHelp( HELP_USAGE, options, true );
70 | System.exit( errorCode );
71 | }
72 |
73 | public static void main( String [] argv ) throws Exception {
74 |
75 | options = new Options();
76 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
77 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
78 | options.addOption( "p", "port", true, "Port for Solr, default=8983" );
79 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
80 | if ( argv.length < 1 ) {
81 | helpAndExit( "Must specifify at least url or host", 1 );
82 | }
83 | CommandLine cmd = null;
84 | try {
85 | CommandLineParser parser = new PosixParser();
86 | cmd = parser.parse( options, argv );
87 | }
88 | catch( ParseException exp ) {
89 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
90 | }
91 | String fullUrl = cmd.getOptionValue( "url" );
92 | String host = cmd.getOptionValue( "host" );
93 | String port = cmd.getOptionValue( "port" );
94 | String coll = cmd.getOptionValue( "collection" );
95 | if ( null==fullUrl && null==host ) {
96 | helpAndExit( "Must specifify at least url or host (b)", 3 );
97 | }
98 | if ( null!=fullUrl && null!=host ) {
99 | helpAndExit( "Must not specifify both url and host", 4 );
100 | }
101 | // Init
102 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
103 | HttpSolrServer solr;
104 | if ( null!=fullUrl ) {
105 | solr = SolrUtils.getServer( fullUrl );
106 | }
107 | else {
108 | // Utils handle null values
109 | solr = SolrUtils.getServer( host, port, coll );
110 | }
111 |
112 | // System.out.println( "Solr = " + solr.getBaseURL() );
113 | // EmptyFieldStats fs = new EmptyFieldStats( solr );
114 | DumpIds di = new DumpIds( solr );
115 | di.dumpIds();
116 |
117 | }
118 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/diff/DiffEmptyFieldStats.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.diff;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.text.MessageFormat;
6 | import java.text.NumberFormat;
7 | import java.util.LinkedHashSet;
8 | import java.util.Set;
9 |
10 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
11 |
12 | import com.lucidworks.dq.data.EmptyFieldStats;
13 | import com.lucidworks.dq.schema.Schema;
14 | import com.lucidworks.dq.schema.SchemaFromRest;
15 | import com.lucidworks.dq.schema.SchemaFromXml;
16 | import com.lucidworks.dq.util.HasDescription;
17 | import com.lucidworks.dq.util.SetUtils;
18 | import com.lucidworks.dq.util.SolrUtils;
19 |
20 | import org.apache.commons.cli.CommandLine;
21 | import org.apache.commons.cli.CommandLineParser;
22 | import org.apache.commons.cli.HelpFormatter;
23 | import org.apache.commons.cli.Options;
24 | import org.apache.commons.cli.ParseException;
25 | import org.apache.commons.cli.PosixParser;
26 |
27 | public class DiffEmptyFieldStats /*implements HasDescription*/ {
28 | static String HELP_WHAT_IS_IT = "Compare fields that aren't fully populated between two cores/collections.";
29 | static String HELP_USAGE = "DiffEmptyFieldStats";
30 | // final static Logger log = LoggerFactory.getLogger( TermStats.class );
31 |
32 | public static String getShortDescription() {
33 | return HELP_WHAT_IS_IT;
34 | }
35 |
36 | static Options options;
37 |
38 | public static String generateReport( EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB, String labelA, String labelB ) throws Exception {
39 | StringWriter sw = new StringWriter();
40 | PrintWriter out = new PrintWriter(sw);
41 |
42 | out.println( "========== Differences Report ==========" );
43 | out.println( "Schema A = " + labelA );
44 | out.println( "Schema B = " + labelB );
45 |
46 | out.println();
47 | addSimpleStatToReport( out, "A: Total Active Docs", fieldStatsA.getTotalDocCount() );
48 | addSimpleStatToReport( out, "B: Total Active Docs", fieldStatsB.getTotalDocCount() );
49 |
50 | out.println();
51 | Set fieldsA = fieldStatsA.getAllFieldNames();
52 | Set fieldsB = fieldStatsB.getAllFieldNames();
53 | addSetComparisonToReport( out, fieldsA, fieldsB, "All Fields" );
54 |
55 | out.println();
56 | addAllFieldStatsToReport( out, fieldStatsA, fieldStatsB );
57 |
58 |
59 | // // Simple Values
60 | // // -------------
61 | // // Name
62 | // String nameA = schemaA.getSchemaName();
63 | // String nameB = schemaB.getSchemaName();
64 | // addStringComparisionToReport( out, nameA, nameB, "Schema Name" );
65 | // // Version
66 | // float versA = schemaA.getSchemaVersion();
67 | // float versB = schemaB.getSchemaVersion();
68 | // out.print( "Schema Version: " );
69 | // if ( versA == versB ) {
70 | // out.println( "Both = '" + versA + "'" );
71 | // }
72 | // else {
73 | // out.println( "\tA = '" + versA + "'" );
74 | // out.println( "\tB = '" + versB + "'" );
75 | // }
76 |
77 | // // Complex Values
78 | // // --------------
79 | // // Fields
80 | // Set fieldsA = schemaA.getAllSchemaFieldNames();
81 | // Set fieldsB = schemaB.getAllSchemaFieldNames();
82 | // addSetComparisonToReport( out, fieldsA, fieldsB, "Fields" );
83 | // // Dynamic Field Patterns
84 | // // TODO: Verify that order is being preserved through the entire process
85 | // Set patternsA = schemaA.getAllDynamicFieldPatterns();
86 | // Set patternsB = schemaB.getAllDynamicFieldPatterns();
87 | // addSetComparisonToReport( out, patternsA, patternsB, "Dynamic-Field Patterns", true );
88 |
89 | String outStr = sw.toString();
90 | return outStr;
91 | }
92 |
93 | static void addAllFieldStatsToReport( PrintWriter out, EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB ) {
94 | Set fieldsA = fieldStatsA.getAllFieldNames();
95 | Set fieldsB = fieldStatsB.getAllFieldNames();
96 | Set allFields = SetUtils.union_nonDestructive( fieldsA, fieldsB );
97 |
98 | // Fully Populated
99 | Set fullFieldsA = fieldStatsA.getFullyPopulatedIndexedFields();
100 | Set fullFieldsB = fieldStatsB.getFullyPopulatedIndexedFields();
101 | // Subset
102 | Set fullFieldsBoth = SetUtils.intersection_nonDestructive( fullFieldsA, fullFieldsB );
103 |
104 | // Empty
105 | Set emptyFieldsA = fieldStatsA.getFieldsWithNoIndexedValues();
106 | Set emptyFieldsB = fieldStatsB.getFieldsWithNoIndexedValues();
107 | // Subset
108 | Set emptyFieldsBoth = SetUtils.intersection_nonDestructive( emptyFieldsA, emptyFieldsB );
109 |
110 | // All Other Fields
111 | // We can only summarize the subsets of completely full and completely empty fields in both collections
112 | // All other fields need to be listed in the detailed report
113 | Set detailFields = new LinkedHashSet<>();
114 | detailFields.addAll( allFields );
115 | detailFields.removeAll( fullFieldsBoth );
116 | detailFields.removeAll( emptyFieldsBoth );
117 |
118 | out.println( "Populated at 100% in Both A and B: " + fullFieldsBoth );
119 | out.println();
120 | out.println( "No Indexed Values / 0% in Both A and B: " + emptyFieldsBoth );
121 | out.println();
122 |
123 | out.println( "Partially Populated Fields and Percentages, A / B:" );
124 | for ( String name : detailFields ) {
125 | Long countA = null;
126 | if ( fieldStatsA.getIndexedValueCounts().containsKey(name) ) {
127 | countA = fieldStatsA.getIndexedValueCounts().get(name);
128 | }
129 | Double percentA = null;
130 | if ( fieldStatsA.getIndexedValuePercentages().containsKey(name) ) {
131 | percentA = fieldStatsA.getIndexedValuePercentages().get( name );
132 | }
133 | Long countB = null;
134 | if ( fieldStatsB.getIndexedValueCounts().containsKey(name) ) {
135 | countB = fieldStatsB.getIndexedValueCounts().get(name);
136 | }
137 | Double percentB = null;
138 | if ( fieldStatsB.getIndexedValuePercentages().containsKey(name) ) {
139 | percentB = fieldStatsB.getIndexedValuePercentages().get( name );
140 | }
141 | addStatsPairAndPercentToReport( out, name, countA, countB, percentA, percentB, "\t" );
142 | }
143 | }
144 |
145 | static void addSimpleStatToReport( PrintWriter out, String label, long stat ) {
146 | String statStr = NumberFormat.getNumberInstance().format( stat );
147 | out.println( "" + label + ": " + statStr );
148 | }
149 |
150 | static void addStringComparisionToReport( PrintWriter out, String thingA, String thingB, String attrLabel ) {
151 | out.print( attrLabel + ":" );
152 | if ( null!=thingA && null!=thingB && thingA.equals(thingB) ) {
153 | out.println( " Both = '" + thingA + "'" );
154 | }
155 | else {
156 | out.println();
157 | out.println( "\tA = '" + thingA + "'" );
158 | out.println( "\tB = '" + thingB + "'" );
159 | }
160 | }
161 |
162 | static void addStatsPairAndPercentToReport( PrintWriter out, String label, Long statA, Long statB, Double percA, Double percB, String optIndent ) {
163 | if ( null!=optIndent ) {
164 | out.print( optIndent );
165 | }
166 | String statStrA = null!=statA ? NumberFormat.getNumberInstance().format( statA ) : "(not in A)";
167 | String statStrB = null!=statB ? NumberFormat.getNumberInstance().format( statB ) : "(not in B)";
168 | String percStrA = null!=percA ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percA ) : "";
169 | String percStrB = null!=percB ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percB ) : "";
170 | out.println( "" + label + ": " + statStrA + percStrA + " / " + statStrB + percStrB );
171 | }
172 |
173 |
174 | static void addSetComparisonToReport( PrintWriter out, Set setA, Set setB, String attrLabel ) {
175 | addSetComparisonToReport( out, setA, setB, attrLabel, false );
176 | }
177 | static void addSetComparisonToReport( PrintWriter out, Set setA, Set setB, String attrLabel, boolean checkOrder ) {
178 | Set inBoth = SetUtils.intersection_nonDestructive( setA, setB );
179 | Set inAOnly = SetUtils.inAOnly_nonDestructive( setA, setB );
180 | Set inBOnly = SetUtils.inBOnly_nonDestructive( setA, setB );
181 | out.println();
182 | out.print( attrLabel + ":" );
183 | if ( inBoth.isEmpty() && inAOnly.isEmpty() && inBOnly.isEmpty() ) {
184 | out.println( " None!" );
185 | }
186 | else {
187 | out.println();
188 | if ( ! inBoth.isEmpty() ) {
189 | if ( ! checkOrder ) {
190 | out.println( "\tIn both = '" + inBoth + "'" );
191 | }
192 | else {
193 | // Note: Sets don't normally perserve order but I've been careful
194 | // to use LinkedHashSet and LinkedHashMap, which DO
195 | Set commonA = SetUtils.intersection_nonDestructive( setA, setB );
196 | Set commonB = SetUtils.intersection_nonDestructive( setB, setA );
197 | boolean inSameOrder = SetUtils.sameAndInSameOrder( commonA, commonB );
198 | if ( inSameOrder ) {
199 | out.println( "\tIn both and SAME relative order = '" + inBoth + "'" );
200 | }
201 | else {
202 | out.println( "\tIn both but DIFFERENT relative order:" );
203 | out.println( "\t\tCommon, order in A = '" + commonA + "'" );
204 | out.println( "\t\tCommon, order in B = '" + commonB + "'" );
205 | }
206 | }
207 | }
208 | if ( ! inAOnly.isEmpty() ) {
209 | out.println( "\tA only = '" + inAOnly + "'" );
210 | }
211 | if ( ! inBOnly.isEmpty() ) {
212 | out.println( "\tB only = '" + inBOnly + "'" );
213 | }
214 | }
215 | }
216 |
217 | static void helpAndExit() {
218 | helpAndExit( null, 1 );
219 | }
220 | static void helpAndExit( String optionalError, int errorCode ) {
221 | HelpFormatter formatter = new HelpFormatter();
222 | if ( null==optionalError ) {
223 | // log.info( HELP_WHAT_IS_IT );
224 | System.out.println( HELP_WHAT_IS_IT );
225 | }
226 | else {
227 | // log.error( optionalError );
228 | System.err.println( optionalError );
229 | }
230 | formatter.printHelp( HELP_USAGE, options, true );
231 | System.exit( errorCode );
232 | }
233 |
234 | public static void main( String[] argv ) throws Exception {
235 | options = new Options();
236 | options.addOption( "u", "url_a", true, "URL for first Solr, OR set host, port and possibly collection" );
237 | options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" );
238 | options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" );
239 | options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" );
240 | options.addOption( "U", "url_b", true, "URL for second Solr, OR set host, port and possibly collection" );
241 | options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" );
242 | options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" );
243 | options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" );
244 |
245 | if ( argv.length < 1 ) {
246 | helpAndExit();
247 | }
248 | CommandLine cmd = null;
249 | try {
250 | CommandLineParser parser = new PosixParser();
251 | // CommandLineParser parser = new DefaultParser();
252 | cmd = parser.parse( options, argv );
253 | }
254 | catch( ParseException exp ) {
255 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
256 | }
257 | // Already using -h for host, don't really need help, just run with no options
258 | //if ( cmd.hasOption("help") ) {
259 | // helpAndExit();
260 | //}
261 |
262 | String fullUrlA = cmd.getOptionValue( "url_a" );
263 | String hostA = cmd.getOptionValue( "host_a" );
264 | String portA = cmd.getOptionValue( "port_a" );
265 | String collA = cmd.getOptionValue( "collection_a" );
266 | if ( null==fullUrlA && null==hostA ) {
267 | helpAndExit( "Must specifify at least url or host for first Solr", 3 );
268 | }
269 | if ( null!=fullUrlA && null!=hostA ) {
270 | helpAndExit( "Must not specifify both url and host for first Solr", 4 );
271 | }
272 |
273 | String fullUrlB = cmd.getOptionValue( "url_b" );
274 | String hostB = cmd.getOptionValue( "host_b" );
275 | String portB = cmd.getOptionValue( "port_b" );
276 | String collB = cmd.getOptionValue( "collection_b" );
277 | if ( null==fullUrlB && null==hostB ) {
278 | helpAndExit( "Must specifify at least url or host for second Solr", 3 );
279 | }
280 | if ( null!=fullUrlB && null!=hostB ) {
281 | helpAndExit( "Must not specifify both url and host for second Solr", 4 );
282 | }
283 |
284 | // Init
285 | // HttpSolrServer solrA = SolrUtils.getServer( HOST1, PORT1, COLL1 );
286 | HttpSolrServer solrA;
287 | if ( null!=fullUrlA ) {
288 | solrA = SolrUtils.getServer( fullUrlA );
289 | }
290 | else {
291 | // Utils handle null values
292 | solrA = SolrUtils.getServer( hostA, portA, collA );
293 | }
294 | System.out.println( "First Solr / Solr A = " + solrA.getBaseURL() );
295 | // HttpSolrServer solrB = SolrUtils.getServer( HOST2, PORT2, COLL2 );
296 | HttpSolrServer solrB;
297 | if ( null!=fullUrlB ) {
298 | solrB = SolrUtils.getServer( fullUrlB );
299 | }
300 | else {
301 | // Utils handle null values
302 | solrB = SolrUtils.getServer( hostB, portB, collB );
303 | }
304 | System.out.println( "Second Solr / Solr B = " + solrB.getBaseURL() );
305 |
306 | String labelA = solrA.getBaseURL();
307 | EmptyFieldStats fieldsStatsA = new EmptyFieldStats( solrA );
308 | String reportA = fieldsStatsA.generateReport( labelA );
309 |
310 | String labelB = solrB.getBaseURL();
311 | EmptyFieldStats fieldsStatsB = new EmptyFieldStats( solrB );
312 | String reportB = fieldsStatsB.generateReport( labelB );
313 |
314 | System.out.println( "========== Individual Reports ==========" );
315 | System.out.println();
316 | System.out.println( "---------- A: " + labelA + " ----------" );
317 | System.out.println( reportA );
318 | System.out.println( "---------- B: " + labelB + " ----------" );
319 | System.out.println( reportB );
320 |
321 | String report = generateReport( fieldsStatsA, fieldsStatsB, labelA, labelB );
322 | System.out.println( report );
323 | }
324 |
325 |
326 | static String HOST0 = "localhost";
327 | static String PORT0 = "8983";
328 | static String COLL0 = "demo_shard1_replica1";
329 | static String URL0 = "http://" + HOST0 + ":" + PORT0 + "/solr/" + COLL0;
330 | // + "/select?q=*:*&rows=" + ROWS + "&fl=id&wt=json&indent=on"
331 |
332 | static String HOST1 = "localhost";
333 | static String PORT1 = "8984"; // "8983";
334 | static String COLL1 = "collection1";
335 | static String URL1 = "http://" + HOST1 + ":" + PORT1 + "/solr/" + COLL1;
336 |
337 | static String HOST2 = "localhost";
338 | static String PORT2 = "8985"; // "8983";
339 | static String COLL2 = "collection1";
340 | static String URL2 = "http://" + HOST1 + ":" + PORT2 + "/solr/" + COLL2;
341 |
342 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/diff/DiffIds.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.diff;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.FileOutputStream;
7 | import java.io.IOException;
8 | import java.io.InputStreamReader;
9 | import java.io.OutputStreamWriter;
10 | import java.io.PrintStream;
11 | import java.io.PrintWriter;
12 | import java.nio.charset.Charset;
13 | import java.nio.charset.CharsetDecoder;
14 | import java.nio.charset.CharsetEncoder;
15 | import java.nio.charset.CodingErrorAction;
16 | import java.util.LinkedHashSet;
17 | import java.util.Set;
18 |
19 | import org.apache.commons.cli.CommandLine;
20 | import org.apache.commons.cli.CommandLineParser;
21 | import org.apache.commons.cli.HelpFormatter;
22 | import org.apache.commons.cli.Options;
23 | import org.apache.commons.cli.ParseException;
24 | import org.apache.commons.cli.PosixParser;
25 | import org.apache.solr.client.solrj.SolrQuery;
26 | import org.apache.solr.client.solrj.SolrServerException;
27 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
28 | import org.apache.solr.client.solrj.response.QueryResponse;
29 | import org.apache.solr.common.SolrDocument;
30 |
31 | import com.lucidworks.dq.util.HasDescription;
32 | import com.lucidworks.dq.util.SetUtils;
33 | import com.lucidworks.dq.util.SolrUtils;
34 |
35 | public class DiffIds /*implements HasDescription*/ {
36 | static String HELP_WHAT_IS_IT = "Compare IDs between two cores/collections.";
37 | static String HELP_USAGE = "DiffIds";
38 | // final static Logger log = LoggerFactory.getLogger( TermStats.class );
39 |
40 | static String MODE_REPORT = "full_report";
41 | static String MODE_A_ONLY = "a_only";
42 | static String MODE_B_ONLY = "b_only";
43 | static String MODE_INTERSECT = "intersect";
44 | static String MODE_UNION = "union";
45 | static String DEFAULT_MODE = MODE_REPORT;
46 | static Set VALID_MODES = new LinkedHashSet() {{
47 | add( MODE_REPORT );
48 | add( MODE_A_ONLY );
49 | add( MODE_B_ONLY );
50 | add( MODE_INTERSECT );
51 | add( MODE_UNION );
52 | }};
53 |
54 | public static String getShortDescription() {
55 | return HELP_WHAT_IS_IT;
56 | }
57 |
58 | public static String NL = System.getProperty("line.separator");
59 |
60 | // command line options
61 | static Options options;
62 |
63 | static Set readIdsFromFile( File targetFile, CharsetDecoder deccoder ) throws IOException {
64 | Set ids = new LinkedHashSet();
65 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder));
66 | String line;
67 | while ((line = in.readLine()) != null) {
68 | // skip completely blank lines, but doesn't do any trimming
69 | if ( line.length()<1 ) {
70 | continue;
71 | }
72 | ids.add( line );
73 | }
74 | in.close();
75 | return ids;
76 | }
77 |
78 | static void helpAndExit() {
79 | helpAndExit( null, 1 );
80 | }
81 | static void helpAndExit( String optionalError, int errorCode ) {
82 | HelpFormatter formatter = new HelpFormatter();
83 | if ( null==optionalError ) {
84 | // log.info( HELP_WHAT_IS_IT );
85 | System.err.println( HELP_WHAT_IS_IT );
86 | }
87 | else {
88 | // log.error( optionalError );
89 | System.err.println( optionalError );
90 | }
91 | // stdout
92 | //formatter.printHelp( HELP_USAGE, options, true );
93 | // stderr
94 | PrintWriter pw = new PrintWriter(System.err);
95 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
96 | pw.flush();
97 | System.exit( errorCode );
98 | }
99 |
100 | public static void main( String[] argv ) throws SolrServerException, IOException {
101 |
102 | options = new Options();
103 | options.addOption( "u", "url_a", true, "URL for first Solr, Eg http://localhost:8983/solr/collection1, OR set host, port and possibly collection" );
104 | options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" );
105 | options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" );
106 | options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" );
107 |
108 | options.addOption( "U", "url_b", true, "URL for second Solr, Eg http://localhost:8983/solr/collection2, OR set host, port and possibly collection" );
109 | options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" );
110 | options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" );
111 | options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" );
112 |
113 | options.addOption( "f", "file_a", true, "Read IDs for A from a text file, one ID per line (skips 0 length lines, not counting newlines)" );
114 | options.addOption( "F", "file_b", true, "Read IDs for B from a text file, one ID per line (skips 0 length lines, not counting newlines)" );
115 |
116 | options.addOption( "o", "output_file", true, "Output file to create for the full report or ID list (default or \"-\" is stdout / standard out)" );
117 | options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" );
118 | options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" );
119 |
120 | options.addOption( "m", "mode", true,
121 | "What to output:"
122 | + " \"" + MODE_REPORT + "\" means fully formatted report (default)"
123 | + ", \"" + MODE_A_ONLY + "\" bare list of IDs only in A (one per line)"
124 | + ", \"" + MODE_B_ONLY + "\" IDs only in B"
125 | + ", \"" + MODE_INTERSECT + "\" IDs preent in BOTH A AND B"
126 | + ", \"" + MODE_UNION + "\" IDs in A or B or in both (combines all IDs from both, but each ID will only appear once)"
127 | );
128 | if ( argv.length < 1 ) {
129 | helpAndExit();
130 | }
131 | CommandLine cmd = null;
132 | try {
133 | CommandLineParser parser = new PosixParser();
134 | // CommandLineParser parser = new DefaultParser();
135 | cmd = parser.parse( options, argv );
136 | }
137 | catch( ParseException exp ) {
138 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
139 | }
140 | // Already using -h for host, don't really need help, just run with no options
141 | //if ( cmd.hasOption("help") ) {
142 | // helpAndExit();
143 | //}
144 |
145 | String fullUrlA = cmd.getOptionValue( "url_a" );
146 | String hostA = cmd.getOptionValue( "host_a" );
147 | String portA = cmd.getOptionValue( "port_a" );
148 | String collA = cmd.getOptionValue( "collection_a" );
149 | String fileA = cmd.getOptionValue( "file_a" );
150 | int optsA = 0;
151 | optsA += (null!=fullUrlA) ? 1 : 0;
152 | optsA += (null!=hostA) ? 1 : 0;
153 | optsA += (null!=fileA) ? 1 : 0;
154 | if ( optsA < 1 ) {
155 | helpAndExit( "Must specifify at least url or host or ids file for first Solr instance", 3 );
156 | }
157 | if ( optsA > 1 ) {
158 | helpAndExit( "Can only specifify one of url, host or ids file for first Solr instance", 4 );
159 | }
160 |
161 | String fullUrlB = cmd.getOptionValue( "url_b" );
162 | String hostB = cmd.getOptionValue( "host_b" );
163 | String portB = cmd.getOptionValue( "port_b" );
164 | String collB = cmd.getOptionValue( "collection_b" );
165 | String fileB = cmd.getOptionValue( "file_b" );
166 | int optsB = 0;
167 | optsB += (null!=fullUrlB) ? 1 : 0;
168 | optsB += (null!=hostB) ? 1 : 0;
169 | optsB += (null!=fileB) ? 1 : 0;
170 | if ( optsB < 1 ) {
171 | helpAndExit( "Must specifify at least url or host or ids file for second Solr instance", 3 );
172 | }
173 | if ( optsB > 1 ) {
174 | helpAndExit( "Can only specifify one of url, host or ids file for second Solr instance", 4 );
175 | }
176 |
177 | // VALID_MODES
178 | String mode = cmd.getOptionValue( "mode" );
179 | if ( null!=mode ) {
180 | mode = mode.toLowerCase().trim();
181 | if ( ! VALID_MODES.contains(mode) ) {
182 | helpAndExit( "Invalid mode, must be one of: " + VALID_MODES, 5 );
183 | }
184 | }
185 | boolean isNormalReport = (null==mode) || mode.equals( MODE_REPORT );
186 |
187 | // File IO
188 | String outputFile = cmd.getOptionValue( "output_file" );
189 | String encodingStr = cmd.getOptionValue( "encoding" );
190 | if ( null==encodingStr || encodingStr.trim().length()<1 ) {
191 | encodingStr = "UTF-8";
192 | }
193 | boolean strictEncoding = true;
194 | if(cmd.hasOption("loose_encoding")) {
195 | strictEncoding = false;
196 | }
197 |
198 | // Setup IO encoding
199 | Charset charset = Charset.forName( encodingStr );
200 | // Input uses Decoder
201 | CharsetDecoder decoder = charset.newDecoder();
202 | // Output uses Encoder
203 | CharsetEncoder encoder = charset.newEncoder();
204 | if ( strictEncoding ) {
205 | decoder.onMalformedInput( CodingErrorAction.REPORT );
206 | encoder.onMalformedInput( CodingErrorAction.REPORT );
207 | }
208 |
209 | PrintWriter out = null;
210 | if( null!=outputFile && ! outputFile.equals("-") ) {
211 | out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoder), true);
212 | } else {
213 | out = new PrintWriter(new OutputStreamWriter(System.out, encoder), true);
214 | }
215 |
216 | // Init
217 | // HttpSolrServer solrA = new HttpSolrServer( URL1 );
218 | HttpSolrServer solrA = null;
219 | if ( null==fileA ) {
220 | if ( null!=fullUrlA ) {
221 | solrA = SolrUtils.getServer( fullUrlA );
222 | }
223 | else {
224 | // Utils handle null values
225 | solrA = SolrUtils.getServer( hostA, portA, collA );
226 | }
227 | if(isNormalReport) out.println( "First Solr / Solr A = " + solrA.getBaseURL() );
228 | }
229 | else {
230 | if(isNormalReport) out.println( "First Solr / Solr A read from file = " + fileA );
231 | }
232 |
233 | // HttpSolrServer solrB = new HttpSolrServer( URL2 );
234 | HttpSolrServer solrB = null;
235 | if ( null==fileB ) {
236 | if ( null!=fullUrlB ) {
237 | solrB = SolrUtils.getServer( fullUrlB );
238 | }
239 | else {
240 | // Utils handle null values
241 | solrB = SolrUtils.getServer( hostB, portB, collB );
242 | }
243 | if(isNormalReport) out.println( "Second Solr / Solr B = " + solrB.getBaseURL() );
244 | }
245 | else {
246 | if(isNormalReport) out.println( "Second Solr / Solr B read from file = " + fileB );
247 | }
248 |
249 | Set idsA = (null!=solrA) ? SolrUtils.getAllIds( solrA ) : readIdsFromFile( new File(fileA), decoder );
250 | Set idsB = (null!=solrB) ? SolrUtils.getAllIds( solrB ) : readIdsFromFile( new File(fileB), decoder );
251 |
252 | if ( isNormalReport ) {
253 | // Use non-destructive here since we use the lists more than once
254 | Set aOnly = SetUtils.inAOnly_nonDestructive(idsA, idsB);
255 | Set bOnly = SetUtils.inBOnly_nonDestructive(idsA, idsB);
256 | out.println( "A-only: " + aOnly );
257 | out.println( "B-only: " + bOnly );
258 | }
259 | else {
260 | Set ids = null;
261 | if ( mode.equals(MODE_A_ONLY) ) {
262 | // destructive OK here since we're just doing 1 calculation
263 | ids = SetUtils.inAOnly_destructive( idsA, idsB );
264 | }
265 | else if ( mode.equals(MODE_B_ONLY) ) {
266 | ids = SetUtils.inBOnly_destructive( idsA, idsB );
267 | }
268 | else if ( mode.equals(MODE_INTERSECT) ) {
269 | ids = SetUtils.intersection_destructive( idsA, idsB );
270 | }
271 | else if ( mode.equals(MODE_UNION) ) {
272 | ids = SetUtils.union_destructive( idsA, idsB );
273 | }
274 | else {
275 | // This should never happen.
276 | // If it ever does, maybe somebody added to VALID_MODES but didn't add a case here
277 | throw new IllegalStateException( "Unknown mode \"" + mode + "\", check VALID_MODES" );
278 | }
279 |
280 | // Print the results
281 | for ( String id : ids ) {
282 | out.println( id );
283 | }
284 | }
285 | out.close();
286 | }
287 |
288 | }
289 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntry.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.Collection;
4 |
5 | /*
6 | * Log entries can have structure.
7 | * Sometimes the structure isn't known when log entries are first ingested, they may come in as raw strings.
8 | * The idea is that a log entry could be fed into a process and then a more specific log entry comes out.
9 | * This process could be repeated for even more specific or normalized entries.
10 | * Ideally more evolved log entries can have the option of still referring back to their parent entries
11 | * for auditing or so that rules can be rerun.
12 | * Another issue is that some series of lines in a log file constitute a higher level log entry.
13 | * Some of the strecture might be fixed text, whereas other items might be parameterizable.
14 | * Eg:
15 | * &name=dave
16 | * &name=mark
17 | * &name=satish
18 | * -> "name" is a fixed identifier, whereas values can vary.
19 | *
20 | * My post on Stack Overflow:
21 | * http://stackoverflow.com/questions/26518770/advanced-requirements-for-log-file-utilities-am-i-reinventing-the-wheel
22 | */
23 | interface LogEntry {
24 |
25 | String getRawText();
26 |
27 | Collection getReferences();
28 | // TODO: should setters be defined in Interface?
29 | // void addReference( LogEntryReference ref );
30 |
31 | // getDate
32 | // getPath
33 | // getHandler
34 | // getParamsString
35 | // getParent
36 | // getChildren
37 | // getEntities
38 | // getEventLevel // Info, warn, error, default
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collection;
5 |
6 | public class LogEntryBase implements LogEntry {
7 |
8 | String rawText;
9 | Collection references = new ArrayList<>();
10 |
11 | LogEntryBase( String rawText ) {
12 | this.rawText = rawText;
13 | }
14 |
15 | @Override
16 | public String getRawText() {
17 | return rawText;
18 | }
19 | public void setRawText( String rawText ) {
20 | this.rawText = rawText;
21 | }
22 |
23 | public static LogEntry logEntryFromString( String rawText ) {
24 | return new LogEntryBase( rawText );
25 | }
26 |
27 |
28 | @Override
29 | public Collection getReferences() {
30 | return references;
31 | }
32 |
33 | // @Override
34 | public void addReference(LogEntryReference ref) {
35 | references.add( ref );
36 | }
37 |
38 | /*
39 | * Throw exception so that derived classes are allowed to do so
40 | */
41 | public static void main(String[] args) throws Exception {
42 | for ( int i=0; i> parsedParamValues;
72 |
73 | // factory method
74 | public static LogEntry solrLogEntryFromBaseEntryOrNull( LogEntry entry ) {
75 | LogEntryFromSolr newEntry = new LogEntryFromSolr( entry );
76 | if ( newEntry.isSolrPattern() ) {
77 | return newEntry;
78 | }
79 | else {
80 | return null;
81 | }
82 | }
83 |
84 | LogEntryFromSolr( LogEntry entry ) {
85 | this( entry.getRawText() );
86 | this.earlierEntry = entry;
87 | init( entry.getRawText() );
88 | }
89 | LogEntryFromSolr(String rawText) {
90 | super( rawText );
91 | init( rawText );
92 | }
93 | // need init broken out so constructor1 can store earlierEntry before calling this
94 | void init( String rawText ) {
95 | this.originalText = rawText;
96 | paramsPattern = Pattern.compile( PARAMS_PATTERN_STR );
97 | paramsMatcher = paramsPattern.matcher( rawText );
98 | if ( paramsMatcher.find() ) {
99 | String matchStr = paramsMatcher.group();
100 | setRawText( matchStr );
101 | int overallStart = paramsMatcher.start();
102 | int overallEnd = paramsMatcher.end();
103 |
104 | int group = 1;
105 | paramsString = paramsMatcher.group( group );
106 | paramsStart = paramsMatcher.start( group );
107 | paramsEnd = paramsMatcher.end( group );
108 | // Make relative to overall pattern match
109 | paramsStart -= overallStart;
110 | // paramsEnd = overallEnd - paramsEnd;
111 | // Relative-to-end might not work in streaming apps since we wouldn't know where the end is
112 | paramsEnd -= overallStart;
113 |
114 |
115 | // TODO: look for other things like the handler, matches and qtime
116 |
117 | // Hookup references *if* we were created from an earlier log entry
118 | if ( null != this.earlierEntry ) {
119 | LogEntryReference ref = new LogEntryReferenceBase( this.earlierEntry, this, "LogEntryFromSolr" );
120 | // ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( paramsStart, paramsEnd );
121 | ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( overallStart, overallEnd );
122 | }
123 |
124 | doSimpleFieldParsing();
125 |
126 | isSolrPattern = true;
127 | }
128 | }
129 |
130 | public String makeParamNamesKey() {
131 | return StringUtils.join( getParsedSolrParams().keySet(), "|" );
132 | }
133 | public Set getParamNames() {
134 | return getParsedSolrParams().keySet();
135 | }
136 | public Collection getParamValues( String paramName ) {
137 | return getParsedSolrParams().get( paramName );
138 | }
139 |
140 | public static Map tabulateQueryArgCombos( Collection entries ) {
141 | Map counts = new HashMap<>();
142 | for ( LogEntryFromSolr e : entries ) {
143 | String key = e.makeParamNamesKey();
144 | SetUtils.incrementMapCounter( counts, key );
145 | }
146 | return counts;
147 | }
148 | // { composite-parameter-key -> { each-parameter-name-> { unique-value: count } } }
149 | public static Map>> tabulateQueryArgCombosAndValues( Collection entries ) {
150 | // Level 1: by Composite Key
151 | Map>> nestedCounts = new HashMap<>();
152 | // Foreach Raw Entry
153 | for ( LogEntryFromSolr e : entries ) {
154 |
155 | String overallKey = e.makeParamNamesKey();
156 | // Level 2: by Parameter Name
157 | Map> paramsAndValues = null;
158 | if ( nestedCounts.containsKey(overallKey) ) {
159 | paramsAndValues = nestedCounts.get(overallKey);
160 | }
161 | else {
162 | paramsAndValues = new TreeMap<>(); // LinkedHashMap<>();
163 | nestedCounts.put( overallKey, paramsAndValues );
164 | }
165 |
166 | Set paramNames = e.getParamNames();
167 | // Foreach Parameter Name
168 | for ( String name : paramNames ) {
169 | // Level 3: by Value
170 | Map tabulatedValues = null;
171 | if ( paramsAndValues.containsKey(name) ) {
172 | tabulatedValues = paramsAndValues.get(name);
173 | }
174 | else {
175 | tabulatedValues = new LinkedHashMap<>();
176 | paramsAndValues.put( name, tabulatedValues );
177 | }
178 | Collection rawValues = e.getParamValues( name );
179 | for ( String rv : rawValues ) {
180 | Long count = 0L;
181 | if ( tabulatedValues.containsKey(rv) ) {
182 | count = tabulatedValues.get(rv);
183 | }
184 | count += 1L;
185 | tabulatedValues.put( rv, count );
186 | }
187 |
188 | } // End Foreach Parameter Name
189 |
190 | } // End Foreach Raw Entry
191 |
192 | return nestedCounts;
193 | }
194 |
195 | void doSimpleFieldParsing() {
196 | parseHandlerName();
197 | parseCollectionName();
198 | parseHits();
199 | parseStatus();
200 | parseQTime();
201 | }
202 | void parseHandlerName() {
203 | handlerName = StringUtils.parseAndCatchGroupAsStringOrNull( HANDLER_PATTERN_STR, getOriginalText(), 1 );
204 | }
205 | void parseCollectionName() {
206 | collectionName = StringUtils.parseAndCatchGroupAsStringOrNull( COLLECTION_PATTERN_STR, getOriginalText(), 1 );
207 | }
208 | void parseHits() {
209 | hits = StringUtils.parseAndCatchGroupAsLongOrNull( HITS_PATTERN_STR, getOriginalText(), 1 );
210 | }
211 | void parseStatus() {
212 | status = StringUtils.parseAndCatchGroupAsLongOrNull( STATUS_PATTERN_STR, getOriginalText(), 1 );
213 | }
214 | void parseQTime() {
215 | qTime = StringUtils.parseAndCatchGroupAsLongOrNull( QTIME_PATTERN_STR, getOriginalText(), 1 );
216 | }
217 |
218 | // Not thread safe, but OK for now, for single thread utility
219 | public Map> getParsedSolrParams() {
220 | if ( null==parsedParamValues ) {
221 | parsedParamValues = StringUtils.parseCgiParameters( getParamsString() );
222 | }
223 | return parsedParamValues;
224 | }
225 |
226 | public boolean isSolrPattern() {
227 | return isSolrPattern;
228 | }
229 |
230 | public String getParamsString() {
231 | return paramsString;
232 | }
233 |
234 | String getOriginalText() {
235 | return originalText;
236 | }
237 |
238 | String getHandlerName() {
239 | return handlerName;
240 | }
241 | String getCollectionName() {
242 | return collectionName;
243 | }
244 | // Don't really need Longs here, but it's what utility returns
245 | /*
246 | * get number of Matches
247 | */
248 | Long getHits() {
249 | return hits;
250 | }
251 | /*
252 | * Similar to HTTP Numeric Status Code
253 | * Eg: 200, 500, etc.
254 | */
255 | Long getStatus() {
256 | return status;
257 | }
258 | /*
259 | * Query time in milliseconds
260 | * may not include transmission time of payload to requesting client
261 | */
262 | Long getQTime() {
263 | return qTime;
264 | }
265 |
266 | public static void main(String[] args) throws IOException {
267 | for ( int i=0; i logs = repo.findLogFiles();
272 | for ( LogFile lf : logs ) {
273 | lf.read();
274 | Collection rawEntries = lf.getEntries();
275 | Collection solrEntries = new ArrayList<>();
276 | for ( LogEntry rawEntry : rawEntries ) {
277 | // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry );
278 | LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry );
279 | // if ( solrEntry.isSolrPattern() )
280 | if ( null != solrEntry )
281 | {
282 | solrEntries.add( (LogEntryFromSolr) solrEntry );
283 | }
284 | }
285 |
286 | // Tabulate
287 | Map queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries );
288 | // composite-parameter-key -> each-parameter-name-> unique-value -> count
289 | Map>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries );
290 | queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts );
291 | queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts );
292 |
293 | // Report
294 | for ( Entry e1 : queryTypeCounts.entrySet() ) {
295 | String queryType = e1.getKey();
296 | Long queryTypeCount = e1.getValue();
297 | System.out.println( "" + queryTypeCount + " " + queryType );
298 | Map> statsForQueryType = detailedStats.get( queryType );
299 | for ( Entry> e2 : statsForQueryType.entrySet() ) {
300 | String paramName = e2.getKey();
301 | System.out.println( "\t" + paramName + ":" );
302 | Map paramValues = e2.getValue();
303 | paramValues = SetUtils.sortMapByValues( paramValues );
304 | paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues );
305 | for ( Entry e3 : paramValues.entrySet() ) {
306 | String value = e3.getKey();
307 | Long valueCount = e3.getValue();
308 | System.out.println( "\t\t" + valueCount + " " + value );
309 | }
310 | }
311 | }
312 | }
313 | // System.out.println( repo );
314 | }
315 |
316 | }
317 |
318 | }
319 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryGroup.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.Collection;
4 |
5 | /*
6 | * TODO: Do we really need this?
7 | * Pro: good abstraction, might developer additional features
8 | * Con: converting back and forth between this and Collection
9 | */
10 | public interface LogEntryGroup /*extends Collection*/ {
11 | Collection getEntries();
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryGroupFromSolr.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.Collection;
4 |
5 | public class LogEntryGroupFromSolr implements LogEntryGroup {
6 |
7 | @Override
8 | public Collection getEntries() {
9 | // TODO Auto-generated method stub
10 | return null;
11 | }
12 |
13 | public static void main(String[] args) {
14 | // TODO Auto-generated method stub
15 |
16 | }
17 |
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryReference.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.Collection;
4 |
5 | public interface LogEntryReference {
6 | Collection getEarlierEntries();
7 | Collection getLaterEntries();
8 | //void addEarlierEntry( LogEntry entry );
9 | //void addLaterEntry( LogEntry entry );
10 |
11 | String getComment();
12 | //void setComment( String comment );
13 |
14 | int getRelativeStart();
15 | int getRelativeEnd();
16 | //void setRelativeRegionOfInterest( int fromStart, int fromEnd );
17 | //void setRelativeStart( int fromStart );
18 | //void setRelativeEnd( int fromEnd );
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryReferenceBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collection;
5 |
6 | public class LogEntryReferenceBase implements LogEntryReference {
7 |
8 | String comment;
9 | // LogEntryGroup is approx Collection
10 | Collection earlierEntries = new ArrayList<>();
11 | Collection laterEntries = new ArrayList<>();
12 |
13 | int relativeRegionOfInterestStart;
14 | int relativeRegionOfInterestEnd;
15 |
16 | public LogEntryReferenceBase() { }
17 |
18 | public LogEntryReferenceBase( LogEntry earlierEntry, LogEntry laterEntry, String comment ) {
19 | this();
20 | // Link to log entries
21 | addEarlierEntry( earlierEntry );
22 | addLaterEntry( laterEntry );
23 | // Link log entries back to us
24 | ( (LogEntryBase)earlierEntry ).addReference( this );
25 | ( (LogEntryBase)laterEntry ).addReference( this );
26 | setComment( comment );
27 | }
28 |
29 | @Override
30 | public Collection getEarlierEntries() {
31 | return earlierEntries;
32 | }
33 | public void addEarlierEntry( LogEntry entry ) {
34 | earlierEntries.add( entry );
35 | }
36 |
37 | @Override
38 | public Collection getLaterEntries() {
39 | return laterEntries;
40 | }
41 | public void addLaterEntry( LogEntry entry ) {
42 | laterEntries.add( entry );
43 | }
44 |
45 | @Override
46 | public String getComment() {
47 | return comment;
48 | }
49 | public void setComment( String comment ) {
50 | this.comment = comment;
51 | }
52 |
53 | @Override
54 | public int getRelativeStart() {
55 | return relativeRegionOfInterestStart;
56 | }
57 | @Override
58 | public int getRelativeEnd() {
59 | return relativeRegionOfInterestEnd;
60 | }
61 | //@Override
62 | public void setRelativeRegionOfInterest( int fromStart, int fromEnd ) {
63 | relativeRegionOfInterestStart = fromStart;
64 | relativeRegionOfInterestEnd = fromEnd;
65 | }
66 | //@Override
67 | public void setRelativeStart( int fromStart ) {
68 | this.relativeRegionOfInterestStart = fromStart;
69 | }
70 | //@Override
71 | public void setRelativeEnd( int fromEnd ) {
72 | this.relativeRegionOfInterestEnd = fromEnd;
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFile.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | public interface LogFile extends LogEntryGroup {
7 |
8 | void read() throws IOException;
9 |
10 | // Inherits getEntries() from super
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.FileNotFoundException;
7 | import java.io.IOException;
8 | import java.io.InputStreamReader;
9 | import java.io.UnsupportedEncodingException;
10 | import java.util.ArrayList;
11 | import java.util.Collection;
12 |
13 | public class LogFileBase implements LogFile {
14 |
15 | // TODO: could leave this NULL until they've called .process() ?
16 | Collection entries = new ArrayList<>();
17 | File sourceFile;
18 |
19 | // Public "factory" methods
20 | public static LogFile logFileFromDiskFile( File inFile ) throws IOException {
21 | return new LogFileBase( inFile );
22 | }
23 | public static LogFile logFileFromDiskFile( String fileName ) throws IOException {
24 | return new LogFileBase( new File(fileName) );
25 | }
26 |
27 | LogFileBase( File sourceFile ) {
28 | this.sourceFile = sourceFile;
29 | }
30 |
31 | // Break out processing logic out from constructor
32 | // in case we want to defer it
33 | @Override
34 | public void read() throws IOException {
35 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile), "UTF-8"));
36 | while( true ) {
37 | String line = in.readLine();
38 | if ( null==line ) {
39 | break;
40 | }
41 | LogEntry entry = LogEntryBase.logEntryFromString( line );
42 | entries.add( entry );
43 | }
44 | in.close();
45 | }
46 |
47 | @Override
48 | public Collection getEntries() {
49 | return entries;
50 | }
51 |
52 |
53 | public static void main(String[] args) throws IOException {
54 | for ( int i=0; i findLogFiles( File startingDirOrFile );
10 | //Collection findLogFiles( Collection startingDirOrFiles );
11 | Collection findLogFiles();
12 |
13 | // TODO: maybe Log *File* Repo is a filesystem impl of a more generic Log Unit Source Repo
14 | // TODO: although we really do need setters, should they be defined in the interface?
15 | String getIncludePattern();
16 | void setIncludePattern( String pattern );
17 | boolean getIncludeCompressedFiles();
18 | void setIncludeCompressedFiles( boolean flag );
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileRepoBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.logs;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.Collection;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 | import java.util.Queue;
10 | import java.util.concurrent.ConcurrentLinkedQueue;
11 |
12 | import com.lucidworks.dq.util.SetUtils;
13 |
14 | public class LogFileRepoBase implements LogFileRepo {
15 |
16 | Collection myQueue = new ConcurrentLinkedQueue<>();
17 |
18 | File startingDirOrFile;
19 |
20 | // Regex, Optional
21 | String includePattern;
22 |
23 | boolean shouldIncludeCompressedFiles;
24 |
25 | public LogFileRepoBase( String startingDirOrFile ) {
26 | this( new File(startingDirOrFile) );
27 | }
28 | public LogFileRepoBase( File startingDirOrFile ) {
29 | this.startingDirOrFile = startingDirOrFile;
30 | }
31 |
32 | @Override
33 | public Collection findLogFiles() {
34 | traverse( myQueue, startingDirOrFile );
35 | Collection outList = new ArrayList<>();
36 | for ( File f : myQueue ) {
37 | LogFile lf = new LogFileBase( f );
38 | outList.add( lf );
39 | }
40 | return outList;
41 | }
42 |
43 | @Override
44 | public void setIncludePattern(String pattern) {
45 | this.includePattern = pattern;
46 | }
47 | @Override
48 | public String getIncludePattern() {
49 | return includePattern;
50 | }
51 |
52 | @Override
53 | public void setIncludeCompressedFiles(boolean flag) {
54 | this.shouldIncludeCompressedFiles = flag;
55 | }
56 | @Override
57 | public boolean getIncludeCompressedFiles() {
58 | return shouldIncludeCompressedFiles;
59 | }
60 |
61 | //Lookup all the files
62 | //traverse( myQueue, "someDirName", null );
63 | //Or simpler
64 | //Collection files = LinkedHashSet();
65 | //traverse( files, "someDirName", null );
66 |
67 | //TODO: would be better to pass in method to call
68 | void traverse( Collectionqueue, String startDir ) {
69 | traverse( queue, new File(startDir) );
70 | }
71 | void traverse( Collectionqueue, File candidate ) {
72 | if( candidate.isFile() ) {
73 | if ( null==getIncludePattern() || candidate.toString().matches(getIncludePattern()) ) {
74 | queue.add( candidate );
75 | }
76 | }
77 | // Else probably a directory
78 | else if ( candidate.isDirectory() ) {
79 | File [] entries = candidate.listFiles();
80 | for ( File f : entries ) {
81 | traverse( queue, f );
82 | }
83 | }
84 | else {
85 | System.out.println( "ERROR: Neither file nor directory: " + candidate );
86 | }
87 | }
88 |
89 | public static void main(String[] args) throws IOException {
90 | // Moved to LogEntryFromSolr main
91 |
92 | // for ( int i=0; i logs = repo.findLogFiles();
95 | // for ( LogFile lf : logs ) {
96 | // lf.read();
97 | // Collection rawEntries = lf.getEntries();
98 | // Collection solrEntries = new ArrayList<>();
99 | // for ( LogEntry rawEntry : rawEntries ) {
100 | // // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry );
101 | // LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry );
102 | // // if ( solrEntry.isSolrPattern() )
103 | // if ( null != solrEntry )
104 | // {
105 | // solrEntries.add( (LogEntryFromSolr) solrEntry );
106 | // }
107 | // }
108 | // Map queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries );
109 | // // composite-parameter-key -> each-parameter-name-> unique-value -> count
110 | // Map>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries );
111 | // queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts );
112 | // queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts );
113 | // for ( Entry e1 : queryTypeCounts.entrySet() ) {
114 | // String queryType = e1.getKey();
115 | // Long queryTypeCount = e1.getValue();
116 | // System.out.println( "" + queryTypeCount + " " + queryType );
117 | // Map> statsForQueryType = detailedStats.get( queryType );
118 | // for ( Entry> e2 : statsForQueryType.entrySet() ) {
119 | // String paramName = e2.getKey();
120 | // System.out.println( "\t" + paramName + ":" );
121 | // Map paramValues = e2.getValue();
122 | // paramValues = SetUtils.sortMapByValues( paramValues );
123 | // paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues );
124 | // for ( Entry e3 : paramValues.entrySet() ) {
125 | // String value = e3.getKey();
126 | // Long valueCount = e3.getValue();
127 | // System.out.println( "\t\t" + valueCount + " " + value );
128 | // }
129 | // }
130 | // }
131 | // }
132 | // // System.out.println( repo );
133 | // }
134 |
135 | }
136 |
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/Schema.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.util.Map;
4 | import java.util.Set;
5 |
6 | public interface Schema {
7 |
8 | // TODO: move throws Exception down to implementation level
9 | // and errors buffer
10 |
11 | public float getSchemaVersion() throws Exception;
12 |
13 | public String getSchemaName() throws Exception;
14 |
15 | public String getUniqueKeyFieldName() throws Exception;
16 |
17 | public String getSimilarityModelClassName() throws Exception;
18 |
19 | public String getDefaultOperator() throws Exception;
20 |
21 | public String getDefaultSearchField() throws Exception;
22 |
23 | public Map> getAllDeclaredAndDynamicFieldsByType() throws Exception;
24 |
25 | public Set getAllSchemaFieldNames() throws Exception;
26 |
27 | public Set getAllDynamicFieldPatterns() throws Exception;
28 |
29 | public Set getAllFieldTypeNames() throws Exception;
30 |
31 | public Set getAllCopyFieldSourceNames() throws Exception;
32 |
33 | public Set getAllCopyFieldDestinationNames() throws Exception;
34 |
35 | public Set getCopyFieldDestinationsForSource(String sourceName) throws Exception;
36 |
37 | public Set getCopyFieldSourcesForDestination(String destName) throws Exception;
38 |
39 | public String generateReport() throws Exception;
40 |
41 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SchemaBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.util.LinkedHashSet;
6 | import java.util.Map;
7 | import java.util.Set;
8 |
9 | public abstract class SchemaBase implements Schema {
10 |
11 | // Also helpful for debugging code
12 | @Override
13 | public String generateReport() throws Exception {
14 | StringWriter sw = new StringWriter();
15 | PrintWriter out = new PrintWriter(sw);
16 |
17 | // Singular Values
18 | String name = getSchemaName();
19 | out.println( "Schema Name: " + name );
20 | float vers = getSchemaVersion();
21 | out.println("Schema Version: " + vers);
22 | String key = getUniqueKeyFieldName();
23 | out.println( "Key Field: " + key );
24 | String defOp = getDefaultOperator();
25 | out.println( "Default Operator: " + defOp );
26 | String sim = getSimilarityModelClassName();
27 | out.println( "Similarity Class Name: " + sim );
28 | String defField = getDefaultSearchField();
29 | out.println( "Default Search Field: " + defField );
30 |
31 | // Complex Values
32 | Set fields = getAllSchemaFieldNames();
33 | out.println();
34 | out.println( "Fields: " + fields );
35 |
36 | Set dynFields = getAllDynamicFieldPatterns();
37 | out.println();
38 | out.println( "Dynamic Field Patterns: " + dynFields );
39 |
40 | Set typeNames = getAllFieldTypeNames();
41 | out.println();
42 | out.println( "Types: " + typeNames );
43 |
44 | Map> typesAndNames = getAllDeclaredAndDynamicFieldsByType();
45 | out.println();
46 | out.println( "Type -> Fields: (declared and dynamic patterns)" );
47 | out.println( "\t(" + typesAndNames.size() + " types)" );
48 | for ( String type : typesAndNames.keySet() ) {
49 | out.println( "\t" + type + ":" );
50 | Set typeFields = typesAndNames.get( type );
51 | out.println( "\t\t(" + typeFields.size() + " fields)" );
52 | for ( String field : typeFields ) {
53 | out.println( "\t\t" + field );
54 | }
55 | }
56 |
57 |
58 | Set sourceNames = getAllCopyFieldSourceNames();
59 | out.println();
60 | out.println( "Copy Sources: " + sourceNames );
61 | for ( String source : sourceNames ) {
62 | Set tmpDests = getCopyFieldDestinationsForSource(source);
63 | out.println( "\tFrom: '"+ source + "' To " + tmpDests );
64 | }
65 |
66 | Set destNames = getAllCopyFieldDestinationNames();
67 | out.println();
68 | out.println( "Copy Destinations: " + destNames );
69 | for ( String dest : destNames ) {
70 | Set tmpSrcs = getCopyFieldSourcesForDestination( dest );
71 | out.println( "\tDest: '"+ dest + "' From " + tmpSrcs );
72 | }
73 |
74 | String outStr = sw.toString();
75 | return outStr;
76 | }
77 |
78 | static void utilTabulateFieldTypeAndName( Map> map, String type, String name ) {
79 | if ( map.containsKey(type) ) {
80 | map.get(type).add( name );
81 | }
82 | else {
83 | Set vector = new LinkedHashSet<>();
84 | vector.add( name );
85 | map.put( type, vector );
86 | }
87 | }
88 |
89 | @Override
90 | public abstract float getSchemaVersion() throws Exception;
91 | @Override
92 | public abstract String getSchemaName() throws Exception;
93 | @Override
94 | public abstract String getUniqueKeyFieldName() throws Exception;
95 | @Override
96 | public abstract String getSimilarityModelClassName() throws Exception;
97 | @Override
98 | public abstract String getDefaultOperator() throws Exception;
99 | @Override
100 | public abstract String getDefaultSearchField() throws Exception;
101 | @Override
102 | public abstract Set getAllSchemaFieldNames() throws Exception;
103 | @Override
104 | public abstract Set getAllDynamicFieldPatterns() throws Exception;
105 | @Override
106 | public abstract Set getAllFieldTypeNames() throws Exception;
107 | @Override
108 | public abstract Set getAllCopyFieldSourceNames() throws Exception;
109 | @Override
110 | public abstract Set getAllCopyFieldDestinationNames() throws Exception;
111 | @Override
112 | public abstract Set getCopyFieldDestinationsForSource(String sourceName) throws Exception;
113 | @Override
114 | public abstract Set getCopyFieldSourcesForDestination(String destName) throws Exception;
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SchemaFromLocalCore_broken.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.util.LinkedHashMap;
4 | import java.util.LinkedHashSet;
5 | import java.util.List;
6 | import java.util.Map;
7 | import java.util.Map.Entry;
8 | import java.util.Properties;
9 | import java.util.Set;
10 |
11 | import org.apache.solr.common.util.NamedList;
12 | import org.apache.solr.core.ConfigSolr;
13 | import org.apache.solr.core.ConfigSolrXmlOld;
14 | import org.apache.solr.core.CoreContainer;
15 | import org.apache.solr.core.SolrCore;
16 | import org.apache.solr.core.SolrResourceLoader;
17 | import org.apache.solr.request.LocalSolrQueryRequest;
18 | import org.apache.solr.request.SolrQueryRequest;
19 | import org.apache.solr.schema.CopyField;
20 | import org.apache.solr.schema.FieldType;
21 | import org.apache.solr.schema.IndexSchema;
22 | import org.apache.solr.schema.IndexSchema.DynamicField;
23 | import org.apache.solr.schema.SchemaField;
24 |
25 | public class SchemaFromLocalCore_broken extends SchemaBase implements Schema {
26 |
27 | static String PATH1 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example";
28 | static String PATH2 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr";
29 | static String PATH3 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr/collection1";
30 |
31 | private IndexSchema schema;
32 |
33 | public SchemaFromLocalCore_broken( String path, String optCoreName ) {
34 | // TODO: currently broken, touble finding info online, postponing for now
35 | SolrResourceLoader loader = new SolrResourceLoader( path );
36 | String confDir = loader.getConfigDir();
37 | String dataDir = loader.getDataDir();
38 | String instanceDir = loader.getInstanceDir();
39 | Properties props = loader.getCoreProperties();
40 | System.out.println( "path = " + path );
41 | System.out.println( "confDir = " + confDir );
42 | System.out.println( "dataDir = " + dataDir );
43 | System.out.println( "instanceDir = " + instanceDir );
44 | System.out.println( "props = " + props );
45 | ConfigSolr config = ConfigSolr.fromSolrHome( loader, path );
46 | CoreContainer container = new CoreContainer( loader, config );
47 | if ( container.getCores().isEmpty() ) {
48 | throw new IllegalArgumentException( "No cores found at " + path );
49 | }
50 | String coreName = optCoreName!=null ? optCoreName : ConfigSolrXmlOld.DEFAULT_DEFAULT_CORE_NAME;
51 | SolrCore core = container.getCore( coreName );
52 | if ( null==core ) {
53 | throw new IllegalArgumentException( "Unable to find core \"" + coreName + "\" at " + path );
54 | }
55 | // SolrQueryRequest req = new LocalSolrQueryRequest( core, "*:*", null, 0, 0, null );
56 | NamedList args = new NamedList();
57 | SolrQueryRequest req = new LocalSolrQueryRequest( core, args );
58 | schema = req.getSchema();
59 | };
60 |
61 | public float getSchemaVersion() throws Exception {
62 | return schema.getVersion();
63 | }
64 |
65 | public String getSchemaName() throws Exception {
66 | return schema.getSchemaName();
67 | }
68 |
69 | public String getUniqueKeyFieldName() throws Exception {
70 | return schema.getUniqueKeyField().getName();
71 | }
72 |
73 | public String getSimilarityModelClassName() throws Exception {
74 | return schema.getSimilarity().getClass().getName();
75 | }
76 |
77 | // TODO: not sure where this comes from
78 | public String getDefaultOperator() throws Exception {
79 | return null;
80 | }
81 |
82 | public String getDefaultSearchField() throws Exception {
83 | return schema.getDefaultSearchFieldName();
84 | }
85 |
86 | public Map> getAllDeclaredAndDynamicFieldsByType() {
87 | Map> out = new LinkedHashMap<>();
88 | return out;
89 | //return null;
90 | }
91 |
92 | public Set getAllSchemaFieldNames() throws Exception {
93 | Map fields = schema.getFields();
94 | return fields.keySet();
95 | // return new LinkedHashSet<>( fields.keySet() );
96 | }
97 |
98 | public Set getAllDynamicFieldPatterns() throws Exception {
99 | DynamicField[] dynFields = schema.getDynamicFields();
100 | Set out = new LinkedHashSet<>();
101 | for ( DynamicField df : dynFields ) {
102 | out.add( df.getRegex() );
103 | }
104 | return out;
105 | }
106 |
107 | public Set getAllFieldTypeNames() throws Exception {
108 | Map types = schema.getFieldTypes();
109 | return types.keySet();
110 | }
111 |
112 | public Set getAllCopyFieldSourceNames() throws Exception {
113 | Map> copyMap = schema.getCopyFieldsMap();
114 | return copyMap.keySet();
115 | }
116 |
117 | public Set getAllCopyFieldDestinationNames() throws Exception {
118 | Set out = new LinkedHashSet<>();
119 | Map> copyMap = schema.getCopyFieldsMap();
120 | for ( Entry> copyEntry : copyMap.entrySet() ) {
121 | // String srcFieldName = copyEntry.getKey();
122 | List copyList = copyEntry.getValue();
123 | for ( CopyField cf : copyList ) {
124 | SchemaField destField = cf.getDestination();
125 | out.add( destField.getName() );
126 | }
127 | }
128 | return out;
129 | }
130 |
131 | public Set getCopyFieldDestinationsForSource(String sourceName) throws Exception {
132 | Set out = new LinkedHashSet<>();
133 | List copyList = schema.getCopyFieldsList( sourceName );
134 | if ( null==copyList || copyList.isEmpty() ) {
135 | return out;
136 | }
137 | for ( CopyField cf : copyList ) {
138 | SchemaField destField = cf.getDestination();
139 | out.add( destField.getName() );
140 | }
141 | return out;
142 | }
143 |
144 | public Set getCopyFieldSourcesForDestination(String targetDestName) throws Exception {
145 | Set out = new LinkedHashSet<>();
146 | Map> copyMap = schema.getCopyFieldsMap();
147 | for ( Entry> copyEntry : copyMap.entrySet() ) {
148 | String srcFieldName = copyEntry.getKey();
149 | List copyList = copyEntry.getValue();
150 | for ( CopyField cf : copyList ) {
151 | SchemaField destField = cf.getDestination();
152 | String destFieldName = destField.getName();
153 | if ( destFieldName.equals(targetDestName) ) {
154 | out.add( srcFieldName );
155 | }
156 | }
157 | }
158 | return out;
159 | }
160 |
161 | // public String generateReport() throws Exception;
162 |
163 | public static void main( String[] argv ) throws Exception {
164 | Schema schema = new SchemaFromLocalCore_broken( PATH3, null );
165 | schema.generateReport();
166 | }
167 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfig.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.util.Collection;
4 |
5 | import javax.xml.xpath.XPathExpressionException;
6 |
7 | public interface SolrConfig {
8 |
9 | public String generateReport() throws Exception;
10 |
11 | // Can't return float, could be const or config
12 | public String getLuceneMatchVersion() throws Exception;
13 |
14 | // Can't return bool, could be const or config
15 | public String getAbortOnConfigurationError() throws Exception;
16 |
17 | public Collection getRequestHandlers() throws Exception;
18 |
19 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfigBase.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.util.Collection;
6 |
7 | public abstract class SolrConfigBase implements SolrConfig {
8 |
9 | @Override
10 | public String generateReport() throws Exception {
11 | StringWriter sw = new StringWriter();
12 | PrintWriter out = new PrintWriter(sw);
13 |
14 | // Singular Values
15 |
16 | String version = getLuceneMatchVersion();
17 | out.println( "Lucene Match Version = " + version );
18 | String abort = getAbortOnConfigurationError();
19 | out.println( "Abort on config error = " + abort );
20 |
21 | // Complex Values
22 |
23 | Collection handlers = getRequestHandlers();
24 | out.println();
25 | out.println( "Request Handlers and Classes:" );
26 | for ( String handler : handlers ) {
27 | out.println( "\t" + handler );
28 | }
29 |
30 | String outStr = sw.toString();
31 | return outStr;
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfigFromXml.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.schema;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import java.net.URL;
8 | import java.util.ArrayList;
9 | import java.util.Collection;
10 | import java.util.LinkedHashSet;
11 | import java.util.Set;
12 |
13 | import javax.xml.parsers.DocumentBuilder;
14 | import javax.xml.parsers.DocumentBuilderFactory;
15 | import javax.xml.parsers.ParserConfigurationException;
16 | import javax.xml.xpath.XPath;
17 | import javax.xml.xpath.XPathConstants;
18 | import javax.xml.xpath.XPathExpressionException;
19 | import javax.xml.xpath.XPathFactory;
20 |
21 | import org.w3c.dom.Document;
22 | import org.w3c.dom.NamedNodeMap;
23 | import org.w3c.dom.NodeList;
24 | import org.xml.sax.SAXException;
25 | import org.w3c.dom.Node;
26 |
27 | public class SolrConfigFromXml extends SolrConfigBase implements SolrConfig {
28 | // get from resources folder
29 | static String CONFIG_FILE_NAME = "solrconfig-480.xml";
30 |
31 | Document document;
32 | XPathFactory xpathFactory = XPathFactory.newInstance();
33 | private final String prefix = null;
34 | private final String name = "";
35 |
36 | // Note: Some of this code was copied from:
37 | // * Solr's IndexSchema.java
38 | // * Solr's Config.java
39 |
40 |
41 | public SolrConfigFromXml() throws ParserConfigurationException, IOException, SAXException {
42 | // this( SCHEMA_FILE_NAME );
43 | //URL schemaPath = this.getClass().getResource( CONFIG_FILE_NAME );
44 | //init( schemaPath );
45 | init( (URL) null );
46 | }
47 | public SolrConfigFromXml( File schemaPath ) throws ParserConfigurationException, SAXException, IOException {
48 | // URI uri = schemaPath.toURI();
49 | // URL url = uri.toURL();
50 | // init( url );
51 | InputStream is = new FileInputStream( schemaPath );
52 | init( is );
53 | }
54 | public SolrConfigFromXml( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException {
55 | init( schemaPath );
56 | }
57 | void init( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException {
58 | if ( null==schemaPath ) {
59 | schemaPath = this.getClass().getClassLoader().getResource( CONFIG_FILE_NAME );
60 | }
61 | InputStream is = schemaPath.openConnection().getInputStream();
62 | init( is );
63 | }
64 | void init( InputStream in ) throws ParserConfigurationException, SAXException, IOException {
65 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
66 | DocumentBuilder builder = factory.newDocumentBuilder();
67 | this.document = builder.parse( in );
68 | xpathFactory = XPathFactory.newInstance();
69 | }
70 |
71 | // Can't return float, could be const or config
72 | /* (non-Javadoc)
73 | * @see com.lucidworks.dq.schema.SolrConfig#getLuceneMatchVersion()
74 | */
75 | @Override
76 | public String getLuceneMatchVersion() throws Exception {
77 | XPath xpath = xpathFactory.newXPath();
78 | // "/config/luceneMatchVersion"
79 | String expression = stepsToPath(CONFIG, LUCENE_VERSION);
80 | // float version = getFloat(expression, 0.0f);
81 | Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
82 | String payload = null;
83 | if ( null!=nd ) {
84 | // payload = nd.getNodeValue();
85 | payload = nd.getTextContent();
86 | }
87 | return payload;
88 | }
89 |
90 | // Can't return bool, could be const or config
91 | /* (non-Javadoc)
92 | * @see com.lucidworks.dq.schema.SolrConfig#getAbortOnConfigurationError()
93 | */
94 | @Override
95 | public String getAbortOnConfigurationError() throws Exception {
96 | XPath xpath = xpathFactory.newXPath();
97 | // "/config/abortOnConfigurationError"
98 | String expression = stepsToPath(CONFIG, ABORT);
99 | Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
100 | String payload = null;
101 | if ( null!=nd ) {
102 | payload = nd.getTextContent();
103 | }
104 | return payload;
105 | }
106 |
107 | // TODO: getLibs:
108 | // TODO: getDataDir: ${solr.data.dir:}
109 | // TODO: getDirectoryFactory:
110 | // TODO: getIndexConfig (nested!):
111 | // TODO:
112 | // TODO:
113 | // TODO: nested
114 | // TODO: Nested:
115 | // TODO: Nested:
116 | // * TODO: Request Handlers, Nested!
117 | //
118 | //
119 | //
120 | //
121 | //
122 | //
123 | //
124 | //
125 | // Parts copied from Solr's IndexSchema .loadFields
126 | /* (non-Javadoc)
127 | * @see com.lucidworks.dq.schema.SolrConfig#getRequestHandlers()
128 | */
129 | @Override
130 | public Collection getRequestHandlers() throws XPathExpressionException {
131 | Collection out = new ArrayList<>();
132 | XPath xpath = xpathFactory.newXPath();
133 | // /schema/fields/field | /schema/fields/dynamicField
134 | // | /schema/field | /schema/dynamicField
135 | // Note: could remove OR and eliminate node name check, but this is closer to Solr code
136 | String expression = stepsToPath(CONFIG, HANDLER);
137 | NodeList nodes = (NodeList)xpath.evaluate(expression, document, XPathConstants.NODESET);
138 | for (int i=0; i TYPES = new HashMap() {{
20 | // put( 1, "R / DIRECTIONALITY_RIGHT_TO_LEFT" );
21 | // put( 2, "AL / DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC" );
22 | // put( 11, "S / DIRECTIONALITY_SEGMENT_SEPARATOR" );
23 | // put( 12, "WS / DIRECTIONALITY_WHITESPACE" );
24 | put( 1, "Lu_UPPERCASE_LETTER" );
25 | put( 2, "Ll_LOWERCASE_LETTER" );
26 | put( 3, "Lt_TITLECASE_LETTER" );
27 | put( 4, "Lm_MODIFIER_LETTER" );
28 | put( 5, "Lo_OTHER_LETTER" );
29 | put( 6, "Mn_NON_SPACING_MARK" );
30 | put( 7, "Me_ENCLOSING_MARK" );
31 | put( 8 , "Mc_COMBINING_SPACING_MARK" );
32 | put( 9, "Nd_DECIMAL_DIGIT_NUMBER" );
33 | put( 11, "No_OTHER_NUMBER" );
34 | put( 12, "Zs_SPACE_SEPARATOR" );
35 | put( 13, "Zl_LINE_SEPARATOR" );
36 | put( 14, "Zp_PARAGRAPH_SEPARATOR" );
37 | put( 15, "Cc_CONTROL" );
38 | put( 16, "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
39 | // 17?
40 | put( 18, "Co_PRIVATE_USE" );
41 | put( 19, "Cs_SURROGATE" );
42 | put( 20, "Pd_DASH_PUNCTUATION" );
43 | put( 21, "Ps_START_PUNCTUATION" );
44 | put( 22, "Pe_END_PUNCTUATION" );
45 | put( 23, "Pc_CONNECTOR_PUNCTUATION" );
46 | put( 24, "Po_OTHER_PUNCTUATION" );
47 | put( 25, "Sm_MATH_SYMBOL" );
48 | put( 26, "Sc_CURRENCY_SYMBOL" );
49 | put( 27, "Sk_MODIFIER_SYMBOL" );
50 | put( 28, "So_OTHER_SYMBOL" );
51 | put( 29, "Pi_INITIAL_QUOTE_PUNCTUATION" );
52 | put( 30, "Pf_FINAL_QUOTE_PUNCTUATION" );
53 | }};
54 |
55 | static final Map ALIASES_SHORT_TO_LONG = new HashMap() {{
56 | // Custom
57 | put( "Qm", QUESTION_MARK_NAME );
58 |
59 | // Script
60 | put( "Com", "COMMON" );
61 | put( "Lat", "LATIN" );
62 |
63 | // Block
64 | put( "Basic", "BASIC_LATIN" );
65 | put( "L1Sup", "LATIN_1_SUPPLEMENT" );
66 | put( "GenPunct", "GENERAL_PUNCTUATION" );
67 | put( "LetterSym", "LETTERLIKE_SYMBOLS" );
68 |
69 | // Types
70 | put( "UPPER", "Lu_UPPERCASE_LETTER" );
71 | put( "lower", "Ll_LOWERCASE_LETTER" );
72 | put( "Title", "Lt_TITLECASE_LETTER" );
73 | put( "ModL", "Lm_MODIFIER_LETTER" );
74 | put( "OtherL", "Lo_OTHER_LETTER" );
75 | put( "NonSpc", "Mn_NON_SPACING_MARK" );
76 | put( "Encl", "Me_ENCLOSING_MARK" );
77 | put( "Combining" , "Mc_COMBINING_SPACING_MARK" );
78 | put( "Digit", "Nd_DECIMAL_DIGIT_NUMBER" );
79 | put( "OtherNum", "No_OTHER_NUMBER" );
80 | put( "Space", "Zs_SPACE_SEPARATOR" );
81 | put( "Line", "Zl_LINE_SEPARATOR" );
82 | put( "Para", "Zp_PARAGRAPH_SEPARATOR" );
83 | put( "Ctrl", "Cc_CONTROL" );
84 | put( "Fmt", "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
85 | // 17?
86 | put( "Priv", "Co_PRIVATE_USE" );
87 | put( "Sur", "Cs_SURROGATE" );
88 | put( "Dash", "Pd_DASH_PUNCTUATION" );
89 | put( "Start", "Ps_START_PUNCTUATION" );
90 | put( "End", "Pe_END_PUNCTUATION" );
91 | put( "Conn", "Pc_CONNECTOR_PUNCTUATION" );
92 | put( "OtherP", "Po_OTHER_PUNCTUATION" );
93 | put( "Math", "Sm_MATH_SYMBOL" );
94 | put( "Currency", "Sc_CURRENCY_SYMBOL" );
95 | put( "ModSym", "Sk_MODIFIER_SYMBOL" );
96 | put( "OtherSym", "So_OTHER_SYMBOL" );
97 | put( "StartQ", "Pi_INITIAL_QUOTE_PUNCTUATION" );
98 | put( "EndQ", "Pf_FINAL_QUOTE_PUNCTUATION" );
99 | }};
100 |
101 | static final Map ALIASES_LONG_TO_SHORT = new HashMap();
102 | static {
103 | for ( Entry entry : ALIASES_SHORT_TO_LONG.entrySet() ) {
104 | String shortName = entry.getKey();
105 | String longName = entry.getValue();
106 | ALIASES_LONG_TO_SHORT.put( longName, shortName );
107 | }
108 | }
109 |
110 | // Compound Aliases
111 | // Note: reversed order of initialization here
112 | static final Map COMPOUND_ALIASES_LONG_TO_SHORT = new HashMap() {{
113 | put( "Com-Basic-Space", "space" );
114 | put( "Lat-Basic-UPPER", "UPPER" );
115 | put( "Lat-Basic-lower", "lower" );
116 | put( "Com-Basic-Conn", "Connector" );
117 | put( "Com-Basic-Currency", "Currency" );
118 | put( "Com-Basic-Digit", "Digit" );
119 | put( "Com-Basic-OtherP", "OtherPunct" );
120 | put( "Com-L1Sup-OtherSym", "OtherSym" );
121 | put( "Com-Basic-Start", "Start" );
122 | put( "Com-Basic-End", "Stop" );
123 | put( "Com-Basic-Math", "Math" );
124 | put( "Com-Basic-Dash", "Dash1" );
125 | put( "Com-GenPunct-Dash", "Dash2" );
126 | put( "Com-LetterSym-OtherSym", "LetterSymbol" );
127 | put( "Com-Basic-Qm", "QuestionMark" ); // add suffix 1 when needed
128 | }};
129 | static final Map COMPOUND_ALIASES_SHORT_TO_LONG = new HashMap();
130 | static {
131 | for ( Entry entry : COMPOUND_ALIASES_LONG_TO_SHORT.entrySet() ) {
132 | String longName = entry.getKey();
133 | String shortName = entry.getValue();
134 | COMPOUND_ALIASES_SHORT_TO_LONG.put( shortName, longName );
135 | }
136 | }
137 |
138 | static String generateReport() {
139 | return generateReportForRange( 0, 255 );
140 | }
141 | static String generateReportForRange( int min, int max ) {
142 | StringWriter sw = new StringWriter();
143 | PrintWriter out = new PrintWriter(sw);
144 |
145 | for ( int i=min; i<=max; i++ ) {
146 | addCharInfoToReport( out, i );
147 | }
148 |
149 | String outStr = sw.toString();
150 | return outStr;
151 | }
152 | static String generateReportForPoints( int ... codePoints ) {
153 | StringWriter sw = new StringWriter();
154 | PrintWriter out = new PrintWriter(sw);
155 |
156 | for ( int i : codePoints ) {
157 | addCharInfoToReport( out, i );
158 | }
159 |
160 | String outStr = sw.toString();
161 | return outStr;
162 | }
163 | static void addCharInfoToReport( PrintWriter out, int codePoint ) {
164 | out.print( "" + codePoint );
165 | out.print( ", " );
166 | out.print( String.format("%X", codePoint) );
167 | out.print( ": " );
168 | if ( codePoint >= 32 ) {
169 | Character c = new Character( (char)codePoint );
170 | if ( ! Character.isSupplementaryCodePoint( codePoint ) ) {
171 | out.print( " c='"+c+"'" );
172 | }
173 | // Extended / Supplmental Unicode
174 | else {
175 | // also StringBuffer appendCodePoint(int cp)
176 | char[] chars = Character.toChars( codePoint );
177 | out.print( " c='" );
178 | for ( char cS : chars ) {
179 | out.print( cS );
180 | }
181 | out.print( "'" );
182 | }
183 | }
184 | boolean isDef = Character.isDefined( codePoint );
185 | out.print( " isDef="+isDef );
186 | boolean isValid = Character.isValidCodePoint( codePoint );
187 | out.print( " isValid="+isValid );
188 | boolean isCtrl = Character.isISOControl( codePoint );
189 | out.print( " isCtrl="+isCtrl );
190 | boolean isBmp = Character.isBmpCodePoint( codePoint );
191 | out.print( " isBmp="+isBmp );
192 | boolean isSupp = Character.isSupplementaryCodePoint( codePoint );
193 | out.print( " isSupp="+isSupp );
194 | boolean isAlpha = Character.isAlphabetic( codePoint );
195 | out.print( " isAlpha="+isAlpha );
196 | boolean isLetter = Character.isLetter( codePoint );
197 | out.print( " isLetter="+isLetter );
198 | boolean isDigit = Character.isDigit( codePoint );
199 | out.print( " isDigit="+isDigit );
200 | int type = Character.getType( codePoint );
201 | String typeStr = "" + type;
202 | if ( TYPES.containsKey(type) ) {
203 | typeStr += " " + TYPES.get(type);
204 | }
205 | else {
206 | typeStr += " (no-TYPES-entry)";
207 | }
208 | out.print( " type="+typeStr );
209 | String block = null;
210 | String script = null;
211 | try {
212 | block = UnicodeBlock.of( codePoint ).toString();
213 | script = UnicodeScript.of( codePoint ).toString();
214 | }
215 | catch( Exception e ) { }
216 | out.print( " script="+script );
217 | out.print( " block="+block );
218 | String name = Character.getName( codePoint );
219 | out.print( " name="+name );
220 | out.println();
221 | }
222 |
223 | public static String getScriptName_LongForm( int codePoint ) {
224 | String script = "Unknown_Unicode_Script";
225 | try {
226 | script = UnicodeScript.of( codePoint ).toString();
227 | }
228 | catch( Exception e ) { }
229 | return script;
230 | }
231 | public static String getScriptName_ShortForm( int codePoint ) {
232 | String longName = getScriptName_LongForm( codePoint );
233 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
234 | return ALIASES_LONG_TO_SHORT.get(longName);
235 | }
236 | else {
237 | return longName;
238 | }
239 | }
240 | public static String getBlockName_LongForm( int codePoint ) {
241 | String block = "Unknown_Unicode_Block";
242 | try {
243 | block = UnicodeBlock.of( codePoint ).toString();
244 | }
245 | catch( Exception e ) { }
246 | return block;
247 | }
248 | public static String getBlockName_ShortForm( int codePoint ) {
249 | String longName = getBlockName_LongForm( codePoint );
250 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
251 | return ALIASES_LONG_TO_SHORT.get(longName);
252 | }
253 | else {
254 | return longName;
255 | }
256 | }
257 | public static String getTypeName_LongForm( int codePoint ) {
258 | int type = Character.getType( codePoint );
259 | String typeStr = "";
260 | if ( codePoint == QUESTION_MARK_CODEPOINT ) {
261 | typeStr = QUESTION_MARK_NAME;
262 | }
263 | else if ( TYPES.containsKey(type) ) {
264 | typeStr = TYPES.get(type);
265 | }
266 | else {
267 | typeStr = "" + type + "_No_TYPES_Entry";
268 | }
269 | return typeStr;
270 | }
271 | public static String getTypeName_ShortForm( int codePoint ) {
272 | String longName = getTypeName_LongForm( codePoint );
273 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
274 | return ALIASES_LONG_TO_SHORT.get(longName);
275 | }
276 | else {
277 | return longName;
278 | }
279 | }
280 | // returns "script-block-type"
281 | public static String getCompoundClassifier_LongForm( int codePoint ) {
282 | return getScriptName_LongForm(codePoint)
283 | + "-" + getBlockName_LongForm(codePoint)
284 | + "-" + getTypeName_LongForm(codePoint)
285 | ;
286 | }
287 | public static String getCompoundClassifier_ShortForm( int codePoint ) {
288 | String candidate = getScriptName_ShortForm(codePoint)
289 | + "-" + getBlockName_ShortForm(codePoint)
290 | + "-" + getTypeName_ShortForm(codePoint)
291 | ;
292 | if ( COMPOUND_ALIASES_LONG_TO_SHORT.containsKey(candidate) ) {
293 | return COMPOUND_ALIASES_LONG_TO_SHORT.get( candidate );
294 | }
295 | else {
296 | return candidate;
297 | }
298 | }
299 |
300 | public static Map classifyString_LongForm( String inStr ) {
301 | return classifyString_LongForm( inStr, null );
302 | }
303 | public static Map classifyString_LongForm( String inStr, Map stats ) {
304 | // Automatically sorts by key-order
305 | if ( null==stats ) {
306 | // In order by key, easier for overall tabulation
307 | stats = new TreeMap<>();
308 | }
309 | if ( null==inStr || inStr.isEmpty() ) {
310 | return stats;
311 | }
312 | // Special looping to allow for Supplementary Unicode Characters (> 65k)
313 | int length = inStr.length();
314 | for (int offset = 0; offset < length; ) {
315 | int codePoint = inStr.codePointAt( offset );
316 | String charKey = getCompoundClassifier_LongForm( codePoint );
317 | // Tabulate
318 | long count = 0L;
319 | if ( stats.containsKey(charKey) ) {
320 | count = stats.get( charKey );
321 | }
322 | count++;
323 | stats.put( charKey, count );
324 | // Advance
325 | offset += Character.charCount( codePoint );
326 | }
327 | return stats;
328 | }
329 | public static Map classifyString_ShortForm( String inStr ) {
330 | return classifyString_ShortForm( inStr, null );
331 | }
332 | // TODO: code very similar to LongForm, combine
333 | public static Map classifyString_ShortForm( String inStr, Map stats ) {
334 | // Automatically sorts by key-order
335 | if ( null==stats ) {
336 | // In order by key, easier for overall tabulation
337 | stats = new TreeMap<>();
338 | }
339 | if ( null==inStr || inStr.isEmpty() ) {
340 | return stats;
341 | }
342 | // Special looping to allow for Supplementary Unicode Characters (> 65k)
343 | int length = inStr.length();
344 | for (int offset = 0; offset < length; ) {
345 | int codePoint = inStr.codePointAt( offset );
346 | String charKey = getCompoundClassifier_ShortForm( codePoint );
347 | // Tabulate
348 | long count = 0L;
349 | if ( stats.containsKey(charKey) ) {
350 | count = stats.get( charKey );
351 | }
352 | count++;
353 | stats.put( charKey, count );
354 | // Advance
355 | offset += Character.charCount( codePoint );
356 | }
357 | return stats;
358 | }
359 |
360 | public static void main( String [] argv ) {
361 | // U+306E, dec:12398
362 | System.out.println( "Japanese \"no\": '\u306e'" );
363 | // U+4e00 19968, U+4e8c 20108, U+4e09 19977
364 | System.out.println( "Chinese 1 2 3: '\u4e00\u4e8c\u4e09'" );
365 | // U+1D11E, dec:119070
366 | System.out.println( "Extended: Musical G-clef: '\uD834\uDD1E'" );
367 | // U+1F37A, dec:127866
368 | System.out.println( "Extended: Beer Mug: '\uD83C\uDF7A'" );
369 |
370 | // String report = generateReportForRange( 0, 255 );
371 | String report = generateReportForPoints( 12398, 19968, 20108, 19977, 119070, 127866 );
372 | System.out.print( report );
373 | }
374 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.lang.reflect.Field;
4 | import java.lang.reflect.InvocationTargetException;
5 | import java.lang.reflect.Method;
6 | import java.util.LinkedHashMap;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 |
10 | public class CmdLineLauncher {
11 | // TODO: currently using static init but
12 | // fefactoring would require that all classes use lightweight null constructor
13 | // static final Map> CLASSES = new LinkedHashMap>()
14 | static final Map> CLASSES = new LinkedHashMap>()
15 | {{
16 | put( "empty_fields", com.lucidworks.dq.data.EmptyFieldStats.class );
17 | put( "term_stats", com.lucidworks.dq.data.TermStats.class );
18 | put( "code_points", com.lucidworks.dq.data.TermCodepointStats.class );
19 | put( "date_checker", com.lucidworks.dq.data.DateChecker.class );
20 | put( "diff_empty_fields", com.lucidworks.dq.diff.DiffEmptyFieldStats.class );
21 | put( "diff_ids", com.lucidworks.dq.diff.DiffIds.class );
22 | put( "diff_schema", com.lucidworks.dq.diff.DiffSchema.class );
23 | put( "diff_config", com.lucidworks.dq.diff.DiffSolrConfig.class );
24 | put( "doc_count", com.lucidworks.dq.data.DocCount.class );
25 | put( "dump_ids", com.lucidworks.dq.data.DumpIds.class );
26 | put( "delete_by_ids", com.lucidworks.dq.data.DeleteByIds.class );
27 | put( "solr_to_solr", com.lucidworks.dq.data.SolrToSolr.class );
28 | put( "solr_to_csv", com.lucidworks.dq.data.SolrToCsv.class );
29 | put( "hash_and_shard", com.lucidworks.dq.util.HashAndShard.class );
30 | }};
31 | public static void main( String[] argv ) {
32 | if( argv.length < 1 ) {
33 | System.out.println( "Pass a command name on the command line to see help for that class:" );
34 | // for( Entry> entry : CLASSES.entrySet() )
35 | for( Entry> entry : CLASSES.entrySet() )
36 | {
37 | String cmdName = entry.getKey();
38 | // Class extends HasDescription> clazz = entry.getValue();
39 | Class> clazz = entry.getValue();
40 |
41 | String desc = null;
42 | try {
43 | Method descMeth = clazz.getMethod( "getShortDescription" );
44 | desc = (String) descMeth.invoke( null, (Object[]) null );
45 | // Field f = clazz.getDeclaredField( "HELP_WHAT_IS_IT" );
46 | // desc = (String) f.get(null);
47 | } catch (SecurityException | IllegalArgumentException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
48 | // TODO Auto-generated catch block
49 | e.printStackTrace();
50 | }
51 |
52 | // System.out.println( cmdName + ": " + desc );
53 | System.out.printf( "%20s: %s\n", cmdName, desc );
54 | }
55 | }
56 | // Has a command name
57 | else {
58 | String cmdName = argv[ 0 ];
59 | if ( CLASSES.containsKey(cmdName) ) {
60 | // Copy over all the first arg
61 | String [] argv2 = new String[ argv.length - 1 ];
62 | for ( int i=1; i clazz = CLASSES.get(cmdName);
66 | try {
67 | Method main = clazz.getMethod( "main", String[].class );
68 | // main.invoke( null, argv2 );
69 | // main.invoke( null, (Object[]) argv2 );
70 | main.invoke( null, (Object) argv2 );
71 | } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
72 | // TODO Auto-generated catch block
73 | e.printStackTrace();
74 | System.exit(2);
75 | }
76 | }
77 | else {
78 | System.err.println( "Command \"" + cmdName + "\" not found in " + CLASSES.keySet() );
79 | System.exit(2);
80 | }
81 | }
82 | }
83 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/DateUtils.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.text.DateFormat;
4 | import java.text.ParseException;
5 | import java.text.SimpleDateFormat;
6 | import java.util.ArrayList;
7 | import java.util.Collection;
8 | import java.util.Date;
9 | import java.util.List;
10 | import java.util.TimeZone;
11 |
12 | public class DateUtils {
13 |
14 | public static final String JAVA_FORMAT = "EEE MMM dd HH:mm:ss z yyyy";
15 | public static final String ZULU_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
16 | // public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.S";
17 | public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.SSS";
18 |
19 | public static String getLocalTimestamp( Date inDate ) {
20 | DateFormat compactFormatter = new SimpleDateFormat( COMPACT_LOG_FORMAT );
21 | // NOT setting timezone
22 | return compactFormatter.format( inDate );
23 | }
24 | public static String getLocalTimestamp() {
25 | return getLocalTimestamp( new Date() );
26 | }
27 | public static String getLocalTimestamp( long ms ) {
28 | return getLocalTimestamp( new Date(ms) );
29 | }
30 | public static String javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException {
31 | java.util.Date dateObj = javaDefault2Date_str2date( inDate );
32 | String outDateStr = date2SolrXmlZulu_date2str( dateObj );
33 | return outDateStr;
34 | }
35 | public static String solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException {
36 | java.util.Date dateObj = solrXmlZulu2Date_str2date( inDate );
37 | String outDateStr = date2JavaDefault_date2str( dateObj );
38 | return outDateStr;
39 | }
40 | public static String _javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException {
41 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
42 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
43 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
44 | java.util.Date tmpDate = javaFormatter.parse( inDate );
45 | String outDate = zuluFormatter.format( tmpDate );
46 | return outDate;
47 | }
48 | public static String _solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException {
49 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
50 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
51 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
52 | java.util.Date tmpDate = zuluFormatter.parse( inDate );
53 | String outDate = javaFormatter.format( tmpDate );
54 | return outDate;
55 | }
56 |
57 | public static String date2SolrXmlZulu_date2str( java.util.Date inDate ) throws ParseException {
58 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
59 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
60 | String outDate = zuluFormatter.format( inDate );
61 | return outDate;
62 | }
63 | public static String date2JavaDefault_date2str( java.util.Date inDate ) throws ParseException {
64 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
65 | String outDate = javaFormatter.format( inDate );
66 | return outDate;
67 | }
68 |
69 | public static java.util.Date javaDefault2Date_str2date( String inDate ) throws ParseException {
70 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
71 | java.util.Date outDate = javaFormatter.parse( inDate );
72 | return outDate;
73 | }
74 | public static java.util.Date solrXmlZulu2Date_str2date( String inDate ) throws ParseException {
75 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
76 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
77 | java.util.Date outDate = zuluFormatter.parse( inDate );
78 | return outDate;
79 | }
80 |
81 | public static List dates2Doubles( Collection dates ) {
82 | List out = new ArrayList<>();
83 | for ( Date d : dates ) {
84 | out.add( new Double( d.getTime() ) );
85 | }
86 | return out;
87 | }
88 | public static Double date2Double( Date d ) {
89 | return new Double( d.getTime() ).doubleValue();
90 | }
91 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/HasDescription.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | // TODO: future... Refactor and add lightweight null constructors
4 | // see also util.CmdLineLauncher
5 | public interface HasDescription {
6 | String getShortDescription();
7 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/HashAndShard.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.apache.solr.common.cloud.DocRouter.Range;
7 | import org.apache.solr.common.util.Hash;
8 |
9 | public class HashAndShard {
10 |
11 | // Should correspond to:
12 | // http://localhost:8983/solr/collection1/select?q=*&fl=*,[shard]
13 |
14 | static String HELP_WHAT_IS_IT = "Calculate hash and shard for a document ID";
15 | static String HELP_USAGE = "HashAndShard docId [numberOfShards [-q]] # shards can be decimal, hex, octal, etc";
16 | public static String getShortDescription() {
17 | return HELP_WHAT_IS_IT;
18 | }
19 |
20 |
21 | /* From:
22 | * solr-lucene-490-src/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java
23 | */
24 | private static int bits = 16;
25 | static List partitionRange( int partitions ) {
26 | int min = Integer.MIN_VALUE; // -2^31 = -2147483648 = -2,147,483,648
27 | int max = Integer.MAX_VALUE; // 2^31-1 = 2147483647 = 2,147,483,647
28 |
29 | // assert max >= min;
30 | // if (partitions == 0) return Collections.EMPTY_LIST;
31 | long rangeSize = (long) max - (long) min;
32 | long rangeStep = Math.max(1, rangeSize / partitions);
33 |
34 | List ranges = new ArrayList<>(partitions);
35 |
36 | long start = min;
37 | long end = start;
38 |
39 | // keep track of the idealized target to avoid accumulating rounding errors
40 | long targetStart = min;
41 | long targetEnd = targetStart;
42 |
43 | // Round to avoid splitting hash domains across ranges if such rounding is not significant.
44 | // With default bits==16, one would need to create more than 4000 shards before this
45 | // becomes false by default.
46 | int mask = 0x0000ffff;
47 | boolean round = rangeStep >= (1 << bits) * 16;
48 |
49 | while (end < max) {
50 | targetEnd = targetStart + rangeStep;
51 | end = targetEnd;
52 |
53 | if (round && ((end & mask) != mask)) {
54 | // round up or down?
55 | int increment = 1 << bits; // 0x00010000
56 | long roundDown = (end | mask) - increment;
57 | long roundUp = (end | mask) + increment;
58 | if (end - roundDown < roundUp - end && roundDown > start) {
59 | end = roundDown;
60 | } else {
61 | end = roundUp;
62 | }
63 | }
64 |
65 | // make last range always end exactly on MAX_VALUE
66 | if (ranges.size() == partitions - 1) {
67 | end = max;
68 | }
69 | ranges.add(new Range((int) start, (int) end));
70 | start = end + 1L;
71 | targetStart = targetEnd + 1L;
72 | }
73 |
74 | return ranges;
75 | }
76 |
77 | static void printRanges( List ranges, Integer hash ) {
78 | int shardCounter = 0;
79 | for ( Range r : ranges ) {
80 | shardCounter++;
81 | System.out.println( "Shard # " + shardCounter );
82 | System.out.println( "\tRange: "
83 | + String.format("0x%8s", Integer.toHexString(r.min)).replace(' ', '0')
84 | + " to "
85 | + String.format("0x%8s", Integer.toHexString(r.max)).replace(' ', '0')
86 | );
87 | if ( null!=hash ) {
88 | if ( hash >= r.min && hash <= r.max ) {
89 | System.out.println( "\tcontains "
90 | + String.format("0x%8s", Integer.toHexString(hash)).replace(' ', '0')
91 | );
92 | }
93 | }
94 | }
95 | }
96 | static int findShardForHash( List ranges, Integer hash ) {
97 | int shardCounter = 0;
98 | for ( Range r : ranges ) {
99 | shardCounter++;
100 | if ( hash >= r.min && hash <= r.max ) {
101 | return shardCounter;
102 | }
103 | }
104 | return -1;
105 | }
106 |
107 | public static void main(String[] args) {
108 | if ( args.length < 1 || args.length > 3 ) {
109 | System.err.println( "Error: syntax: " + HELP_USAGE );
110 | System.exit(1);
111 | }
112 | String docId = args[0];
113 | if ( docId.length() < 1 ) {
114 | System.err.println( "Error: empty docId" );
115 | System.exit(2);
116 | }
117 | String numShardsStr = args.length >= 2 ? args[1] : null;
118 | String quietStr = args.length >= 3 ? args[2] : null;
119 | boolean quiet = null!=quietStr && quietStr.equalsIgnoreCase("-q");
120 |
121 | int signedHash = Hash.murmurhash3_x86_32( docId, 0, docId.length(), 0 );
122 | long unsignedHash = signedHash & 0x00000000ffffffffL;
123 | if ( ! quiet ) {
124 | System.out.println( "docId: \"" + docId + '"' );
125 | System.out.println( "32-bit Hash (signed decimal int): " + signedHash );
126 | System.out.println( "32-bit Hash (unsigned dec int): " + unsignedHash );
127 | System.out.println( "32-bit Hash (hex): " + String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') );
128 | System.out.println( "32-bit Hash (binary): " + String.format("%32s", Integer.toBinaryString(signedHash)).replace(' ', '0') );
129 | }
130 | else {
131 | System.out.print( docId + " " );
132 | System.out.print( String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') );
133 | }
134 |
135 | if ( null != numShardsStr ) {
136 | Integer numShards = null;
137 | try {
138 | numShards = Integer.decode( numShardsStr );
139 | }
140 | catch( NumberFormatException e ) {
141 | System.err.println( "Error parsing numberOfShards: " + e );
142 | System.exit(3);
143 | }
144 | if ( numShards <= 0 ) {
145 | System.err.println( "Error: numberOfShards must be > 0; got " + numShards );
146 | System.exit(4);
147 | }
148 | // WRONG!
149 | // long shardNumber = (unsignedHash % numShards) + 1;
150 | // System.out.println( "Route to Shard (base-ONE): " + shardNumber );
151 |
152 | List ranges = partitionRange( numShards );
153 |
154 | if ( ! quiet ) {
155 | System.out.println( "Number of Shards: " + numShards );
156 |
157 | printRanges( ranges, signedHash );
158 | }
159 | else {
160 | int targetShard = findShardForHash( ranges, signedHash );
161 | System.out.print( " " + targetShard );
162 | }
163 | }
164 | if ( quiet ) {
165 | System.out.println();
166 | }
167 |
168 | }
169 |
170 | }
171 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/IO_Utils.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.URI;
6 | import java.net.URISyntaxException;
7 | import java.nio.file.CopyOption;
8 | import java.nio.file.FileSystem;
9 | import java.nio.file.FileSystems;
10 | import java.nio.file.FileVisitResult;
11 | import java.nio.file.Files;
12 | import java.nio.file.Path;
13 | import java.nio.file.Paths;
14 | import java.nio.file.SimpleFileVisitor;
15 | import java.nio.file.StandardCopyOption;
16 | import java.nio.file.attribute.BasicFileAttributes;
17 | import java.util.Collections;
18 |
19 |
20 | public class IO_Utils {
21 |
22 | public static File materializeSolrHomeIntoTemp() throws IOException, URISyntaxException {
23 | String prefix = "solr_dq_utils_";
24 | String topName = "solr_home";
25 | //String magicName = "configsets";
26 | Path baseTempDir = Files.createTempDirectory( prefix );
27 | // File destinationDir = new File( baseTempDir.toFile(), magicName );
28 | File destinationDir = new File( baseTempDir.toFile(), topName );
29 | if ( ! destinationDir.mkdirs() ) {
30 | throw new IOException( "Unable to create path \"" + destinationDir + "\"" );
31 | }
32 | // System.out.println( "Created \"" + destinationDir + "\"" );
33 | IO_Utils iou = new IO_Utils();
34 |
35 | //String sourcePathWithinJar = "/";
36 | // ^-- gets all classes from every combined jar
37 |
38 | //String sourcePathWithinJar = "configsets";
39 | // ^-- Exception in thread "main" java.lang.IllegalArgumentException, no details
40 |
41 | // String sourcePathWithinJar = "/" + magicName;
42 | String sourcePathWithinJar = "/" + topName;
43 |
44 | // String destinationPathInFilesystem = "/Users/mbennett/tmp_test_copy";
45 | // ^-- Doesn't create spanning .../configsets/... dir, just subdirectories of it
46 |
47 | // iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationPathInFilesystem) );
48 | iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationDir.toString()) );
49 | return destinationDir;
50 | }
51 |
52 | // Parts take from:
53 | // * http://stackoverflow.com/a/24316335/295802
54 | // * http://codingjunkie.net/java-7-copy-move/
55 | // Usage: copyFromJar("/path/to/the/template/in/jar", Paths.get("/tmp/from-jar"))
56 | public void copyFromJar(String source, final Path target) throws URISyntaxException, IOException {
57 | System.out.println( "source str = \"" + source + "\"" );
58 |
59 |
60 | // getClass is defined in Object
61 | URI resource = getClass().getResource("").toURI();
62 |
63 | // ... ? FileSystems.newFileSystem(...)
64 | // ^-- java.lang.IllegalArgumentException: Path component should be '/'
65 | // at least when run in Eclipse (non .jar packaging)
66 | //URI resource = getClass().getResource("/").toURI();
67 |
68 | System.out.println( "URI Resource = \"" + resource + "\"" );
69 | // ^-- Interactive: "file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/"
70 | // ^-- Run Uberjar: "jar:file:/Users/mbennett/data/dev/DQ/data-quality-github/target/data-quality-java-1.0-SNAPSHOT.jar!/com/lucidworks/dq/util/"
71 |
72 | // jar:file: - Running from packaged jar
73 | if ( resource.toString().startsWith("jar:file:" ) ) {
74 | FileSystem fileSystem = FileSystems.newFileSystem(
75 | resource,
76 | Collections.emptyMap()
77 | );
78 |
79 | final Path jarPath = fileSystem.getPath(source);
80 |
81 | // Recursive copy
82 | // TODO: looks similar to other recursive copy below, maybe combine
83 | Files.walkFileTree(jarPath, new SimpleFileVisitor() {
84 |
85 | private Path currentTarget;
86 |
87 | @Override
88 | public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
89 | currentTarget = target.resolve(jarPath.relativize(dir).toString());
90 | Files.createDirectories(currentTarget);
91 | return FileVisitResult.CONTINUE;
92 | }
93 |
94 | @Override
95 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
96 | //System.out.println( "Copying \"" + file.toString() + "\" ..." );
97 | Files.copy(file, target.resolve(jarPath.relativize(file).toString()), StandardCopyOption.REPLACE_EXISTING);
98 | return FileVisitResult.CONTINUE;
99 | }
100 |
101 | });
102 |
103 | }
104 | // file: - Running from Eclipse or other non-packaged runner
105 | else if ( resource.toString().startsWith("file:" ) ) {
106 | // Our resource is relative root level, not this specific package
107 | URI resource2 = getClass().getResource("/").toURI();
108 | File base = new File( resource2.getPath() );
109 | File srcDir = new File( base, source );
110 | final Path fromPath = srcDir.toPath();
111 | final Path toPath = target;
112 |
113 | // Recursive copy
114 | // TODO: looks similar to other recursive copy above, maybe combine
115 | Files.walkFileTree(fromPath, new SimpleFileVisitor() {
116 |
117 | @Override
118 | public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
119 | Path targetPath = toPath.resolve(fromPath.relativize(dir));
120 | if ( ! Files.exists(targetPath) ){
121 | Files.createDirectory(targetPath);
122 | }
123 | return FileVisitResult.CONTINUE;
124 | }
125 |
126 | @Override
127 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
128 | Files.copy(file, toPath.resolve(fromPath.relativize(file)), StandardCopyOption.REPLACE_EXISTING);
129 | return FileVisitResult.CONTINUE;
130 | }
131 |
132 | });
133 |
134 |
135 | /***
136 | // TODO: recursive copy from filesystem
137 | // Files.copy( new File(source).toPath(), target, StandardCopyOption.REPLACE_EXISTING );
138 | // ^-- No, only has "/solr_home"
139 | // and "resource" is too far down:
140 | // Gives: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/com/lucidworks/dq/util
141 | // Need: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/solr_home
142 | URI resource2 = getClass().getResource("/").toURI();
143 | // gives! file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/
144 | System.out.println( "URI Resource2 = \"" + resource2 + "\"" );
145 | File base = new File( resource2.getPath() );
146 | File srcDir = new File( base, source );
147 | Path srcPath = srcDir.toPath();
148 | System.out.println( "srcPath = \"" + srcPath + "\"" );
149 | System.out.println( "target = \"" + target + "\"" );
150 | //Files.copy( srcPath, target, StandardCopyOption.REPLACE_EXISTING );
151 |
152 | // EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
153 | // TreeCopier tc = new TreeCopier(source[i], dest, prompt, preserve);
154 | // Files.walkFileTree(source[i], opts, Integer.MAX_VALUE, tc);
155 |
156 | ***/
157 |
158 |
159 | }
160 | else {
161 | throw new IllegalArgumentException( "Don't know how to handle " + resource );
162 | }
163 |
164 | }
165 |
166 |
167 |
168 |
169 | public static void main(String[] args) throws URISyntaxException, IOException {
170 | //File configSetsDir = materializeConfigsetsInTemp();
171 | File configSetsDir = materializeSolrHomeIntoTemp();
172 | System.out.println( "ConfigSets = " + configSetsDir );
173 |
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/LLR.java-new:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 | import java.util.Collection;
6 | import java.util.LinkedHashMap;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 | import java.util.Set;
10 | import java.util.TreeMap;
11 | import java.util.TreeSet;
12 |
13 | import org.apache.solr.client.solrj.SolrServerException;
14 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
15 |
16 | public class LLR {
17 |
18 | Map wordsA;
19 | Map wordsB;
20 | // TODO: consider this, BUT threshold for A OR B, or A AND B ?
21 | // long minWordsThreshold = 0L;
22 |
23 | // Column Totals
24 | double sumA = 0.0;
25 | double sumB = 0.0;
26 | // K Total
27 | double grandTotal;
28 | // Row Totals
29 | Map rowTotals = new LinkedHashMap<>();
30 |
31 | // Set allWordsAboveThreshold = new TreeSet<>();
32 | Set allWords = new TreeSet<>();
33 |
34 | Map scoresByWord = new TreeMap<>();
35 | Map sortedScoresByWord = new TreeMap<>();
36 |
37 | // Peformance Stat
38 | long plogp_counter = 0L;
39 |
40 | public LLR( Map wordsA, Map wordsB /*, Long optThreshold*/ ) {
41 | this.wordsA = wordsA;
42 | this.wordsB = wordsB;
43 | //if ( null!=optThreshold && optThreshold.longValue() > 0L ) {
44 | // this.minWordsThreshold = optThreshold.longValue();
45 | //}
46 | doInitialCalculations();
47 | calcAllWords();
48 | sortWords();
49 | }
50 |
51 | public void doInitialCalculations() {
52 |
53 | // Column Totals
54 | // -------------
55 | // sumA = sumWithThreshold( wordsA.values() );
56 | // sumB = sumWithThreshold( wordsB.values() );
57 | sumA = new Double( StatsUtils.sumList_Longs(wordsA.values()) ).doubleValue();
58 | sumB = new Double( StatsUtils.sumList_Longs(wordsB.values()) ).doubleValue();
59 | if ( sumA<=0.0 || sumB<=0.0 ) {
60 | throw new IllegalArgumentException( "Must have non-zero word counts: A=" + sumA + ", B=" + sumB );
61 | }
62 |
63 | // K Total
64 | grandTotal = sumA + sumB;
65 |
66 | // Row Totals
67 | // ----------
68 | allWords.addAll( wordsA.keySet() );
69 | allWords.addAll( wordsB.keySet() );
70 | for ( String word : allWords ) {
71 | Long countA = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
72 | Long countB = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
73 | rowTotals.put( word, new Double(countA + countB) );
74 | }
75 |
76 | }
77 |
78 | public void calcAllWords() {
79 | for ( String word : allWords ) {
80 | // double g2 = calcG2_viaDunning( word );
81 | double g2 = calcG2_viaTraditional( word );
82 | scoresByWord.put( word, g2 );
83 | }
84 | }
85 |
86 |
87 | // TODO: G2 is the same as -2 log lambda ?
88 | // http://scg.unibe.ch/archive/papers/Kuhn09aLogLikelihoodRatio.pdf
89 | // Before Sign:
90 | // food: 0.0
91 | // bananas: 0.46192170199964266
92 | // apples: 0.6291706616789554
93 | // carrots: 60.03320678316349
94 | // candy: 60.03320678316351
95 | // After Sign:
96 | // candy: -60.03320678316351
97 | // bananas: -0.46192170199964266
98 | // food: 0.0
99 | // apples: 0.6291706616789554
100 | // carrots: 60.03320678316349
101 | double calcG2_viaTraditional( String word ) {
102 | boolean debug = false;
103 | if(debug) System.out.println( "\n=== Calculating G2 via Traditional formula for \"" + word + "\" ===" );
104 | // Simple terms
105 | double k1 = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
106 | double k2 = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
107 | double n1 = sumA;
108 | double n2 = sumB;
109 | double p1 = k1 / n1;
110 | double p2 = k2 / n2;
111 | if(debug) System.out.println( "Corpus A: k1, n1, p1: " + k1 + ", " + n1 + ", " + p1 );
112 | if(debug) System.out.println( "Corpus B: k2, n2, p2: " + k2 + ", " + n2 + ", " + p2 );
113 | double p = (k1 + k2) / (n1 + n2); // rowCount / grandTotal
114 | if(debug) System.out.println( "Combined: k1+2, n1+2, p1+2: " + (k1+k2) + ", " + (n1+n2) + ", " + p );
115 | // Factors
116 | double factorA = Math.log( L(p1,k1,n1) );
117 | double factorB = Math.log( L(p2,k2,n2) );
118 | double factorC = Math.log( L(p,k1,n1) );
119 | double factorD = Math.log( L(p,k2,n2) );
120 | double sign = sign( p1, p2 );
121 | // Result
122 | double out = sign * 2.0 * ( factorA + factorB - factorC - factorD );
123 | if(debug) System.out.println( "out = +/-sign * 2.0 * ( factorA + factorB - factorC - factorD )" );
124 | if(debug) System.out.println( "Sign and Factors A, B, C, D: " + sign + ", " + factorA + ", " + factorB + ", " + factorC + ", " + factorD );
125 | if(debug) System.out.println( "out = " + out );
126 | return out;
127 | }
128 |
129 | // TODO: this is Binomial Likelihood ?
130 | // k = word count
131 | // n = total words in corpus (non-unique)
132 | // p = k/n, BUT might use different k and n
133 | static double L( double p, double k, double n ) {
134 | double part1 = Math.pow( p, k );
135 | double part2 = Math.pow( (1.0-p), (n-k) );
136 | return part1 * part2;
137 | }
138 |
139 | // TODO: confirm meaning of +/-
140 | // plus = heavier in first collection
141 | // minus = heavier in second collection
142 | static double sign( double p1, double p2 ) {
143 | if ( p1 - p2 >= 0.0 ) {
144 | return 1.0;
145 | }
146 | else {
147 | return -1.0;
148 | }
149 | }
150 |
151 | // Each word is done individually, across both collections
152 | // food: 1.7319479184152442E-13
153 | // bananas: 0.4619217019995059
154 | // apples: 0.6291706616789394
155 | // candy: 60.03320678316341
156 | // carrots: 60.03320678316341
157 | double calcG2_viaDunning( String word ) {
158 | boolean debug = false;
159 | if(debug) System.out.println( "\n=== Calculating G2 via Dunning Entropy formula for \"" + word + "\" ===" );
160 | // Calc H_rowSums
161 | // ---------------
162 | double row1Total = rowTotals.get(word);
163 | double row2Total = grandTotal - row1Total;
164 | if(debug) System.out.println( "Row Totals: " + row1Total + " " + row2Total );
165 | // plnp = probability * log (probability), log = natural log
166 | double plogpRow1 = 0.0;
167 | if ( row1Total > 0.0 ) {
168 | double prob = row1Total / grandTotal;
169 | plogpRow1 = prob * Math.log(prob);
170 | plogp_counter++;
171 | }
172 | double plogpRow2 = 0.0;
173 | if ( row2Total > 0.0 ) {
174 | double prob = row2Total / grandTotal;
175 | plogpRow2 = prob * Math.log(prob);
176 | plogp_counter++;
177 | }
178 | double H_rowSums = -1.0 * ( plogpRow1 + plogpRow2 );
179 | if(debug) System.out.println( "Row plogp 1 & 2 and H_rowSums: " + plogpRow1 + " " + plogpRow2 + " " + H_rowSums );
180 |
181 | // Calc H_colSums
182 | // --------------
183 | // We checked column sums earlier
184 | double probCol1 = sumA / grandTotal;
185 | double plogpCol1 = probCol1 * Math.log( probCol1 );
186 | plogp_counter++;
187 | double probCol2 = sumB / grandTotal;
188 | double plogpCol2 = probCol2 * Math.log( probCol2 );
189 | plogp_counter++;
190 | double H_colSums = -1.0 * ( plogpCol1 + plogpCol2 );
191 | if(debug) System.out.println( "Column plogp 1 & 2 and H_colSums: " + plogpCol1 + " " + plogpCol2 + " " + H_colSums );
192 |
193 | // Calc H_k
194 | // -----------
195 | // column 1 counts
196 | double k_11 = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
197 | double k_21 = sumA - k_11; // all other counts
198 | // column 2 counts
199 | double k_12 = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
200 | double k_22 = sumB - k_12; // all other counts
201 | if(debug) System.out.println( "K counts:\n\t" + k_11 + " " + k_12 + "\n\t" + k_21 + " " + k_22 );
202 | // probabilities
203 | double prob_11 = k_11 / grandTotal;
204 | double prob_21 = k_21 / grandTotal;
205 | double prob_12 = k_12 / grandTotal;
206 | double prob_22 = k_22 / grandTotal;
207 | // p log( p )
208 | // method has its own counter
209 | double plogp_11 = plogp( prob_11 );
210 | double plogp_21 = plogp( prob_21 );
211 | double plogp_12 = plogp( prob_12 );
212 | double plogp_22 = plogp( prob_22 );
213 | // finally H_k
214 | double H_k = -1.0 * ( plogp_11 + plogp_21 + plogp_12 + plogp_22 );
215 | if(debug) System.out.println( "K plogp:\n\t" + plogp_11 + " " + plogp_12 + "\n\t" + plogp_21 + " " + plogp_22 );
216 | if(debug) System.out.println( "H_k = " + H_k );
217 |
218 | // Dunning's formula
219 | // http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
220 | // double G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums );
221 | // if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums )" );
222 | // if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_k + " - " + H_rowSums + " - " + H_colSums + " )" );
223 |
224 | // Revised, see http://math.stackexchange.com/questions/693114/wrong-result-from-llr-using-dunning-entropy-method
225 | double G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k );
226 | if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k )" );
227 | if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_rowSums + " + " + H_colSums + " - " + H_k + " )" );
228 |
229 | return G2;
230 | }
231 |
232 | // Calculates p * log( p )
233 | // natural log
234 | // but returns 0.0 if p is 0
235 | // TODO: maybe some implementitons just add 1 to all counts?
236 | double plogp( double prob ) {
237 | if ( prob > 0.0 ) {
238 | plogp_counter++;
239 | return prob * Math.log( prob );
240 | }
241 | else {
242 | return 0.0;
243 | }
244 | }
245 |
246 | void sortWords() {
247 | // Map scoresByWord = new TreeMap<>();
248 | // Map sortedScoresByWord = new TreeMap<>();
249 | sortedScoresByWord = SetUtils.sortMapByValues( scoresByWord );
250 | }
251 |
252 | // double pLogP_KOverallWordA( String word ) {
253 | // double prob = probKOverallWordA( word );
254 | // if ( prob > 0.0 ) {
255 | // return prob * Math.log( prob );
256 | // }
257 | // else {
258 | // return 0.0;
259 | // }
260 | // }
261 | // double pLogP_KOverallWordB( String word ) {
262 | // double prob = probKOverallWordB( word );
263 | // if ( prob > 0.0 ) {
264 | // return prob * Math.log( prob );
265 | // }
266 | // else {
267 | // return 0.0;
268 | // }
269 | // }
270 | // double probKOverallWordA( String word ) {
271 | // return probKOverallWord( word, wordsA );
272 | // }
273 | // double probKOverallWordB( String word ) {
274 | // return probKOverallWord( word, wordsB );
275 | // }
276 | // double probKOverallWord( String word, Map countMap ) {
277 | // long count = countMap.containsKey(word) ? countMap.get(word) : 0L;
278 | // double prob = (double) count / grandTotal;
279 | // return prob;
280 | // }
281 |
282 | // double sumWithThreshold( Collection counts ) {
283 | // double out = 0.0;
284 | // for ( Long c : counts ) {
285 | // if ( c >= minWordsThreshold ) {
286 | // out += c;
287 | // }
288 | // }
289 | // return out;
290 | // }
291 |
292 | public String generateReport( String optLabel ) {
293 | StringWriter sw = new StringWriter();
294 | PrintWriter out = new PrintWriter(sw);
295 |
296 | int sampleSize = 5;
297 |
298 | if ( null!=optLabel ) {
299 | out.println( "----------- " + optLabel + " -----------" );
300 | }
301 |
302 | out.println();
303 | out.println( "Corpus A unique / total words: " + wordsA.size() + " / " + sumA );
304 | out.println( "Corpus B unique / total words: " + wordsB.size() + " / " + sumB );
305 | out.println( "Combined unique / total words: " + allWords.size() + " / " + grandTotal );
306 | out.println( "Number of p log(p) calculations: " + plogp_counter );
307 | out.println();
308 |
309 | if ( sortedScoresByWord.size() <= 2 * sampleSize + 1 ) {
310 | addTermsSliceToReport( out, "All Term Changes", sortedScoresByWord );
311 | }
312 | else {
313 | Map firstTerms = SetUtils.mapHead( sortedScoresByWord, sampleSize );
314 | addTermsSliceToReport( out, "Term Changes, first " + sampleSize + " entries", firstTerms );
315 | Map lastTerms = SetUtils.mapTail( sortedScoresByWord, sampleSize );
316 | addTermsSliceToReport( out, "Term Changes, last " + sampleSize + " entries", lastTerms );
317 | }
318 |
319 | String outStr = sw.toString();
320 | return outStr;
321 | }
322 | void addTermsSliceToReport( PrintWriter out, String label, Map terms ) {
323 | out.println( "" + label + ":" );
324 | for ( Entry wordEntry : terms.entrySet() ) {
325 | String word = wordEntry.getKey();
326 | double g2 = wordEntry.getValue();
327 | out.println( "\t" + word + ": " + g2 );
328 | }
329 | }
330 |
331 | public static void main( String[] argv ) throws SolrServerException {
332 | // Map corpusA = new LinkedHashMap() {{
333 | // // 100k docs total
334 | // put( "blog", 25L ); // test word
335 | // put( "computer", 3200L ); // other words
336 | // put( "internet", 96775L ); // other words
337 | // }};
338 | // Map corpusB = new LinkedHashMap() {{
339 | // // 200k docs total
340 | // put( "blog", 2500L ); // test word
341 | // put( "computer", 6000L ); // other words
342 | // put( "internet", 191500L ); // other words
343 | // }};
344 |
345 | // // Example posted online
346 | // Map corpusA = new LinkedHashMap() {{
347 | // // 100k docs total
348 | // put( "spam", 40000L ); // test word
349 | // put( "other words", 60000L ); // other words
350 | // }};
351 | // Map corpusB = new LinkedHashMap() {{
352 | // // 200k docs total
353 | // put( "spam", 120000L ); // test word
354 | // put( "other words", 80000L ); // other words
355 | // }};
356 |
357 | // Map corpusA = new LinkedHashMap() {{
358 | // put( "apples", 25L );
359 | // put( "bananas", 30L );
360 | // put( "carrots", 40L );
361 | // put( "food", 100L );
362 | // }};
363 | // Map corpusB = new LinkedHashMap() {{
364 | // put( "apples", 20L ); // down by 5
365 | // put( "bananas", 35L ); // up by 5
366 | // put( "candy", 40L ); // carrots -> candy!
367 | // put( "food", 100L ); // unchanged, and total unchanged
368 | // }};
369 |
370 |
371 | HttpSolrServer solrA = SolrUtils.getServer( "localhost", 8984 );
372 | HttpSolrServer solrB = SolrUtils.getServer( "localhost", 8985 );
373 | String fieldName = "text";
374 | // Set corpusA = SolrUtils.getTermsForField_ViaTermsRequest( solrA, fieldName );
375 | // Set corpusB = SolrUtils.getTermsForField_ViaTermsRequest( solrB, fieldName );
376 | Map corpusA = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrA, fieldName );
377 | Map corpusB = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrB, fieldName );
378 |
379 | LLR llr = new LLR( corpusA, corpusB );
380 | String report = llr.generateReport( "A -> B" );
381 | System.out.print( report );
382 |
383 | }
384 | }
--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/SetUtils.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collection;
5 | import java.util.Date;
6 | import java.util.Iterator;
7 | import java.util.LinkedHashMap;
8 | import java.util.LinkedHashSet;
9 | import java.util.List;
10 | import java.util.Map;
11 | import java.util.Map.Entry;
12 | import java.util.Set;
13 | import java.util.TreeMap;
14 | import java.util.TreeSet;
15 |
16 | public class SetUtils {
17 |
18 | public static void incrementMapCounter( Map tabulationMap, String key ) {
19 | Long value = 0L;
20 | if ( tabulationMap.containsKey(key) ) {
21 | value = tabulationMap.get(key);
22 | }
23 | value += 1L;
24 | tabulationMap.put( key, value );
25 | }
26 |
27 | /***
28 | public static void incrementMapCounter( Map