├── .gitignore
├── LICENSE.txt
├── README.md
├── pom.xml
└── src
    └── main
        ├── java
            └── com
            │   └── lucidworks
            │       └── dq
            │           ├── data
            │               ├── DateChecker.java
            │               ├── DeleteByIds.java
            │               ├── DocCount.java
            │               ├── DumpIds.java
            │               ├── EmptyFieldStats.java
            │               ├── SolrToCsv.java
            │               ├── SolrToSolr.java
            │               ├── TermCodepointStats.java
            │               ├── TermStats.java
            │               └── TestArgs.java
            │           ├── diff
            │               ├── DiffEmptyFieldStats.java
            │               ├── DiffIds.java
            │               ├── DiffSchema.java
            │               └── DiffSolrConfig.java
            │           ├── logs
            │               ├── LogEntry.java
            │               ├── LogEntryBase.java
            │               ├── LogEntryFromSolr.java
            │               ├── LogEntryGroup.java
            │               ├── LogEntryGroupFromSolr.java
            │               ├── LogEntryReference.java
            │               ├── LogEntryReferenceBase.java
            │               ├── LogFile.java
            │               ├── LogFileBase.java
            │               ├── LogFileFromSolr.java
            │               ├── LogFileRepo.java
            │               └── LogFileRepoBase.java
            │           ├── schema
            │               ├── Schema.java
            │               ├── SchemaBase.java
            │               ├── SchemaFromLocalCore_broken.java
            │               ├── SchemaFromRest.java
            │               ├── SchemaFromRestAdHock.java
            │               ├── SchemaFromXml.java
            │               ├── SchemalessPlus.java
            │               ├── SolrConfig.java
            │               ├── SolrConfigBase.java
            │               └── SolrConfigFromXml.java
            │           ├── util
            │               ├── CharUtils.java
            │               ├── CmdLineLauncher.java
            │               ├── DateUtils.java
            │               ├── HasDescription.java
            │               ├── HashAndShard.java
            │               ├── IO_Utils.java
            │               ├── LLR.java
            │               ├── LLR.java-new
            │               ├── SetUtils.java
            │               ├── SolrUtils.java
            │               ├── StatsUtils.java
            │               ├── StringUtils.java
            │               └── TupleEntropy.java
            │           └── zk_experiment
            │               └── ZkSmartClient.java
        └── resources
            ├── DQ-Prototype-and-SolrJ.key
            ├── DQ-Prototype-and-SolrJ.pdf
            ├── sample-reports
                ├── README.txt
                ├── dates-curve-fitting.txt
                ├── llr-larger-sample.txt
                ├── llr-tiny-sample.txt
                ├── populated-fields-diff.txt
                ├── populated-fields-single-extended-options.txt
                ├── populated-fields-single.txt
                ├── report-terms-via-termsReqHandler.txt
                ├── schema-info-diff.txt
                ├── schema-info-single.txt
                ├── term-counts.txt
                ├── term-lengths.txt
                ├── unicode-format1.txt
                └── unicode-format2.txt
            ├── schema-461.xml
            └── schema-481.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Eclipse
 4 | .classpath
 5 | .project
 6 | .settings
 7 | 
 8 | # Package Files #
 9 | *.jar
10 | *.war
11 | *.ear
12 | /target
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
2 | 
3 | You may obtain a copy of the License at:
4 |     http://www.apache.org/licenses/LICENSE-2.0
5 | 
6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
7 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>com.lucidworks</groupId>
  5 |   <artifactId>data-quality-java</artifactId>
  6 |   <packaging>jar</packaging>
  7 |   <version>1.0-SNAPSHOT</version>
  8 |   <name>data-quality-java</name>
  9 |   <url>http://maven.apache.org</url>
 10 | 
 11 | 
 12 |   <description>Data-Quality Checks</description>
 13 | 
 14 |   <properties>
 15 |       <solr.version>4.10.3</solr.version>
 16 |       <slf4j.version>1.6.4</slf4j.version>
 17 |   </properties>
 18 | 
 19 | 
 20 | 
 21 | <dependencies>
 22 | 
 23 |     <dependency>
 24 |         <groupId>org.apache.solr</groupId>
 25 |         <artifactId>solr-solrj</artifactId>
 26 |         <version>${solr.version}</version>
 27 |     </dependency>
 28 |     <dependency>
 29 |         <groupId>org.apache.solr</groupId>
 30 |         <artifactId>solr-core</artifactId>
 31 |         <version>${solr.version}</version>
 32 |     </dependency>
 33 | 
 34 |     <!-- GetOpts, etc -->
 35 |     <dependency>
 36 |         <groupId>commons-cli</groupId>
 37 |         <artifactId>commons-cli</artifactId>
 38 |         <version>1.2</version>
 39 |     </dependency>
 40 | 
 41 | 
 42 |     <dependency>
 43 |       <groupId>com.google.code.gson</groupId>
 44 |       <artifactId>gson</artifactId>
 45 |       <version>2.2.4</version>
 46 |       <!-- <scope>compile</scope> -->
 47 |     </dependency>
 48 | 
 49 |     <dependency>
 50 |       <groupId>org.codehaus.jackson</groupId>
 51 |       <artifactId>jackson-mapper-asl</artifactId>
 52 |       <version>1.6.4</version>
 53 |       <!-- <scope>compile</scope> -->
 54 |     </dependency>
 55 | 
 56 | <!-- Jersey... later
 57 |     https://jersey.java.net/documentation/latest/getting-started.html
 58 | -->
 59 | 
 60 | <!--
 61 | <dependency>
 62 |     <groupId>org.apache.lucene</groupId>
 63 |     <artifactId>lucene-core</artifactId>
 64 |     <version>${solr.version}</version>
 65 | </dependency>
 66 | <dependency>
 67 |     <groupId>org.apache.lucene</groupId>
 68 |     <artifactId>lucene-analyzers-common</artifactId>
 69 |     <version>${solr.version}</version>
 70 | </dependency>
 71 | -->
 72 | 
 73 |     <dependency>
 74 |       <groupId>junit</groupId>
 75 |       <artifactId>junit</artifactId>
 76 |       <version>3.8.1</version>
 77 |       <scope>test</scope>
 78 |     </dependency>
 79 | 
 80 | 
 81 |     <!-- slf4j: may not need everything -->
 82 |     <dependency>
 83 |         <groupId>commons-logging</groupId>
 84 |         <artifactId>commons-logging</artifactId>
 85 |         <version>1.1.1</version>
 86 |     </dependency>
 87 | 
 88 | 
 89 | 
 90 |   </dependencies>
 91 | 
 92 | 
 93 |   <build>
 94 |     <plugins>
 95 |       <plugin>
 96 |         <artifactId>maven-compiler-plugin</artifactId>
 97 |         <version>2.3.2</version>
 98 |         <configuration>
 99 |           <source>1.7</source>
100 |           <target>1.7</target>
101 |         </configuration>
102 |       </plugin>
103 | 
104 |         <!-- Shade / Self-contained JAR -->
105 |         <plugin>
106 |             <groupId>org.apache.maven.plugins</groupId>
107 |             <artifactId>maven-shade-plugin</artifactId>
108 |             <!-- <version>2.0</version> -->
109 |             <version>2.2</version>
110 |             <executions>
111 |                 <execution>
112 |                     <phase>package</phase>
113 |                     <goals>
114 |                         <goal>shade</goal>
115 |                     </goals>
116 |                     <configuration>
117 |                         <!-- <minimizeJar>true</minimizeJar> -->
118 |                         <minimizeJar>false</minimizeJar>
119 |                         <transformers>
120 |                             <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
121 |                                 <mainClass>com.lucidworks.dq.util.CmdLineLauncher</mainClass>
122 |                             </transformer>
123 |                         </transformers>
124 | 
125 |                         <!-- fix SecurityException: Invalid signature file digest for Manifest main attributes -->
126 |                         <filters>
127 |                             <filter>
128 |                                 <artifact>*:*</artifact>
129 |                                 <excludes>
130 |                                     <exclude>META-INF/*.SF</exclude>
131 |                                     <exclude>META-INF/*.DSA</exclude>
132 |                                     <exclude>META-INF/*.RSA</exclude>
133 |                                 </excludes>
134 |                             </filter>
135 |                         </filters>
136 | 
137 |                     </configuration>
138 |                 </execution>
139 |             </executions>
140 |         </plugin>
141 | 
142 |     </plugins>
143 |   </build>
144 | 
145 | </project>
146 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DeleteByIds.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.data;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStreamReader;
  7 | import java.io.PrintWriter;
  8 | import java.io.StringWriter;
  9 | import java.nio.charset.Charset;
 10 | import java.nio.charset.CharsetDecoder;
 11 | import java.nio.charset.CodingErrorAction;
 12 | import java.text.MessageFormat;
 13 | import java.text.NumberFormat;
 14 | import java.util.Arrays;
 15 | import java.util.Collection;
 16 | import java.util.LinkedHashMap;
 17 | import java.util.LinkedHashSet;
 18 | import java.util.LinkedList;
 19 | import java.util.List;
 20 | import java.util.Map;
 21 | import java.util.Map.Entry;
 22 | import java.util.Set;
 23 | 
 24 | import org.apache.commons.cli.CommandLine;
 25 | import org.apache.commons.cli.CommandLineParser;
 26 | import org.apache.commons.cli.HelpFormatter;
 27 | import org.apache.commons.cli.OptionBuilder;
 28 | import org.apache.commons.cli.Options;
 29 | import org.apache.commons.cli.ParseException;
 30 | import org.apache.commons.cli.PosixParser;
 31 | import org.apache.solr.client.solrj.SolrServerException;
 32 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 33 | 
 34 | import com.lucidworks.dq.util.HasDescription;
 35 | import com.lucidworks.dq.util.SetUtils;
 36 | import com.lucidworks.dq.util.SolrUtils;
 37 | 
 38 | public class DeleteByIds /*implements HasDescription*/ {
 39 | 
 40 |   static String HELP_WHAT_IS_IT = "Delete documents by their ID, either passed on the command line, or from a file, or from standard in / stdin.";
 41 |   static String HELP_USAGE = "DeleteByIds -u http://localhost:8983/collection1 --ids 1234 5678 ... or --input_file ids_to_delete.txt";
 42 | 
 43 |   public static String getShortDescription() {
 44 |     return HELP_WHAT_IS_IT;
 45 |   }
 46 | 
 47 |   static int DEFAULT_BATCH_SIZE = 1000;
 48 | 
 49 |   static Options options;
 50 | 
 51 |   // We use List<String> instead of Set<String> because that's what SolrJ expects in deleteById
 52 |   static List<String> readIdsFromFile( String targetFile, CharsetDecoder deccoder ) throws IOException {
 53 |     List<String> ids = new LinkedList<String>();
 54 |     BufferedReader in = null;
 55 |     if( null!=targetFile && ! targetFile.equals("-") ) {
 56 |       in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder));
 57 |     } else {
 58 |       in = new BufferedReader(new InputStreamReader(System.in, deccoder));
 59 |     }
 60 |     String line;
 61 |     while ((line = in.readLine()) != null) {
 62 |       // skip completely blank lines, but doesn't do any trimming
 63 |       if ( line.length()<1 ) {
 64 |         continue;
 65 |       }
 66 |       ids.add( line );
 67 |     }
 68 |     in.close();
 69 |     return ids;
 70 |   }
 71 | 
 72 |   static void helpAndExit() {
 73 |     helpAndExit( null, 1 );
 74 |   }
 75 |   static void helpAndExit( String optionalError, int errorCode ) {
 76 |     HelpFormatter formatter = new HelpFormatter();
 77 |     if ( null==optionalError ) {
 78 |       System.err.println( HELP_WHAT_IS_IT );
 79 |     }
 80 |     else {
 81 |       // log.error( optionalError );
 82 |       System.err.println( optionalError );
 83 |     }
 84 |     // stdout
 85 |     //formatter.printHelp( HELP_USAGE, options, true );
 86 |     // stderr
 87 |     PrintWriter pw = new PrintWriter(System.err);
 88 |     formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
 89 |     pw.flush();
 90 |     System.exit( errorCode );
 91 |   }
 92 | 
 93 |   public static void main( String [] argv ) throws Exception {
 94 | 
 95 |     options = new Options();
 96 |     options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
 97 |     options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
 98 |     options.addOption( "p", "port", true, "Port for Solr, default=8983" );
 99 |     options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
100 |     options.addOption( "f", "input_file", true, "File to read IDs from, one ID per line (skips 0 length lines, not counting newlines) (Use \"-\" for stdout / standard out)" );
101 |     options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" );
102 |     options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" );
103 | 
104 |     options.addOption( OptionBuilder.withLongOpt( "batch_size" )
105 |         .withDescription( "Batch size, 1=doc-by-doc, 0=all-at-once (be careful memory-wise), default="+DEFAULT_BATCH_SIZE )
106 |         .hasArg()
107 |         .withType( Number.class ) // NOT Long.class
108 |         .create( "b" )
109 |         );
110 | 
111 |     options.addOption( OptionBuilder.withLongOpt( "ids" )
112 |         .withDescription( "Pass one or more IDs on the command line" )
113 |         .hasArgs()   // PLURAL!
114 |         .create( "i" )
115 |         );
116 | 
117 |     if ( argv.length < 1 ) {
118 |       helpAndExit( "Must specifify at least url or host", 1 );
119 |     }
120 |     CommandLine cmd = null;
121 |     try {
122 |       CommandLineParser parser = new PosixParser();
123 |       cmd = parser.parse( options, argv );
124 |     }
125 |     catch( ParseException exp ) {
126 |       helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
127 |     }
128 |     String fullUrl = cmd.getOptionValue( "url" );
129 |     String host = cmd.getOptionValue( "host" );
130 |     String port = cmd.getOptionValue( "port" );
131 |     String coll = cmd.getOptionValue( "collection" );
132 |     if ( null==fullUrl && null==host ) {
133 |       helpAndExit( "Must specifify at least url or host (b)", 3 );
134 |     }
135 |     if ( null!=fullUrl && null!=host ) {
136 |       helpAndExit( "Must not specifify both url and host", 4 );
137 |     }
138 |     // Init
139 |     // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
140 |     HttpSolrServer solr;
141 |     if ( null!=fullUrl ) {
142 |       solr = SolrUtils.getServer( fullUrl );
143 |     }
144 |     else {
145 |       // Utils handle null values
146 |       solr = SolrUtils.getServer( host, port, coll );    
147 |     }
148 | 
149 |     int batchSize = DEFAULT_BATCH_SIZE;
150 |     Long batchObj = (Long) cmd.getParsedOptionValue( "batch_size" );
151 |     if ( null!=batchObj ) {
152 |       if ( batchObj.longValue() < 0L ) {
153 |         helpAndExit( "batch_size must be >= 0", 5 );
154 |       }
155 |       batchSize = batchObj.intValue();
156 |     }
157 | 
158 |     String encodingStr = cmd.getOptionValue( "encoding" );
159 |     // Didn't set encoding
160 |     if ( null==encodingStr || encodingStr.trim().length()<1 ) {
161 |       encodingStr = "UTF-8";
162 |     }
163 |     // Did set encoding
164 |     else {
165 |       // But didn't set input file
166 |       if ( null == cmd.getOptionValue( "input_file" ) ) {
167 |         helpAndExit( "Encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 6 );
168 |       }
169 |     }
170 |     boolean strictEncoding = true;
171 |     if(cmd.hasOption("loose_encoding")) {
172 |       strictEncoding = false;
173 |       if ( null == cmd.getOptionValue( "input_file" ) ) {
174 |         helpAndExit( "loose_encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 7 );
175 |       }
176 |     }
177 |     // Setup IO encoding
178 |     Charset charset = Charset.forName( encodingStr );
179 |     // Input uses Decoder
180 |     CharsetDecoder decoder = charset.newDecoder();
181 |     if ( strictEncoding ) {
182 |       decoder.onMalformedInput( CodingErrorAction.REPORT );
183 |     }
184 | 
185 |     String inputFile = cmd.getOptionValue( "input_file" );
186 | 
187 |     String [] cmdLineIds = cmd.getOptionValues( "ids" );
188 | 
189 |     if ( null==inputFile && null==cmdLineIds ) {
190 |       helpAndExit( "Must use at least one of --input_file or --ids ..., OK to use both. For standard in / stdin use --input_file -", 8 );
191 |     }
192 | 
193 |     // We use List<String> instead of Set<String> because that's what SolrJ expects in deleteById
194 |     List<String> ids = new LinkedList<String>();
195 |     if ( null!=inputFile ) {
196 |       ids = readIdsFromFile( inputFile, decoder );
197 |     }
198 |     if ( null!=cmdLineIds ) {
199 |       ids.addAll( Arrays.asList( cmdLineIds ) );
200 |     }
201 | 
202 |     if ( batchSize < 1 ) {
203 |       solr.deleteById(ids);
204 |     }
205 |     else if ( batchSize == 1 ) {
206 |       for ( String id : ids ) {
207 |         solr.deleteById( id );
208 |       }
209 |     }
210 |     else {
211 |       for ( int start = 0; start < ids.size(); start += batchSize ) {
212 |         int end = start + batchSize;
213 |         if ( end > ids.size() ) {
214 |           end = ids.size();
215 |         }
216 |         List<String> sublist = ids.subList( start, end );
217 |         solr.deleteById( sublist );
218 |       }
219 |     }
220 |     // Wait for disk commit and new searcher to fire up
221 |     // TODO: maybe have other commit options, although this is probably the safest
222 |     solr.commit( true, true );
223 | 
224 |   }
225 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DocCount.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.data;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.text.MessageFormat;
  6 | import java.text.NumberFormat;
  7 | import java.util.Collection;
  8 | import java.util.LinkedHashMap;
  9 | import java.util.LinkedHashSet;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.Set;
 13 | 
 14 | import org.apache.commons.cli.CommandLine;
 15 | import org.apache.commons.cli.CommandLineParser;
 16 | import org.apache.commons.cli.HelpFormatter;
 17 | import org.apache.commons.cli.Options;
 18 | import org.apache.commons.cli.ParseException;
 19 | import org.apache.commons.cli.PosixParser;
 20 | import org.apache.solr.client.solrj.SolrServerException;
 21 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 22 | 
 23 | import com.lucidworks.dq.util.HasDescription;
 24 | import com.lucidworks.dq.util.SetUtils;
 25 | import com.lucidworks.dq.util.SolrUtils;
 26 | 
 27 | public class DocCount /*implements HasDescription*/ {
 28 | 
 29 |   static String HELP_WHAT_IS_IT = "Count of active documents in a collection to standard out / stdout.";
 30 |   static String HELP_USAGE = "DocCount -u http://localhost:8983 (output sent to stdout)";
 31 | 
 32 |   public static String getShortDescription() {
 33 |     return HELP_WHAT_IS_IT;
 34 |   }
 35 | 
 36 |   static Options options;
 37 |   
 38 |   HttpSolrServer solrServer;
 39 | 
 40 |   static void helpAndExit() {
 41 |     helpAndExit( null, 1 );
 42 |   }
 43 |   static void helpAndExit( String optionalError, int errorCode ) {
 44 |     HelpFormatter formatter = new HelpFormatter();
 45 |     if ( null==optionalError ) {
 46 |       System.err.println( HELP_WHAT_IS_IT );
 47 |     }
 48 |     else {
 49 |       // log.error( optionalError );
 50 |       System.err.println( optionalError );
 51 |     }
 52 |     // stdout
 53 |     //formatter.printHelp( HELP_USAGE, options, true );
 54 |     // stderr
 55 |     PrintWriter pw = new PrintWriter(System.err);
 56 |     formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
 57 |     pw.flush();
 58 |     System.exit( errorCode );
 59 |   }
 60 | 
 61 |   public static void main( String [] argv ) throws Exception {
 62 | 
 63 |     options = new Options();
 64 |     options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
 65 |     options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
 66 |     options.addOption( "p", "port", true, "Port for Solr, default=8983" );
 67 |     options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
 68 |     if ( argv.length < 1 ) {
 69 |       helpAndExit( "Must specifify at least url or host", 1 );
 70 |     }
 71 |     CommandLine cmd = null;
 72 |     try {
 73 |       CommandLineParser parser = new PosixParser();
 74 |       cmd = parser.parse( options, argv );
 75 |     }
 76 |     catch( ParseException exp ) {
 77 |       helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
 78 |     }
 79 |     String fullUrl = cmd.getOptionValue( "url" );
 80 |     String host = cmd.getOptionValue( "host" );
 81 |     String port = cmd.getOptionValue( "port" );
 82 |     String coll = cmd.getOptionValue( "collection" );
 83 |     if ( null==fullUrl && null==host ) {
 84 |       helpAndExit( "Must specifify at least url or host (b)", 3 );
 85 |     }
 86 |     if ( null!=fullUrl && null!=host ) {
 87 |       helpAndExit( "Must not specifify both url and host", 4 );
 88 |     }
 89 |     // Init
 90 |     // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
 91 |     HttpSolrServer solr;
 92 |     if ( null!=fullUrl ) {
 93 |       solr = SolrUtils.getServer( fullUrl );
 94 |     }
 95 |     else {
 96 |       // Utils handle null values
 97 |       solr = SolrUtils.getServer( host, port, coll );    
 98 |     }
 99 | 
100 |     long count = SolrUtils.getTotalDocCount( solr );
101 |     System.out.println( count );
102 | 
103 |   }
104 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/data/DumpIds.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.data;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.text.MessageFormat;
  6 | import java.text.NumberFormat;
  7 | import java.util.Collection;
  8 | import java.util.LinkedHashMap;
  9 | import java.util.LinkedHashSet;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.Set;
 13 | 
 14 | import org.apache.commons.cli.CommandLine;
 15 | import org.apache.commons.cli.CommandLineParser;
 16 | import org.apache.commons.cli.HelpFormatter;
 17 | import org.apache.commons.cli.Options;
 18 | import org.apache.commons.cli.ParseException;
 19 | import org.apache.commons.cli.PosixParser;
 20 | import org.apache.solr.client.solrj.SolrServerException;
 21 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 22 | 
 23 | import com.lucidworks.dq.util.HasDescription;
 24 | import com.lucidworks.dq.util.SetUtils;
 25 | import com.lucidworks.dq.util.SolrUtils;
 26 | 
 27 | public class DumpIds /*implements HasDescription*/ {
 28 | 
 29 |   static String HELP_WHAT_IS_IT = "Dump all the IDs from a collection to standard out / stdout.";
 30 |   static String HELP_USAGE = "DumpIds -u http://localhost:8983 (output sent to stdout)";
 31 |   // final static Logger log = LoggerFactory.getLogger( FieldStats.class );
 32 | 
 33 |   public static String getShortDescription() {
 34 |     return HELP_WHAT_IS_IT;
 35 |   }
 36 | 
 37 |   static Options options;
 38 |   
 39 |   HttpSolrServer solrServer;
 40 | 
 41 |   // TODO: refactor to allow options to be settable after constructor is run
 42 |   public DumpIds( HttpSolrServer server ) throws SolrServerException {
 43 |     this.solrServer = server;
 44 |   }
 45 |   public HttpSolrServer getSolrServer() {
 46 |     return this.solrServer;
 47 |   }
 48 | 
 49 |   void dumpIds() throws SolrServerException {
 50 |     Set<String> ids = SolrUtils.getAllIds( getSolrServer() );
 51 |     for ( String id : ids ) {
 52 |       System.out.println( id );
 53 |     }
 54 |   }
 55 | 
 56 |   static void helpAndExit() {
 57 |     helpAndExit( null, 1 );
 58 |   }
 59 |   static void helpAndExit( String optionalError, int errorCode ) {
 60 |     HelpFormatter formatter = new HelpFormatter();
 61 |     if ( null==optionalError ) {
 62 |       // log.info( HELP_WHAT_IS_IT );
 63 |       System.out.println( HELP_WHAT_IS_IT );
 64 |     }
 65 |     else {
 66 |       // log.error( optionalError );
 67 |       System.err.println( optionalError );
 68 |     }
 69 |     formatter.printHelp( HELP_USAGE, options, true );
 70 |     System.exit( errorCode );
 71 |   }
 72 | 
 73 |   public static void main( String [] argv ) throws Exception {
 74 | 
 75 |     options = new Options();
 76 |     options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" );
 77 |     options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" );
 78 |     options.addOption( "p", "port", true, "Port for Solr, default=8983" );
 79 |     options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" );
 80 |     if ( argv.length < 1 ) {
 81 |       helpAndExit( "Must specifify at least url or host", 1 );
 82 |     }
 83 |     CommandLine cmd = null;
 84 |     try {
 85 |       CommandLineParser parser = new PosixParser();
 86 |       cmd = parser.parse( options, argv );
 87 |     }
 88 |     catch( ParseException exp ) {
 89 |       helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
 90 |     }
 91 |     String fullUrl = cmd.getOptionValue( "url" );
 92 |     String host = cmd.getOptionValue( "host" );
 93 |     String port = cmd.getOptionValue( "port" );
 94 |     String coll = cmd.getOptionValue( "collection" );
 95 |     if ( null==fullUrl && null==host ) {
 96 |       helpAndExit( "Must specifify at least url or host (b)", 3 );
 97 |     }
 98 |     if ( null!=fullUrl && null!=host ) {
 99 |       helpAndExit( "Must not specifify both url and host", 4 );
100 |     }
101 |     // Init
102 |     // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL );
103 |     HttpSolrServer solr;
104 |     if ( null!=fullUrl ) {
105 |       solr = SolrUtils.getServer( fullUrl );
106 |     }
107 |     else {
108 |       // Utils handle null values
109 |       solr = SolrUtils.getServer( host, port, coll );    
110 |     }
111 | 
112 |     // System.out.println( "Solr = " + solr.getBaseURL() );
113 |     // EmptyFieldStats fs = new EmptyFieldStats( solr );
114 |     DumpIds di = new DumpIds( solr );
115 |     di.dumpIds();
116 | 
117 |   }
118 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/diff/DiffEmptyFieldStats.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.diff;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.text.MessageFormat;
  6 | import java.text.NumberFormat;
  7 | import java.util.LinkedHashSet;
  8 | import java.util.Set;
  9 | 
 10 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 11 | 
 12 | import com.lucidworks.dq.data.EmptyFieldStats;
 13 | import com.lucidworks.dq.schema.Schema;
 14 | import com.lucidworks.dq.schema.SchemaFromRest;
 15 | import com.lucidworks.dq.schema.SchemaFromXml;
 16 | import com.lucidworks.dq.util.HasDescription;
 17 | import com.lucidworks.dq.util.SetUtils;
 18 | import com.lucidworks.dq.util.SolrUtils;
 19 | 
 20 | import org.apache.commons.cli.CommandLine;
 21 | import org.apache.commons.cli.CommandLineParser;
 22 | import org.apache.commons.cli.HelpFormatter;
 23 | import org.apache.commons.cli.Options;
 24 | import org.apache.commons.cli.ParseException;
 25 | import org.apache.commons.cli.PosixParser;
 26 | 
 27 | public class DiffEmptyFieldStats /*implements HasDescription*/ {
 28 |   static String HELP_WHAT_IS_IT = "Compare fields that aren't fully populated between two cores/collections.";
 29 |   static String HELP_USAGE = "DiffEmptyFieldStats";
 30 |   // final static Logger log = LoggerFactory.getLogger( TermStats.class );
 31 | 
 32 |   public static String getShortDescription() {
 33 |     return HELP_WHAT_IS_IT;
 34 |   }
 35 |   
 36 |   static Options options;
 37 | 
 38 |   public static String generateReport( EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB, String labelA, String labelB ) throws Exception {
 39 |     StringWriter sw = new StringWriter();
 40 |     PrintWriter out = new PrintWriter(sw);
 41 | 
 42 |     out.println( "========== Differences Report ==========" );
 43 |     out.println( "Schema A = " + labelA );
 44 |     out.println( "Schema B = " + labelB );
 45 | 
 46 |     out.println();
 47 |     addSimpleStatToReport( out, "A: Total Active Docs", fieldStatsA.getTotalDocCount() );
 48 |     addSimpleStatToReport( out, "B: Total Active Docs", fieldStatsB.getTotalDocCount() );
 49 | 
 50 |     out.println();
 51 |     Set<String> fieldsA = fieldStatsA.getAllFieldNames();
 52 |     Set<String> fieldsB = fieldStatsB.getAllFieldNames();
 53 |     addSetComparisonToReport( out, fieldsA, fieldsB, "All Fields" );
 54 | 
 55 |     out.println();
 56 |     addAllFieldStatsToReport( out, fieldStatsA, fieldStatsB );
 57 | 
 58 | 
 59 | //  // Simple Values
 60 | //  // -------------
 61 | //  // Name
 62 | //  String nameA = schemaA.getSchemaName();
 63 | //  String nameB = schemaB.getSchemaName();
 64 | //  addStringComparisionToReport( out, nameA, nameB, "Schema Name" );
 65 | //  // Version
 66 | //  float versA = schemaA.getSchemaVersion();
 67 | //  float versB = schemaB.getSchemaVersion();
 68 | //  out.print( "Schema Version: " );
 69 | //  if ( versA == versB ) {
 70 | //    out.println( "Both = '" + versA + "'" );
 71 | //  }
 72 | //  else {
 73 | //    out.println( "\tA = '" + versA + "'" );
 74 | //    out.println( "\tB = '" + versB + "'" );
 75 | //  }
 76 | 
 77 | //  // Complex Values
 78 | //  // --------------
 79 | //  // Fields
 80 | //  Set<String> fieldsA = schemaA.getAllSchemaFieldNames();
 81 | //  Set<String> fieldsB = schemaB.getAllSchemaFieldNames();
 82 | //  addSetComparisonToReport( out, fieldsA, fieldsB, "Fields" );
 83 | //  // Dynamic Field Patterns
 84 | //  // TODO: Verify that order is being preserved through the entire process
 85 | //  Set<String> patternsA = schemaA.getAllDynamicFieldPatterns();
 86 | //  Set<String> patternsB = schemaB.getAllDynamicFieldPatterns();
 87 | //  addSetComparisonToReport( out, patternsA, patternsB, "Dynamic-Field Patterns", true );
 88 | 
 89 |     String outStr = sw.toString();
 90 |     return outStr;
 91 |   }
 92 | 
 93 |   static void addAllFieldStatsToReport( PrintWriter out, EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB ) {
 94 |     Set<String> fieldsA = fieldStatsA.getAllFieldNames();
 95 |     Set<String> fieldsB = fieldStatsB.getAllFieldNames();
 96 |     Set<String> allFields = SetUtils.union_nonDestructive( fieldsA, fieldsB );
 97 | 
 98 |     // Fully Populated
 99 |     Set<String> fullFieldsA = fieldStatsA.getFullyPopulatedIndexedFields();
100 |     Set<String> fullFieldsB = fieldStatsB.getFullyPopulatedIndexedFields();
101 |     // Subset
102 |     Set<String> fullFieldsBoth = SetUtils.intersection_nonDestructive( fullFieldsA, fullFieldsB );
103 | 
104 |     // Empty
105 |     Set<String> emptyFieldsA = fieldStatsA.getFieldsWithNoIndexedValues();
106 |     Set<String> emptyFieldsB = fieldStatsB.getFieldsWithNoIndexedValues();
107 |     // Subset
108 |     Set<String> emptyFieldsBoth = SetUtils.intersection_nonDestructive( emptyFieldsA, emptyFieldsB );
109 | 
110 |     // All Other Fields
111 |     // We can only summarize the subsets of completely full and completely empty fields in both collections
112 |     // All other fields need to be listed in the detailed report
113 |     Set<String> detailFields = new LinkedHashSet<>();
114 |     detailFields.addAll( allFields );
115 |     detailFields.removeAll( fullFieldsBoth );
116 |     detailFields.removeAll( emptyFieldsBoth );
117 | 
118 |     out.println( "Populated at 100% in Both A and B: " + fullFieldsBoth );
119 |     out.println();
120 |     out.println( "No Indexed Values / 0% in Both A and B: " + emptyFieldsBoth );
121 |     out.println();
122 | 
123 |     out.println( "Partially Populated Fields and Percentages, A / B:" );
124 |     for ( String name : detailFields ) {
125 |       Long countA = null;
126 |       if ( fieldStatsA.getIndexedValueCounts().containsKey(name) ) {
127 |         countA = fieldStatsA.getIndexedValueCounts().get(name);
128 |       }
129 |       Double percentA = null;
130 |       if ( fieldStatsA.getIndexedValuePercentages().containsKey(name) ) {
131 |         percentA = fieldStatsA.getIndexedValuePercentages().get( name );
132 |       }
133 |       Long countB = null;
134 |       if ( fieldStatsB.getIndexedValueCounts().containsKey(name) ) {
135 |         countB = fieldStatsB.getIndexedValueCounts().get(name);
136 |       }
137 |       Double percentB = null;
138 |       if ( fieldStatsB.getIndexedValuePercentages().containsKey(name) ) {
139 |         percentB = fieldStatsB.getIndexedValuePercentages().get( name );
140 |       }
141 |       addStatsPairAndPercentToReport( out, name, countA, countB, percentA, percentB, "\t" );
142 |     }
143 |   }
144 | 
145 |   static void addSimpleStatToReport( PrintWriter out, String label, long stat ) {
146 |     String statStr = NumberFormat.getNumberInstance().format( stat );
147 |     out.println( "" + label + ": " + statStr );
148 |   }
149 | 
150 |   static void addStringComparisionToReport( PrintWriter out, String thingA, String thingB, String attrLabel ) {
151 |     out.print( attrLabel + ":" );
152 |     if ( null!=thingA && null!=thingB && thingA.equals(thingB) ) {
153 |       out.println( " Both = '" + thingA + "'" );
154 |     }
155 |     else {
156 |       out.println();
157 |       out.println( "\tA = '" + thingA + "'" );
158 |       out.println( "\tB = '" + thingB + "'" );
159 |     }  
160 |   }
161 | 
162 |   static void addStatsPairAndPercentToReport( PrintWriter out, String label, Long statA, Long statB, Double percA, Double percB, String optIndent ) {
163 |     if ( null!=optIndent ) {
164 |       out.print( optIndent );
165 |     }
166 |     String statStrA = null!=statA ? NumberFormat.getNumberInstance().format( statA ) : "(not in A)";
167 |     String statStrB = null!=statB ? NumberFormat.getNumberInstance().format( statB ) : "(not in B)";
168 |     String percStrA = null!=percA ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percA ) : "";
169 |     String percStrB = null!=percB ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percB ) : "";
170 |     out.println( "" + label + ": " + statStrA + percStrA + " / " + statStrB + percStrB );
171 |   }
172 | 
173 | 
174 |   static void addSetComparisonToReport( PrintWriter out, Set<String> setA, Set<String> setB, String attrLabel ) {
175 |     addSetComparisonToReport( out, setA, setB, attrLabel, false );
176 |   }
177 |   static void addSetComparisonToReport( PrintWriter out, Set<String> setA, Set<String> setB, String attrLabel, boolean checkOrder ) {
178 |     Set<String> inBoth = SetUtils.intersection_nonDestructive( setA, setB );
179 |     Set<String> inAOnly = SetUtils.inAOnly_nonDestructive( setA, setB );
180 |     Set<String> inBOnly = SetUtils.inBOnly_nonDestructive( setA, setB );
181 |     out.println();
182 |     out.print( attrLabel + ":" );
183 |     if ( inBoth.isEmpty() && inAOnly.isEmpty() && inBOnly.isEmpty() ) {
184 |       out.println( " None!" );
185 |     }
186 |     else {
187 |       out.println();
188 |       if ( ! inBoth.isEmpty() ) {
189 |         if ( ! checkOrder ) {
190 |           out.println( "\tIn both = '" + inBoth + "'" );
191 |         }
192 |         else {
193 |           // Note: Sets don't normally perserve order but I've been careful
194 |           // to use LinkedHashSet and LinkedHashMap, which DO
195 |           Set<String> commonA = SetUtils.intersection_nonDestructive( setA, setB );
196 |           Set<String> commonB = SetUtils.intersection_nonDestructive( setB, setA );
197 |           boolean inSameOrder = SetUtils.sameAndInSameOrder( commonA, commonB );
198 |           if ( inSameOrder ) {
199 |             out.println( "\tIn both and SAME relative order = '" + inBoth + "'" );
200 |           }
201 |           else {
202 |             out.println( "\tIn both but DIFFERENT relative order:" );
203 |             out.println( "\t\tCommon, order in A = '" + commonA + "'" );
204 |             out.println( "\t\tCommon, order in B = '" + commonB + "'" );
205 |           }
206 |         }
207 |       }
208 |       if ( ! inAOnly.isEmpty() ) {
209 |         out.println( "\tA only = '" + inAOnly + "'" );
210 |       }
211 |       if ( ! inBOnly.isEmpty() ) {
212 |         out.println( "\tB only = '" + inBOnly + "'" );
213 |       }
214 |     }
215 |   }
216 | 
217 |   static void helpAndExit() {
218 |     helpAndExit( null, 1 );
219 |   }
220 |   static void helpAndExit( String optionalError, int errorCode ) {
221 |     HelpFormatter formatter = new HelpFormatter();
222 |     if ( null==optionalError ) {
223 |       // log.info( HELP_WHAT_IS_IT );
224 |       System.out.println( HELP_WHAT_IS_IT );
225 |     }
226 |     else {
227 |       // log.error( optionalError );
228 |       System.err.println( optionalError );
229 |     }
230 |     formatter.printHelp( HELP_USAGE, options, true );
231 |     System.exit( errorCode );
232 |   }
233 | 
234 |   public static void main( String[] argv ) throws Exception {
235 |     options = new Options();
236 |     options.addOption( "u", "url_a", true, "URL for first Solr, OR set host, port and possibly collection" );
237 |     options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" );
238 |     options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" );
239 |     options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" );
240 |     options.addOption( "U", "url_b", true, "URL for second Solr, OR set host, port and possibly collection" );
241 |     options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" );
242 |     options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" );
243 |     options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" );
244 | 
245 |     if ( argv.length < 1 ) {
246 |       helpAndExit();
247 |     }
248 |     CommandLine cmd = null;
249 |     try {
250 |       CommandLineParser parser = new PosixParser();
251 |       // CommandLineParser parser = new DefaultParser();
252 |       cmd = parser.parse( options, argv );
253 |     }
254 |     catch( ParseException exp ) {
255 |       helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
256 |     }
257 |     // Already using -h for host, don't really need help, just run with no options
258 |     //if ( cmd.hasOption("help") ) {
259 |     //  helpAndExit();
260 |     //}
261 | 
262 |     String fullUrlA = cmd.getOptionValue( "url_a" );
263 |     String hostA = cmd.getOptionValue( "host_a" );
264 |     String portA = cmd.getOptionValue( "port_a" );
265 |     String collA = cmd.getOptionValue( "collection_a" );
266 |     if ( null==fullUrlA && null==hostA ) {
267 |       helpAndExit( "Must specifify at least url or host for first Solr", 3 );
268 |     }
269 |     if ( null!=fullUrlA && null!=hostA ) {
270 |       helpAndExit( "Must not specifify both url and host for first Solr", 4 );
271 |     }
272 | 
273 |     String fullUrlB = cmd.getOptionValue( "url_b" );
274 |     String hostB = cmd.getOptionValue( "host_b" );
275 |     String portB = cmd.getOptionValue( "port_b" );
276 |     String collB = cmd.getOptionValue( "collection_b" );
277 |     if ( null==fullUrlB && null==hostB ) {
278 |       helpAndExit( "Must specifify at least url or host for second Solr", 3 );
279 |     }
280 |     if ( null!=fullUrlB && null!=hostB ) {
281 |       helpAndExit( "Must not specifify both url and host for second Solr", 4 );
282 |     }
283 | 
284 |     // Init
285 |     // HttpSolrServer solrA = SolrUtils.getServer( HOST1, PORT1, COLL1 );
286 |     HttpSolrServer solrA;
287 |     if ( null!=fullUrlA ) {
288 |       solrA = SolrUtils.getServer( fullUrlA );
289 |     }
290 |     else {
291 |       // Utils handle null values
292 |       solrA = SolrUtils.getServer( hostA, portA, collA );    
293 |     }
294 |     System.out.println( "First Solr / Solr A = " + solrA.getBaseURL() );
295 |     // HttpSolrServer solrB = SolrUtils.getServer( HOST2, PORT2, COLL2 );
296 |     HttpSolrServer solrB;
297 |     if ( null!=fullUrlB ) {
298 |       solrB = SolrUtils.getServer( fullUrlB );
299 |     }
300 |     else {
301 |       // Utils handle null values
302 |       solrB = SolrUtils.getServer( hostB, portB, collB );    
303 |     }
304 |     System.out.println( "Second Solr / Solr B = " + solrB.getBaseURL() );
305 | 
306 |     String labelA = solrA.getBaseURL();
307 |     EmptyFieldStats fieldsStatsA = new EmptyFieldStats( solrA );
308 |     String reportA = fieldsStatsA.generateReport( labelA );
309 | 
310 |     String labelB = solrB.getBaseURL();
311 |     EmptyFieldStats fieldsStatsB = new EmptyFieldStats( solrB );
312 |     String reportB = fieldsStatsB.generateReport( labelB );
313 | 
314 |     System.out.println( "========== Individual Reports ==========" );
315 |     System.out.println();
316 |     System.out.println( "---------- A: " + labelA + " ----------" );
317 |     System.out.println( reportA );
318 |     System.out.println( "---------- B: " + labelB + " ----------" );
319 |     System.out.println( reportB );
320 | 
321 |     String report = generateReport( fieldsStatsA, fieldsStatsB, labelA, labelB );
322 |     System.out.println( report );
323 |   }
324 | 
325 | 
326 |   static String HOST0 = "localhost";
327 |   static String PORT0 = "8983";
328 |   static String COLL0 = "demo_shard1_replica1";
329 |   static String URL0 = "http://" + HOST0 + ":" + PORT0 + "/solr/" + COLL0;
330 |   // + "/select?q=*:*&rows=" + ROWS + "&fl=id&wt=json&indent=on"
331 | 
332 |   static String HOST1 = "localhost";
333 |   static String PORT1 = "8984"; // "8983";
334 |   static String COLL1 = "collection1";
335 |   static String URL1 = "http://" + HOST1 + ":" + PORT1 + "/solr/" + COLL1;
336 | 
337 |   static String HOST2 = "localhost";
338 |   static String PORT2 = "8985"; // "8983";
339 |   static String COLL2 = "collection1";
340 |   static String URL2 = "http://" + HOST1 + ":" + PORT2 + "/solr/" + COLL2;
341 | 
342 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/diff/DiffIds.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.diff;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.FileOutputStream;
  7 | import java.io.IOException;
  8 | import java.io.InputStreamReader;
  9 | import java.io.OutputStreamWriter;
 10 | import java.io.PrintStream;
 11 | import java.io.PrintWriter;
 12 | import java.nio.charset.Charset;
 13 | import java.nio.charset.CharsetDecoder;
 14 | import java.nio.charset.CharsetEncoder;
 15 | import java.nio.charset.CodingErrorAction;
 16 | import java.util.LinkedHashSet;
 17 | import java.util.Set;
 18 | 
 19 | import org.apache.commons.cli.CommandLine;
 20 | import org.apache.commons.cli.CommandLineParser;
 21 | import org.apache.commons.cli.HelpFormatter;
 22 | import org.apache.commons.cli.Options;
 23 | import org.apache.commons.cli.ParseException;
 24 | import org.apache.commons.cli.PosixParser;
 25 | import org.apache.solr.client.solrj.SolrQuery;
 26 | import org.apache.solr.client.solrj.SolrServerException;
 27 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 28 | import org.apache.solr.client.solrj.response.QueryResponse;
 29 | import org.apache.solr.common.SolrDocument;
 30 | 
 31 | import com.lucidworks.dq.util.HasDescription;
 32 | import com.lucidworks.dq.util.SetUtils;
 33 | import com.lucidworks.dq.util.SolrUtils;
 34 | 
 35 | public class DiffIds /*implements HasDescription*/ {
 36 |   static String HELP_WHAT_IS_IT = "Compare IDs between two cores/collections.";
 37 |   static String HELP_USAGE = "DiffIds";
 38 |   // final static Logger log = LoggerFactory.getLogger( TermStats.class );
 39 | 
 40 |   static String MODE_REPORT = "full_report";
 41 |   static String MODE_A_ONLY = "a_only";
 42 |   static String MODE_B_ONLY = "b_only";
 43 |   static String MODE_INTERSECT = "intersect";
 44 |   static String MODE_UNION = "union";
 45 |   static String DEFAULT_MODE = MODE_REPORT;
 46 |   static Set<String> VALID_MODES = new LinkedHashSet<String>() {{
 47 |     add( MODE_REPORT );
 48 |     add( MODE_A_ONLY );
 49 |     add( MODE_B_ONLY );
 50 |     add( MODE_INTERSECT );
 51 |     add( MODE_UNION );
 52 |   }};
 53 | 
 54 |   public static String getShortDescription() {
 55 |     return HELP_WHAT_IS_IT;
 56 |   }
 57 |   
 58 |   public static String NL = System.getProperty("line.separator");
 59 | 
 60 |   // command line options
 61 |   static Options options;
 62 | 
 63 |   static Set<String> readIdsFromFile( File targetFile, CharsetDecoder deccoder ) throws IOException {
 64 |     Set<String> ids = new LinkedHashSet<String>();
 65 |     BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder));
 66 |     String line;
 67 |     while ((line = in.readLine()) != null) {
 68 |       // skip completely blank lines, but doesn't do any trimming
 69 |       if ( line.length()<1 ) {
 70 |         continue;
 71 |       }
 72 |       ids.add( line );
 73 |     }
 74 |     in.close();
 75 |     return ids;
 76 |   }
 77 | 
 78 |   static void helpAndExit() {
 79 |     helpAndExit( null, 1 );
 80 |   }
 81 |   static void helpAndExit( String optionalError, int errorCode ) {
 82 |     HelpFormatter formatter = new HelpFormatter();
 83 |     if ( null==optionalError ) {
 84 |       // log.info( HELP_WHAT_IS_IT );
 85 |       System.err.println( HELP_WHAT_IS_IT );
 86 |     }
 87 |     else {
 88 |       // log.error( optionalError );
 89 |       System.err.println( optionalError );
 90 |     }
 91 |     // stdout
 92 |     //formatter.printHelp( HELP_USAGE, options, true );
 93 |     // stderr
 94 |     PrintWriter pw = new PrintWriter(System.err);
 95 |     formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true );
 96 |     pw.flush();
 97 |     System.exit( errorCode );
 98 |   }
 99 | 
100 |   public static void main( String[] argv ) throws SolrServerException, IOException {
101 | 
102 |     options = new Options();
103 |     options.addOption( "u", "url_a", true, "URL for first Solr, Eg http://localhost:8983/solr/collection1, OR set host, port and possibly collection" );
104 |     options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" );
105 |     options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" );
106 |     options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" );
107 | 
108 |     options.addOption( "U", "url_b", true, "URL for second Solr, Eg http://localhost:8983/solr/collection2, OR set host, port and possibly collection" );
109 |     options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" );
110 |     options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" );
111 |     options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" );
112 | 
113 |     options.addOption( "f", "file_a", true, "Read IDs for A from a text file, one ID per line (skips 0 length lines, not counting newlines)" );
114 |     options.addOption( "F", "file_b", true, "Read IDs for B from a text file, one ID per line (skips 0 length lines, not counting newlines)" );
115 | 
116 |     options.addOption( "o", "output_file", true, "Output file to create for the full report or ID list (default or \"-\" is stdout / standard out)" );
117 |     options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" );
118 |     options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" );
119 | 
120 |     options.addOption( "m", "mode", true,
121 |         "What to output:"
122 |             + " \"" + MODE_REPORT + "\" means fully formatted report (default)"
123 |             + ", \"" + MODE_A_ONLY + "\" bare list of IDs only in A (one per line)"
124 |             + ", \"" + MODE_B_ONLY + "\" IDs only in B"
125 |             + ", \"" + MODE_INTERSECT + "\" IDs preent in BOTH A AND B"
126 |             + ", \"" + MODE_UNION + "\" IDs in A or B or in both (combines all IDs from both, but each ID will only appear once)"
127 |         );
128 |     if ( argv.length < 1 ) {
129 |       helpAndExit();
130 |     }
131 |     CommandLine cmd = null;
132 |     try {
133 |       CommandLineParser parser = new PosixParser();
134 |       // CommandLineParser parser = new DefaultParser();
135 |       cmd = parser.parse( options, argv );
136 |     }
137 |     catch( ParseException exp ) {
138 |       helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 );
139 |     }
140 |     // Already using -h for host, don't really need help, just run with no options
141 |     //if ( cmd.hasOption("help") ) {
142 |     //  helpAndExit();
143 |     //}
144 | 
145 |     String fullUrlA = cmd.getOptionValue( "url_a" );
146 |     String hostA = cmd.getOptionValue( "host_a" );
147 |     String portA = cmd.getOptionValue( "port_a" );
148 |     String collA = cmd.getOptionValue( "collection_a" );
149 |     String fileA = cmd.getOptionValue( "file_a" );
150 |     int optsA = 0;
151 |     optsA += (null!=fullUrlA) ? 1 : 0;
152 |     optsA += (null!=hostA) ? 1 : 0;
153 |     optsA += (null!=fileA) ? 1 : 0;
154 |     if ( optsA < 1 ) {
155 |       helpAndExit( "Must specifify at least url or host or ids file for first Solr instance", 3 );
156 |     }
157 |     if ( optsA > 1 ) {
158 |       helpAndExit( "Can only specifify one of url, host or ids file for first Solr instance", 4 );
159 |     }
160 | 
161 |     String fullUrlB = cmd.getOptionValue( "url_b" );
162 |     String hostB = cmd.getOptionValue( "host_b" );
163 |     String portB = cmd.getOptionValue( "port_b" );
164 |     String collB = cmd.getOptionValue( "collection_b" );
165 |     String fileB = cmd.getOptionValue( "file_b" );
166 |     int optsB = 0;
167 |     optsB += (null!=fullUrlB) ? 1 : 0;
168 |     optsB += (null!=hostB) ? 1 : 0;
169 |     optsB += (null!=fileB) ? 1 : 0;
170 |     if ( optsB < 1 ) {
171 |       helpAndExit( "Must specifify at least url or host or ids file for second Solr instance", 3 );
172 |     }
173 |     if ( optsB > 1 ) {
174 |       helpAndExit( "Can only specifify one of url, host or ids file for second Solr instance", 4 );
175 |     }
176 | 
177 |     // VALID_MODES
178 |     String mode = cmd.getOptionValue( "mode" );
179 |     if ( null!=mode ) {
180 |       mode = mode.toLowerCase().trim();
181 |       if ( ! VALID_MODES.contains(mode) ) {
182 |         helpAndExit( "Invalid mode, must be one of: " + VALID_MODES, 5 );
183 |       }
184 |     }
185 |     boolean isNormalReport = (null==mode) || mode.equals( MODE_REPORT );
186 | 
187 |     // File IO
188 |     String outputFile = cmd.getOptionValue( "output_file" );
189 |     String encodingStr = cmd.getOptionValue( "encoding" );
190 |     if ( null==encodingStr || encodingStr.trim().length()<1 ) {
191 |       encodingStr = "UTF-8";
192 |     }
193 |     boolean strictEncoding = true;
194 |     if(cmd.hasOption("loose_encoding")) {
195 |       strictEncoding = false;
196 |     }
197 | 
198 |     // Setup IO encoding
199 |     Charset charset = Charset.forName( encodingStr );
200 |     // Input uses Decoder
201 |     CharsetDecoder decoder = charset.newDecoder();
202 |     // Output uses Encoder
203 |     CharsetEncoder encoder = charset.newEncoder();
204 |     if ( strictEncoding ) {
205 |       decoder.onMalformedInput( CodingErrorAction.REPORT );
206 |       encoder.onMalformedInput( CodingErrorAction.REPORT );
207 |     }
208 | 
209 |     PrintWriter out = null;
210 |     if( null!=outputFile && ! outputFile.equals("-") ) {
211 |       out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoder), true);
212 |     } else {
213 |       out = new PrintWriter(new OutputStreamWriter(System.out, encoder), true);
214 |     }
215 | 
216 |     // Init
217 |     // HttpSolrServer solrA = new HttpSolrServer( URL1 );
218 |     HttpSolrServer solrA = null;
219 |     if ( null==fileA ) {
220 |       if ( null!=fullUrlA ) {
221 |         solrA = SolrUtils.getServer( fullUrlA );
222 |       }
223 |       else {
224 |         // Utils handle null values
225 |         solrA = SolrUtils.getServer( hostA, portA, collA );    
226 |       }
227 |       if(isNormalReport) out.println( "First Solr / Solr A = " + solrA.getBaseURL() );
228 |     }
229 |     else {
230 |       if(isNormalReport) out.println( "First Solr / Solr A read from file = " + fileA );
231 |     }
232 | 
233 |     // HttpSolrServer solrB = new HttpSolrServer( URL2 );
234 |     HttpSolrServer solrB = null;
235 |     if ( null==fileB ) {
236 |       if ( null!=fullUrlB ) {
237 |         solrB = SolrUtils.getServer( fullUrlB );
238 |       }
239 |       else {
240 |         // Utils handle null values
241 |         solrB = SolrUtils.getServer( hostB, portB, collB );    
242 |       }
243 |       if(isNormalReport) out.println( "Second Solr / Solr B = " + solrB.getBaseURL() );
244 |     }
245 |     else {
246 |       if(isNormalReport) out.println( "Second Solr / Solr B read from file = " + fileB );
247 |     }
248 | 
249 |     Set<String> idsA = (null!=solrA) ? SolrUtils.getAllIds( solrA ) : readIdsFromFile( new File(fileA), decoder );
250 |     Set<String> idsB = (null!=solrB) ? SolrUtils.getAllIds( solrB ) : readIdsFromFile( new File(fileB), decoder );
251 | 
252 |     if ( isNormalReport ) {
253 |       // Use non-destructive here since we use the lists more than once
254 |       Set<String> aOnly = SetUtils.inAOnly_nonDestructive(idsA, idsB);
255 |       Set<String> bOnly = SetUtils.inBOnly_nonDestructive(idsA, idsB);
256 |       out.println( "A-only: " + aOnly );
257 |       out.println( "B-only: " + bOnly );
258 |     }
259 |     else {
260 |       Set<String> ids = null;
261 |       if ( mode.equals(MODE_A_ONLY) ) {
262 |         // destructive OK here since we're just doing 1 calculation
263 |         ids = SetUtils.inAOnly_destructive( idsA, idsB );
264 |       }
265 |       else if ( mode.equals(MODE_B_ONLY) ) {
266 |         ids = SetUtils.inBOnly_destructive( idsA, idsB );
267 |       }
268 |       else if ( mode.equals(MODE_INTERSECT) ) {
269 |         ids = SetUtils.intersection_destructive( idsA, idsB );  
270 |       }
271 |       else if ( mode.equals(MODE_UNION) ) {
272 |         ids = SetUtils.union_destructive( idsA, idsB );  
273 |       }
274 |       else {
275 |         // This should never happen.
276 |         // If it ever does, maybe somebody added to VALID_MODES but didn't add a case here
277 |         throw new IllegalStateException( "Unknown mode \"" + mode + "\", check VALID_MODES" );
278 |       }
279 | 
280 |       // Print the results
281 |       for ( String id : ids ) {
282 |         out.println( id );
283 |       }
284 |     }
285 |     out.close();
286 |   }
287 | 
288 | }
289 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntry.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | /*
 6 |  * Log entries can have structure.
 7 |  * Sometimes the structure isn't known when log entries are first ingested, they may come in as raw strings.
 8 |  * The idea is that a log entry could be fed into a process and then a more specific log entry comes out.
 9 |  * This process could be repeated for even more specific or normalized entries.
10 |  * Ideally more evolved log entries can have the option of still referring back to their parent entries
11 |  * for auditing or so that rules can be rerun.
12 |  * Another issue is that some series of lines in a log file constitute a higher level log entry.
13 |  * Some of the strecture might be fixed text, whereas other items might be parameterizable.
14 |  * Eg:
15 |  *  &name=dave
16 |  *  &name=mark
17 |  *  &name=satish
18 |  *  -> "name" is a fixed identifier, whereas values can vary.
19 |  *  
20 |  *  My post on Stack Overflow:
21 |  *  http://stackoverflow.com/questions/26518770/advanced-requirements-for-log-file-utilities-am-i-reinventing-the-wheel
22 |  */
23 | interface LogEntry {
24 | 
25 |   String getRawText();
26 |   
27 |   Collection<LogEntryReference> getReferences();
28 |   // TODO: should setters be defined in Interface?
29 |   // void addReference( LogEntryReference ref );
30 | 
31 |   // getDate
32 |   // getPath
33 |   // getHandler
34 |   // getParamsString
35 |   // getParent
36 |   // getChildren
37 |   // getEntities
38 |   // getEventLevel // Info, warn, error, default
39 |   
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryBase.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Collection;
 5 | 
 6 | public class LogEntryBase implements LogEntry {
 7 | 
 8 |   String rawText;
 9 |   Collection<LogEntryReference> references = new ArrayList<>();
10 | 
11 |   LogEntryBase( String rawText ) {
12 |     this.rawText = rawText;
13 |   }
14 | 
15 |   @Override
16 |   public String getRawText() {
17 |     return rawText;
18 |   }
19 |   public void setRawText( String rawText ) {
20 |     this.rawText = rawText;
21 |   }
22 | 
23 |   public static LogEntry logEntryFromString( String rawText ) {
24 |     return new LogEntryBase( rawText );
25 |   }
26 | 
27 | 
28 |   @Override
29 |   public Collection<LogEntryReference> getReferences() {
30 |     return references;
31 |   }
32 | 
33 |   // @Override
34 |   public void addReference(LogEntryReference ref) {
35 |     references.add( ref );
36 |   }
37 | 
38 |   /*
39 |    * Throw exception so that derived classes are allowed to do so
40 |    */
41 |   public static void main(String[] args) throws Exception {
42 |     for ( int i=0; i<args.length; i++ ) {
43 |       LogEntry entry = logEntryFromString( args[i] );
44 |       System.out.println( entry );
45 |     }
46 |   }
47 | 
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryFromSolr.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.logs;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Collection;
  6 | import java.util.HashMap;
  7 | import java.util.LinkedHashMap;
  8 | import java.util.Map;
  9 | import java.util.Map.Entry;
 10 | import java.util.Set;
 11 | import java.util.TreeMap;
 12 | import java.util.regex.Matcher;
 13 | import java.util.regex.Pattern;
 14 | 
 15 | import com.lucidworks.dq.util.SetUtils;
 16 | import com.lucidworks.dq.util.StringUtils;
 17 | 
 18 | public class LogEntryFromSolr extends LogEntryBase {
 19 | 
 20 |   // Two types of lines:
 21 |   //
 22 |   // core.2014_10_10.log
 23 |   // 2014-10-10 00:05:50,525 INFO core.SolrCore - [DT_Products_CERT] webapp= path=/select params={pf=product_short_name^2+product_long_name^2+brand_name^1+sub_brand_name^1+tier1_names^1+tier2_names^1+tier3_names^1+tier4_names^1+product_description^2+text_all^1&sort=sales_rank+asc,&indent=false&q=*:*&qf=product_short_name^20+product_long_name^5+brand_name^10+sub_brand_name^10+tier1_names^5+tier2_names^5+tier3_names^5+tier4_names^5+product_description^2+text_all^1&q.alt=*:*&wt=json&fq=store_id:(32)&rows=30&defType=edismax} hits=24177 status=0 QTime=0  
 24 |   //
 25 |   // core.request.2014_10_13.log
 26 |   // 10.150.226.22 -  -  [13/Oct/2014:12:19:49 +0000] "GET /solr/DT_Products_CERT/select?q.alt=*:*&defType=edismax&q=%2A%3A%2A&fq=upc:(12364)&facet=true&facet.field=availability&f.availability.facet.sort=c&facet.field=ways_to_save&f.ways_to_save.facet.sort=c&facet.range=price_value&f.price_value.facet.range.start=0&f.price_value.facet.range.end=1000&f.price_value.facet.range.gap=50&facet.field=tier2_facets&f.tier2_facets.facet.sort=c&fl=upc%2Cupc_typ_ct%2Cid%2Cscore%2Cstore_id%2Cproduct_id%2Cproduct_short_name%2Cproduct_long_name%2Cproduct_description%2Cuom%2Cselling_size%2Cis_random_weight%2Cimages%2Cbrand_name%2Csub_brand_name%2Cbrand_is_meijer%2Cis_active%2Cis_sellable%2Cis_killed%2Cis_click_and_collect%2Cis_ship_to_home%2Cis_quick_shop%2Cis_alcohol%2Cis_tobacco%2Cis_age_restricted%2Cis_hazardous_material%2Cis_prepared_item%2Cis_organic%2Cis_primary_upc%2Cprice_text%2Cprice_value%2Csale_price_value%2Csale_price_text%2Csavings_value%2Csavings_value_text%2Chas_mperks_ofers%2Cis_local%2Ccool%2Cnutr_calories%2Cnutr_protien%2Cnutr_carbs%2Cmutr_fat%2Clast_updated%2Clast_updated_by%2Cweight_each%2Chot_item_flag%2Cis_substitutable%2Ctier1_ids%2Ctier2_ids%2Ctier3_ids%2Ctier4_ids%2Ctier1_facets%2Ctier2_facets%2Ctier3_facets%2Ctier4_facets%2Ctier1_names%2Ctier2_names%2Ctier3_names%2Ctier4_names%2Cmperks_offers&sort=sales_rank%20asc,&qf=product_short_name%5E20%20product_long_name%5E5%20brand_name%5E10%20sub_brand_name%5E10%20tier1_names%5E5%20tier2_names%5E5%20tier3_names%5E5%20tier4_names%5E5%20product_description%5E2%20text_all%5E1&pf=product_short_name%5E2%20product_long_name%5E2%20brand_name%5E1%20sub_brand_name%5E1%20tier1_names%5E1%20tier2_names%5E1%20tier3_names%5E1%20tier4_names%5E1%20product_description%5E2%20text_all%5E1&wt=json&indent=false HTTP/1.1" 200 2646  16
 27 |   
 28 |   // We overwrite rawText with params match
 29 |   String originalText;
 30 | 
 31 |   /*
 32 |    * When the constructor is called, we don't know for sure whether
 33 |    * this is a Solr log entry or not.
 34 |    * Although constructor is obligated to return an object,
 35 |    * the static factory will return null if we aren't.
 36 |    */
 37 |   boolean isSolrPattern = false;
 38 | 
 39 |   /*
 40 |    * If we're created from a less specific log entry
 41 |    */
 42 |   LogEntry earlierEntry;
 43 | 
 44 |   // Match params={q=...&...}
 45 |   static String PARAMS_PATTERN_STR = "params=[{]([^}]+)[}]";
 46 |   Pattern paramsPattern;
 47 |   Matcher paramsMatcher;
 48 | 
 49 |   // Match path=/handler_name
 50 |   // "Illegal repetition"
 51 |   static String HANDLER_PATTERN_STR = " path=([^ ]+) ";
 52 |   String handlerName;
 53 | 
 54 |   // LWS:  - [DT_Products_CERT] webapp=
 55 |   static String COLLECTION_PATTERN_STR = " - \\[([^\\]]+)\\] webapp=";
 56 |   String collectionName;
 57 | 
 58 |   // hits=19644 status=0 QTime=141
 59 |   static String HITS_PATTERN_STR = " hits=([0-9]+)";
 60 |   static String STATUS_PATTERN_STR = " status=([0-9]+)";
 61 |   static String QTIME_PATTERN_STR = " QTime=([0-9]+)";
 62 |   // Don't really need Longs here, but it's what utility returns
 63 |   Long hits;
 64 |   Long status;
 65 |   Long qTime;
 66 |   
 67 |   String paramsString;
 68 |   int paramsStart = -1;
 69 |   int paramsEnd = -1;
 70 |   
 71 |   Map<String,Collection<String>> parsedParamValues;
 72 | 
 73 |   // factory method
 74 |   public static LogEntry solrLogEntryFromBaseEntryOrNull( LogEntry entry ) {
 75 |     LogEntryFromSolr newEntry = new LogEntryFromSolr( entry );
 76 |     if ( newEntry.isSolrPattern() ) {
 77 |       return newEntry;
 78 |     }
 79 |     else {
 80 |       return null;
 81 |     }
 82 |   }
 83 | 
 84 |   LogEntryFromSolr( LogEntry entry ) {
 85 |     this( entry.getRawText() );
 86 |     this.earlierEntry = entry;
 87 |     init( entry.getRawText() );
 88 |   }
 89 |   LogEntryFromSolr(String rawText) {
 90 |     super( rawText );
 91 |     init( rawText );
 92 |   }
 93 |   // need init broken out so constructor1 can store earlierEntry before calling this
 94 |   void init( String rawText ) {
 95 |     this.originalText = rawText;
 96 |     paramsPattern = Pattern.compile( PARAMS_PATTERN_STR );
 97 |     paramsMatcher = paramsPattern.matcher( rawText );
 98 |     if ( paramsMatcher.find() ) {
 99 |       String matchStr = paramsMatcher.group();
100 |       setRawText( matchStr );
101 |       int overallStart = paramsMatcher.start();
102 |       int overallEnd = paramsMatcher.end();
103 | 
104 |       int group = 1;
105 |       paramsString = paramsMatcher.group( group );
106 |       paramsStart = paramsMatcher.start( group );
107 |       paramsEnd = paramsMatcher.end( group );
108 |       // Make relative to overall pattern match
109 |       paramsStart -= overallStart;
110 |       // paramsEnd = overallEnd - paramsEnd;
111 |       // Relative-to-end might not work in streaming apps since we wouldn't know where the end is
112 |       paramsEnd -= overallStart;
113 | 
114 |       
115 |       // TODO: look for other things like the handler, matches and qtime
116 |       
117 |       // Hookup references *if* we were created from an earlier log entry
118 |       if ( null != this.earlierEntry ) {
119 |         LogEntryReference ref = new LogEntryReferenceBase( this.earlierEntry, this, "LogEntryFromSolr" );
120 |         // ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( paramsStart, paramsEnd );
121 |         ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( overallStart, overallEnd );
122 |       }
123 | 
124 |       doSimpleFieldParsing();
125 | 
126 |       isSolrPattern = true;
127 |     }
128 |   }
129 | 
130 |   public String makeParamNamesKey() {
131 |     return StringUtils.join( getParsedSolrParams().keySet(), "|" );
132 |   }
133 |   public Set<String> getParamNames() {
134 |     return getParsedSolrParams().keySet();
135 |   }
136 |   public Collection<String> getParamValues( String paramName ) {
137 |     return getParsedSolrParams().get( paramName );
138 |   }
139 | 
140 |   public static Map<String,Long> tabulateQueryArgCombos( Collection<LogEntryFromSolr> entries ) {
141 |     Map<String,Long> counts = new HashMap<>();
142 |     for ( LogEntryFromSolr e : entries ) {
143 |       String key = e.makeParamNamesKey();
144 |       SetUtils.incrementMapCounter( counts, key );
145 |     }
146 |     return counts;
147 |   }
148 |   // { composite-parameter-key -> { each-parameter-name-> { unique-value: count } } }
149 |   public static Map<String,Map<String,Map<String,Long>>> tabulateQueryArgCombosAndValues( Collection<LogEntryFromSolr> entries ) {
150 |     // Level 1: by Composite Key
151 |     Map<String,Map<String,Map<String,Long>>> nestedCounts = new HashMap<>();
152 |     // Foreach Raw Entry
153 |     for ( LogEntryFromSolr e : entries ) {
154 | 
155 |       String overallKey = e.makeParamNamesKey();
156 |       // Level 2: by Parameter Name
157 |       Map<String,Map<String,Long>> paramsAndValues = null;
158 |       if ( nestedCounts.containsKey(overallKey) ) {
159 |         paramsAndValues = nestedCounts.get(overallKey);
160 |       }
161 |       else {
162 |         paramsAndValues = new TreeMap<>(); // LinkedHashMap<>();
163 |         nestedCounts.put( overallKey, paramsAndValues );
164 |       }
165 | 
166 |       Set<String> paramNames = e.getParamNames();
167 |       // Foreach Parameter Name
168 |       for ( String name : paramNames ) {
169 |         // Level 3: by Value
170 |         Map<String,Long> tabulatedValues = null;
171 |         if ( paramsAndValues.containsKey(name) ) {
172 |           tabulatedValues = paramsAndValues.get(name);
173 |         }
174 |         else {
175 |           tabulatedValues = new LinkedHashMap<>();
176 |           paramsAndValues.put( name, tabulatedValues );
177 |         }
178 |         Collection<String> rawValues = e.getParamValues( name );
179 |         for ( String rv : rawValues ) {
180 |           Long count = 0L;
181 |           if ( tabulatedValues.containsKey(rv) ) {
182 |             count = tabulatedValues.get(rv);
183 |           }
184 |           count += 1L;
185 |           tabulatedValues.put( rv, count );
186 |         }
187 | 
188 |       }  // End Foreach Parameter Name
189 |       
190 |     }  // End Foreach Raw Entry
191 | 
192 |     return nestedCounts;
193 |   }
194 | 
195 |   void doSimpleFieldParsing() {
196 |     parseHandlerName();
197 |     parseCollectionName();
198 |     parseHits();
199 |     parseStatus();
200 |     parseQTime();
201 |   }
202 |   void parseHandlerName() {
203 |     handlerName = StringUtils.parseAndCatchGroupAsStringOrNull( HANDLER_PATTERN_STR, getOriginalText(), 1 );
204 |   }
205 |   void parseCollectionName() {
206 |     collectionName = StringUtils.parseAndCatchGroupAsStringOrNull( COLLECTION_PATTERN_STR, getOriginalText(), 1 );
207 |   }
208 |   void parseHits() {
209 |     hits = StringUtils.parseAndCatchGroupAsLongOrNull( HITS_PATTERN_STR, getOriginalText(), 1 );
210 |   }
211 |   void parseStatus() {
212 |     status = StringUtils.parseAndCatchGroupAsLongOrNull( STATUS_PATTERN_STR, getOriginalText(), 1 );
213 |   }
214 |   void parseQTime() {
215 |     qTime = StringUtils.parseAndCatchGroupAsLongOrNull( QTIME_PATTERN_STR, getOriginalText(), 1 );
216 |   }
217 | 
218 |   // Not thread safe, but OK for now, for single thread utility
219 |   public Map<String,Collection<String>> getParsedSolrParams() {
220 |     if ( null==parsedParamValues ) {
221 |       parsedParamValues = StringUtils.parseCgiParameters( getParamsString() );
222 |     }
223 |     return parsedParamValues;
224 |   }
225 | 
226 |   public boolean isSolrPattern() {
227 |     return isSolrPattern;
228 |   }
229 |   
230 |   public String getParamsString() {
231 |     return paramsString;
232 |   }
233 | 
234 |   String getOriginalText() {
235 |     return originalText;
236 |   }
237 | 
238 |   String getHandlerName() {
239 |     return handlerName;
240 |   }
241 |   String getCollectionName() {
242 |     return collectionName;
243 |   }
244 |   // Don't really need Longs here, but it's what utility returns
245 |   /*
246 |    * get number of Matches
247 |    */
248 |   Long getHits() {
249 |     return hits;
250 |   }
251 |   /*
252 |    * Similar to HTTP Numeric Status Code
253 |    * Eg: 200, 500, etc.
254 |    */
255 |   Long getStatus() {
256 |     return status;
257 |   }
258 |   /*
259 |    * Query time in milliseconds
260 |    * may not include transmission time of payload to requesting client
261 |    */
262 |   Long getQTime() {
263 |     return qTime;
264 |   }
265 | 
266 |   public static void main(String[] args) throws IOException {
267 |     for ( int i=0; i<args.length; i++ ) {
268 | 
269 |       // Locate Files
270 |       LogFileRepo repo = new LogFileRepoBase( args[i] );
271 |       Collection<LogFile> logs = repo.findLogFiles();
272 |       for ( LogFile lf : logs ) {
273 |         lf.read();
274 |         Collection<LogEntry> rawEntries = lf.getEntries();
275 |         Collection<LogEntryFromSolr> solrEntries = new ArrayList<>();
276 |         for ( LogEntry rawEntry : rawEntries ) {
277 |           // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry );
278 |           LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry );
279 |           // if ( solrEntry.isSolrPattern() )
280 |           if ( null != solrEntry )
281 |           {
282 |             solrEntries.add( (LogEntryFromSolr) solrEntry );
283 |           }
284 |         }
285 | 
286 |         // Tabulate
287 |         Map<String,Long> queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries );
288 |         // composite-parameter-key -> each-parameter-name-> unique-value -> count
289 |         Map<String,Map<String,Map<String,Long>>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries );
290 |         queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts );
291 |         queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts );
292 | 
293 |         // Report
294 |         for ( Entry<String, Long> e1 : queryTypeCounts.entrySet() ) {
295 |           String queryType = e1.getKey();
296 |           Long queryTypeCount = e1.getValue();
297 |           System.out.println( "" + queryTypeCount + " " + queryType );
298 |           Map<String,Map<String,Long>> statsForQueryType = detailedStats.get( queryType );
299 |           for ( Entry<String, Map<String, Long>> e2 : statsForQueryType.entrySet() ) {
300 |             String paramName = e2.getKey();
301 |             System.out.println( "\t" + paramName + ":" );
302 |             Map<String, Long> paramValues = e2.getValue();
303 |             paramValues = SetUtils.sortMapByValues( paramValues );
304 |             paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues );
305 |             for ( Entry<String, Long> e3 : paramValues.entrySet() ) {
306 |               String value = e3.getKey();
307 |               Long valueCount = e3.getValue();
308 |               System.out.println( "\t\t" + valueCount + " " + value );
309 |             }
310 |           }
311 |         }
312 |       }
313 |       // System.out.println( repo );
314 |     }
315 | 
316 |   }
317 | 
318 | }
319 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryGroup.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | /*
 6 |  * TODO: Do we really need this?
 7 |  * Pro: good abstraction, might developer additional features
 8 |  * Con: converting back and forth between this and Collection<LogEntry>
 9 |  */
10 | public interface LogEntryGroup /*extends Collection<LogEntry>*/ {
11 |   Collection<LogEntry> getEntries();
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryGroupFromSolr.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | public class LogEntryGroupFromSolr implements LogEntryGroup {
 6 | 
 7 |   @Override
 8 |   public Collection<LogEntry> getEntries() {
 9 |     // TODO Auto-generated method stub
10 |     return null;
11 |   }
12 | 
13 |   public static void main(String[] args) {
14 |     // TODO Auto-generated method stub
15 | 
16 |   }
17 | 
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryReference.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | public interface LogEntryReference {
 6 |   Collection<LogEntry> getEarlierEntries();
 7 |   Collection<LogEntry> getLaterEntries();
 8 |   //void addEarlierEntry( LogEntry entry );
 9 |   //void addLaterEntry( LogEntry entry );
10 | 
11 |   String getComment();
12 |   //void setComment( String comment );
13 | 
14 |   int getRelativeStart();
15 |   int getRelativeEnd();
16 |   //void setRelativeRegionOfInterest( int fromStart, int fromEnd );
17 |   //void setRelativeStart( int fromStart );
18 |   //void setRelativeEnd( int fromEnd );
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogEntryReferenceBase.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Collection;
 5 | 
 6 | public class LogEntryReferenceBase implements LogEntryReference {
 7 | 
 8 |   String comment;
 9 |   // LogEntryGroup is approx Collection<LogEntry>
10 |   Collection<LogEntry> earlierEntries = new ArrayList<>();
11 |   Collection<LogEntry> laterEntries = new ArrayList<>();
12 | 
13 |   int relativeRegionOfInterestStart;
14 |   int relativeRegionOfInterestEnd;
15 |   
16 |   public LogEntryReferenceBase() { }
17 |   
18 |   public LogEntryReferenceBase( LogEntry earlierEntry, LogEntry laterEntry, String comment ) {
19 |     this();
20 |     // Link to log entries
21 |     addEarlierEntry( earlierEntry );
22 |     addLaterEntry( laterEntry );
23 |     // Link log entries back to us
24 |     ( (LogEntryBase)earlierEntry ).addReference( this );
25 |     ( (LogEntryBase)laterEntry ).addReference( this );
26 |     setComment( comment );
27 |   }
28 | 
29 |   @Override
30 |   public Collection<LogEntry> getEarlierEntries() {
31 |     return earlierEntries;
32 |   }
33 |   public void addEarlierEntry( LogEntry entry ) {
34 |     earlierEntries.add( entry );
35 |   }
36 | 
37 |   @Override
38 |   public Collection<LogEntry> getLaterEntries() {
39 |     return laterEntries;
40 |   }
41 |   public void addLaterEntry( LogEntry entry ) {
42 |     laterEntries.add( entry );
43 |   }
44 | 
45 |   @Override
46 |   public String getComment() {
47 |     return comment;
48 |   }
49 |   public void setComment( String comment ) {
50 |     this.comment = comment;
51 |   }
52 | 
53 |   @Override
54 |   public int getRelativeStart() {
55 |     return relativeRegionOfInterestStart;
56 |   }
57 |   @Override
58 |   public int getRelativeEnd() {
59 |     return relativeRegionOfInterestEnd;
60 |   }
61 |   //@Override
62 |   public void setRelativeRegionOfInterest( int fromStart, int fromEnd ) {
63 |     relativeRegionOfInterestStart = fromStart;
64 |     relativeRegionOfInterestEnd = fromEnd;
65 |   }
66 |   //@Override
67 |   public void setRelativeStart( int fromStart ) {
68 |     this.relativeRegionOfInterestStart = fromStart;
69 |   }
70 |   //@Override
71 |   public void setRelativeEnd( int fromEnd ) {
72 |     this.relativeRegionOfInterestEnd = fromEnd;
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFile.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | 
 6 | public interface LogFile extends LogEntryGroup {
 7 | 
 8 |   void read() throws IOException;
 9 |   
10 |   // Inherits getEntries() from super
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileBase.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.FileNotFoundException;
 7 | import java.io.IOException;
 8 | import java.io.InputStreamReader;
 9 | import java.io.UnsupportedEncodingException;
10 | import java.util.ArrayList;
11 | import java.util.Collection;
12 | 
13 | public class LogFileBase implements LogFile {
14 |   
15 |   // TODO: could leave this NULL until they've called .process() ?
16 |   Collection<LogEntry> entries = new ArrayList<>();
17 |   File sourceFile;
18 | 
19 |   // Public "factory" methods
20 |   public static LogFile logFileFromDiskFile( File inFile ) throws IOException {
21 |     return new LogFileBase( inFile );
22 |   }
23 |   public static LogFile logFileFromDiskFile( String fileName ) throws IOException {
24 |     return new LogFileBase( new File(fileName) );
25 |   }
26 | 
27 |   LogFileBase( File sourceFile ) {
28 |     this.sourceFile = sourceFile;
29 |   }
30 | 
31 |   // Break out processing logic out from constructor
32 |   // in case we want to defer it
33 |   @Override
34 |   public void read() throws IOException {
35 |     BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile), "UTF-8"));
36 |     while( true ) {
37 |         String line = in.readLine();
38 |         if ( null==line ) {
39 |             break;
40 |         }
41 |         LogEntry entry = LogEntryBase.logEntryFromString( line );
42 |         entries.add( entry );
43 |     }
44 |     in.close();
45 |   }
46 |   
47 |   @Override
48 |   public Collection<LogEntry> getEntries() {
49 |     return entries;
50 |   }
51 | 
52 | 
53 |   public static void main(String[] args) throws IOException {
54 |     for ( int i=0; i<args.length; i++ ) {
55 |       LogFile entry = logFileFromDiskFile( args[i] );
56 |       System.out.println( entry );
57 |     }
58 |   }
59 | 
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileFromSolr.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.io.File;
 4 | 
 5 | public class LogFileFromSolr extends LogFileBase {
 6 | 
 7 |   LogFileFromSolr(File sourceFile) {
 8 |     super(sourceFile);
 9 | 
10 |   }
11 | 
12 |   public static void main(String[] args) {
13 | 
14 |   }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileRepo.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.logs;
 2 | 
 3 | import java.io.File;
 4 | import java.util.Collection;
 5 | 
 6 | public interface LogFileRepo {
 7 |   // TODO: handle streaming, via socket, and one-at-a-time iteration
 8 |   // Actually these could be considered *candidate* log files...
 9 |   //Collection<LogFile> findLogFiles( File startingDirOrFile );
10 |   //Collection<LogFile> findLogFiles( Collection<File> startingDirOrFiles );
11 |   Collection<LogFile> findLogFiles();
12 | 
13 |   // TODO: maybe Log *File* Repo is a filesystem impl of a more generic Log Unit Source Repo
14 |   // TODO: although we really do need setters, should they be defined in the interface?
15 |   String getIncludePattern();
16 |   void setIncludePattern( String pattern );
17 |   boolean getIncludeCompressedFiles();
18 |   void setIncludeCompressedFiles( boolean flag );
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/logs/LogFileRepoBase.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.logs;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.util.ArrayList;
  6 | import java.util.Collection;
  7 | import java.util.Map;
  8 | import java.util.Map.Entry;
  9 | import java.util.Queue;
 10 | import java.util.concurrent.ConcurrentLinkedQueue;
 11 | 
 12 | import com.lucidworks.dq.util.SetUtils;
 13 | 
 14 | public class LogFileRepoBase implements LogFileRepo {
 15 | 
 16 |   Collection<File> myQueue = new ConcurrentLinkedQueue<>();
 17 |  
 18 |   File startingDirOrFile;
 19 | 
 20 |   // Regex, Optional
 21 |   String includePattern;
 22 |   
 23 |   boolean shouldIncludeCompressedFiles;
 24 | 
 25 |   public LogFileRepoBase( String startingDirOrFile ) {
 26 |     this( new File(startingDirOrFile) );
 27 |   }
 28 |   public LogFileRepoBase( File startingDirOrFile ) {
 29 |     this.startingDirOrFile = startingDirOrFile;    
 30 |   }
 31 | 
 32 |   @Override
 33 |   public Collection<LogFile> findLogFiles() {
 34 |     traverse( myQueue, startingDirOrFile );
 35 |     Collection<LogFile> outList = new ArrayList<>();
 36 |     for ( File f : myQueue ) {
 37 |       LogFile lf = new LogFileBase( f );
 38 |       outList.add( lf );
 39 |     }
 40 |     return outList;
 41 |   }
 42 | 
 43 |   @Override
 44 |   public void setIncludePattern(String pattern) {
 45 |     this.includePattern = pattern;
 46 |   }
 47 |   @Override
 48 |   public String getIncludePattern() {
 49 |     return includePattern;
 50 |   }
 51 | 
 52 |   @Override
 53 |   public void setIncludeCompressedFiles(boolean flag) {
 54 |     this.shouldIncludeCompressedFiles = flag;
 55 |   }
 56 |   @Override
 57 |   public boolean getIncludeCompressedFiles() {
 58 |     return shouldIncludeCompressedFiles;
 59 |   }
 60 |   
 61 |   //Lookup all the files
 62 |   //traverse( myQueue, "someDirName", null );
 63 |   //Or simpler
 64 |   //Collection files = LinkedHashSet<File>();
 65 |   //traverse( files, "someDirName", null );
 66 | 
 67 |   //TODO: would be better to pass in method to call
 68 |   void traverse( Collection<File>queue, String startDir ) {
 69 |     traverse( queue, new File(startDir) );
 70 |   }
 71 |   void traverse( Collection<File>queue, File candidate ) {
 72 |     if( candidate.isFile() ) {
 73 |       if ( null==getIncludePattern() || candidate.toString().matches(getIncludePattern()) ) {
 74 |         queue.add( candidate );
 75 |       }
 76 |     }
 77 |     // Else probably a directory
 78 |     else if ( candidate.isDirectory() ) {
 79 |       File [] entries = candidate.listFiles();
 80 |       for ( File f : entries ) {
 81 |         traverse( queue, f );
 82 |       }
 83 |     }
 84 |     else {
 85 |       System.out.println( "ERROR: Neither file nor directory: " + candidate );
 86 |     }
 87 |   }
 88 | 
 89 |   public static void main(String[] args) throws IOException {
 90 |     // Moved to LogEntryFromSolr main
 91 | 
 92 | //    for ( int i=0; i<args.length; i++ ) {
 93 | //      LogFileRepo repo = new LogFileRepoBase( args[i] );
 94 | //      Collection<LogFile> logs = repo.findLogFiles();
 95 | //      for ( LogFile lf : logs ) {
 96 | //        lf.read();
 97 | //        Collection<LogEntry> rawEntries = lf.getEntries();
 98 | //        Collection<LogEntryFromSolr> solrEntries = new ArrayList<>();
 99 | //        for ( LogEntry rawEntry : rawEntries ) {
100 | //          // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry );
101 | //          LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry );
102 | //          // if ( solrEntry.isSolrPattern() )
103 | //          if ( null != solrEntry )
104 | //          {
105 | //            solrEntries.add( (LogEntryFromSolr) solrEntry );
106 | //          }
107 | //        }
108 | //        Map<String,Long> queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries );
109 | //        // composite-parameter-key -> each-parameter-name-> unique-value -> count
110 | //        Map<String,Map<String,Map<String,Long>>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries );
111 | //        queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts );
112 | //        queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts );
113 | //        for ( Entry<String, Long> e1 : queryTypeCounts.entrySet() ) {
114 | //          String queryType = e1.getKey();
115 | //          Long queryTypeCount = e1.getValue();
116 | //          System.out.println( "" + queryTypeCount + " " + queryType );
117 | //          Map<String,Map<String,Long>> statsForQueryType = detailedStats.get( queryType );
118 | //          for ( Entry<String, Map<String, Long>> e2 : statsForQueryType.entrySet() ) {
119 | //            String paramName = e2.getKey();
120 | //            System.out.println( "\t" + paramName + ":" );
121 | //            Map<String, Long> paramValues = e2.getValue();
122 | //            paramValues = SetUtils.sortMapByValues( paramValues );
123 | //            paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues );
124 | //            for ( Entry<String, Long> e3 : paramValues.entrySet() ) {
125 | //              String value = e3.getKey();
126 | //              Long valueCount = e3.getValue();
127 | //              System.out.println( "\t\t" + valueCount + " " + value );
128 | //            }
129 | //          }
130 | //        }
131 | //      }
132 | //      // System.out.println( repo );
133 | //    }
134 | 
135 |   }
136 |   
137 |   
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/Schema.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.schema;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Set;
 5 | 
 6 | public interface Schema {
 7 | 
 8 |   // TODO: move throws Exception down to implementation level
 9 |   // and errors buffer
10 | 
11 |   public float getSchemaVersion() throws Exception;
12 | 
13 |   public String getSchemaName() throws Exception;
14 | 
15 |   public String getUniqueKeyFieldName() throws Exception;
16 | 
17 |   public String getSimilarityModelClassName() throws Exception;
18 | 
19 |   public String getDefaultOperator() throws Exception;
20 | 
21 |   public String getDefaultSearchField() throws Exception;
22 | 
23 |   public Map<String, Set<String>> getAllDeclaredAndDynamicFieldsByType() throws Exception;
24 | 
25 |   public Set<String> getAllSchemaFieldNames() throws Exception;
26 | 
27 |   public Set<String> getAllDynamicFieldPatterns() throws Exception;
28 | 
29 |   public Set<String> getAllFieldTypeNames() throws Exception;
30 | 
31 |   public Set<String> getAllCopyFieldSourceNames() throws Exception;
32 | 
33 |   public Set<String> getAllCopyFieldDestinationNames() throws Exception;
34 | 
35 |   public Set<String> getCopyFieldDestinationsForSource(String sourceName) throws Exception;
36 | 
37 |   public Set<String> getCopyFieldSourcesForDestination(String destName) throws Exception;
38 | 
39 |   public String generateReport() throws Exception;
40 | 
41 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SchemaBase.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.schema;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.util.LinkedHashSet;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | public abstract class SchemaBase implements Schema {
 10 | 
 11 |   // Also helpful for debugging code
 12 |   @Override
 13 |   public String generateReport() throws Exception {
 14 |     StringWriter sw = new StringWriter();
 15 |     PrintWriter out = new PrintWriter(sw);
 16 | 
 17 |     // Singular Values
 18 |     String name = getSchemaName();
 19 |     out.println( "Schema Name: " + name );
 20 |     float vers = getSchemaVersion();
 21 |     out.println("Schema Version: " + vers);
 22 |     String key = getUniqueKeyFieldName();
 23 |     out.println( "Key Field: " + key );
 24 |     String defOp = getDefaultOperator();
 25 |     out.println( "Default Operator: " + defOp );
 26 |     String sim = getSimilarityModelClassName();
 27 |     out.println( "Similarity Class Name: " + sim );
 28 |     String defField = getDefaultSearchField();
 29 |     out.println( "Default Search Field: " + defField );
 30 | 
 31 |     // Complex Values
 32 |     Set<String> fields = getAllSchemaFieldNames();
 33 |     out.println();
 34 |     out.println( "Fields: " + fields );
 35 | 
 36 |     Set<String> dynFields = getAllDynamicFieldPatterns();
 37 |     out.println();
 38 |     out.println( "Dynamic Field Patterns: " + dynFields );
 39 | 
 40 |     Set<String> typeNames = getAllFieldTypeNames();
 41 |     out.println();
 42 |     out.println( "Types: " + typeNames );
 43 | 
 44 |     Map<String, Set<String>> typesAndNames = getAllDeclaredAndDynamicFieldsByType();
 45 |     out.println();
 46 |     out.println( "Type -> Fields: (declared and dynamic patterns)" );
 47 |     out.println( "\t(" + typesAndNames.size() + " types)" );
 48 |     for ( String type : typesAndNames.keySet() ) {
 49 |       out.println( "\t" + type + ":" );
 50 |       Set<String> typeFields = typesAndNames.get( type );
 51 |       out.println( "\t\t(" + typeFields.size() + " fields)" );
 52 |       for ( String field : typeFields ) {
 53 |         out.println( "\t\t" + field );        
 54 |       }
 55 |     }
 56 |     
 57 |     
 58 |     Set<String> sourceNames = getAllCopyFieldSourceNames();
 59 |     out.println();
 60 |     out.println( "Copy Sources: " + sourceNames );
 61 |     for ( String source : sourceNames ) {
 62 |       Set<String> tmpDests = getCopyFieldDestinationsForSource(source);
 63 |       out.println( "\tFrom: '"+ source + "' To " + tmpDests );
 64 |     }
 65 | 
 66 |     Set<String> destNames = getAllCopyFieldDestinationNames();
 67 |     out.println();
 68 |     out.println( "Copy Destinations: " + destNames );
 69 |     for ( String dest : destNames ) {
 70 |       Set<String> tmpSrcs = getCopyFieldSourcesForDestination( dest );
 71 |       out.println( "\tDest: '"+ dest + "' From " + tmpSrcs );
 72 |     }
 73 | 
 74 |     String outStr = sw.toString();
 75 |     return outStr;
 76 |   }
 77 |   
 78 |   static void utilTabulateFieldTypeAndName( Map<String, Set<String>> map, String type, String name ) {
 79 |     if ( map.containsKey(type) ) {
 80 |       map.get(type).add( name );
 81 |     }
 82 |     else {
 83 |       Set<String> vector = new LinkedHashSet<>();
 84 |       vector.add( name );
 85 |       map.put( type, vector );
 86 |     }   
 87 |   }
 88 | 
 89 |   @Override
 90 |   public abstract float getSchemaVersion() throws Exception;
 91 |   @Override
 92 |   public abstract String getSchemaName() throws Exception;
 93 |   @Override
 94 |   public abstract String getUniqueKeyFieldName() throws Exception;
 95 |   @Override
 96 |   public abstract String getSimilarityModelClassName() throws Exception;
 97 |   @Override
 98 |   public abstract String getDefaultOperator() throws Exception;
 99 |   @Override
100 |   public abstract String getDefaultSearchField() throws Exception;
101 |   @Override
102 |   public abstract Set<String> getAllSchemaFieldNames() throws Exception;
103 |   @Override
104 |   public abstract Set<String> getAllDynamicFieldPatterns() throws Exception;
105 |   @Override
106 |   public abstract Set<String> getAllFieldTypeNames() throws Exception;
107 |   @Override
108 |   public abstract Set<String> getAllCopyFieldSourceNames() throws Exception;
109 |   @Override
110 |   public abstract Set<String> getAllCopyFieldDestinationNames() throws Exception;
111 |   @Override
112 |   public abstract Set<String> getCopyFieldDestinationsForSource(String sourceName) throws Exception;
113 |   @Override
114 |   public abstract Set<String> getCopyFieldSourcesForDestination(String destName) throws Exception;
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SchemaFromLocalCore_broken.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.schema;
  2 | 
  3 | import java.util.LinkedHashMap;
  4 | import java.util.LinkedHashSet;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.Map.Entry;
  8 | import java.util.Properties;
  9 | import java.util.Set;
 10 | 
 11 | import org.apache.solr.common.util.NamedList;
 12 | import org.apache.solr.core.ConfigSolr;
 13 | import org.apache.solr.core.ConfigSolrXmlOld;
 14 | import org.apache.solr.core.CoreContainer;
 15 | import org.apache.solr.core.SolrCore;
 16 | import org.apache.solr.core.SolrResourceLoader;
 17 | import org.apache.solr.request.LocalSolrQueryRequest;
 18 | import org.apache.solr.request.SolrQueryRequest;
 19 | import org.apache.solr.schema.CopyField;
 20 | import org.apache.solr.schema.FieldType;
 21 | import org.apache.solr.schema.IndexSchema;
 22 | import org.apache.solr.schema.IndexSchema.DynamicField;
 23 | import org.apache.solr.schema.SchemaField;
 24 | 
 25 | public class SchemaFromLocalCore_broken extends SchemaBase implements Schema {
 26 | 
 27 |   static String PATH1 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example";
 28 |   static String PATH2 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr";
 29 |   static String PATH3 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr/collection1";
 30 | 
 31 |   private IndexSchema schema;
 32 |   
 33 |   public SchemaFromLocalCore_broken( String path, String optCoreName ) {
 34 |     // TODO: currently broken, touble finding info online, postponing for now
 35 |     SolrResourceLoader loader = new SolrResourceLoader( path );
 36 |     String confDir = loader.getConfigDir();
 37 |     String dataDir = loader.getDataDir();
 38 |     String instanceDir = loader.getInstanceDir();
 39 |     Properties props = loader.getCoreProperties();
 40 |     System.out.println( "path = " + path );
 41 |     System.out.println( "confDir = " + confDir );
 42 |     System.out.println( "dataDir = " + dataDir );
 43 |     System.out.println( "instanceDir = " + instanceDir );
 44 |     System.out.println( "props = " + props );
 45 |     ConfigSolr config = ConfigSolr.fromSolrHome( loader, path );
 46 |     CoreContainer container = new CoreContainer( loader, config );
 47 |     if ( container.getCores().isEmpty() ) {
 48 |       throw new IllegalArgumentException( "No cores found at " + path );
 49 |     }
 50 |     String coreName = optCoreName!=null ? optCoreName : ConfigSolrXmlOld.DEFAULT_DEFAULT_CORE_NAME;
 51 |     SolrCore core = container.getCore( coreName );
 52 |     if ( null==core ) {
 53 |       throw new IllegalArgumentException( "Unable to find core \"" + coreName + "\" at " + path );
 54 |     }
 55 |     // SolrQueryRequest req = new LocalSolrQueryRequest( core, "*:*", null, 0, 0, null );
 56 |     NamedList args = new NamedList();
 57 |     SolrQueryRequest req = new LocalSolrQueryRequest( core, args );
 58 |     schema = req.getSchema();
 59 |   };
 60 | 
 61 |   public float getSchemaVersion() throws Exception {
 62 |     return schema.getVersion();
 63 |   }
 64 | 
 65 |   public String getSchemaName() throws Exception {
 66 |     return schema.getSchemaName();
 67 |   }
 68 | 
 69 |   public String getUniqueKeyFieldName() throws Exception {
 70 |     return schema.getUniqueKeyField().getName();
 71 |   }
 72 | 
 73 |   public String getSimilarityModelClassName() throws Exception {
 74 |     return schema.getSimilarity().getClass().getName();
 75 |   }
 76 | 
 77 |   // TODO: not sure where this comes from
 78 |   public String getDefaultOperator() throws Exception {
 79 |     return null;
 80 |   }
 81 | 
 82 |   public String getDefaultSearchField() throws Exception {
 83 |     return schema.getDefaultSearchFieldName();
 84 |   }
 85 | 
 86 |   public Map<String, Set<String>> getAllDeclaredAndDynamicFieldsByType() {
 87 |     Map<String, Set<String>> out = new LinkedHashMap<>();
 88 |     return out;
 89 |     //return null;
 90 |   }
 91 | 
 92 |   public Set<String> getAllSchemaFieldNames() throws Exception {
 93 |     Map<String, SchemaField> fields = schema.getFields();
 94 |     return fields.keySet();
 95 |     // return new LinkedHashSet<>( fields.keySet() );
 96 |   }
 97 | 
 98 |   public Set<String> getAllDynamicFieldPatterns() throws Exception {
 99 |     DynamicField[] dynFields = schema.getDynamicFields();
100 |     Set<String> out = new LinkedHashSet<>();
101 |     for ( DynamicField df : dynFields ) {
102 |       out.add( df.getRegex() );
103 |     }
104 |     return out;
105 |   }
106 | 
107 |   public Set<String> getAllFieldTypeNames() throws Exception {
108 |     Map<String, FieldType> types = schema.getFieldTypes();
109 |     return types.keySet();
110 |   }
111 | 
112 |   public Set<String> getAllCopyFieldSourceNames() throws Exception {
113 |     Map<String, List<CopyField>> copyMap = schema.getCopyFieldsMap();
114 |     return copyMap.keySet();
115 |   }
116 | 
117 |   public Set<String> getAllCopyFieldDestinationNames() throws Exception {
118 |     Set<String> out = new LinkedHashSet<>();
119 |     Map<String, List<CopyField>> copyMap = schema.getCopyFieldsMap();
120 |     for ( Entry<String, List<CopyField>> copyEntry : copyMap.entrySet() ) {
121 |       // String srcFieldName = copyEntry.getKey();
122 |       List<CopyField> copyList = copyEntry.getValue();
123 |       for ( CopyField cf : copyList ) {
124 |         SchemaField destField = cf.getDestination();
125 |         out.add( destField.getName() );
126 |       }
127 |     }
128 |     return out;
129 |   }
130 | 
131 |   public Set<String> getCopyFieldDestinationsForSource(String sourceName) throws Exception {
132 |     Set<String> out = new LinkedHashSet<>();
133 |     List<CopyField> copyList = schema.getCopyFieldsList( sourceName );
134 |     if ( null==copyList || copyList.isEmpty() ) {
135 |       return out;
136 |     }
137 |     for ( CopyField cf : copyList ) {
138 |       SchemaField destField = cf.getDestination();
139 |       out.add( destField.getName() );
140 |     }
141 |     return out;
142 |   }
143 | 
144 |   public Set<String> getCopyFieldSourcesForDestination(String targetDestName) throws Exception {
145 |     Set<String> out = new LinkedHashSet<>();
146 |     Map<String, List<CopyField>> copyMap = schema.getCopyFieldsMap();
147 |     for ( Entry<String, List<CopyField>> copyEntry : copyMap.entrySet() ) {
148 |       String srcFieldName = copyEntry.getKey();
149 |       List<CopyField> copyList = copyEntry.getValue();
150 |       for ( CopyField cf : copyList ) {
151 |         SchemaField destField = cf.getDestination();
152 |         String destFieldName = destField.getName();
153 |         if ( destFieldName.equals(targetDestName) ) {
154 |           out.add( srcFieldName );
155 |         }
156 |       }
157 |     }
158 |     return out;
159 |   }
160 | 
161 |   // public String generateReport() throws Exception;
162 | 
163 |   public static void main( String[] argv ) throws Exception {
164 |     Schema schema = new SchemaFromLocalCore_broken( PATH3, null );
165 |     schema.generateReport();
166 |   }
167 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfig.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.schema;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import javax.xml.xpath.XPathExpressionException;
 6 | 
 7 | public interface SolrConfig {
 8 | 
 9 |   public String generateReport() throws Exception;
10 | 
11 |   // Can't return float, could be const or config
12 |   public String getLuceneMatchVersion() throws Exception;
13 | 
14 |   // Can't return bool, could be const or config
15 |   public String getAbortOnConfigurationError() throws Exception;
16 | 
17 |   public Collection<String> getRequestHandlers() throws Exception;
18 | 
19 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfigBase.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.schema;
 2 | 
 3 | import java.io.PrintWriter;
 4 | import java.io.StringWriter;
 5 | import java.util.Collection;
 6 | 
 7 | public abstract class SolrConfigBase implements SolrConfig {
 8 | 
 9 |   @Override
10 |   public String generateReport() throws Exception {
11 |     StringWriter sw = new StringWriter();
12 |     PrintWriter out = new PrintWriter(sw);
13 | 
14 |     // Singular Values
15 | 
16 |     String version = getLuceneMatchVersion();
17 |     out.println( "Lucene Match Version = " + version );
18 |     String abort = getAbortOnConfigurationError();
19 |     out.println( "Abort on config error = " + abort );
20 |     
21 |     // Complex Values
22 | 
23 |     Collection<String> handlers = getRequestHandlers();
24 |     out.println();
25 |     out.println( "Request Handlers and Classes:" );
26 |     for ( String handler : handlers ) {
27 |       out.println( "\t" + handler );      
28 |     }
29 | 
30 |     String outStr = sw.toString();
31 |     return outStr;
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/schema/SolrConfigFromXml.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.schema;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.net.URL;
  8 | import java.util.ArrayList;
  9 | import java.util.Collection;
 10 | import java.util.LinkedHashSet;
 11 | import java.util.Set;
 12 | 
 13 | import javax.xml.parsers.DocumentBuilder;
 14 | import javax.xml.parsers.DocumentBuilderFactory;
 15 | import javax.xml.parsers.ParserConfigurationException;
 16 | import javax.xml.xpath.XPath;
 17 | import javax.xml.xpath.XPathConstants;
 18 | import javax.xml.xpath.XPathExpressionException;
 19 | import javax.xml.xpath.XPathFactory;
 20 | 
 21 | import org.w3c.dom.Document;
 22 | import org.w3c.dom.NamedNodeMap;
 23 | import org.w3c.dom.NodeList;
 24 | import org.xml.sax.SAXException;
 25 | import org.w3c.dom.Node;
 26 | 
 27 | public class SolrConfigFromXml extends SolrConfigBase implements SolrConfig {
 28 |   // get from resources folder
 29 |   static String CONFIG_FILE_NAME = "solrconfig-480.xml";
 30 |   
 31 |   Document document;
 32 |   XPathFactory xpathFactory = XPathFactory.newInstance();
 33 |   private final String prefix = null;
 34 |   private final String name = "";
 35 | 
 36 |   // Note: Some of this code was copied from:
 37 |   // * Solr's IndexSchema.java
 38 |   // * Solr's Config.java
 39 | 
 40 | 
 41 |   public SolrConfigFromXml() throws ParserConfigurationException, IOException, SAXException {
 42 |     // this( SCHEMA_FILE_NAME );
 43 |     //URL schemaPath = this.getClass().getResource( CONFIG_FILE_NAME );
 44 |     //init( schemaPath );
 45 |     init( (URL) null );
 46 |   }
 47 |   public SolrConfigFromXml( File schemaPath ) throws ParserConfigurationException, SAXException, IOException {
 48 |     // URI uri = schemaPath.toURI();
 49 |     // URL url = uri.toURL();
 50 |     // init( url );
 51 |     InputStream is = new FileInputStream( schemaPath );
 52 |     init( is );
 53 |   }
 54 |   public SolrConfigFromXml( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException {
 55 |     init( schemaPath );
 56 |   }
 57 |   void init( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException {
 58 |     if ( null==schemaPath ) {
 59 |       schemaPath = this.getClass().getClassLoader().getResource( CONFIG_FILE_NAME );
 60 |     }
 61 |     InputStream is = schemaPath.openConnection().getInputStream();
 62 |     init( is );
 63 |   }
 64 |   void init( InputStream in ) throws ParserConfigurationException, SAXException, IOException {
 65 |     DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
 66 |     DocumentBuilder builder = factory.newDocumentBuilder();
 67 |     this.document = builder.parse( in );
 68 |     xpathFactory = XPathFactory.newInstance();
 69 |   }
 70 | 
 71 |   // Can't return float, could be const or config
 72 |   /* (non-Javadoc)
 73 |    * @see com.lucidworks.dq.schema.SolrConfig#getLuceneMatchVersion()
 74 |    */
 75 |   @Override
 76 |   public String getLuceneMatchVersion() throws Exception {
 77 |     XPath xpath = xpathFactory.newXPath();
 78 |     // "/config/luceneMatchVersion"
 79 |     String expression = stepsToPath(CONFIG, LUCENE_VERSION);
 80 |     // float version = getFloat(expression, 0.0f);
 81 |     Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
 82 |     String payload = null;
 83 |     if ( null!=nd ) {
 84 |       // payload = nd.getNodeValue();
 85 |       payload = nd.getTextContent();
 86 |     }
 87 |     return payload;
 88 |   }
 89 | 
 90 |   // Can't return bool, could be const or config
 91 |   /* (non-Javadoc)
 92 |    * @see com.lucidworks.dq.schema.SolrConfig#getAbortOnConfigurationError()
 93 |    */
 94 |   @Override
 95 |   public String getAbortOnConfigurationError() throws Exception {
 96 |     XPath xpath = xpathFactory.newXPath();
 97 |     // "/config/abortOnConfigurationError"
 98 |     String expression = stepsToPath(CONFIG, ABORT);
 99 |     Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
100 |     String payload = null;
101 |     if ( null!=nd ) {
102 |       payload = nd.getTextContent();
103 |     }
104 |     return payload;
105 |   }
106 |   
107 |   // TODO: getLibs: <lib dir="../../dist/" regex="apache-solr-dataimporthandler-\d.*\.jar" />
108 |   // TODO: getDataDir: <dataDir>${solr.data.dir:}</dataDir>
109 |   // TODO: getDirectoryFactory: <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
110 |   // TODO: getIndexConfig (nested!): <indexConfig>
111 |   // TODO: <jmx />
112 |   // TODO: <updateHandler class="solr.DirectUpdateHandler2">
113 |   // TODO: nested <indexReaderFactory name="IndexReaderFactory" class="package.class">
114 |   // TODO: Nested: <query>
115 |   // TODO: Nested: <requestDispatcher>
116 |   // * TODO: Request Handlers, Nested!
117 |   //   <requestHandler name="/select" class="solr.SearchHandler">
118 |   //   <requestHandler name="/browse" class="solr.SearchHandler">
119 |   //   <requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
120 |   //   <requestHandler name="/update/javabin" class="solr.BinaryUpdateRequestHandler" />
121 |   //   <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
122 |   //   <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler" startup="lazy" />
123 |   //   <requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler" >
124 |   //   <requestHandler name="/update/xslt" startup="lazy" class="solr.XsltUpdateRequestHandler"/>
125 |   // Parts copied from Solr's IndexSchema .loadFields
126 |   /* (non-Javadoc)
127 |    * @see com.lucidworks.dq.schema.SolrConfig#getRequestHandlers()
128 |    */
129 |   @Override
130 |   public Collection<String> getRequestHandlers() throws XPathExpressionException {
131 |     Collection<String> out = new ArrayList<>();
132 |     XPath xpath = xpathFactory.newXPath();
133 |     // /schema/fields/field | /schema/fields/dynamicField
134 |     // | /schema/field | /schema/dynamicField
135 |     // Note: could remove OR and eliminate node name check, but this is closer to Solr code
136 |     String expression = stepsToPath(CONFIG, HANDLER);
137 |     NodeList nodes = (NodeList)xpath.evaluate(expression, document, XPathConstants.NODESET);
138 |     for (int i=0; i<nodes.getLength(); i++) {
139 |       Node node = nodes.item(i);
140 |       NamedNodeMap attrs = node.getAttributes();
141 |       String name = getAttr(attrs, NAME, "handler_name" );
142 |       String classStr = getAttr(attrs, CLASS, "class_name" );
143 |       String key = "" + name + ":" + classStr;
144 |       out.add( key );
145 |     } 
146 |     return out;
147 |   }
148 | 
149 |   // from solr/core/src/java/org/apache/solr/schema/IndexSchema.java
150 |   /**
151 |    * Converts a sequence of path steps into a rooted path, by inserting slashes in front of each step.
152 |    * @param steps The steps to join with slashes to form a path
153 |    * @return a rooted path: a leading slash followed by the given steps joined with slashes
154 |    * Copied from Solr's IndexSchema.java
155 |    */
156 |   private String stepsToPath(String... steps) {
157 |     StringBuilder builder = new StringBuilder();
158 |     for (String step : steps) { builder.append(SLASH).append(step); }
159 |     return builder.toString();
160 |   } 
161 | 
162 |   // Copied from Solr's Config.java
163 |   float getFloat(String path) throws NumberFormatException, Exception {
164 |     return Float.parseFloat(getVal(path, true));
165 |   }
166 |   // Copied from Solr's Config.java
167 |   float getFloat(String path, float def) throws Exception {
168 |     String val = getVal(path, false);
169 |     return val!=null ? Float.parseFloat(val) : def;
170 |   }
171 |   String getVal(String path, boolean errIfMissing) throws Exception {
172 |     Node nd = getNode(path,errIfMissing);
173 |     if (nd==null) return null;
174 | 
175 |     // String txt = DOMUtil.getText(nd);
176 |     // TODO: node DOM Level 2 compatible, see Solr's DOMUtil.getText
177 |     String txt = nd.getTextContent();
178 | 
179 |     // log.debug(name + ' '+path+'='+txt);
180 |     return txt;
181 |   }
182 |   // Copied from Solr's Config.java
183 |   Node getNode(String path, boolean errifMissing) throws Exception {
184 |     return getNode(path, document, errifMissing);
185 |   }
186 |   // Copied from Solr's Config.java
187 |   Node getNode(String path, Document doc, boolean errIfMissing) throws Exception {
188 |     XPath xpath = xpathFactory.newXPath();
189 |     String xstr = normalize(path);
190 | 
191 |     try {
192 |       NodeList nodes = (NodeList)xpath.evaluate(xstr, doc,
193 |                                                 XPathConstants.NODESET);
194 |       if (nodes==null || 0 == nodes.getLength() ) {
195 |         if (errIfMissing) {
196 |           throw new RuntimeException(name + " missing "+path);
197 |         } else {
198 |           // log.debug(name + " missing optional " + path);
199 |           return null;
200 |         }
201 |       }
202 |       if ( 1 < nodes.getLength() ) {
203 |         throw new /*Solr*/ Exception( /*SolrException.ErrorCode.SERVER_ERROR,*/
204 |                                  name + " contains more than one value for config path: " + path);
205 |       }
206 |       Node nd = nodes.item(0);
207 |       // log.trace(name + ":" + path + "=" + nd);
208 |       return nd;
209 | 
210 |     } catch (XPathExpressionException e) {
211 |       // SolrException.log(log,"Error in xpath",e);
212 |       throw new /*Solr*/ Exception( /*SolrException.ErrorCode.SERVER_ERROR,*/"Error in xpath:" + xstr + " for " + name,e);
213 | //    } catch (SolrException e) {
214 | //      throw(e);
215 |     } catch (Throwable e) {
216 |       // SolrException.log(log,"Error in xpath",e);
217 |       throw new /*Solr*/Exception( /*SolrException.ErrorCode.SERVER_ERROR,*/"Error in xpath:" + xstr+ " for " + name,e);
218 |     }
219 |   }
220 |   // Copied from Solr's IndexSchema.java
221 |   private String normalize(String path) {
222 |     return (prefix==null || path.startsWith("/")) ? path : prefix+path;
223 |   }
224 |   // Copied from Solr's DOMUtil.java
225 |   public static String getAttr(NamedNodeMap attrs, String name, String missing_err) {
226 |     Node attr = attrs==null? null : attrs.getNamedItem(name);
227 |     if (attr==null) {
228 |       if (missing_err==null) return null;
229 |       throw new RuntimeException(missing_err + ": missing mandatory attribute '" + name + "'");
230 |     }
231 |     String val = attr.getNodeValue();
232 |     return val;
233 |   }
234 |   public static String getAttrOrNull(NamedNodeMap attrs, String name, String missing_err) {
235 |     try {
236 |       return getAttr( attrs, name, missing_err );
237 |     }
238 |     catch( RuntimeException e ) {
239 |       System.err.println( e );
240 |       return null;
241 |     }
242 |   }
243 | 
244 |   public static void main( String[] argv ) throws Exception {
245 |     SolrConfig s = new SolrConfigFromXml();
246 | 
247 |     String version = s.getLuceneMatchVersion();
248 |     System.out.println( "Lucene Match Version = " + version );
249 |     String abort = s.getAbortOnConfigurationError();
250 |     System.out.println( "Abort on config error = " + abort );
251 |     
252 |     Collection<String> handlers = s.getRequestHandlers();
253 |     System.out.println( "Request Handlers:" );
254 |     for ( String handler : handlers ) {
255 |       System.out.println( "\t" + handler );      
256 |     }
257 | 
258 |   }
259 | 
260 |   // Strings taken from SolrConfig.java
261 |   // but many are defined inline
262 |   public static final String ABORT = "abortOnConfigurationError";
263 |   public static final String CLASS = "class";
264 |   public static final String CONFIG = "config";
265 |   public static final String _DEFAULT = "default";  // TODO: request hnadler, others?, not implemented yet
266 |   public static final String HANDLER = "requestHandler";
267 |   public static final String LUCENE_VERSION = "luceneMatchVersion"; 
268 |   public static final String NAME = "name"; 
269 |   public static final String SLASH = "/";
270 |   public static final String _STARTUP = "startup";  // TODO: request hnadler, not implemented yet
271 | }
272 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/CharUtils.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.lang.Character.UnicodeBlock;
  6 | import java.lang.Character.UnicodeScript;
  7 | import java.util.HashMap;
  8 | import java.util.LinkedHashMap;
  9 | import java.util.Map;
 10 | import java.util.TreeMap;
 11 | import java.util.Map.Entry;
 12 | 
 13 | public class CharUtils {
 14 |   // Special handling, Unicode mangled character marker
 15 |   static final String QUESTION_MARK_STR       = "?";
 16 |   static final int    QUESTION_MARK_CODEPOINT = QUESTION_MARK_STR.codePointAt(0);
 17 |   static final String QUESTION_MARK_NAME      = "QUESTION_MARK";
 18 | 
 19 |   static final Map<Integer,String> TYPES = new HashMap<Integer,String>() {{
 20 |     // put( 1, "R / DIRECTIONALITY_RIGHT_TO_LEFT" );
 21 |     // put( 2, "AL / DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC" );
 22 |     // put( 11, "S / DIRECTIONALITY_SEGMENT_SEPARATOR" );
 23 |     // put( 12, "WS / DIRECTIONALITY_WHITESPACE" );
 24 |     put( 1, "Lu_UPPERCASE_LETTER" );
 25 |     put( 2, "Ll_LOWERCASE_LETTER" );
 26 |     put( 3, "Lt_TITLECASE_LETTER" );
 27 |     put( 4, "Lm_MODIFIER_LETTER" );
 28 |     put( 5, "Lo_OTHER_LETTER" );
 29 |     put( 6, "Mn_NON_SPACING_MARK" );
 30 |     put( 7, "Me_ENCLOSING_MARK" );
 31 |     put( 8 , "Mc_COMBINING_SPACING_MARK" );
 32 |     put( 9, "Nd_DECIMAL_DIGIT_NUMBER" );
 33 |     put( 11, "No_OTHER_NUMBER" );
 34 |     put( 12, "Zs_SPACE_SEPARATOR" );
 35 |     put( 13, "Zl_LINE_SEPARATOR" );
 36 |     put( 14, "Zp_PARAGRAPH_SEPARATOR" );
 37 |     put( 15, "Cc_CONTROL" );
 38 |     put( 16, "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
 39 |     // 17?
 40 |     put( 18, "Co_PRIVATE_USE" );
 41 |     put( 19, "Cs_SURROGATE" );
 42 |     put( 20, "Pd_DASH_PUNCTUATION" );
 43 |     put( 21, "Ps_START_PUNCTUATION" );
 44 |     put( 22, "Pe_END_PUNCTUATION" );
 45 |     put( 23, "Pc_CONNECTOR_PUNCTUATION" );
 46 |     put( 24, "Po_OTHER_PUNCTUATION" );
 47 |     put( 25, "Sm_MATH_SYMBOL" );
 48 |     put( 26, "Sc_CURRENCY_SYMBOL" );
 49 |     put( 27, "Sk_MODIFIER_SYMBOL" );
 50 |     put( 28, "So_OTHER_SYMBOL" );
 51 |     put( 29, "Pi_INITIAL_QUOTE_PUNCTUATION" );
 52 |     put( 30, "Pf_FINAL_QUOTE_PUNCTUATION" );
 53 |   }};
 54 | 
 55 |   static final Map<String,String> ALIASES_SHORT_TO_LONG = new HashMap<String,String>() {{
 56 |     // Custom
 57 |     put( "Qm", QUESTION_MARK_NAME );
 58 | 
 59 |     // Script
 60 |     put( "Com", "COMMON" );
 61 |     put( "Lat", "LATIN" );
 62 | 
 63 |     // Block
 64 |     put( "Basic", "BASIC_LATIN" );
 65 |     put( "L1Sup", "LATIN_1_SUPPLEMENT" );
 66 |     put( "GenPunct", "GENERAL_PUNCTUATION" );
 67 |     put( "LetterSym", "LETTERLIKE_SYMBOLS" );
 68 | 
 69 |     // Types
 70 |     put( "UPPER", "Lu_UPPERCASE_LETTER" );
 71 |     put( "lower", "Ll_LOWERCASE_LETTER" );
 72 |     put( "Title", "Lt_TITLECASE_LETTER" );
 73 |     put( "ModL", "Lm_MODIFIER_LETTER" );
 74 |     put( "OtherL", "Lo_OTHER_LETTER" );
 75 |     put( "NonSpc", "Mn_NON_SPACING_MARK" );
 76 |     put( "Encl", "Me_ENCLOSING_MARK" );
 77 |     put( "Combining" , "Mc_COMBINING_SPACING_MARK" );
 78 |     put( "Digit", "Nd_DECIMAL_DIGIT_NUMBER" );
 79 |     put( "OtherNum", "No_OTHER_NUMBER" );
 80 |     put( "Space", "Zs_SPACE_SEPARATOR" );
 81 |     put( "Line", "Zl_LINE_SEPARATOR" );
 82 |     put( "Para", "Zp_PARAGRAPH_SEPARATOR" );
 83 |     put( "Ctrl", "Cc_CONTROL" );
 84 |     put( "Fmt", "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
 85 |     // 17?
 86 |     put( "Priv", "Co_PRIVATE_USE" );
 87 |     put( "Sur", "Cs_SURROGATE" );
 88 |     put( "Dash", "Pd_DASH_PUNCTUATION" );
 89 |     put( "Start", "Ps_START_PUNCTUATION" );
 90 |     put( "End", "Pe_END_PUNCTUATION" );
 91 |     put( "Conn", "Pc_CONNECTOR_PUNCTUATION" );
 92 |     put( "OtherP", "Po_OTHER_PUNCTUATION" );
 93 |     put( "Math", "Sm_MATH_SYMBOL" );
 94 |     put( "Currency", "Sc_CURRENCY_SYMBOL" );
 95 |     put( "ModSym", "Sk_MODIFIER_SYMBOL" );
 96 |     put( "OtherSym", "So_OTHER_SYMBOL" );
 97 |     put( "StartQ", "Pi_INITIAL_QUOTE_PUNCTUATION" );
 98 |     put( "EndQ", "Pf_FINAL_QUOTE_PUNCTUATION" );
 99 |   }};
100 | 
101 |   static final Map<String,String> ALIASES_LONG_TO_SHORT = new HashMap<String,String>();
102 |   static {
103 |     for ( Entry<String,String> entry : ALIASES_SHORT_TO_LONG.entrySet() ) {
104 |       String shortName = entry.getKey();
105 |       String longName = entry.getValue();
106 |       ALIASES_LONG_TO_SHORT.put( longName, shortName );
107 |     }
108 |   }
109 | 
110 |   // Compound Aliases
111 |   // Note: reversed order of initialization here
112 |   static final Map<String,String> COMPOUND_ALIASES_LONG_TO_SHORT = new HashMap<String,String>() {{
113 |     put( "Com-Basic-Space", "space" );
114 |     put( "Lat-Basic-UPPER", "UPPER" );
115 |     put( "Lat-Basic-lower", "lower" );
116 |     put( "Com-Basic-Conn", "Connector" );
117 |     put( "Com-Basic-Currency", "Currency" );
118 |     put( "Com-Basic-Digit", "Digit" );
119 |     put( "Com-Basic-OtherP", "OtherPunct" );
120 |     put( "Com-L1Sup-OtherSym", "OtherSym" );
121 |     put( "Com-Basic-Start", "Start" );
122 |     put( "Com-Basic-End", "Stop" );
123 |     put( "Com-Basic-Math", "Math" );
124 |     put( "Com-Basic-Dash", "Dash1" );
125 |     put( "Com-GenPunct-Dash", "Dash2" );
126 |     put( "Com-LetterSym-OtherSym", "LetterSymbol" );
127 |     put( "Com-Basic-Qm", "QuestionMark" );  // add suffix 1 when needed
128 |   }};
129 |   static final Map<String,String> COMPOUND_ALIASES_SHORT_TO_LONG = new HashMap<String,String>();
130 |   static {
131 |     for ( Entry<String,String> entry : COMPOUND_ALIASES_LONG_TO_SHORT.entrySet() ) {
132 |       String longName = entry.getKey();
133 |       String shortName = entry.getValue();
134 |       COMPOUND_ALIASES_SHORT_TO_LONG.put( shortName, longName );
135 |     }
136 |   }
137 | 
138 |   static String generateReport() {
139 |     return generateReportForRange( 0, 255 );
140 |   }
141 |   static String generateReportForRange( int min, int max ) {
142 |     StringWriter sw = new StringWriter();
143 |     PrintWriter out = new PrintWriter(sw);
144 | 
145 |     for ( int i=min; i<=max; i++ ) {
146 |       addCharInfoToReport( out, i );
147 |     }
148 | 
149 |     String outStr = sw.toString();
150 |     return outStr;
151 |   }
152 |   static String generateReportForPoints( int ... codePoints ) {
153 |     StringWriter sw = new StringWriter();
154 |     PrintWriter out = new PrintWriter(sw);
155 | 
156 |     for ( int i : codePoints ) {
157 |       addCharInfoToReport( out, i );
158 |     }
159 | 
160 |     String outStr = sw.toString();
161 |     return outStr;
162 |   }
163 |   static void addCharInfoToReport( PrintWriter out, int codePoint ) {
164 |     out.print( "" + codePoint );
165 |     out.print( ", " );
166 |     out.print( String.format("%X", codePoint) );
167 |     out.print( ": " );
168 |     if ( codePoint >= 32 ) {
169 |       Character c = new Character( (char)codePoint );
170 |       if ( ! Character.isSupplementaryCodePoint( codePoint ) ) {
171 |         out.print( " c='"+c+"'" );
172 |       }
173 |       // Extended / Supplmental Unicode
174 |       else {
175 |         // also StringBuffer appendCodePoint(int cp)
176 |         char[] chars = Character.toChars( codePoint );
177 |         out.print( " c='" );
178 |         for ( char cS : chars ) {
179 |           out.print( cS );
180 |         }
181 |         out.print( "'" );
182 |       }
183 |     }
184 |     boolean isDef = Character.isDefined( codePoint );
185 |     out.print( " isDef="+isDef );
186 |     boolean isValid = Character.isValidCodePoint( codePoint );
187 |     out.print( " isValid="+isValid );
188 |     boolean isCtrl = Character.isISOControl( codePoint );
189 |     out.print( " isCtrl="+isCtrl );
190 |     boolean isBmp = Character.isBmpCodePoint( codePoint );
191 |     out.print( " isBmp="+isBmp );
192 |     boolean isSupp = Character.isSupplementaryCodePoint( codePoint );
193 |     out.print( " isSupp="+isSupp );
194 |     boolean isAlpha = Character.isAlphabetic( codePoint );
195 |     out.print( " isAlpha="+isAlpha );
196 |     boolean isLetter = Character.isLetter( codePoint );
197 |     out.print( " isLetter="+isLetter );
198 |     boolean isDigit = Character.isDigit( codePoint );
199 |     out.print( " isDigit="+isDigit );
200 |     int type = Character.getType( codePoint );
201 |     String typeStr = "" + type;
202 |     if ( TYPES.containsKey(type) ) {
203 |       typeStr += " " + TYPES.get(type);
204 |     }
205 |     else {
206 |       typeStr += " (no-TYPES-entry)";
207 |     }
208 |     out.print( " type="+typeStr );
209 |     String block = null;
210 |     String script = null;
211 |     try {
212 |       block = UnicodeBlock.of( codePoint ).toString();
213 |       script = UnicodeScript.of( codePoint ).toString();
214 |     }
215 |     catch( Exception e ) { }
216 |     out.print( " script="+script );
217 |     out.print( " block="+block );
218 |     String name = Character.getName( codePoint );
219 |     out.print( " name="+name );
220 |     out.println();
221 |   }
222 | 
223 |   public static String getScriptName_LongForm( int codePoint ) {
224 |     String script = "Unknown_Unicode_Script";
225 |     try {
226 |       script = UnicodeScript.of( codePoint ).toString();
227 |     }
228 |     catch( Exception e ) { }
229 |     return script;
230 |   }
231 |   public static String getScriptName_ShortForm( int codePoint ) {
232 |     String longName = getScriptName_LongForm( codePoint );
233 |     if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
234 |       return ALIASES_LONG_TO_SHORT.get(longName);
235 |     }
236 |     else {
237 |       return longName;
238 |     }
239 |   }
240 |   public static String getBlockName_LongForm( int codePoint ) {
241 |     String block = "Unknown_Unicode_Block";
242 |     try {
243 |       block = UnicodeBlock.of( codePoint ).toString();
244 |     }
245 |     catch( Exception e ) { }
246 |     return block;
247 |   }
248 |   public static String getBlockName_ShortForm( int codePoint ) {
249 |     String longName = getBlockName_LongForm( codePoint );
250 |     if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
251 |       return ALIASES_LONG_TO_SHORT.get(longName);
252 |     }
253 |     else {
254 |       return longName;
255 |     }
256 |   }
257 |   public static String getTypeName_LongForm( int codePoint ) {
258 |     int type = Character.getType( codePoint );
259 |     String typeStr = "";
260 |     if ( codePoint == QUESTION_MARK_CODEPOINT ) {
261 |       typeStr = QUESTION_MARK_NAME;
262 |     }
263 |     else if ( TYPES.containsKey(type) ) {
264 |       typeStr = TYPES.get(type);
265 |     }
266 |     else {
267 |       typeStr = "" + type + "_No_TYPES_Entry";
268 |     }
269 |     return typeStr;
270 |   }
271 |   public static String getTypeName_ShortForm( int codePoint ) {
272 |     String longName = getTypeName_LongForm( codePoint );
273 |     if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) {
274 |       return ALIASES_LONG_TO_SHORT.get(longName);
275 |     }
276 |     else {
277 |       return longName;
278 |     }
279 |   }
280 |   // returns "script-block-type"
281 |   public static String getCompoundClassifier_LongForm( int codePoint ) {
282 |     return      getScriptName_LongForm(codePoint)
283 |         + "-" + getBlockName_LongForm(codePoint)
284 |         + "-" + getTypeName_LongForm(codePoint)
285 |         ;
286 |   }
287 |   public static String getCompoundClassifier_ShortForm( int codePoint ) {
288 |     String candidate = getScriptName_ShortForm(codePoint)
289 |         + "-" + getBlockName_ShortForm(codePoint)
290 |         + "-" + getTypeName_ShortForm(codePoint)
291 |         ;
292 |     if ( COMPOUND_ALIASES_LONG_TO_SHORT.containsKey(candidate) ) {
293 |       return COMPOUND_ALIASES_LONG_TO_SHORT.get( candidate );
294 |     }
295 |     else {
296 |       return candidate;
297 |     }
298 |   }
299 | 
300 |   public static Map<String,Long> classifyString_LongForm( String inStr ) {
301 |     return classifyString_LongForm( inStr, null );
302 |   }
303 |   public static Map<String,Long> classifyString_LongForm( String inStr, Map<String,Long> stats ) {
304 |     // Automatically sorts by key-order
305 |     if ( null==stats ) {
306 |       // In order by key, easier for overall tabulation
307 |       stats = new TreeMap<>();
308 |     }
309 |     if ( null==inStr || inStr.isEmpty() ) {
310 |       return stats;
311 |     }
312 |     // Special looping to allow for Supplementary Unicode Characters (> 65k)
313 |     int length = inStr.length();
314 |     for (int offset = 0; offset < length; ) {
315 |       int codePoint = inStr.codePointAt( offset );
316 |       String charKey = getCompoundClassifier_LongForm( codePoint );
317 |       // Tabulate
318 |       long count = 0L;
319 |       if ( stats.containsKey(charKey) ) {
320 |         count = stats.get( charKey );
321 |       }
322 |       count++;
323 |       stats.put( charKey, count );
324 |       // Advance
325 |       offset += Character.charCount( codePoint );
326 |     }
327 |     return stats;
328 |   }
329 |   public static Map<String,Long> classifyString_ShortForm( String inStr ) {
330 |     return classifyString_ShortForm( inStr, null );
331 |   }
332 |   // TODO: code very similar to LongForm, combine
333 |   public static Map<String,Long> classifyString_ShortForm( String inStr, Map<String,Long> stats ) {
334 |     // Automatically sorts by key-order
335 |     if ( null==stats ) {
336 |       // In order by key, easier for overall tabulation
337 |       stats = new TreeMap<>();
338 |     }
339 |     if ( null==inStr || inStr.isEmpty() ) {
340 |       return stats;
341 |     }
342 |     // Special looping to allow for Supplementary Unicode Characters (> 65k)
343 |     int length = inStr.length();
344 |     for (int offset = 0; offset < length; ) {
345 |       int codePoint = inStr.codePointAt( offset );
346 |       String charKey = getCompoundClassifier_ShortForm( codePoint );
347 |       // Tabulate
348 |       long count = 0L;
349 |       if ( stats.containsKey(charKey) ) {
350 |         count = stats.get( charKey );
351 |       }
352 |       count++;
353 |       stats.put( charKey, count );
354 |       // Advance
355 |       offset += Character.charCount( codePoint );
356 |     }
357 |     return stats;
358 |   }
359 | 
360 |   public static void main( String [] argv ) {
361 |     // U+306E, dec:12398
362 |     System.out.println( "Japanese \"no\": '\u306e'" );
363 |     // U+4e00 19968, U+4e8c 20108, U+4e09 19977
364 |     System.out.println( "Chinese 1 2 3: '\u4e00\u4e8c\u4e09'" );
365 |     // U+1D11E, dec:119070
366 |     System.out.println( "Extended: Musical G-clef: '\uD834\uDD1E'" );
367 |     // U+1F37A, dec:127866
368 |     System.out.println( "Extended: Beer Mug: '\uD83C\uDF7A'" );
369 | 
370 |     // String report = generateReportForRange( 0, 255 );
371 |     String report = generateReportForPoints( 12398, 19968, 20108, 19977, 119070, 127866 );
372 |     System.out.print( report );
373 |   }
374 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.util;
 2 | 
 3 | import java.lang.reflect.Field;
 4 | import java.lang.reflect.InvocationTargetException;
 5 | import java.lang.reflect.Method;
 6 | import java.util.LinkedHashMap;
 7 | import java.util.Map;
 8 | import java.util.Map.Entry;
 9 | 
10 | public class CmdLineLauncher {
11 |   // TODO: currently using static init but
12 |   // fefactoring would require that all classes use lightweight null constructor
13 |   // static final Map<String, Class<? extends HasDescription>> CLASSES = new LinkedHashMap<String,Class<? extends HasDescription>>()
14 |   static final Map<String, Class<?>> CLASSES = new LinkedHashMap<String,Class<?>>()
15 |   {{
16 |     put( "empty_fields",      com.lucidworks.dq.data.EmptyFieldStats.class     );
17 |     put( "term_stats",        com.lucidworks.dq.data.TermStats.class           );
18 |     put( "code_points",       com.lucidworks.dq.data.TermCodepointStats.class  );
19 |     put( "date_checker",      com.lucidworks.dq.data.DateChecker.class         );
20 |     put( "diff_empty_fields", com.lucidworks.dq.diff.DiffEmptyFieldStats.class );
21 |     put( "diff_ids",          com.lucidworks.dq.diff.DiffIds.class             );
22 |     put( "diff_schema",       com.lucidworks.dq.diff.DiffSchema.class          );
23 |     put( "diff_config",       com.lucidworks.dq.diff.DiffSolrConfig.class      );
24 |     put( "doc_count",         com.lucidworks.dq.data.DocCount.class            );
25 |     put( "dump_ids",          com.lucidworks.dq.data.DumpIds.class             );
26 |     put( "delete_by_ids",     com.lucidworks.dq.data.DeleteByIds.class         );
27 |     put( "solr_to_solr",      com.lucidworks.dq.data.SolrToSolr.class          );
28 |     put( "solr_to_csv",       com.lucidworks.dq.data.SolrToCsv.class           );
29 |     put( "hash_and_shard",    com.lucidworks.dq.util.HashAndShard.class        );
30 |   }};
31 |   public static void main( String[] argv ) {
32 |     if( argv.length < 1 ) {
33 |       System.out.println( "Pass a command name on the command line to see help for that class:" );
34 |       // for( Entry<String, Class<? extends HasDescription>> entry : CLASSES.entrySet() )
35 |       for( Entry<String, Class<?>> entry : CLASSES.entrySet() )
36 |       {
37 |         String cmdName = entry.getKey();
38 |         // Class<? extends HasDescription> clazz = entry.getValue();
39 |         Class<?> clazz = entry.getValue();
40 | 
41 |         String desc = null;
42 |         try {
43 |           Method descMeth = clazz.getMethod( "getShortDescription" );
44 |           desc = (String) descMeth.invoke( null, (Object[]) null );
45 |           // Field f = clazz.getDeclaredField( "HELP_WHAT_IS_IT" );
46 |           // desc = (String) f.get(null);
47 |         } catch (SecurityException | IllegalArgumentException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
48 |           // TODO Auto-generated catch block
49 |           e.printStackTrace();
50 |         }
51 |         
52 |         // System.out.println( cmdName + ": " + desc );
53 |         System.out.printf( "%20s: %s\n", cmdName, desc );
54 |       }
55 |     }
56 |     // Has a command name
57 |     else {
58 |       String cmdName = argv[ 0 ];
59 |       if ( CLASSES.containsKey(cmdName) ) {
60 |         // Copy over all the first arg
61 |         String [] argv2 = new String[ argv.length - 1 ];
62 |         for ( int i=1; i<argv.length; i++ ) {
63 |           argv2[ i-1 ] = argv[ i ];
64 |         }
65 |         Class<?> clazz = CLASSES.get(cmdName);
66 |         try {
67 |           Method main = clazz.getMethod( "main", String[].class );
68 |           // main.invoke( null, argv2 );
69 |           // main.invoke( null, (Object[]) argv2 );
70 |           main.invoke( null, (Object) argv2 );
71 |         } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
72 |           // TODO Auto-generated catch block
73 |           e.printStackTrace();
74 |           System.exit(2);
75 |         }
76 |       }
77 |       else {
78 |         System.err.println( "Command \"" + cmdName + "\" not found in " + CLASSES.keySet() );
79 |         System.exit(2);
80 |       }
81 |     }
82 |   }
83 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/DateUtils.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.util;
 2 | 
 3 | import java.text.DateFormat;
 4 | import java.text.ParseException;
 5 | import java.text.SimpleDateFormat;
 6 | import java.util.ArrayList;
 7 | import java.util.Collection;
 8 | import java.util.Date;
 9 | import java.util.List;
10 | import java.util.TimeZone;
11 | 
12 | public class DateUtils {
13 | 
14 |   public static final String JAVA_FORMAT = "EEE MMM dd HH:mm:ss z yyyy";
15 |   public static final String ZULU_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
16 |   // public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.S";
17 |   public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.SSS";
18 | 
19 |   public static String getLocalTimestamp( Date inDate ) {
20 |     DateFormat compactFormatter = new SimpleDateFormat( COMPACT_LOG_FORMAT );
21 |     // NOT setting timezone
22 |     return compactFormatter.format( inDate );
23 |   }
24 |   public static String getLocalTimestamp() {
25 |     return getLocalTimestamp( new Date() );
26 |   }
27 |   public static String getLocalTimestamp( long ms ) {
28 |     return getLocalTimestamp( new Date(ms) );
29 |   }
30 |   public static String javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException {
31 |     java.util.Date dateObj = javaDefault2Date_str2date( inDate );
32 |     String outDateStr = date2SolrXmlZulu_date2str( dateObj );
33 |     return outDateStr;
34 |   }
35 |   public static String solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException {
36 |     java.util.Date dateObj = solrXmlZulu2Date_str2date( inDate );
37 |     String outDateStr = date2JavaDefault_date2str( dateObj );
38 |     return outDateStr;
39 |   }
40 |   public static String _javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException {
41 |     DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
42 |     DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
43 |     zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
44 |     java.util.Date tmpDate = javaFormatter.parse( inDate );
45 |     String outDate = zuluFormatter.format( tmpDate );
46 |     return outDate;
47 |   }
48 |   public static String _solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException {
49 |     DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
50 |     zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
51 |     DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
52 |     java.util.Date tmpDate = zuluFormatter.parse( inDate );
53 |     String outDate = javaFormatter.format( tmpDate );
54 |     return outDate;
55 |   }
56 | 
57 |   public static String date2SolrXmlZulu_date2str( java.util.Date inDate ) throws ParseException {
58 |     DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
59 |     zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
60 |     String outDate = zuluFormatter.format( inDate );
61 |     return outDate;
62 |   }
63 |   public static String date2JavaDefault_date2str( java.util.Date inDate ) throws ParseException {
64 |     DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
65 |     String outDate = javaFormatter.format( inDate );
66 |     return outDate;
67 |   }
68 | 
69 |   public static java.util.Date javaDefault2Date_str2date( String inDate ) throws ParseException {
70 |     DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT );
71 |     java.util.Date outDate = javaFormatter.parse( inDate );
72 |     return outDate;
73 |   }
74 |   public static java.util.Date solrXmlZulu2Date_str2date( String inDate ) throws ParseException {
75 |     DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT );
76 |     zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") );
77 |     java.util.Date outDate = zuluFormatter.parse( inDate );
78 |     return outDate;
79 |   }
80 | 
81 |   public static List<Double> dates2Doubles( Collection<Date> dates ) {
82 |     List<Double> out = new ArrayList<>();
83 |     for ( Date d : dates ) {
84 |       out.add(  new Double( d.getTime() )  );
85 |     }
86 |     return out;
87 |   }
88 |   public static Double date2Double( Date d ) {
89 |     return new Double( d.getTime() ).doubleValue();
90 |   }
91 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/HasDescription.java:
--------------------------------------------------------------------------------
1 | package com.lucidworks.dq.util;
2 | 
3 | // TODO: future... Refactor and add lightweight null constructors
4 | // see also util.CmdLineLauncher
5 | public interface HasDescription {
6 |   String getShortDescription();
7 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/HashAndShard.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.apache.solr.common.cloud.DocRouter.Range;
  7 | import org.apache.solr.common.util.Hash;
  8 | 
  9 | public class HashAndShard {
 10 |   
 11 |   // Should correspond to:
 12 |   // http://localhost:8983/solr/collection1/select?q=*&fl=*,[shard]
 13 | 
 14 |   static String HELP_WHAT_IS_IT = "Calculate hash and shard for a document ID";
 15 |   static String HELP_USAGE = "HashAndShard docId [numberOfShards [-q]]    # shards can be decimal, hex, octal, etc";
 16 |   public static String getShortDescription() {
 17 |     return HELP_WHAT_IS_IT;
 18 |   }
 19 | 
 20 | 
 21 |   /* From:
 22 |    * solr-lucene-490-src/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java
 23 |    */
 24 |   private static int bits = 16;
 25 |   static List<Range> partitionRange( int partitions ) {
 26 |     int min = Integer.MIN_VALUE; // -2^31   = -2147483648 = -2,147,483,648
 27 |     int max = Integer.MAX_VALUE; //  2^31-1 =  2147483647 =  2,147,483,647
 28 | 
 29 |     // assert max >= min;
 30 |     // if (partitions == 0) return Collections.EMPTY_LIST;
 31 |     long rangeSize = (long) max - (long) min;
 32 |     long rangeStep = Math.max(1, rangeSize / partitions);
 33 | 
 34 |     List<Range> ranges = new ArrayList<>(partitions);
 35 | 
 36 |     long start = min;
 37 |     long end = start;
 38 | 
 39 |     // keep track of the idealized target to avoid accumulating rounding errors
 40 |     long targetStart = min;
 41 |     long targetEnd = targetStart;
 42 | 
 43 |     // Round to avoid splitting hash domains across ranges if such rounding is not significant.
 44 |     // With default bits==16, one would need to create more than 4000 shards before this
 45 |     // becomes false by default.
 46 |     int mask = 0x0000ffff;
 47 |     boolean round = rangeStep >= (1 << bits) * 16;
 48 | 
 49 |     while (end < max) {
 50 |       targetEnd = targetStart + rangeStep;
 51 |       end = targetEnd;
 52 | 
 53 |       if (round && ((end & mask) != mask)) {
 54 |         // round up or down?
 55 |         int increment = 1 << bits;  // 0x00010000
 56 |         long roundDown = (end | mask) - increment;
 57 |         long roundUp = (end | mask) + increment;
 58 |         if (end - roundDown < roundUp - end && roundDown > start) {
 59 |           end = roundDown;
 60 |         } else {
 61 |           end = roundUp;
 62 |         }
 63 |       }
 64 | 
 65 |       // make last range always end exactly on MAX_VALUE
 66 |       if (ranges.size() == partitions - 1) {
 67 |         end = max;
 68 |       }
 69 |       ranges.add(new Range((int) start, (int) end));
 70 |       start = end + 1L;
 71 |       targetStart = targetEnd + 1L;
 72 |     }
 73 | 
 74 |     return ranges;
 75 |   }
 76 | 
 77 |   static void printRanges( List<Range> ranges, Integer hash ) {
 78 |     int shardCounter = 0;
 79 |     for ( Range r : ranges ) {
 80 |       shardCounter++;
 81 |       System.out.println( "Shard # " + shardCounter );
 82 |       System.out.println( "\tRange: "
 83 |           + String.format("0x%8s", Integer.toHexString(r.min)).replace(' ', '0')
 84 |           + " to "
 85 |           + String.format("0x%8s", Integer.toHexString(r.max)).replace(' ', '0')
 86 |           );
 87 |       if ( null!=hash ) {
 88 |         if ( hash >= r.min && hash <= r.max ) {
 89 |           System.out.println( "\tcontains "
 90 |               + String.format("0x%8s", Integer.toHexString(hash)).replace(' ', '0')
 91 |               );
 92 |         }
 93 |       }
 94 |     }
 95 |   }
 96 |   static int findShardForHash( List<Range> ranges, Integer hash ) {
 97 |     int shardCounter = 0;
 98 |     for ( Range r : ranges ) {
 99 |       shardCounter++;
100 |       if ( hash >= r.min && hash <= r.max ) {
101 |         return shardCounter;
102 |       }
103 |     }
104 |     return -1;
105 |   }
106 | 
107 |   public static void main(String[] args) {
108 |     if ( args.length < 1 || args.length > 3 ) {
109 |       System.err.println( "Error: syntax: " + HELP_USAGE );
110 |       System.exit(1);
111 |     }
112 |     String docId = args[0];
113 |     if ( docId.length() < 1 ) {
114 |       System.err.println( "Error: empty docId" );
115 |       System.exit(2);      
116 |     }
117 |     String numShardsStr = args.length >= 2 ? args[1] : null;
118 |     String quietStr = args.length >= 3 ? args[2] : null;
119 |     boolean quiet = null!=quietStr && quietStr.equalsIgnoreCase("-q");
120 | 
121 |     int signedHash = Hash.murmurhash3_x86_32( docId, 0, docId.length(), 0 );
122 |     long unsignedHash = signedHash & 0x00000000ffffffffL;
123 |     if ( ! quiet ) {
124 |       System.out.println( "docId: \"" + docId + '"' );
125 |       System.out.println( "32-bit Hash (signed decimal int): " + signedHash );
126 |       System.out.println( "32-bit Hash (unsigned dec int): " + unsignedHash );
127 |       System.out.println( "32-bit Hash (hex): " + String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') );
128 |       System.out.println( "32-bit Hash (binary): " + String.format("%32s", Integer.toBinaryString(signedHash)).replace(' ', '0') );
129 |     }
130 |     else {
131 |       System.out.print( docId + " " );
132 |       System.out.print( String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') );      
133 |     }
134 | 
135 |     if ( null != numShardsStr ) {
136 |       Integer numShards = null;
137 |       try {
138 |         numShards = Integer.decode( numShardsStr );
139 |       }
140 |       catch( NumberFormatException e ) {
141 |         System.err.println( "Error parsing numberOfShards: " + e );
142 |         System.exit(3);      
143 |       }
144 |       if ( numShards <= 0 ) {
145 |         System.err.println( "Error: numberOfShards must be > 0; got " + numShards );
146 |         System.exit(4);        
147 |       }
148 |       // WRONG!
149 |       // long shardNumber = (unsignedHash % numShards) + 1;
150 |       // System.out.println( "Route to Shard (base-ONE): " + shardNumber );
151 | 
152 |       List<Range> ranges = partitionRange( numShards );
153 | 
154 |       if ( ! quiet ) {
155 |         System.out.println( "Number of Shards: " + numShards );
156 | 
157 |         printRanges( ranges, signedHash );
158 |       }
159 |       else {
160 |         int targetShard = findShardForHash( ranges, signedHash );
161 |         System.out.print( " " + targetShard );
162 |       }
163 |     }
164 |     if ( quiet ) {
165 |       System.out.println();
166 |     }
167 |     
168 |   }
169 | 
170 | }
171 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/IO_Utils.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.net.URI;
  6 | import java.net.URISyntaxException;
  7 | import java.nio.file.CopyOption;
  8 | import java.nio.file.FileSystem;
  9 | import java.nio.file.FileSystems;
 10 | import java.nio.file.FileVisitResult;
 11 | import java.nio.file.Files;
 12 | import java.nio.file.Path;
 13 | import java.nio.file.Paths;
 14 | import java.nio.file.SimpleFileVisitor;
 15 | import java.nio.file.StandardCopyOption;
 16 | import java.nio.file.attribute.BasicFileAttributes;
 17 | import java.util.Collections;
 18 | 
 19 | 
 20 | public class IO_Utils {
 21 |   
 22 |   public static File materializeSolrHomeIntoTemp() throws IOException, URISyntaxException {
 23 |     String prefix = "solr_dq_utils_";
 24 |     String topName = "solr_home";
 25 |     //String magicName = "configsets";
 26 |     Path baseTempDir = Files.createTempDirectory( prefix );
 27 |     // File destinationDir = new File( baseTempDir.toFile(), magicName );
 28 |     File destinationDir = new File( baseTempDir.toFile(), topName );
 29 |     if ( ! destinationDir.mkdirs() ) {
 30 |       throw new IOException( "Unable to create path \"" + destinationDir + "\"" );
 31 |     }
 32 |     // System.out.println( "Created \"" + destinationDir + "\"" );
 33 |     IO_Utils iou = new IO_Utils();
 34 |   
 35 |     //String sourcePathWithinJar = "/";
 36 |     // ^-- gets all classes from every combined jar
 37 |     
 38 |     //String sourcePathWithinJar = "configsets";
 39 |     // ^-- Exception in thread "main" java.lang.IllegalArgumentException, no details
 40 |     
 41 |     // String sourcePathWithinJar = "/" + magicName;
 42 |     String sourcePathWithinJar = "/" + topName;
 43 |   
 44 |     // String destinationPathInFilesystem = "/Users/mbennett/tmp_test_copy";
 45 |     // ^-- Doesn't create spanning .../configsets/... dir, just subdirectories of it
 46 | 
 47 |     // iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationPathInFilesystem) );
 48 |     iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationDir.toString()) );
 49 |     return destinationDir;
 50 |   }
 51 | 
 52 |   // Parts take from:
 53 |   // * http://stackoverflow.com/a/24316335/295802
 54 |   // * http://codingjunkie.net/java-7-copy-move/
 55 |   // Usage: copyFromJar("/path/to/the/template/in/jar", Paths.get("/tmp/from-jar"))
 56 |   public void copyFromJar(String source, final Path target) throws URISyntaxException, IOException {
 57 |     System.out.println( "source str = \"" + source + "\"" );
 58 | 
 59 |     
 60 |     // getClass is defined in Object
 61 |     URI resource = getClass().getResource("").toURI();
 62 | 
 63 |     // ... ? FileSystems.newFileSystem(...)
 64 |     // ^-- java.lang.IllegalArgumentException: Path component should be '/'
 65 |     //     at least when run in Eclipse (non .jar packaging)
 66 |     //URI resource = getClass().getResource("/").toURI();
 67 | 
 68 |     System.out.println( "URI Resource = \"" + resource + "\"" );
 69 |     // ^-- Interactive: "file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/"
 70 |     // ^-- Run Uberjar: "jar:file:/Users/mbennett/data/dev/DQ/data-quality-github/target/data-quality-java-1.0-SNAPSHOT.jar!/com/lucidworks/dq/util/"
 71 | 
 72 |     // jar:file: - Running from packaged jar
 73 |     if ( resource.toString().startsWith("jar:file:" ) ) {
 74 |       FileSystem fileSystem = FileSystems.newFileSystem(
 75 |           resource,
 76 |           Collections.<String, String>emptyMap()
 77 |           );
 78 |   
 79 |       final Path jarPath = fileSystem.getPath(source);
 80 |   
 81 |       // Recursive copy
 82 |       // TODO: looks similar to other recursive copy below, maybe combine
 83 |       Files.walkFileTree(jarPath, new SimpleFileVisitor<Path>() {
 84 |   
 85 |         private Path currentTarget;
 86 |   
 87 |         @Override
 88 |         public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
 89 |           currentTarget = target.resolve(jarPath.relativize(dir).toString());
 90 |           Files.createDirectories(currentTarget);
 91 |           return FileVisitResult.CONTINUE;
 92 |         }
 93 |   
 94 |         @Override
 95 |         public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
 96 |           //System.out.println( "Copying \"" + file.toString() + "\" ..." );
 97 |           Files.copy(file, target.resolve(jarPath.relativize(file).toString()), StandardCopyOption.REPLACE_EXISTING);
 98 |           return FileVisitResult.CONTINUE;
 99 |         }
100 |   
101 |       });
102 |     
103 |     }
104 |     // file: - Running from Eclipse or other non-packaged runner
105 |     else if ( resource.toString().startsWith("file:" ) ) {
106 |       // Our resource is relative root level, not this specific package
107 |       URI resource2 = getClass().getResource("/").toURI();
108 |       File base = new File( resource2.getPath() );
109 |       File srcDir = new File( base, source );
110 |       final Path fromPath = srcDir.toPath();
111 |       final Path toPath = target;
112 | 
113 |       // Recursive copy
114 |       // TODO: looks similar to other recursive copy above, maybe combine
115 |       Files.walkFileTree(fromPath, new SimpleFileVisitor<Path>() {
116 | 
117 |         @Override
118 |         public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
119 |           Path targetPath = toPath.resolve(fromPath.relativize(dir));
120 |           if ( ! Files.exists(targetPath) ){
121 |             Files.createDirectory(targetPath);
122 |           }
123 |           return FileVisitResult.CONTINUE;
124 |         }
125 | 
126 |         @Override
127 |         public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
128 |           Files.copy(file, toPath.resolve(fromPath.relativize(file)), StandardCopyOption.REPLACE_EXISTING);
129 |           return FileVisitResult.CONTINUE;
130 |         }
131 | 
132 |       });
133 |       
134 |       
135 |       /***
136 |       // TODO: recursive copy from filesystem
137 |       // Files.copy( new File(source).toPath(), target, StandardCopyOption.REPLACE_EXISTING );
138 |       // ^-- No, only has "/solr_home"
139 |       // and "resource" is too far down:
140 |       // Gives: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/com/lucidworks/dq/util
141 |       //  Need: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/solr_home
142 |       URI resource2 = getClass().getResource("/").toURI();
143 |       // gives! file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/
144 |       System.out.println( "URI Resource2 = \"" + resource2 + "\"" );
145 |       File base = new File( resource2.getPath() );
146 |       File srcDir = new File( base, source );
147 |       Path srcPath = srcDir.toPath();
148 |       System.out.println( "srcPath = \"" + srcPath + "\"" );
149 |       System.out.println( "target = \"" + target + "\"" );
150 |       //Files.copy( srcPath, target, StandardCopyOption.REPLACE_EXISTING );
151 | 
152 |       // EnumSet<FileVisitOption> opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
153 |       // TreeCopier tc = new TreeCopier(source[i], dest, prompt, preserve);
154 |       // Files.walkFileTree(source[i], opts, Integer.MAX_VALUE, tc);
155 | 
156 |       ***/
157 |     
158 |     
159 |     }
160 |     else {
161 |       throw new IllegalArgumentException( "Don't know how to handle " + resource );
162 |     }
163 | 
164 |   }
165 | 
166 | 
167 |   
168 |   
169 |   public static void main(String[] args) throws URISyntaxException, IOException {
170 |     //File configSetsDir = materializeConfigsetsInTemp();
171 |     File configSetsDir = materializeSolrHomeIntoTemp();
172 |     System.out.println( "ConfigSets = " + configSetsDir );
173 |     
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/LLR.java-new:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.io.PrintWriter;
  4 | import java.io.StringWriter;
  5 | import java.util.Collection;
  6 | import java.util.LinkedHashMap;
  7 | import java.util.Map;
  8 | import java.util.Map.Entry;
  9 | import java.util.Set;
 10 | import java.util.TreeMap;
 11 | import java.util.TreeSet;
 12 | 
 13 | import org.apache.solr.client.solrj.SolrServerException;
 14 | import org.apache.solr.client.solrj.impl.HttpSolrServer;
 15 | 
 16 | public class LLR {
 17 | 
 18 |   Map<String,Long> wordsA;
 19 |   Map<String,Long> wordsB;
 20 |   // TODO: consider this, BUT threshold for A OR B, or A AND B ?
 21 |   // long minWordsThreshold = 0L;
 22 | 
 23 |   // Column Totals
 24 |   double sumA = 0.0;
 25 |   double sumB = 0.0;
 26 |   // K Total
 27 |   double grandTotal;
 28 |   // Row Totals
 29 |   Map<String,Double> rowTotals = new LinkedHashMap<>();
 30 | 
 31 |   // Set<String> allWordsAboveThreshold = new TreeSet<>();
 32 |   Set<String> allWords = new TreeSet<>();
 33 | 
 34 |   Map<String,Double> scoresByWord = new TreeMap<>();
 35 |   Map<String,Double> sortedScoresByWord = new TreeMap<>();
 36 | 
 37 |   // Peformance Stat
 38 |   long plogp_counter = 0L;
 39 | 
 40 |   public LLR( Map<String,Long> wordsA, Map<String,Long> wordsB /*, Long optThreshold*/ ) {
 41 | 	this.wordsA = wordsA;
 42 | 	this.wordsB = wordsB;
 43 | 	//if ( null!=optThreshold && optThreshold.longValue() > 0L ) {
 44 | 	//  this.minWordsThreshold = optThreshold.longValue();
 45 | 	//}
 46 | 	doInitialCalculations();
 47 | 	calcAllWords();
 48 | 	sortWords();
 49 |   }
 50 | 
 51 |   public void doInitialCalculations() {
 52 | 
 53 |     // Column Totals
 54 |     // -------------
 55 |     // sumA = sumWithThreshold( wordsA.values() );
 56 |     // sumB = sumWithThreshold( wordsB.values() );
 57 | 	sumA = new Double( StatsUtils.sumList_Longs(wordsA.values()) ).doubleValue();
 58 | 	sumB = new Double( StatsUtils.sumList_Longs(wordsB.values()) ).doubleValue();
 59 | 	if ( sumA<=0.0 || sumB<=0.0 ) {
 60 |       throw new IllegalArgumentException( "Must have non-zero word counts: A=" + sumA + ", B=" + sumB );
 61 | 	}
 62 | 
 63 |     // K Total
 64 |     grandTotal = sumA + sumB;
 65 | 
 66 |     // Row Totals
 67 |     // ----------
 68 |     allWords.addAll( wordsA.keySet() );
 69 |     allWords.addAll( wordsB.keySet() );
 70 |     for ( String word : allWords ) {
 71 |       Long countA = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
 72 |       Long countB = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
 73 |       rowTotals.put( word, new Double(countA + countB) );
 74 |     }
 75 | 
 76 |   }
 77 | 
 78 |   public void calcAllWords() {
 79 | 	for ( String word : allWords ) {
 80 | 	  // double g2 = calcG2_viaDunning( word );
 81 | 	  double g2 = calcG2_viaTraditional( word );
 82 | 	  scoresByWord.put( word, g2 );
 83 |     }
 84 |   }
 85 | 
 86 | 
 87 |   // TODO: G2 is the same as -2 log lambda ?
 88 |   // http://scg.unibe.ch/archive/papers/Kuhn09aLogLikelihoodRatio.pdf
 89 |   // Before Sign:
 90 |   //   food: 0.0
 91 |   //   bananas: 0.46192170199964266
 92 |   //   apples: 0.6291706616789554
 93 |   //   carrots: 60.03320678316349
 94 |   //   candy: 60.03320678316351
 95 |   // After Sign:
 96 |   //  candy: -60.03320678316351
 97 |   //  bananas: -0.46192170199964266
 98 |   //  food: 0.0
 99 |   //  apples: 0.6291706616789554
100 |   //  carrots: 60.03320678316349
101 |   double calcG2_viaTraditional( String word ) {
102 |     boolean debug = false;
103 |     if(debug) System.out.println( "\n=== Calculating G2 via Traditional formula for \"" + word + "\" ===" );
104 |     // Simple terms
105 |     double k1 = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
106 |     double k2 = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
107 |     double n1 = sumA;
108 |     double n2 = sumB;
109 |     double p1 = k1 / n1;
110 |     double p2 = k2 / n2;
111 |     if(debug) System.out.println( "Corpus A: k1, n1, p1: " + k1 + ", " + n1 + ", " + p1 );
112 |     if(debug) System.out.println( "Corpus B: k2, n2, p2: " + k2 + ", " + n2 + ", " + p2 );
113 |     double p = (k1 + k2) / (n1 + n2);  // rowCount / grandTotal
114 |     if(debug) System.out.println( "Combined: k1+2, n1+2, p1+2: " + (k1+k2) + ", " + (n1+n2) + ", " + p );
115 |     // Factors
116 |     double factorA = Math.log( L(p1,k1,n1) );
117 |     double factorB = Math.log( L(p2,k2,n2) );
118 |     double factorC = Math.log(  L(p,k1,n1) );
119 |     double factorD = Math.log(  L(p,k2,n2) );
120 |     double sign = sign( p1, p2 );
121 |     // Result
122 |     double out = sign * 2.0 * ( factorA + factorB - factorC - factorD );
123 |     if(debug) System.out.println( "out = +/-sign * 2.0 * ( factorA + factorB - factorC - factorD )" );
124 |     if(debug) System.out.println( "Sign and Factors A, B, C, D: " + sign + ", " + factorA + ", " + factorB + ", " + factorC + ", " + factorD );
125 |     if(debug) System.out.println( "out = " + out );
126 |     return out;
127 |   }
128 | 
129 |   // TODO: this is Binomial Likelihood ?
130 |   // k = word count
131 |   // n = total words in corpus (non-unique)
132 |   // p = k/n, BUT might use different k and n
133 |   static double L( double p, double k, double n ) {
134 | 	double part1 = Math.pow( p, k );
135 | 	double part2 = Math.pow( (1.0-p), (n-k) );
136 | 	return part1 * part2;
137 |   }
138 | 
139 |   // TODO: confirm meaning of +/-
140 |   // plus = heavier in first collection
141 |   // minus = heavier in second collection
142 |   static double sign( double p1, double p2 ) {
143 |     if ( p1 - p2 >= 0.0 ) {
144 |       return 1.0;
145 |     }
146 |     else {
147 |       return -1.0;
148 |     }
149 |   }
150 | 
151 |   // Each word is done individually, across both collections
152 |   //   food: 1.7319479184152442E-13
153 |   //   bananas: 0.4619217019995059
154 |   //   apples: 0.6291706616789394
155 |   //   candy: 60.03320678316341
156 |   //   carrots: 60.03320678316341
157 |   double calcG2_viaDunning( String word ) {
158 |     boolean debug = false;
159 |     if(debug) System.out.println( "\n=== Calculating G2 via Dunning Entropy formula for \"" + word + "\" ===" );
160 |     // Calc H_rowSums
161 | 	// ---------------
162 |     double row1Total = rowTotals.get(word);
163 |     double row2Total = grandTotal - row1Total;
164 |     if(debug) System.out.println( "Row Totals: " + row1Total + "  " + row2Total );
165 |     // plnp = probability * log (probability), log = natural log
166 |     double plogpRow1 = 0.0;
167 |     if ( row1Total > 0.0 ) {
168 |       double prob = row1Total / grandTotal;
169 |       plogpRow1 = prob * Math.log(prob);
170 |       plogp_counter++;
171 |     }
172 |     double plogpRow2 = 0.0;
173 |     if ( row2Total > 0.0 ) {
174 |       double prob = row2Total / grandTotal;
175 |       plogpRow2 = prob * Math.log(prob);
176 |       plogp_counter++;
177 |     }
178 |     double H_rowSums = -1.0 * ( plogpRow1 + plogpRow2 );
179 |     if(debug) System.out.println( "Row plogp 1 & 2 and H_rowSums: " + plogpRow1 + "  " + plogpRow2 + "  " + H_rowSums );
180 | 
181 |     // Calc H_colSums
182 |     // --------------
183 |     // We checked column sums earlier
184 |     double probCol1 = sumA / grandTotal;
185 |     double plogpCol1 = probCol1 * Math.log( probCol1 );
186 |     plogp_counter++;
187 |     double probCol2 = sumB / grandTotal;
188 |     double plogpCol2 = probCol2 * Math.log( probCol2 );
189 |     plogp_counter++;
190 |     double H_colSums = -1.0 * ( plogpCol1 + plogpCol2 );
191 |     if(debug) System.out.println( "Column plogp 1 & 2 and H_colSums: " + plogpCol1 + "  " + plogpCol2 + "  " + H_colSums );
192 | 
193 |     // Calc H_k
194 |     // -----------
195 |     // column 1 counts
196 |     double k_11 = wordsA.containsKey(word) ? wordsA.get(word) : 0L;
197 |     double k_21 = sumA - k_11;  // all other counts
198 |     // column 2 counts
199 |     double k_12 = wordsB.containsKey(word) ? wordsB.get(word) : 0L;
200 |     double k_22 = sumB - k_12;  // all other counts
201 |     if(debug) System.out.println( "K counts:\n\t" + k_11 + "  " + k_12 + "\n\t" + k_21 + "  " + k_22 );
202 |     // probabilities
203 |     double prob_11 = k_11 / grandTotal;
204 |     double prob_21 = k_21 / grandTotal;
205 |     double prob_12 = k_12 / grandTotal;
206 |     double prob_22 = k_22 / grandTotal;
207 |     // p log( p )
208 |     // method has its own counter
209 |     double plogp_11 = plogp( prob_11 );
210 |     double plogp_21 = plogp( prob_21 );
211 |     double plogp_12 = plogp( prob_12 );
212 |     double plogp_22 = plogp( prob_22 );
213 |     // finally H_k
214 |     double H_k = -1.0 * ( plogp_11 + plogp_21 + plogp_12 + plogp_22 );
215 |     if(debug) System.out.println( "K plogp:\n\t" + plogp_11 + "  " + plogp_12 + "\n\t" + plogp_21 + "  " + plogp_22 );
216 |     if(debug) System.out.println( "H_k = " + H_k );
217 | 
218 |     // Dunning's formula
219 |     // http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
220 | //    double G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums );
221 | //    if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums )" );
222 | //    if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_k + " - " + H_rowSums + " - " + H_colSums + " )" );
223 | 
224 |     // Revised, see http://math.stackexchange.com/questions/693114/wrong-result-from-llr-using-dunning-entropy-method
225 |     double G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k );
226 |     if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k )" );
227 |     if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_rowSums + " + " + H_colSums + " - " + H_k + " )" );
228 | 
229 |     return G2;
230 |   }
231 | 
232 |   // Calculates p * log( p )
233 |   // natural log
234 |   // but returns 0.0 if p is 0
235 |   // TODO: maybe some implementitons just add 1 to all counts?
236 |   double plogp( double prob ) {
237 | 	if ( prob > 0.0 ) {
238 | 	  plogp_counter++;
239 | 	  return prob * Math.log( prob );
240 | 	}
241 | 	else {
242 | 	  return 0.0;
243 | 	}
244 |   }
245 | 
246 |   void sortWords() {
247 | 	//  Map<String,Double> scoresByWord = new TreeMap<>();
248 | 	//  Map<String,Double> sortedScoresByWord = new TreeMap<>();
249 |     sortedScoresByWord = SetUtils.sortMapByValues( scoresByWord );
250 |   }
251 | 
252 | //  double pLogP_KOverallWordA( String word ) {
253 | //	double prob = probKOverallWordA( word );
254 | //	if ( prob > 0.0 ) {
255 | //      return prob * Math.log( prob );
256 | //	}
257 | //	else {
258 | //	  return 0.0;
259 | //	}
260 | //  }
261 | //  double pLogP_KOverallWordB( String word ) {
262 | //	double prob = probKOverallWordB( word );
263 | //	if ( prob > 0.0 ) {
264 | //      return prob * Math.log( prob );
265 | //	}
266 | //	else {
267 | //	  return 0.0;
268 | //	}
269 | //  }
270 | //  double probKOverallWordA( String word ) {
271 | //	return probKOverallWord( word, wordsA );
272 | //  }
273 | //  double probKOverallWordB( String word ) {
274 | //    return probKOverallWord( word, wordsB );
275 | //  }
276 | //  double probKOverallWord( String word, Map<String,Long> countMap ) {
277 | //    long count = countMap.containsKey(word) ? countMap.get(word) : 0L;
278 | //    double prob = (double) count / grandTotal;
279 | //    return prob;
280 | //  }
281 | 
282 | //  double sumWithThreshold( Collection<Long> counts ) {
283 | //    double out = 0.0;
284 | //	for ( Long c : counts ) {
285 | //      if ( c >= minWordsThreshold ) {
286 | //    	out += c;
287 | //      }
288 | //    }
289 | //    return out;
290 | //  }
291 | 
292 |   public String generateReport( String optLabel ) {
293 | 	StringWriter sw = new StringWriter();
294 |     PrintWriter out = new PrintWriter(sw);
295 | 
296 |     int sampleSize = 5;
297 | 
298 |     if ( null!=optLabel ) {
299 |     	out.println( "----------- " + optLabel + " -----------" );
300 |     }
301 | 
302 |     out.println();
303 |     out.println( "Corpus A unique / total words: " + wordsA.size() + " / " + sumA );
304 |     out.println( "Corpus B unique / total words: " + wordsB.size() + " / " + sumB );
305 |     out.println( "Combined unique / total words: " + allWords.size() + " / " + grandTotal );
306 |     out.println( "Number of p log(p) calculations: " + plogp_counter );
307 |     out.println();
308 | 
309 |     if ( sortedScoresByWord.size() <= 2 * sampleSize + 1 ) {
310 |       addTermsSliceToReport( out, "All Term Changes", sortedScoresByWord );
311 |     }
312 |     else {
313 |       Map<String,Double> firstTerms = SetUtils.mapHead( sortedScoresByWord, sampleSize );
314 |       addTermsSliceToReport( out, "Term Changes, first " + sampleSize + " entries", firstTerms );
315 |       Map<String,Double> lastTerms = SetUtils.mapTail( sortedScoresByWord, sampleSize );
316 |       addTermsSliceToReport( out, "Term Changes, last " + sampleSize + " entries", lastTerms );
317 |     }
318 | 
319 |     String outStr = sw.toString();
320 |     return outStr;
321 |   }
322 |   void addTermsSliceToReport( PrintWriter out, String label, Map<String,Double> terms ) {
323 |     out.println( "" + label + ":" );
324 |     for ( Entry<String, Double> wordEntry : terms.entrySet() ) {
325 |       String word = wordEntry.getKey();
326 |       double g2 = wordEntry.getValue();
327 |       out.println( "\t" + word + ": " + g2 );
328 |     }  
329 |   }
330 |   
331 |   public static void main( String[] argv ) throws SolrServerException {
332 | //	Map<String,Long> corpusA = new LinkedHashMap<String,Long>() {{
333 | //      // 100k docs total
334 | //	  put( "blog",        25L );  // test word
335 | //	  put( "computer",  3200L );  // other words
336 | //	  put( "internet", 96775L );  // other words
337 | //	}};
338 | //	Map<String,Long> corpusB = new LinkedHashMap<String,Long>() {{
339 | //      // 200k docs total
340 | //      put( "blog",       2500L ); // test word
341 | //      put( "computer",   6000L ); // other words
342 | //      put( "internet", 191500L ); // other words
343 | //    }};
344 | 
345 | //    // Example posted online
346 | //	Map<String,Long> corpusA = new LinkedHashMap<String,Long>() {{
347 | //	  // 100k docs total
348 | //      put( "spam",        40000L );  // test word
349 | //      put( "other words", 60000L );  // other words
350 | //    }};
351 | //    Map<String,Long> corpusB = new LinkedHashMap<String,Long>() {{
352 | //      // 200k docs total
353 | //      put( "spam",        120000L ); // test word
354 | //      put( "other words",  80000L ); // other words
355 | //    }};
356 | 
357 | //  Map<String,Long> corpusA = new LinkedHashMap<String,Long>() {{
358 | //    put( "apples",   25L );
359 | //    put( "bananas",  30L );
360 | //    put( "carrots",  40L );
361 | //    put( "food",    100L );
362 | //  }};
363 | //  Map<String,Long> corpusB = new LinkedHashMap<String,Long>() {{
364 | //    put( "apples",   20L ); // down by 5
365 | //    put( "bananas",  35L ); // up by 5
366 | //    put( "candy",    40L ); // carrots -> candy!
367 | //    put( "food",    100L ); // unchanged, and total unchanged
368 | //  }};
369 | 
370 | 
371 |     HttpSolrServer solrA = SolrUtils.getServer( "localhost", 8984 );   
372 |     HttpSolrServer solrB = SolrUtils.getServer( "localhost", 8985 );
373 |     String fieldName = "text";
374 |     // Set<String> corpusA = SolrUtils.getTermsForField_ViaTermsRequest( solrA, fieldName );
375 |     // Set<String> corpusB = SolrUtils.getTermsForField_ViaTermsRequest( solrB, fieldName );
376 |     Map<String, Long> corpusA = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrA, fieldName );
377 |     Map<String, Long> corpusB = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrB, fieldName );
378 | 
379 | 	LLR llr = new LLR( corpusA, corpusB );
380 |     String report = llr.generateReport( "A -> B" );
381 |     System.out.print( report );
382 | 
383 |   }
384 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/SetUtils.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collection;
  5 | import java.util.Date;
  6 | import java.util.Iterator;
  7 | import java.util.LinkedHashMap;
  8 | import java.util.LinkedHashSet;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.Set;
 13 | import java.util.TreeMap;
 14 | import java.util.TreeSet;
 15 | 
 16 | public class SetUtils {
 17 | 
 18 |   public static void incrementMapCounter( Map<String,Long> tabulationMap, String key ) {
 19 |     Long value = 0L;
 20 |     if ( tabulationMap.containsKey(key) ) {
 21 |       value = tabulationMap.get(key);
 22 |     }
 23 |     value += 1L;
 24 |     tabulationMap.put( key, value );
 25 |   }
 26 | 
 27 | /***
 28 |   public static void incrementMapCounter( Map<Object,Number> tabulationMap, Object key ) {
 29 |     incrementMapCounter( tabulationMap, key, 1 );
 30 |   }
 31 |   public static void incrementMapCounter( Map<Object,Number> tabulationMap, Object key, Number increment ) {
 32 |     Number value = 0;
 33 |     if ( tabulationMap.containsKey(key) ) {
 34 |       value = tabulationMap.get(key);
 35 |     }
 36 |     // value += increment;
 37 |     // value = value + increment;
 38 |     value = value.doubleValue() + increment.doubleValue();
 39 |     tabulationMap.put( key, value );
 40 |   }
 41 | ***/
 42 | 
 43 | /***
 44 |   // posted to http://stackoverflow.com/questions/26551403/method-call-doesnt-match-method-signature-even-though-method-is-using-more-gene
 45 |   public static void incrementMapCounter( Map<Object,Number> tabulationMap, Object key ) {
 46 |     Number value = 0;
 47 |     if ( tabulationMap.containsKey(key) ) {
 48 |       value = tabulationMap.get(key);
 49 |     }
 50 |     value = value.doubleValue() + new Double(1);
 51 |     tabulationMap.put( key, value );
 52 |   }
 53 | ***/
 54 | /***
 55 |   // public static void incrementMapCounter( Map<?,? extends Number> tabulationMap, Object key )
 56 |   public static <K,V extends Number> void incrementMapCounter( Map<K,V> tabulationMap, K key )
 57 |   // public static void incrementMapCounter( Map<?,? extends Number> tabulationMap, <? extends Object> key )
 58 |   {
 59 |     Number value = 0;
 60 |     if ( tabulationMap.containsKey(key) ) {
 61 |       value = tabulationMap.get(key);
 62 |     }
 63 |     value = value.doubleValue() + new Double(1);
 64 |     tabulationMap.put( key, value );
 65 |   }
 66 | ***/
 67 | 
 68 |   /**
 69 |    * @deprecated use {@link StringUtils#join(Collection)} instead.  
 70 |    */
 71 |   @Deprecated
 72 |   public static String join( Collection<String> strings ) {
 73 |     return StringUtils.join( strings );
 74 |   }
 75 |   /**
 76 |    * @deprecated use {@link StringUtils#join(Collection, String)} instead.  
 77 |    */
 78 |   @Deprecated
 79 |   public static String join( Collection<String> strings, String delimiter ) {
 80 |     return StringUtils.join( strings, delimiter );
 81 |   }
 82 |   /**
 83 |    * @deprecated use {@link StringUtils#splitCsv(String)} instead.  
 84 |    */
 85 |   @Deprecated  
 86 |   public static Set<String> splitCsv( String inStr ) {
 87 |     return StringUtils.splitCsv( inStr );
 88 |   }
 89 | 
 90 |   // Assumes always using LinkedHashMap which keep things in predictable insertion order
 91 |   public static <K,V> Map<K,V> reverseMapEntryKeyOrder( Map<K,V> inEntries ) {
 92 |     List<K> keys = new ArrayList<>( inEntries.keySet() );
 93 |     List<V> values = new ArrayList<>( inEntries.values() );
 94 |     if ( keys.size() != values.size() ) {
 95 |       throw new IllegalStateException( "Number of of keys (" + keys.size() + ") != number of values (" + values.size() );
 96 |     }
 97 |     Map<K,V> out = new LinkedHashMap<>();
 98 |     for ( int i=keys.size()-1; i>=0; i-- ) {
 99 |       out.put( keys.get(i), values.get(i) );
100 |     }
101 |     return out;
102 |   }
103 | 
104 |   public static <K,V> Map<K,V> mapHead( Map<K,V> inEntries, int n ) {
105 |     if ( n < 1 ) {
106 |       throw new IllegalStateException( "Number of desired entries must be > 0, but n = " + n );
107 |     }
108 |     // TODO: safe to do this?
109 |     //if ( n >= inEntries.size() ) {
110 |     //  return inEntries;
111 |     //}
112 |     Map<K,V> out = new LinkedHashMap<>();
113 |     int counter = 0;
114 |     for ( Entry<K, V> entry : inEntries.entrySet() ) {
115 |       out.put( entry.getKey(), entry.getValue() );
116 |       counter++;
117 |       if ( counter >= n ) {
118 |         break;
119 |       }
120 |     }
121 |     // for ( int i=1; i<=n; i++ )
122 |     return out;
123 |   }
124 |   public static <K,V> Map<K,V> mapTail( Map<K,V> inEntries, int n ) {
125 |     if ( n < 1 ) {
126 |       throw new IllegalStateException( "Number of desired entries must be > 0, but n = " + n );
127 |     }
128 |     List<K> keys = new ArrayList<>( inEntries.keySet() );
129 |     List<V> values = new ArrayList<>( inEntries.values() );
130 |     if ( keys.size() != values.size() ) {
131 |       throw new IllegalStateException( "Number of of keys (" + keys.size() + ") != number of values (" + values.size() );
132 |     }
133 |     Map<K,V> out = new LinkedHashMap<>();
134 |     int start = inEntries.size() - n - 1;
135 |     if ( start<0 ) start = 0;
136 |     for ( int i=start; i<keys.size(); i++ ) {
137 |       out.put( keys.get(i), values.get(i) );
138 |     }
139 |     return out;
140 |   }
141 | 
142 |   public static <K,V> Map<K,V> sortMapByValues( Map<K,V> inMap ) {
143 |     // Inverting also sorts because we use TreeMap
144 |     Map<V,Set<K>> invertedMap = invertMapAndSort( inMap );
145 |     // This preserves the new order
146 |     Map<K,V> out = uninvertMap( invertedMap );
147 |     return out;
148 |   }
149 |   // using tree map for output, so automatically sorted
150 |   public static <K,V> Map<V,Set<K>> invertMapAndSort( Map<K,V> inMap ) {
151 |     Map<V,Set<K>> out = new TreeMap<>();
152 |     for ( Entry<K, V> entry : inMap.entrySet() ) {
153 |       K key = entry.getKey();
154 |       V value = entry.getValue();
155 |       if ( out.containsKey(value) ) {
156 |         Set<K> vector = out.get(value);
157 |         vector.add( key );
158 |       }
159 |       else {
160 |         Set<K> vector = new TreeSet<>();
161 |         vector.add( key );
162 |         out.put( value, vector );
163 |       }
164 |     }
165 |     return out;
166 |   }
167 |   // Preserve insertion order
168 |   public static <K,V> Map<K,V> uninvertMap( Map<V,Set<K>> inMap ) {
169 |     Map<K,V> out = new LinkedHashMap<>();
170 |     for ( Entry<V, Set<K>> entry : inMap.entrySet() ) {
171 |       V value = entry.getKey();
172 |       Set<K> keys = entry.getValue();
173 |       for ( K k : keys ) {
174 |         if ( out.containsKey(k) ) {
175 |           throw new IllegalArgumentException( "Duplicate entries for supposed unique key " + k );
176 |         }
177 |         out.put( k, value );
178 |       }
179 |     }
180 |     return out;
181 |   }
182 | 
183 |   public static boolean sameAndInSameOrder( Set<String> idsA, Set<String> idsB ) {
184 |     // Bunch of edge cases
185 |     // TODO: maybe move edge cases to same set
186 |     // TODO: other methods don't do null checking....
187 |     if ( null==idsA && null==idsB ) {
188 |       return true;
189 |     }
190 |     if ( null==idsA ) {
191 |       return null==idsB || idsB.isEmpty();
192 |     }
193 |     if ( null==idsB ) {
194 |       return null==idsA || idsA.isEmpty();
195 |     }
196 |     if ( idsA.isEmpty() && idsB.isEmpty() ) {
197 |       return true;
198 |     }
199 |     if ( idsA.size() != idsB.size() ) {
200 |       return false;
201 |     }
202 |     Set<String> onlyA = inAOnly_nonDestructive( idsA, idsB );
203 |     Set<String> onlyB = inBOnly_nonDestructive( idsA, idsB );
204 |     if ( ! onlyA.isEmpty() || ! onlyB.isEmpty() ) {
205 |       return false;
206 |     }
207 |     // OK, walk them together
208 |     // And we've checked the sizes
209 |     Iterator<String> itA = idsA.iterator();
210 |     Iterator<String> itB = idsB.iterator();
211 | 
212 |     // Note:
213 |     // The while and if checks look redundant
214 |     // but they handle the very unlikely edge case
215 |     // where one list is added to while we're looping
216 |     // and gets longer - that means FALSE
217 |     // but if loop just ended we'd accidently return true
218 |     while ( itA.hasNext() || itB.hasNext() ) {
219 |       if ( ! itA.hasNext() || ! itB.hasNext() ) {
220 |         return false;
221 |       }
222 |       String itemA = itA.next();
223 |       String itemB = itB.next();
224 |       if ( ! itemA.equals(itemB) ) {
225 |         return false;
226 |       }
227 |     }
228 |     // All tests have passed
229 |     return true;
230 |   }
231 | 
232 |   // TODO: refactor to handle anything implementing Collection
233 |   public static boolean sameAndInSameOrder( Collection<String> idsA, Collection<String> idsB ) {
234 |     // Bunch of edge cases
235 |     // TODO: maybe move edge cases to same set
236 |     // TODO: other methods don't do null checking....
237 |     if ( null==idsA && null==idsB ) {
238 |       return true;
239 |     }
240 |     if ( null==idsA ) {
241 |       return null==idsB || idsB.isEmpty();
242 |     }
243 |     if ( null==idsB ) {
244 |       return null==idsA || idsA.isEmpty();
245 |     }
246 |     if ( idsA.isEmpty() && idsB.isEmpty() ) {
247 |       return true;
248 |     }
249 |     if ( idsA.size() != idsB.size() ) {
250 |       return false;
251 |     }
252 |     Collection<String> onlyA = inAOnly_nonDestructive( idsA, idsB );
253 |     Collection<String> onlyB = inBOnly_nonDestructive( idsA, idsB );
254 |     if ( ! onlyA.isEmpty() || ! onlyB.isEmpty() ) {
255 |       return false;
256 |     }
257 |     // OK, walk them together
258 |     // And we've checked the sizes
259 |     Iterator<String> itA = idsA.iterator();
260 |     Iterator<String> itB = idsB.iterator();
261 | 
262 |     // Note:
263 |     // The while and if checks look redundant
264 |     // but they handle the very unlikely edge case
265 |     // where one list is added to while we're looping
266 |     // and gets longer - that means FALSE
267 |     // but if loop just ended we'd accidently return true
268 |     while ( itA.hasNext() || itB.hasNext() ) {
269 |       if ( ! itA.hasNext() || ! itB.hasNext() ) {
270 |         return false;
271 |       }
272 |       String itemA = itA.next();
273 |       String itemB = itB.next();
274 |       if ( ! itemA.equals(itemB) ) {
275 |         return false;
276 |       }
277 |     }
278 |     // All tests have passed
279 |     return true;
280 |   }
281 | 
282 |   // Non-Destructive
283 | 
284 |   public static Set<String> inAOnly_nonDestructive( Set<String> idsA, Set<String> idsB ) {
285 |     Set<String> out = new LinkedHashSet<>();
286 |     out.addAll( idsA );
287 |     out.removeAll( idsB );
288 |     return out;
289 |   }
290 |   // TODO: redo so it takes anything derived from Collection
291 |   public static Collection<String> inAOnly_nonDestructive( Collection<String> idsA, Collection<String> idsB ) {
292 |     Set<String> out = new LinkedHashSet<>();
293 |     out.addAll( idsA );
294 |     out.removeAll( idsB );
295 |     return out;
296 |   }
297 |   public static Set<String> inBOnly_nonDestructive( Set<String> idsA, Set<String> idsB ) {
298 |     return inAOnly_nonDestructive( idsB, idsA );
299 |   }
300 |   // TODO: redo so it takes anything derived from Collection
301 |   public static Collection<String> inBOnly_nonDestructive( Collection<String> idsA, Collection<String> idsB ) {
302 |     return inAOnly_nonDestructive( idsB, idsA );
303 |   }
304 |   public static Set<String> intersection_nonDestructive( Set<String> idsA, Set<String> idsB ) {
305 |     Set<String> out = new LinkedHashSet<>();
306 |     out.addAll( idsA );
307 |     out.retainAll( idsB );
308 |     return out;
309 |   }
310 |   public static Collection<String> intersection_nonDestructive( Collection<String> idsA, Collection<String> idsB ) {
311 |     Set<String> out = new LinkedHashSet<>();
312 |     out.addAll( idsA );
313 |     out.retainAll( idsB );
314 |     return out;
315 |   }
316 |   public static Set<String> union_nonDestructive( Set<String> idsA, Set<String> idsB ) {
317 |     Set<String> out = new LinkedHashSet<>();
318 |     out.addAll( idsA );
319 |     out.addAll( idsB );
320 |     return out;
321 |   }
322 | 
323 |   // Destructive
324 | 
325 |   public static Set<String> inAOnly_destructive( Set<String> idsA, Set<String> idsB ) {
326 |     idsA.removeAll( idsB );
327 |     return idsA;
328 |   }
329 |   public static Set<String> inBOnly_destructive( Set<String> idsA, Set<String> idsB ) {
330 |     return inAOnly_destructive( idsB, idsA );
331 |   }
332 |   public static Set<String> intersection_destructive( Set<String> idsA, Set<String> idsB ) {
333 |     idsA.retainAll( idsB );
334 |     return idsB;
335 |   }
336 |   public static Set<String> union_destructive( Set<String> idsA, Set<String> idsB ) {
337 |     idsA.addAll( idsB );
338 |     return idsA;
339 |   }
340 | 
341 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/StatsUtils.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collection;
  5 | import java.util.Iterator;
  6 | import java.util.LinkedHashSet;
  7 | import java.util.List;
  8 | import java.util.Set;
  9 | 
 10 | public class StatsUtils {
 11 | 
 12 |   // Quick Stats
 13 |   // For ints, we still return long as a sum
 14 |   public static long sumList_Ints( Collection<Integer> in ) {
 15 |     long out = 0L;
 16 |     for ( Integer i : in ) {
 17 |       if ( null!=i ) {
 18 |         out += i;
 19 |       }
 20 |     }
 21 |     return out;
 22 |   }
 23 |   public static long sumList_Longs( Collection<Long> in ) {
 24 |     long out = 0L;
 25 |     for ( Long i : in ) {
 26 |       if ( null!=i ) {
 27 |         out += i;
 28 |       }
 29 |     }
 30 |     return out;
 31 |   }
 32 |   public static double sumList_Doubles( Collection<Double> in ) {
 33 |     double out = 0L;
 34 |     for ( Double i : in ) {
 35 |       if ( null!=i ) {
 36 |         out += i;
 37 |       }
 38 |     }
 39 |     return out;
 40 |   }
 41 |   public static int minList_Ints( Collection<Integer> in ) {
 42 |     int out = Integer.MAX_VALUE;
 43 |     for ( Integer i : in ) {
 44 |       if ( i < out ) {
 45 |         out = i;
 46 |       }
 47 |     }
 48 |     return out;
 49 |   }
 50 |   public static long minList_Longs( Collection<Long> in ) {
 51 |     long out = Long.MAX_VALUE;
 52 |     for ( Long i : in ) {
 53 |       if ( i < out ) {
 54 |         out = i;
 55 |       }
 56 |     }
 57 |     return out;
 58 |   }
 59 |   public static int maxList_Ints( Collection<Integer> in ) {
 60 |     int out = Integer.MIN_VALUE;
 61 |     for ( Integer i : in ) {
 62 |       if ( i > out ) {
 63 |         out = i;
 64 |       }
 65 |     }
 66 |     return out;
 67 |   }
 68 |   public static long maxList_Longs( Collection<Long> in ) {
 69 |     long out = Long.MIN_VALUE;
 70 |     for ( Long i : in ) {
 71 |       if ( i > out ) {
 72 |         out = i;
 73 |       }
 74 |     }
 75 |     return out;
 76 |   }
 77 |   public static double averageList_Ints( Collection<Integer> in ) {
 78 |     if ( null==in || in.isEmpty() ) {
 79 |       return Double.NaN;
 80 |     }
 81 |     Long sum = sumList_Ints( in );
 82 |     return (double) sum / (double) in.size();
 83 |     // return new Double(sum) / new Double(in.size());
 84 |   }
 85 |   public static double averageList_Longs( Collection<Long> in ) {
 86 |     if ( null==in || in.isEmpty() ) {
 87 |       return Double.NaN;
 88 |     }
 89 |     Long sum = sumList_Longs( in );
 90 |     return (double) sum / (double) in.size();
 91 |     // return new Double(sum) / new Double(in.size());
 92 |   }
 93 |   public static double averageList_Doubles( Collection<Double> in ) {
 94 |     if ( null==in || in.isEmpty() ) {
 95 |       return Double.NaN;
 96 |     }
 97 |     Double sum = sumList_Doubles( in );
 98 |     return (double) sum / (double) in.size();
 99 |     // return new Double(sum) / new Double(in.size());
100 |   }
101 |   // TODO: assumes full-population std, could add flag for sample, N-1 logic
102 |   public static double standardDeviationList_Ints( Collection<Integer> in ) {
103 |     if ( null==in || in.isEmpty() ) {
104 |       return 0.0;
105 |     }
106 |     double avg = averageList_Ints( in );
107 |     double sumOfDeltaSquared = 0.0;
108 |     for ( int i : in ) {
109 |       // Order doesn't matter since we square it
110 |       double delta = avg - (double)i;
111 |       sumOfDeltaSquared += delta * delta;
112 |     }
113 |     return Math.sqrt( sumOfDeltaSquared / (double)in.size() );
114 |   }
115 |   public static double standardDeviationList_Longs( Collection<Long> in ) {
116 |     if ( null==in || in.isEmpty() ) {
117 |       return 0.0;
118 |     }
119 |     double avg = averageList_Longs( in );
120 |     double sumOfDeltaSquared = 0.0;
121 |     for ( long i : in ) {
122 |       // Order doesn't matter since we square it
123 |       double delta = avg - (double) i;
124 |       sumOfDeltaSquared += delta * delta;
125 |     }
126 |     return Math.sqrt( sumOfDeltaSquared / (double)in.size() );
127 |   }
128 | 
129 |   public static List<Double> longs2Doubles( Collection<Long> longs ) {
130 |     List<Double> out = new ArrayList<>();
131 |     for ( Long l : longs ) {
132 |       Double d = new Double(l);
133 |       out.add(  new Double( l )  );
134 |     }
135 |     return out;
136 |   }
137 | 
138 |   // http://math.stackexchange.com/questions/350754/fitting-exponential-curve-to-data
139 |   // Returns [A,k] for y = A e^kx
140 |   public static double [] leastSquares_Exponential( List<Double> xList, List<Double> yList ) {
141 |     List<Double> xList2 = new ArrayList<>();
142 |     List<Double> yList2 = new ArrayList<>();
143 |     // Skip zeros!
144 |     for ( int i=0; i<xList.size(); i++ ) {
145 |       Double y = yList.get(i);
146 |       if ( y > 0.0 ) {
147 |         Double x = xList.get(i);
148 |         double y2 = Math.log(y);
149 |         xList2.add( x );
150 |         yList2.add( y2 );
151 |       }
152 |       // yList2.add( Math.log(d) );
153 |     }
154 |     double [] line = leastSquares_Line( xList2, yList2 );
155 |     double m = line[0];
156 |     double b = line[1];
157 |     double A = Math.exp( b );
158 |     double k = m;
159 |     double out[] = new double[2];
160 |     out[0] = A;
161 |     out[1] = k;
162 |     System.out.println( "leastSquares_Exponential: returning [A, k] = [" + A + ", " + k + "]" );
163 |     return out;
164 |   }
165 |   // Retruns [m, b] for y = mx+b
166 |   // http://hotmath.com/hotmath_help/topics/line-of-best-fit.html
167 |   public static double [] leastSquares_Line( List<Double> xList, List<Double> yList ) {
168 |     double m = 0;
169 |     double b = 0;
170 |     if ( xList.size() != yList.size() ) {
171 |       throw new IllegalStateException( "Number of of x values (" + xList.size() + ") != number of y (" + yList.size() );
172 |     }
173 |     if ( xList.size() > 0 ) {
174 |       double sumX = 0;
175 |       double sumY = 0;
176 |       double sumXY = 0;
177 |       double sumSquaredX = 0;
178 |       for ( int i=0; i<xList.size(); i++ ) {
179 |         double x = xList.get(i);
180 |         double y = yList.get(i);
181 |         sumX += x;
182 |         sumY += y;
183 |         sumXY += x * y;
184 |         sumSquaredX += x * x;
185 |       }
186 |       double meanX = averageList_Doubles( xList );
187 |       double meanY = averageList_Doubles( yList );
188 |       double n = (double) xList.size();
189 |       double m_mumerator = sumXY - sumX*sumY / n;
190 |       double m_denominator = sumSquaredX - sumX*sumX / n;
191 |       m = m_mumerator / m_denominator;
192 |       b = meanY - m * meanX;
193 |       System.out.println( "leastSquares_Line: x["+xList.size()+"] = " + xList );
194 |       System.out.println( "leastSquares_Line: y["+xList.size()+"] = " + yList );
195 |       System.out.println( "leastSquares_Line: sumX = " + sumX );
196 |       System.out.println( "leastSquares_Line: sumY = " + sumY );
197 |       System.out.println( "leastSquares_Line: sumXY = " + sumXY );
198 |       System.out.println( "leastSquares_Line: sumSquaredX = " + sumSquaredX );
199 |       System.out.println( "leastSquares_Line: meanX = " + meanX );
200 |       System.out.println( "leastSquares_Line: meanY = " + meanY );
201 |       System.out.println( "leastSquares_Line: n = " + n );
202 |       System.out.println( "leastSquares_Line: m_mumerator = " + m_mumerator );
203 |       System.out.println( "leastSquares_Line: m_denominator = " + m_denominator );
204 |       System.out.println( "leastSquares_Line: returning [m, b] = [" + m + ", " + b + "]" );
205 |     }
206 |     else {
207 |       System.err.println( "Warn: leastSquares_Line: no values, nothing to do, returning zeros" );
208 |     }
209 |     double out[] = new double[2];
210 |     out[0] = m;
211 |     out[1] = b;
212 |     return out;
213 |   }
214 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/StringUtils.java:
--------------------------------------------------------------------------------
  1 | package com.lucidworks.dq.util;
  2 | 
  3 | import java.io.UnsupportedEncodingException;
  4 | import java.net.URLDecoder;
  5 | import java.util.ArrayList;
  6 | import java.util.Collection;
  7 | import java.util.LinkedHashMap;
  8 | import java.util.LinkedHashSet;
  9 | import java.util.Map;
 10 | import java.util.Set;
 11 | import java.util.TreeMap;
 12 | import java.util.regex.Matcher;
 13 | import java.util.regex.Pattern;
 14 | 
 15 | public class StringUtils {
 16 | 
 17 |   public static String NL = System.getProperty("line.separator");
 18 | 
 19 |   public static String parseAndCatchGroupAsStringOrNull( String patternStr, String sourceText, int groupNumber ) {
 20 |     // Java caches pattern compilations
 21 |     Pattern pattern = Pattern.compile( patternStr );
 22 |     Matcher matcher = pattern.matcher( sourceText );   
 23 |     if ( matcher.find() ) {
 24 |       return matcher.group( groupNumber );
 25 |     }
 26 |     else {
 27 |       return null;
 28 |     }
 29 |   }
 30 | 
 31 |   public static Long parseAndCatchGroupAsLongOrNull( String patternStr, String sourceText, int groupNumber ) {
 32 |     String matchStr = parseAndCatchGroupAsStringOrNull( patternStr, sourceText, groupNumber );
 33 |     if ( null!=matchStr ) {
 34 |       // Java caches valueOf object results
 35 |       return Long.valueOf( matchStr );
 36 |     }
 37 |     else {
 38 |       return null;
 39 |     }
 40 |   }
 41 | 
 42 |   /*
 43 |    * Convert:
 44 |    *  color=red&fruit=apple&fruit=banana&desert=&lone+string
 45 |    * into:
 46 |    *  { color: [red], fruit: [apple, banana], desert: [""], content: [pizza] }
 47 |    * TODO: feels like reinventing the wheel, though want very specific rules applied...
 48 |    */
 49 |   public static Map<String,Collection<String>> parseCgiParameters( String rawText ) {
 50 | 
 51 |     // picky options we might expose later on
 52 |     boolean maintainInsertionOrder = false;
 53 |     boolean isCaseSensitiveKeys = true;
 54 |     boolean trimKeys = true;
 55 |     String defaultParamName = "content";
 56 |     String encoding = "UTF-8";
 57 |     // Value normalization might vary by parameter name
 58 |     // TODO: separate method to look for CSV and space delimited values
 59 |     // TODO: separate method to perhaps provide default values
 60 | 
 61 |     // Map<String,Collection<String>> outMap = maintainInsertionOrder ? new LinkedHashMap<>() : new TreeMap<>();
 62 |     Map<String,Collection<String>> outMap = null;
 63 |     if ( maintainInsertionOrder ) {
 64 |       outMap = new LinkedHashMap<>();
 65 |     }
 66 |     else {
 67 |       outMap = new TreeMap<>();
 68 |     }
 69 | 
 70 |     // Break on & and ? (usually just &)
 71 |     String [] args = rawText.split( "[?&]" );
 72 |     for ( int i=0; i<args.length; i++ ) {
 73 | 
 74 |       String arg = args[i];
 75 |       // Skip empty entries
 76 |       if ( arg.isEmpty() ) {
 77 |         continue;
 78 |       }
 79 | 
 80 |       // Break on FIRST equals sign with arg
 81 |       int equalsAt = arg.indexOf( '=' );
 82 |       String key = "";
 83 |       String value = "";
 84 |       if ( equalsAt >= 0 ) {
 85 |         if ( equalsAt > 0 ) {
 86 |           key = arg.substring( 0, equalsAt );
 87 |         }
 88 |         if ( equalsAt < arg.length() ) {
 89 |           value = arg.substring( equalsAt + 1 );
 90 |         }
 91 |       }
 92 |       else {
 93 |         key = arg;
 94 |       }
 95 | 
 96 |       // Normalize key and value
 97 |       try {
 98 |         key = URLDecoder.decode( key, encoding );
 99 |       } catch (UnsupportedEncodingException e) {
100 |         e.printStackTrace();
101 |       }
102 |       if ( trimKeys ) {
103 |         key = key.trim();
104 |       }
105 |       if ( key.isEmpty() ) {
106 |         key = defaultParamName;
107 |       }
108 |       if ( ! isCaseSensitiveKeys ) {
109 |         key = key.toLowerCase();
110 |       }
111 |       // normalization of values can be handled via additioanl methods
112 |       try {
113 |         value = URLDecoder.decode( value, encoding );
114 |       } catch (UnsupportedEncodingException e) {
115 |         e.printStackTrace();
116 |       }
117 | 
118 |       // Tabulate
119 |       Collection<String> values = null;
120 |       if ( outMap.containsKey(key) ) {
121 |         values = outMap.get(key);
122 |       }
123 |       else {
124 |         values = new ArrayList<>();
125 |         outMap.put( key, values );
126 |       }
127 |       values.add( value );
128 |       
129 |     }
130 |   
131 |     return outMap;
132 |   }
133 | 
134 |   // AKA multiplyString
135 |   // TODO: Or use org.apache.commons.lang.StringUtils.repeat(...) ???
136 |   public static String repeatString( String s, int n ) {
137 |     if ( n <= 0 ) { return ""; }
138 |     // http://stackoverflow.com/questions/1235179/simple-way-to-repeat-a-string-in-java
139 |     return new String(new char[n]).replace("\0", s);
140 |     // return String.format(String.format("%%0%dd", n), 0).replace("0",s);
141 |   }
142 | 
143 |   public static String join( Collection<String> strings ) {
144 |     return join( strings, ", " );
145 |   }
146 |   public static String join( Collection<String> strings, String delimiter ) {
147 |     StringBuffer out = new StringBuffer();
148 |     boolean isFirst = true;
149 |     for ( String s : strings ) {
150 |       if ( ! isFirst ) {
151 |         out.append( delimiter );
152 |       }
153 |       else {
154 |         isFirst = false;
155 |       }
156 |       out.append( s );
157 |     }
158 |     return new String( out );
159 |   }
160 | 
161 |   // Handy for comma separated field names list, etc
162 |   public static Set<String> splitCsv( String inStr ) {
163 |     String[] fieldsAry = inStr.split( ",\\s*" );
164 |     // maintains order of insertion
165 |     Set<String> out = new LinkedHashSet<>();
166 |     for ( String f : fieldsAry ) {
167 |       if ( ! f.trim().isEmpty() ) {
168 |         out.add( f.trim() );
169 |       }
170 |     }
171 |     return out;
172 |   }
173 | 
174 |   public static String escapeSpaces( String inStr ) {
175 |     if ( null==inStr ) {
176 |       return null;
177 |     }
178 |     return inStr.replaceAll( "[ ]", "\\\\ " );
179 |   }
180 |   public static String escapeColons( String inStr ) {
181 |     if ( null==inStr ) {
182 |       return null;
183 |     }
184 |     return inStr.replaceAll( "[:]", "\\\\:" );
185 |   }
186 | 
187 |   /**
188 |    * Based on code from:
189 |    * http://stackoverflow.com/questions/1247772 and
190 |    * http://stackoverflow.com/a/17369948/295802
191 |    * 
192 |    * Converts a standard POSIX Shell globbing pattern into a regular expression
193 |    * pattern. The result can be used with the standard {@link java.util.regex} API to
194 |    * recognize strings which match the glob pattern.
195 |    * <p/>
196 |    * See also, the POSIX Shell language:
197 |    * http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_13_01
198 |    * 
199 |    * @param pattern A glob pattern.
200 |    * @return A regex pattern to recognize the given glob pattern.
201 |    */
202 |   public static final String convertGlobToRegex(String pattern) {
203 |     StringBuilder sb = new StringBuilder(pattern.length() * 2);
204 |     int inGroup = 0;
205 |     int inClass = 0;
206 |     int firstIndexInClass = -1;
207 |     char[] arr = pattern.toCharArray();
208 |     for (int i = 0; i < arr.length; i++) {
209 |       char ch = arr[i];
210 |       switch (ch) {
211 |       case '\\':
212 |         if (++i >= arr.length) {
213 |           sb.append('\\');
214 |         } else {
215 |           char next = arr[i];
216 |           switch (next) {
217 |           case ',':
218 |             // escape not needed
219 |             break;
220 |           case 'Q':
221 |           case 'E':
222 |             // extra escape needed
223 |             sb.append('\\');
224 |           default:
225 |             sb.append('\\');
226 |           }
227 |           sb.append(next);
228 |         }
229 |         break;
230 |       case '*':
231 |         if (inClass == 0)
232 |           sb.append(".*");
233 |         else
234 |           sb.append('*');
235 |         break;
236 |       case '?':
237 |         if (inClass == 0)
238 |           sb.append('.');
239 |         else
240 |           sb.append('?');
241 |         break;
242 |       case '[':
243 |         inClass++;
244 |         firstIndexInClass = i + 1;
245 |         sb.append('[');
246 |         break;
247 |       case ']':
248 |         inClass--;
249 |         sb.append(']');
250 |         break;
251 |       case '.':
252 |       case '(':
253 |       case ')':
254 |       case '+':
255 |       case '|':
256 |       case '^':
257 |       case '$':
258 |       case '@':
259 |       case '%':
260 |         if (inClass == 0 || (firstIndexInClass == i && ch == '^'))
261 |           sb.append('\\');
262 |         sb.append(ch);
263 |         break;
264 |       case '!':
265 |         if (firstIndexInClass == i)
266 |           sb.append('^');
267 |         else
268 |           sb.append('!');
269 |         break;
270 |       case '{':
271 |         inGroup++;
272 |         sb.append('(');
273 |         break;
274 |       case '}':
275 |         inGroup--;
276 |         sb.append(')');
277 |         break;
278 |       case ',':
279 |         if (inGroup > 0)
280 |           sb.append('|');
281 |         else
282 |           sb.append(',');
283 |         break;
284 |       default:
285 |         sb.append(ch);
286 |       }
287 |     }
288 |     return sb.toString();
289 |   }
290 | 
291 |   // TODO: could also do list of matches with m.reset(myNewString), might be slightly faster
292 |   public static boolean checkPatternsInList( Collection<Pattern> patterns, String targetString ) {
293 |     for ( Pattern p : patterns ) {
294 |       Matcher m = p.matcher( targetString );
295 |       if ( m.matches() ) {
296 |         return true;
297 |       }
298 |     }
299 |     return false;
300 |   }
301 | 
302 | 
303 | }
304 | 


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/util/TupleEntropy.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.util;
 2 | 
 3 | import java.util.LinkedHashMap;
 4 | import java.util.Map;
 5 | import java.util.Map.Entry;
 6 | 
 7 | public class TupleEntropy {
 8 |   static boolean debug = false;
 9 |   public static double calcTupleEntropyForAllLengths( String word ) {
10 |     double outEntropy = 0.0;
11 |     for ( int i=1; i<=word.length(); i++ ) {
12 |       double newEntropy = calcTupleEntropyForLength( word, i );
13 |       outEntropy += newEntropy;
14 |     }
15 |     return outEntropy;
16 |   }
17 |   public static double calcTupleEntropyForLength( String word, int len ) {
18 |     Map<String,Double> tupleStats = calcTuplesForLen( word, len );
19 |     double outEntropy = calcEntropyForCounts( tupleStats );
20 |     if(debug) System.out.println( "\tTuple Len: " + len + " has " + tupleStats.keySet().size() + " / " + StatsUtils.sumList_Doubles(tupleStats.values())  + " unique/total" );
21 |     if(debug) System.out.println( "\t\tTuples: " + tupleStats );
22 |     if(debug) System.out.println( "\t\tEntropy = " + outEntropy );
23 |     return outEntropy;
24 |   }
25 |   public static double calcEntropyForCounts( Map<String,Double> inMap ) {
26 |     if ( null==inMap || inMap.isEmpty() ) {
27 |       return 0.0;
28 |     }
29 |     double sum = StatsUtils.sumList_Doubles( inMap.values() );
30 |     if ( sum <= 0.0 ) {
31 |       return 0.0;
32 |     }
33 |     double outEntropy = 0.0;
34 |     for ( Entry<String, Double> entry : inMap.entrySet() ) {
35 |       String word = entry.getKey();
36 |       double count = entry.getValue();
37 |       double prob = count / sum;
38 |       if ( prob > 0.0 ) {
39 |         double newEntropy = -1.0 * prob * Math.log( prob );
40 |         outEntropy += newEntropy;
41 |       }
42 |     }
43 |     return outEntropy;
44 |   }
45 |   public static Map<String,Double> calcTuplesForLen( String word, int len ) {
46 |     Map<String,Double> out = new LinkedHashMap<>();
47 |     if ( len > 0 && word.length() >= len ) {
48 |       for ( int i=0; i <= word.length() - len; i++ ) {
49 |         String tupe = word.substring( i, i + len );
50 |         double oldCount = 0.0;
51 |         if ( out.containsKey(tupe) ) {
52 |           oldCount = out.get( tupe );
53 |         }
54 |         out.put( tupe, oldCount + 1.0 );
55 |       }
56 |     }
57 |     return out;
58 |   }
59 |   public static void main( String[] argv ) {
60 |     for ( String word : argv ) {
61 |       if(debug) System.out.println( "Word: \"" + word + "\"" );
62 |       double entropy = calcTupleEntropyForAllLengths( word );
63 |       if(debug) System.out.println( "\ttotal for word \"" + word + "\": " + entropy );
64 |       if ( ! debug ) {
65 |         System.out.println( "" + word + "\t" + word.length() + "\t" + entropy );
66 |       }
67 |     }
68 |   }
69 | }


--------------------------------------------------------------------------------
/src/main/java/com/lucidworks/dq/zk_experiment/ZkSmartClient.java:
--------------------------------------------------------------------------------
 1 | package com.lucidworks.dq.zk_experiment;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.solr.client.solrj.SolrQuery;
 7 | import org.apache.solr.client.solrj.SolrServer;
 8 | import org.apache.solr.client.solrj.SolrServerException;
 9 | import org.apache.solr.client.solrj.impl.CloudSolrServer;
10 | import org.apache.solr.client.solrj.response.QueryResponse;
11 | import org.apache.solr.common.SolrDocument;
12 | import org.apache.solr.common.SolrInputDocument;
13 | 
14 | /*
15 |  * Demonstrate a ZooKeeper aware "Smart Client" that can automatically handle server issues
16 |  */
17 | public class ZkSmartClient {
18 |   static final String COLLECTION = "collection1";
19 |   static final String ID_FIELD = "id";
20 | 
21 |   // BEFORE: Would normally use this w/ HttpSolrServer
22 |   static final String SOLR_URL = "http://localhost:8983/solr/" + COLLECTION;
23 | 
24 |   // AFTER: Instead we use this w/ CloudSolrServer
25 |   // These are ZooKeeper instances, could map to 1 or 100 Solr servers
26 |   static final String ZK_ENSEMBLE = "localhost:2181,localhost:2182,localhost:2183";
27 |  
28 |   static SolrServer openServer() {
29 | 
30 |     // BEFORE: Normally we'd use this:
31 |     // HttpSolrServer extends SolrServer
32 |     // HttpSolrServer server = new HttpSolrServer( serverUrl );
33 |     
34 |     // AFTER: Instead we use this:
35 |     // CloudSolrServer extends SolrServer
36 |     CloudSolrServer server = new CloudSolrServer( ZK_ENSEMBLE );
37 |     // .setDefaultColl not defined for base SolrServer type
38 |     server.setDefaultCollection( COLLECTION );
39 | 
40 |     return server;
41 |   }
42 | 
43 |   static void addDoc( SolrServer server, int id ) throws SolrServerException, IOException {
44 |     SolrInputDocument doc = new SolrInputDocument();
45 |     doc.addField( ID_FIELD, "" + id );
46 |     doc.addField( "name", "Test Doc " + id );
47 |     server.add(doc);
48 |     // Normally wouldn't do this, but OK for small test
49 |     server.commit();  
50 |     System.out.println( "Added doc " + id );
51 |   }
52 |   
53 |   static void testSearch( SolrServer server ) throws SolrServerException {
54 |     SolrQuery query = new SolrQuery( "*:*" );
55 |     query.addField( ID_FIELD );
56 |     QueryResponse res = server.query( query );
57 |     System.out.println( "Sample doc IDs:" );
58 |     // gets max of 10 docs by default
59 |     for ( SolrDocument doc : res.getResults() ) {
60 |       String id = (String) doc.get( ID_FIELD );
61 |       System.out.println( id );
62 |     }
63 |   }
64 | 
65 |   public static void main(String[] args) throws SolrServerException, IOException {
66 |     SolrServer server = openServer();
67 |     addDoc( server, 4 );
68 |     testSearch( server );
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/resources/DQ-Prototype-and-SolrJ.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucidworks/data-quality/6fd557d8757d5e956082a51f669f88bf7c226d80/src/main/resources/DQ-Prototype-and-SolrJ.key


--------------------------------------------------------------------------------
/src/main/resources/DQ-Prototype-and-SolrJ.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucidworks/data-quality/6fd557d8757d5e956082a51f669f88bf7c226d80/src/main/resources/DQ-Prototype-and-SolrJ.pdf


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/README.txt:
--------------------------------------------------------------------------------
1 | These aren't the exact output of current reports:
2 | * Comments added
3 | * Reports broken into separate parts in some cases
4 | * Slight wording/formatting differences from current java code
5 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/dates-curve-fitting.txt:
--------------------------------------------------------------------------------
  1 | Date Histogram and Exponential Growth Curve Fitting
  2 | 
  3 | Report:
  4 | * Guesses your date fields
  5 | * Generates a Histogram by year
  6 | * Also tries to fit an exponential curve to your Dates' growth
  7 |     (via least squares on natual log of counts)
  8 | 
  9 | In this example report, the fitted curves are
 10 | flatter than the data for 2 reasons:
 11 | 1: Has dates in distant past
 12 |     (late 1800's, etc, below display threshhold)
 13 | 2: Most recent year was when data was captured, midyear,
 14 |     so value shortfall
 15 | 
 16 | ------------
 17 | 
 18 | Solr = http://localhost:8983/solr/demo_shard1_replica1
 19 | 
 20 | Stats, Strs: Start/Stop: 1900-01-01T00:00:00Z / 2025-12-31T00:00:00Z
 21 |     Years with < .5 hash marks aren't displayed
 22 |     Not many dates in early 1900's nor in the future
 23 |     
 24 | Date Field: releaseDate
 25 |     2012-01-01: =====#=====================
 26 |     2011-01-01: =====#=====================================
 27 |     2010-01-01: ====#=================================
 28 |     2009-01-01: ===#======================================
 29 |     2008-01-01: ===#========================================================
 30 |     2007-01-01: ==#===================================================
 31 |     2006-01-01: ==#====================================================
 32 |     2005-01-01: ==#=====================================================
 33 |     2004-01-01: =#===================================================
 34 |     2003-01-01: =#============================================
 35 |     2002-01-01: =#=====================================
 36 |     2001-01-01: =#====================================
 37 |     2000-01-01: #===========================================
 38 |     1999-01-01: #===============================
 39 |     1998-01-01: #=============================
 40 |     1997-01-01: #=====================
 41 |     1996-01-01: #======================
 42 |     1995-01-01: #=========================
 43 |     1994-01-01: #====================
 44 |     1993-01-01: #============
 45 |     1992-01-01: ==========
 46 |     1991-01-01: =========
 47 |     1990-01-01: ===============
 48 |     1989-01-01: ==
 49 | 
 50 | Stats, Strs: Start/Stop: 1884-01-01T00:00:00Z / 2012-07-29T00:00:00Z
 51 | Date Field: startDate
 52 |     2012-01-01: ==========#===
 53 |     2011-01-01: =========#=============================================
 54 |     2010-01-01: ========#==========================================
 55 |     2009-01-01: =======#====================================
 56 |     2008-01-01: ======#===============================================
 57 |     2007-01-01: =====#======================================================
 58 |     2006-01-01: ====#================================================
 59 |     2005-01-01: ====#=================================================
 60 |     2004-01-01: ===#=====================================================
 61 |     2003-01-01: ===#===============================================
 62 |     2002-01-01: ==#======================================
 63 |     2001-01-01: ==#=================================
 64 |     2000-01-01: =#====================================
 65 |     1999-01-01: =#=====================================
 66 |     1998-01-01: =#===========================
 67 |     1997-01-01: =#=======================
 68 |     1996-01-01: #===================
 69 |     1995-01-01: #========================
 70 |     1994-01-01: #=====================
 71 |     1993-01-01: #================
 72 |     1992-01-01: #==========
 73 |     1991-01-01: #========
 74 |     1990-01-01: #==============
 75 |     1989-01-01: #==
 76 |     1988-01-01: #
 77 | 
 78 | Calculations output, debugging, etc.................
 79 | 
 80 | leastSquares_Line: x[55] = [-2.2089888E12, -1.230768E12, -1.1045376E12, -6.31152E11, -5.049216E11, -4.733856E11, -4.418496E11, -1.577664E11, -9.46944E10, -6.31584E10, -3.1536E10, 0.0, 6.3072E10, 1.262304E11, 1.577664E11, 1.893024E11, 2.209248E11, 2.524608E11, 2.839968E11, 3.155328E11, 3.471552E11, 3.786912E11, 4.102272E11, 4.417632E11, 4.733856E11, 5.049216E11, 5.364576E11, 5.679936E11, 5.99616E11, 6.31152E11, 6.62688E11, 6.94224E11, 7.258464E11, 7.573824E11, 7.889184E11, 8.204544E11, 8.520768E11, 8.836128E11, 9.151488E11, 9.466848E11, 9.783072E11, 1.0098432E12, 1.0413792E12, 1.0729152E12, 1.1045376E12, 1.1360736E12, 1.1676096E12, 1.1991456E12, 1.230768E12, 1.262304E12, 1.29384E12, 1.325376E12, 1.3569984E12, 1.5778368E12, 1.7356896E12]
 81 | leastSquares_Line: y[55] = [2.6390573296152584, 2.0794415416798357, 0.0, 1.0986122886681098, 0.0, 0.6931471805599453, 0.6931471805599453, 0.0, 0.0, 0.0, 0.6931471805599453, 1.3862943611198906, 0.0, 0.0, 1.0986122886681098, 0.0, 1.0986122886681098, 1.6094379124341003, 1.0986122886681098, 5.288267030694535, 1.0986122886681098, 2.0794415416798357, 2.4849066497880004, 3.6635616461296463, 3.6888794541139363, 4.574710978503383, 5.220355825078324, 6.1224928095143865, 8.119993827725105, 9.971333099431195, 9.512516890578416, 9.616405300156314, 9.850666776352545, 10.339805124127057, 10.555656476367515, 10.411358816475682, 10.388964598613677, 10.683041760836812, 10.735679026718607, 11.068215243411098, 10.914051563182127, 10.95166582665281, 11.113566675293825, 11.244038976969438, 11.3011050641372, 11.297750083305196, 11.263489271701095, 11.376509917165842, 11.00829726411219, 10.910258637538657, 11.04809244505471, 10.59383080576334, 3.258096538021482, 1.0986122886681098, 3.1780538303479458]
 82 | leastSquares_Line: sumX = 2.6097552E13
 83 | leastSquares_Line: sumY = 310.2204061940795
 84 | leastSquares_Line: sumXY = 2.6660475019541412E14
 85 | leastSquares_Line: sumSquaredX = 4.153851514838016E25
 86 | leastSquares_Line: meanX = 4.745009454545455E11
 87 | leastSquares_Line: meanY = 5.6403710217105365
 88 | leastSquares_Line: n = 55.0
 89 | leastSquares_Line: m_mumerator = 1.1940487415703028E14
 90 | leastSquares_Line: m_denominator = 2.9155202050330995E25
 91 | leastSquares_Line: returning [m, b] = [4.095491224890163E-12, 3.6970565633993595]
 92 | leastSquares_Exponential: returning [A, k] = [40.328425326295104, 4.095491224890163E-12]
 93 | leastSquares_Line: x[55] = [-2.7139104E12, -2.2405248E12, -1.230768E12, -1.1676096E12, -1.1045376E12, -6.31152E11, -5.364576E11, -5.049216E11, -4.733856E11, -1.577664E11, -1.262304E11, -9.46944E10, -6.31584E10, -3.1536E10, 0.0, 6.3072E10, 1.262304E11, 1.577664E11, 1.893024E11, 2.209248E11, 2.524608E11, 2.839968E11, 3.155328E11, 3.471552E11, 3.786912E11, 4.102272E11, 4.417632E11, 4.733856E11, 5.049216E11, 5.364576E11, 5.679936E11, 5.99616E11, 6.31152E11, 6.62688E11, 6.94224E11, 7.258464E11, 7.573824E11, 7.889184E11, 8.204544E11, 8.520768E11, 8.836128E11, 9.151488E11, 9.466848E11, 9.783072E11, 1.0098432E12, 1.0413792E12, 1.0729152E12, 1.1045376E12, 1.1360736E12, 1.1676096E12, 1.1991456E12, 1.230768E12, 1.262304E12, 1.29384E12, 1.325376E12]
 94 | leastSquares_Line: y[55] = [2.302585092994046, 2.6390573296152584, 2.0794415416798357, 1.3862943611198906, 0.0, 0.6931471805599453, 0.0, 0.6931471805599453, 1.9459101490553132, 0.0, 1.791759469228055, 0.0, 0.6931471805599453, 1.0986122886681098, 0.6931471805599453, 0.0, 1.0986122886681098, 0.6931471805599453, 0.0, 1.3862943611198906, 1.6094379124341003, 5.267858159063328, 1.791759469228055, 0.0, 2.3978952727983707, 3.4657359027997265, 3.7376696182833684, 4.07753744390572, 4.882801922586371, 5.420534999272286, 6.8966943316227125, 8.576781982827894, 10.054361440970256, 9.581145019820722, 9.7107519573933, 10.146002265529594, 10.431995911154427, 10.549202396661185, 10.33471786853032, 10.579234218415905, 10.713750879764753, 11.014785701011421, 10.989656070979896, 10.936316100658235, 11.045366844253579, 11.280917742717547, 11.376395294789138, 11.326547692846342, 11.319595537796637, 11.43589322129926, 11.336677426964654, 11.133391226885413, 11.264105066866035, 11.34294346177025, 9.984053200375696]
 95 | leastSquares_Line: sumX = 1.72931328E13
 96 | leastSquares_Line: sumY = 315.2068163472246
 97 | leastSquares_Line: sumXY = 2.494364214301782E14
 98 | leastSquares_Line: sumSquaredX = 4.317231463538688E25
 99 | leastSquares_Line: meanX = 3.1442059636363635E11
100 | leastSquares_Line: meanY = 5.731033024494993
101 | leastSquares_Line: n = 55.0
102 | leastSquares_Line: m_mumerator = 1.5032890625640062E14
103 | leastSquares_Line: m_denominator = 3.7734997507415324E25
104 | leastSquares_Line: returning [m, b] = [3.983805914571994E-12, 4.478442393038285]
105 | leastSquares_Exponential: returning [A, k] = [88.09734471448613, 3.983805914571994E-12]
106 | ----------- http://localhost:8983/solr/demo_shard1_replica1 -----------
107 | Total Active Docs: 1,275,077
108 | 
109 | All Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight]
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/llr-larger-sample.txt:
--------------------------------------------------------------------------------
 1 | Compare words between two sources.
 2 | * Larger absolute number means more important change
 3 | * sign (+/-) indicates direction of change
 4 | * Note: signs may be backwards, still confirming
 5 | 
 6 | Notice the terms with the highest absolute score are the ones
 7 | that were added, eg: "acme", "cardboard", "box", etc.
 8 | 
 9 | ----------- A -> B -----------
10 | 
11 | Corpus A unique / total words: 398 / 579.0
12 | Corpus B unique / total words: 385 / 593.0
13 | Combined unique / total words: 418 / 1172.0
14 | Number of p log(p) calculations: 0
15 | 
16 | Term Changes, first 5 entries:
17 |     acme: -4.09515240975383
18 |     any: -4.09515240975383
19 |     box: -4.09515240975383
20 |     cardboard: -4.09515240975383
21 |     fits: -4.09515240975383
22 | Term Changes, last 5 entries:
23 |     silentseek: 1.4112036109151607
24 |     sp2514n: 1.4112036109151607
25 |     spinpoint: 1.4112036109151607
26 |     ultra: 1.4112036109151607
27 |     cache: 2.824159489031562
28 |     hard: 2.824159489031562
29 | 
30 | 
31 | Data:
32 | 
33 | Corpus A is stock Solr with all exampledocs XML files submitted.
34 | 
35 | Corpus B is a slightly modified version with a few docs added.
36 | For example, has:
37 | 
38 | new.xml
39 | -------
40 | <add><doc>
41 |   <field name="id">NEW111</field>
42 |   <field name="name">New Sample Product</field>
43 |   <field name="manu">Acme, Inc.</field>
44 |   <!-- Join -->
45 |   <field name="manu_id_s">acme</field>
46 |   <field name="cat">electronics</field>
47 |   <field name="cat">gadget</field>
48 |   <field name="features">Rocket powered, sugar-free, fits in any tackle box!</field>
49 |   <field name="includes">cardboard box</field>
50 |   <field name="weight">10.5</field>
51 |   <field name="price">19.95</field>
52 |   <field name="popularity">101</field>
53 |   <field name="inStock">true</field>
54 |   <!-- Buffalo store -->
55 |   <field name="store">43.17614,-90.57341</field>
56 | </doc></add>
57 | 
58 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/llr-tiny-sample.txt:
--------------------------------------------------------------------------------
 1 | Compare words between two sources.
 2 | * Larger absolute number means more important change
 3 | * sign (+/-) indicates direction of change
 4 | 
 5 | 
 6 | ----------- A -> B -----------
 7 | 
 8 | Corpus A unique / total words: 4 / 195.0
 9 | Corpus B unique / total words: 4 / 195.0
10 | Combined unique / total words: 5 / 390.0
11 | 
12 | All Term Changes:
13 |     candy: -60.03320678316351
14 |     bananas: -0.46192170199964266
15 |     food: 0.0
16 |     apples: 0.6291706616789554
17 |     carrots: 60.03320678316349
18 | 
19 | 
20 | Inputs:
21 |   Map<String,Long> corpusA = new LinkedHashMap<String,Long>() {{
22 |     put( "apples",   25L );
23 |     put( "bananas",  30L );
24 |     put( "carrots",  40L );
25 |     put( "food",    100L );
26 |   }};
27 |   Map<String,Long> corpusB = new LinkedHashMap<String,Long>() {{
28 |     put( "apples",   20L ); // down by 5
29 |     put( "bananas",  35L ); // up by 5
30 |     put( "candy",    40L ); // carrots -> candy!
31 |     put( "food",    100L ); // unchanged, and total unchanged
32 |   }};
33 | 
34 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/populated-fields-diff.txt:
--------------------------------------------------------------------------------
 1 | Index A = http://localhost:8984/solr/collection1
 2 | Index B = http://localhost:8985/solr/collection1
 3 | 
 4 | A: Total Active Docs: 32
 5 | B: Total Active Docs: 33
 6 | 
 7 | 
 8 | All Fields:
 9 | 
10 |     In both = '[_root_, _version_, author, cat, category, comments, content, content_type, description, features, id, inStock, includes, keywords, last_modified, links, manu, manu_exact, name, payloads, popularity, price, resourcename, sku, store, subject, text, text_rev, title, url, weight, address_s, compName_s, incubationdate_dt, manu_id_s, manufacturedate_dt, price_c, price_c____amount_raw, price_c____currency, store_0_coordinate, store_1_coordinate]'
11 | 
12 |     B only = '[field_a_en, field_b_en]'
13 | 
14 | Populated at 100% in Both A and B: [_version_, id]
15 | 
16 | No Indexed Values / 0% in Both A and B: [_root_, author, category, comments, content, content_type, description, keywords, last_modified, links, resourcename, sku, store, subject, text_rev, title, url]
17 | 
18 | Partially Populated Fields and Percentages, A / B:
19 |     cat: 20 (62.5%) / 21 (63.64%)
20 |     features: 20 (62.5%) / 21 (63.64%)
21 |     inStock: 21 (65.62%) / 20 (60.61%)
22 |     includes: 3 (9.38%) / 6 (18.18%)
23 |     manu: 20 (62.5%) / 21 (63.64%)
24 |     manu_exact: 20 (62.5%) / 21 (63.64%)
25 |     name: 21 (65.62%) / 22 (66.67%)
26 |     payloads: 3 (9.38%) / 3 (9.09%)
27 |     popularity: 15 (46.88%) / 14 (42.42%)
28 |     price: 16 (50%) / 15 (45.45%)
29 |     text: 21 (65.62%) / 22 (66.67%)
30 |     weight: 9 (28.12%) / 10 (30.3%)
31 |     address_s: 11 (34.38%) / 11 (33.33%)
32 |     compName_s: 11 (34.38%) / 11 (33.33%)
33 |     incubationdate_dt: 1 (3.12%) / 1 (3.03%)
34 |     manu_id_s: 18 (56.25%) / 19 (57.58%)
35 |     manufacturedate_dt: 11 (34.38%) / 9 (27.27%)
36 |     price_c: 20 (62.5%) / 19 (57.58%)
37 |     price_c____amount_raw: 20 (62.5%) / 19 (57.58%)
38 |     price_c____currency: 20 (62.5%) / 19 (57.58%)
39 |     store_0_coordinate: 14 (43.75%) / 13 (39.39%)
40 |     store_1_coordinate: 14 (43.75%) / 13 (39.39%)
41 |     field_a_en: (not in A) / 1 (3.03%)
42 |     field_b_en: (not in A) / 1 (3.03%)
43 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/populated-fields-single.txt:
--------------------------------------------------------------------------------
 1 | Notes:
 2 | * Fields include both declared and dynamic fields.
 3 | * This is based on fields with indexed values (vs. stored values)
 4 | * Stats do NOT include deleted docs.
 5 | 
 6 | - - - actual report - - -
 7 | 
 8 | Total Active Docs: 1,275,077
 9 | 
10 | All Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight]
11 | 
12 | Populated at 100%: [_version_, id, regularPrice, salePrice, store_id, text, type]
13 | 
14 | No Indexed Values / 0%: [_root_, author, cat, category, categoryPath, comments, content, content_type, inStock, includes, keywords, last_modified, links, manu, manu_exact, payloads, popularity, price, resourcename, shippingWeight, sku, store, subject, text_rev, title, url]
15 | 
16 | Partially Populated Fields / Percentages:
17 |     accessories: 11,460 (0.9%)
18 |     albumLabel: 876,821 (68.77%)
19 |     albumTitle: 876,845 (68.77%)
20 |     artistName: 871,477 (68.35%)
21 |     bundledIn: 7,148 (0.56%)
22 |     cast: 132,231 (10.37%)
23 |     categoryIds: 1,262,671 (99.03%)
24 |     categoryNames: 1,262,671 (99.03%)
25 |     class: 1,258,757 (98.72%)
26 |     color: 47,567 (3.73%)
27 |     condition: 103,036 (8.08%)
28 |     crew: 118,603 (9.3%)
29 |     customerReviewAverage: 67,489 (5.29%)
30 |     customerReviewCount: 67,489 (5.29%)
31 |     department: 1,258,757 (98.72%)
32 |     depthCategoryIds: 1,262,671 (99.03%)
33 |     depthCategoryNames: 1,262,671 (99.03%)
34 |     description: 7,499 (0.59%)
35 |     details: 101,235 (7.94%)
36 |     features: 116,881 (9.17%)
37 |     format: 1,147,134 (89.97%)
38 |     frequentlyPurchasedWith: 23,950 (1.88%)
39 |     genre: 1,133,752 (88.92%)
40 |     hardGoodType: 103,036 (8.08%)
41 |     image: 1,273,774 (99.9%)
42 |     lengthInMinutes: 197,204 (15.47%)
43 |     longDescription: 136,234 (10.68%)
44 |     manufacturer: 997,494 (78.23%)
45 |     mpaaRating: 123,899 (9.72%)
46 |     name: 1,274,453 (99.95%)
47 |     plot: 204,358 (16.03%)
48 |     product_id: 54,363 (4.26%)
49 |     relatedProducts: 36,994 (2.9%)
50 |     releaseDate: 1,162,200 (91.15%)
51 |     salesRankLongTerm: 281,712 (22.09%)
52 |     salesRankMediumTerm: 131,228 (10.29%)
53 |     salesRankShortTerm: 112,483 (8.82%)
54 |     shortDescription: 120,163 (9.42%)
55 |     softwareGrade: 417 (0.03%)
56 |     startDate: 1,273,615 (99.89%)
57 |     studio: 256,401 (20.11%)
58 |     subclass: 1,258,757 (98.72%)
59 |     weight: 67,072 (5.26%)
60 | 
61 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/schema-info-diff.txt:
--------------------------------------------------------------------------------
 1 | Notes:
 2 | 
 3 | Schema diffs:
 4 | * It can even compare schemas between running systems (via REST) and local XML files.
 5 |   (though some fields blank depending on source)
 6 | * The sample below is the default 4.6.1 schema vs. Apollo demo (plus my local changes)
 7 | * A few lists are order-dependant
 8 |   TODO: turns out this may not be needed, to verify.
 9 | 
10 | (shows individual reports, then the diff report)
11 | 
12 | ========== Differences Report ==========
13 | Schema A = Default Solr 4.6.1 Schema
14 | Schema B = Apollo demo plus local changes
15 | Schema Name: Both = 'example'
16 | Schema Version: Both = '1.5'
17 | Key Field: Both = 'id'
18 | Default Operator:
19 |     A = 'null'
20 |     B = '(not-available)'
21 | Similarity Class Name:
22 |     A = 'org.apache.solr.search.similarities.DefaultSimilarityFactory'
23 |     B = '(not-available)'
24 | 
25 | Fields:
26 | 
27 |     In both = '[_version_, _root_, id, sku, name, manu, cat, features, includes, weight, price, popularity, inStock, store, title, subject, description, comments, author, keywords, category, resourcename, url, content_type, last_modified, links, content, text, text_rev, manu_exact, payloads]'
28 | 
29 |     B only = '[accessories, albumLabel, albumTitle, artistName, bundledIn, cast, categoryIds, categoryNames, categoryPath, class, color, condition, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, details, format, frequentlyPurchasedWith, genre, hardGoodType, image, lengthInMinutes, longDescription, manufacturer, mpaaRating, plot, product_id, regularPrice, relatedProducts, releaseDate, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, softwareGrade, startDate, store_id, studio, subclass, type]'
30 | 
31 | Dynamic Field Patterns:
32 | 
33 |     In both but DIFFERENT relative order:
34 |         Common, order in A = '[*_i, *_is, *_s, *_ss, *_l, *_ls, *_t, *_txt, *_en, *_b, *_bs, *_f, *_fs, *_d, *_ds, *_coordinate, *_dt, *_dts, *_p, *_ti, *_tl, *_tf, *_td, *_tdt, *_pi, *_c, ignored_*, attr_*, random_*]'
35 |         Common, order in B = '[*_coordinate, ignored_*, random_*, attr_*, *_txt, *_dts, *_tdt, *_is, *_ss, *_ls, *_en, *_bs, *_fs, *_ds, *_dt, *_ti, *_tl, *_tf, *_td, *_pi, *_i, *_s, *_l, *_t, *_b, *_f, *_d, *_p, *_c]'
36 | 
37 | Types:
38 |     In both = '[string, boolean, int, float, long, double, tint, tfloat, tlong, tdouble, date, tdate, binary, pint, plong, pfloat, pdouble, pdate, random, text_ws, text_general, text_en, text_en_splitting, text_en_splitting_tight, text_general_rev, alphaOnlySort, phonetic, payloads, lowercase, descendent_path, ancestor_path, ignored, point, location, location_rpt, currency, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr]'
39 | 
40 | Copy Field Sources:
41 |     In both = '[cat, name, manu, features, includes, price, title, author, description, keywords, content, content_type, resourcename, url]'
42 |     B only = '[id]'
43 | 
44 | Copy Field Destinations:
45 |     In both = '[text, manu_exact, price_c, author_s]'
46 | 
47 | 


--------------------------------------------------------------------------------
/src/main/resources/sample-reports/schema-info-single.txt:
--------------------------------------------------------------------------------
 1 | Notes:
 2 | * It can view schemas from running systems (via REST) or local XML files.
 3 |   (though some fields blank depending on source)
 4 | 
 5 | ========== Individual Reports ==========
 6 | 
 7 | ---------- Schama A: Default Solr 4.6.1 Schema ----------
 8 | Schema Name: example
 9 | Schema Version: 1.5
10 | Key Field: id
11 | Default Operator: null
12 | Similarity Class Name: org.apache.solr.search.similarities.DefaultSimilarityFactory
13 | Default Search Field: null
14 | 
15 | Fields: [_version_, _root_, id, sku, name, manu, cat, features, includes, weight, price, popularity, inStock, store, title, subject, description, comments, author, keywords, category, resourcename, url, content_type, last_modified, links, content, text, text_rev, manu_exact, payloads]
16 | 
17 | Dynamic field Patterns: [*_i, *_is, *_s, *_ss, *_l, *_ls, *_t, *_txt, *_en, *_b, *_bs, *_f, *_fs, *_d, *_ds, *_coordinate, *_dt, *_dts, *_p, *_ti, *_tl, *_tf, *_td, *_tdt, *_pi, *_c, ignored_*, attr_*, random_*]
18 | 
19 | Types: [string, boolean, int, float, long, double, tint, tfloat, tlong, tdouble, date, tdate, binary, pint, plong, pfloat, pdouble, pdate, random, text_ws, text_general, text_en, text_en_splitting, text_en_splitting_tight, text_general_rev, alphaOnlySort, phonetic, payloads, lowercase, descendent_path, ancestor_path, ignored, point, location, location_rpt, currency, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr]
20 | 
21 | Copy Sources: [cat, name, manu, features, includes, price, title, author, description, keywords, content, content_type, resourcename, url]
22 |     From: 'cat' To [text]
23 |     From: 'name' To [text]
24 |     From: 'manu' To [text, manu_exact]
25 |     From: 'features' To [text]
26 |     From: 'includes' To [text]
27 |     From: 'price' To [price_c]
28 |     From: 'title' To [text]
29 |     From: 'author' To [text, author_s]
30 |     From: 'description' To [text]
31 |     From: 'keywords' To [text]
32 |     From: 'content' To [text]
33 |     From: 'content_type' To [text]
34 |     From: 'resourcename' To [text]
35 |     From: 'url' To [text]
36 | Copy Destinations: [text, manu_exact, price_c, author_s]
37 |     Dest: 'text' From [cat, name, manu, features, includes, title, author, description, keywords, content, content_type, resourcename, url]
38 |     Dest: 'manu_exact' From [manu]
39 |     Dest: 'price_c' From [price]
40 |     Dest: 'author_s' From [author]
41 | 
42 | ---------- Schema B: Apollo demo plus local changes ----------
43 | Schema Name: example
44 | Schema Version: 1.5
45 | Key Field: id
46 | Default Operator: (not-available)
47 | Similarity Class Name: (not-available)
48 | Default Search Field: (not-available)
49 | 
50 | Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight]
51 | 
52 | Dynamic field Patterns: [*_coordinate, ignored_*, random_*, attr_*, *_txt, *_dts, *_tdt, *_is, *_ss, *_ls, *_en, *_bs, *_fs, *_ds, *_dt, *_ti, *_tl, *_tf, *_td, *_pi, *_i, *_s, *_l, *_t, *_b, *_f, *_d, *_p, *_c]
53 | 
54 | Types: [alphaOnlySort, ancestor_path, binary, boolean, currency, date, descendent_path, double, float, ignored, int, location, location_rpt, long, lowercase, payloads, pdate, pdouble, pfloat, phonetic, pint, plong, point, random, string, tdate, tdouble, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_en, text_en_splitting, text_en_splitting_tight, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_general, text_general_rev, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr, text_ws, tfloat, tint, tlong]
55 | 
56 | Copy Sources: [author, cat, content, content_type, description, features, id, includes, keywords, manu, name, resourcename, title, url, price]
57 |     From: 'author' To [text, author_s]
58 |     From: 'cat' To [text]
59 |     From: 'content' To [text]
60 |     From: 'content_type' To [text]
61 |     From: 'description' To [text]
62 |     From: 'features' To [text]
63 |     From: 'id' To [text]
64 |     From: 'includes' To [text]
65 |     From: 'keywords' To [text]
66 |     From: 'manu' To [manu_exact, text]
67 |     From: 'name' To [text]
68 |     From: 'resourcename' To [text]
69 |     From: 'title' To [text]
70 |     From: 'url' To [text]
71 |     From: 'price' To [price_c]
72 | Copy Destinations: [text, manu_exact, price_c, author_s]
73 |     Dest: 'text' From [author, cat, content, content_type, description, features, id, includes, keywords, manu, name, resourcename, title, url]
74 |     Dest: 'manu_exact' From [manu]
75 |     Dest: 'price_c' From [price]
76 |     Dest: 'author_s' From [author]
77 | 


--------------------------------------------------------------------------------