├── .gitignore ├── LICENSE.txt ├── README.md ├── pom.xml └── src └── main ├── java └── com │ └── lucidworks │ └── dq │ ├── data │ ├── DateChecker.java │ ├── DeleteByIds.java │ ├── DocCount.java │ ├── DumpIds.java │ ├── EmptyFieldStats.java │ ├── SolrToCsv.java │ ├── SolrToSolr.java │ ├── TermCodepointStats.java │ ├── TermStats.java │ └── TestArgs.java │ ├── diff │ ├── DiffEmptyFieldStats.java │ ├── DiffIds.java │ ├── DiffSchema.java │ └── DiffSolrConfig.java │ ├── logs │ ├── LogEntry.java │ ├── LogEntryBase.java │ ├── LogEntryFromSolr.java │ ├── LogEntryGroup.java │ ├── LogEntryGroupFromSolr.java │ ├── LogEntryReference.java │ ├── LogEntryReferenceBase.java │ ├── LogFile.java │ ├── LogFileBase.java │ ├── LogFileFromSolr.java │ ├── LogFileRepo.java │ └── LogFileRepoBase.java │ ├── schema │ ├── Schema.java │ ├── SchemaBase.java │ ├── SchemaFromLocalCore_broken.java │ ├── SchemaFromRest.java │ ├── SchemaFromRestAdHock.java │ ├── SchemaFromXml.java │ ├── SchemalessPlus.java │ ├── SolrConfig.java │ ├── SolrConfigBase.java │ └── SolrConfigFromXml.java │ ├── util │ ├── CharUtils.java │ ├── CmdLineLauncher.java │ ├── DateUtils.java │ ├── HasDescription.java │ ├── HashAndShard.java │ ├── IO_Utils.java │ ├── LLR.java │ ├── LLR.java-new │ ├── SetUtils.java │ ├── SolrUtils.java │ ├── StatsUtils.java │ ├── StringUtils.java │ └── TupleEntropy.java │ └── zk_experiment │ └── ZkSmartClient.java └── resources ├── DQ-Prototype-and-SolrJ.key ├── DQ-Prototype-and-SolrJ.pdf ├── sample-reports ├── README.txt ├── dates-curve-fitting.txt ├── llr-larger-sample.txt ├── llr-tiny-sample.txt ├── populated-fields-diff.txt ├── populated-fields-single-extended-options.txt ├── populated-fields-single.txt ├── report-terms-via-termsReqHandler.txt ├── schema-info-diff.txt ├── schema-info-single.txt ├── term-counts.txt ├── term-lengths.txt ├── unicode-format1.txt └── unicode-format2.txt ├── schema-461.xml └── schema-481.xml /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Eclipse 4 | .classpath 5 | .project 6 | .settings 7 | 8 | # Package Files # 9 | *.jar 10 | *.war 11 | *.ear 12 | /target 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 2 | 3 | You may obtain a copy of the License at: 4 | http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 7 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.lucidworks 5 | data-quality-java 6 | jar 7 | 1.0-SNAPSHOT 8 | data-quality-java 9 | http://maven.apache.org 10 | 11 | 12 | Data-Quality Checks 13 | 14 | 15 | 4.10.3 16 | 1.6.4 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | org.apache.solr 25 | solr-solrj 26 | ${solr.version} 27 | 28 | 29 | org.apache.solr 30 | solr-core 31 | ${solr.version} 32 | 33 | 34 | 35 | 36 | commons-cli 37 | commons-cli 38 | 1.2 39 | 40 | 41 | 42 | 43 | com.google.code.gson 44 | gson 45 | 2.2.4 46 | 47 | 48 | 49 | 50 | org.codehaus.jackson 51 | jackson-mapper-asl 52 | 1.6.4 53 | 54 | 55 | 56 | 59 | 60 | 72 | 73 | 74 | junit 75 | junit 76 | 3.8.1 77 | test 78 | 79 | 80 | 81 | 82 | 83 | commons-logging 84 | commons-logging 85 | 1.1.1 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | maven-compiler-plugin 97 | 2.3.2 98 | 99 | 1.7 100 | 1.7 101 | 102 | 103 | 104 | 105 | 106 | org.apache.maven.plugins 107 | maven-shade-plugin 108 | 109 | 2.2 110 | 111 | 112 | package 113 | 114 | shade 115 | 116 | 117 | 118 | false 119 | 120 | 121 | com.lucidworks.dq.util.CmdLineLauncher 122 | 123 | 124 | 125 | 126 | 127 | 128 | *:* 129 | 130 | META-INF/*.SF 131 | META-INF/*.DSA 132 | META-INF/*.RSA 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/data/DeleteByIds.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.data; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.io.PrintWriter; 8 | import java.io.StringWriter; 9 | import java.nio.charset.Charset; 10 | import java.nio.charset.CharsetDecoder; 11 | import java.nio.charset.CodingErrorAction; 12 | import java.text.MessageFormat; 13 | import java.text.NumberFormat; 14 | import java.util.Arrays; 15 | import java.util.Collection; 16 | import java.util.LinkedHashMap; 17 | import java.util.LinkedHashSet; 18 | import java.util.LinkedList; 19 | import java.util.List; 20 | import java.util.Map; 21 | import java.util.Map.Entry; 22 | import java.util.Set; 23 | 24 | import org.apache.commons.cli.CommandLine; 25 | import org.apache.commons.cli.CommandLineParser; 26 | import org.apache.commons.cli.HelpFormatter; 27 | import org.apache.commons.cli.OptionBuilder; 28 | import org.apache.commons.cli.Options; 29 | import org.apache.commons.cli.ParseException; 30 | import org.apache.commons.cli.PosixParser; 31 | import org.apache.solr.client.solrj.SolrServerException; 32 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 33 | 34 | import com.lucidworks.dq.util.HasDescription; 35 | import com.lucidworks.dq.util.SetUtils; 36 | import com.lucidworks.dq.util.SolrUtils; 37 | 38 | public class DeleteByIds /*implements HasDescription*/ { 39 | 40 | static String HELP_WHAT_IS_IT = "Delete documents by their ID, either passed on the command line, or from a file, or from standard in / stdin."; 41 | static String HELP_USAGE = "DeleteByIds -u http://localhost:8983/collection1 --ids 1234 5678 ... or --input_file ids_to_delete.txt"; 42 | 43 | public static String getShortDescription() { 44 | return HELP_WHAT_IS_IT; 45 | } 46 | 47 | static int DEFAULT_BATCH_SIZE = 1000; 48 | 49 | static Options options; 50 | 51 | // We use List instead of Set because that's what SolrJ expects in deleteById 52 | static List readIdsFromFile( String targetFile, CharsetDecoder deccoder ) throws IOException { 53 | List ids = new LinkedList(); 54 | BufferedReader in = null; 55 | if( null!=targetFile && ! targetFile.equals("-") ) { 56 | in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder)); 57 | } else { 58 | in = new BufferedReader(new InputStreamReader(System.in, deccoder)); 59 | } 60 | String line; 61 | while ((line = in.readLine()) != null) { 62 | // skip completely blank lines, but doesn't do any trimming 63 | if ( line.length()<1 ) { 64 | continue; 65 | } 66 | ids.add( line ); 67 | } 68 | in.close(); 69 | return ids; 70 | } 71 | 72 | static void helpAndExit() { 73 | helpAndExit( null, 1 ); 74 | } 75 | static void helpAndExit( String optionalError, int errorCode ) { 76 | HelpFormatter formatter = new HelpFormatter(); 77 | if ( null==optionalError ) { 78 | System.err.println( HELP_WHAT_IS_IT ); 79 | } 80 | else { 81 | // log.error( optionalError ); 82 | System.err.println( optionalError ); 83 | } 84 | // stdout 85 | //formatter.printHelp( HELP_USAGE, options, true ); 86 | // stderr 87 | PrintWriter pw = new PrintWriter(System.err); 88 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true ); 89 | pw.flush(); 90 | System.exit( errorCode ); 91 | } 92 | 93 | public static void main( String [] argv ) throws Exception { 94 | 95 | options = new Options(); 96 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" ); 97 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" ); 98 | options.addOption( "p", "port", true, "Port for Solr, default=8983" ); 99 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" ); 100 | options.addOption( "f", "input_file", true, "File to read IDs from, one ID per line (skips 0 length lines, not counting newlines) (Use \"-\" for stdout / standard out)" ); 101 | options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" ); 102 | options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" ); 103 | 104 | options.addOption( OptionBuilder.withLongOpt( "batch_size" ) 105 | .withDescription( "Batch size, 1=doc-by-doc, 0=all-at-once (be careful memory-wise), default="+DEFAULT_BATCH_SIZE ) 106 | .hasArg() 107 | .withType( Number.class ) // NOT Long.class 108 | .create( "b" ) 109 | ); 110 | 111 | options.addOption( OptionBuilder.withLongOpt( "ids" ) 112 | .withDescription( "Pass one or more IDs on the command line" ) 113 | .hasArgs() // PLURAL! 114 | .create( "i" ) 115 | ); 116 | 117 | if ( argv.length < 1 ) { 118 | helpAndExit( "Must specifify at least url or host", 1 ); 119 | } 120 | CommandLine cmd = null; 121 | try { 122 | CommandLineParser parser = new PosixParser(); 123 | cmd = parser.parse( options, argv ); 124 | } 125 | catch( ParseException exp ) { 126 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 ); 127 | } 128 | String fullUrl = cmd.getOptionValue( "url" ); 129 | String host = cmd.getOptionValue( "host" ); 130 | String port = cmd.getOptionValue( "port" ); 131 | String coll = cmd.getOptionValue( "collection" ); 132 | if ( null==fullUrl && null==host ) { 133 | helpAndExit( "Must specifify at least url or host (b)", 3 ); 134 | } 135 | if ( null!=fullUrl && null!=host ) { 136 | helpAndExit( "Must not specifify both url and host", 4 ); 137 | } 138 | // Init 139 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL ); 140 | HttpSolrServer solr; 141 | if ( null!=fullUrl ) { 142 | solr = SolrUtils.getServer( fullUrl ); 143 | } 144 | else { 145 | // Utils handle null values 146 | solr = SolrUtils.getServer( host, port, coll ); 147 | } 148 | 149 | int batchSize = DEFAULT_BATCH_SIZE; 150 | Long batchObj = (Long) cmd.getParsedOptionValue( "batch_size" ); 151 | if ( null!=batchObj ) { 152 | if ( batchObj.longValue() < 0L ) { 153 | helpAndExit( "batch_size must be >= 0", 5 ); 154 | } 155 | batchSize = batchObj.intValue(); 156 | } 157 | 158 | String encodingStr = cmd.getOptionValue( "encoding" ); 159 | // Didn't set encoding 160 | if ( null==encodingStr || encodingStr.trim().length()<1 ) { 161 | encodingStr = "UTF-8"; 162 | } 163 | // Did set encoding 164 | else { 165 | // But didn't set input file 166 | if ( null == cmd.getOptionValue( "input_file" ) ) { 167 | helpAndExit( "Encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 6 ); 168 | } 169 | } 170 | boolean strictEncoding = true; 171 | if(cmd.hasOption("loose_encoding")) { 172 | strictEncoding = false; 173 | if ( null == cmd.getOptionValue( "input_file" ) ) { 174 | helpAndExit( "loose_encoding only applicable when reading from input file or standard in / stdiin; operating system handles command line argument encoding", 7 ); 175 | } 176 | } 177 | // Setup IO encoding 178 | Charset charset = Charset.forName( encodingStr ); 179 | // Input uses Decoder 180 | CharsetDecoder decoder = charset.newDecoder(); 181 | if ( strictEncoding ) { 182 | decoder.onMalformedInput( CodingErrorAction.REPORT ); 183 | } 184 | 185 | String inputFile = cmd.getOptionValue( "input_file" ); 186 | 187 | String [] cmdLineIds = cmd.getOptionValues( "ids" ); 188 | 189 | if ( null==inputFile && null==cmdLineIds ) { 190 | helpAndExit( "Must use at least one of --input_file or --ids ..., OK to use both. For standard in / stdin use --input_file -", 8 ); 191 | } 192 | 193 | // We use List instead of Set because that's what SolrJ expects in deleteById 194 | List ids = new LinkedList(); 195 | if ( null!=inputFile ) { 196 | ids = readIdsFromFile( inputFile, decoder ); 197 | } 198 | if ( null!=cmdLineIds ) { 199 | ids.addAll( Arrays.asList( cmdLineIds ) ); 200 | } 201 | 202 | if ( batchSize < 1 ) { 203 | solr.deleteById(ids); 204 | } 205 | else if ( batchSize == 1 ) { 206 | for ( String id : ids ) { 207 | solr.deleteById( id ); 208 | } 209 | } 210 | else { 211 | for ( int start = 0; start < ids.size(); start += batchSize ) { 212 | int end = start + batchSize; 213 | if ( end > ids.size() ) { 214 | end = ids.size(); 215 | } 216 | List sublist = ids.subList( start, end ); 217 | solr.deleteById( sublist ); 218 | } 219 | } 220 | // Wait for disk commit and new searcher to fire up 221 | // TODO: maybe have other commit options, although this is probably the safest 222 | solr.commit( true, true ); 223 | 224 | } 225 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/data/DocCount.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.data; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.text.MessageFormat; 6 | import java.text.NumberFormat; 7 | import java.util.Collection; 8 | import java.util.LinkedHashMap; 9 | import java.util.LinkedHashSet; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.Set; 13 | 14 | import org.apache.commons.cli.CommandLine; 15 | import org.apache.commons.cli.CommandLineParser; 16 | import org.apache.commons.cli.HelpFormatter; 17 | import org.apache.commons.cli.Options; 18 | import org.apache.commons.cli.ParseException; 19 | import org.apache.commons.cli.PosixParser; 20 | import org.apache.solr.client.solrj.SolrServerException; 21 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 22 | 23 | import com.lucidworks.dq.util.HasDescription; 24 | import com.lucidworks.dq.util.SetUtils; 25 | import com.lucidworks.dq.util.SolrUtils; 26 | 27 | public class DocCount /*implements HasDescription*/ { 28 | 29 | static String HELP_WHAT_IS_IT = "Count of active documents in a collection to standard out / stdout."; 30 | static String HELP_USAGE = "DocCount -u http://localhost:8983 (output sent to stdout)"; 31 | 32 | public static String getShortDescription() { 33 | return HELP_WHAT_IS_IT; 34 | } 35 | 36 | static Options options; 37 | 38 | HttpSolrServer solrServer; 39 | 40 | static void helpAndExit() { 41 | helpAndExit( null, 1 ); 42 | } 43 | static void helpAndExit( String optionalError, int errorCode ) { 44 | HelpFormatter formatter = new HelpFormatter(); 45 | if ( null==optionalError ) { 46 | System.err.println( HELP_WHAT_IS_IT ); 47 | } 48 | else { 49 | // log.error( optionalError ); 50 | System.err.println( optionalError ); 51 | } 52 | // stdout 53 | //formatter.printHelp( HELP_USAGE, options, true ); 54 | // stderr 55 | PrintWriter pw = new PrintWriter(System.err); 56 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true ); 57 | pw.flush(); 58 | System.exit( errorCode ); 59 | } 60 | 61 | public static void main( String [] argv ) throws Exception { 62 | 63 | options = new Options(); 64 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" ); 65 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" ); 66 | options.addOption( "p", "port", true, "Port for Solr, default=8983" ); 67 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" ); 68 | if ( argv.length < 1 ) { 69 | helpAndExit( "Must specifify at least url or host", 1 ); 70 | } 71 | CommandLine cmd = null; 72 | try { 73 | CommandLineParser parser = new PosixParser(); 74 | cmd = parser.parse( options, argv ); 75 | } 76 | catch( ParseException exp ) { 77 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 ); 78 | } 79 | String fullUrl = cmd.getOptionValue( "url" ); 80 | String host = cmd.getOptionValue( "host" ); 81 | String port = cmd.getOptionValue( "port" ); 82 | String coll = cmd.getOptionValue( "collection" ); 83 | if ( null==fullUrl && null==host ) { 84 | helpAndExit( "Must specifify at least url or host (b)", 3 ); 85 | } 86 | if ( null!=fullUrl && null!=host ) { 87 | helpAndExit( "Must not specifify both url and host", 4 ); 88 | } 89 | // Init 90 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL ); 91 | HttpSolrServer solr; 92 | if ( null!=fullUrl ) { 93 | solr = SolrUtils.getServer( fullUrl ); 94 | } 95 | else { 96 | // Utils handle null values 97 | solr = SolrUtils.getServer( host, port, coll ); 98 | } 99 | 100 | long count = SolrUtils.getTotalDocCount( solr ); 101 | System.out.println( count ); 102 | 103 | } 104 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/data/DumpIds.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.data; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.text.MessageFormat; 6 | import java.text.NumberFormat; 7 | import java.util.Collection; 8 | import java.util.LinkedHashMap; 9 | import java.util.LinkedHashSet; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.Set; 13 | 14 | import org.apache.commons.cli.CommandLine; 15 | import org.apache.commons.cli.CommandLineParser; 16 | import org.apache.commons.cli.HelpFormatter; 17 | import org.apache.commons.cli.Options; 18 | import org.apache.commons.cli.ParseException; 19 | import org.apache.commons.cli.PosixParser; 20 | import org.apache.solr.client.solrj.SolrServerException; 21 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 22 | 23 | import com.lucidworks.dq.util.HasDescription; 24 | import com.lucidworks.dq.util.SetUtils; 25 | import com.lucidworks.dq.util.SolrUtils; 26 | 27 | public class DumpIds /*implements HasDescription*/ { 28 | 29 | static String HELP_WHAT_IS_IT = "Dump all the IDs from a collection to standard out / stdout."; 30 | static String HELP_USAGE = "DumpIds -u http://localhost:8983 (output sent to stdout)"; 31 | // final static Logger log = LoggerFactory.getLogger( FieldStats.class ); 32 | 33 | public static String getShortDescription() { 34 | return HELP_WHAT_IS_IT; 35 | } 36 | 37 | static Options options; 38 | 39 | HttpSolrServer solrServer; 40 | 41 | // TODO: refactor to allow options to be settable after constructor is run 42 | public DumpIds( HttpSolrServer server ) throws SolrServerException { 43 | this.solrServer = server; 44 | } 45 | public HttpSolrServer getSolrServer() { 46 | return this.solrServer; 47 | } 48 | 49 | void dumpIds() throws SolrServerException { 50 | Set ids = SolrUtils.getAllIds( getSolrServer() ); 51 | for ( String id : ids ) { 52 | System.out.println( id ); 53 | } 54 | } 55 | 56 | static void helpAndExit() { 57 | helpAndExit( null, 1 ); 58 | } 59 | static void helpAndExit( String optionalError, int errorCode ) { 60 | HelpFormatter formatter = new HelpFormatter(); 61 | if ( null==optionalError ) { 62 | // log.info( HELP_WHAT_IS_IT ); 63 | System.out.println( HELP_WHAT_IS_IT ); 64 | } 65 | else { 66 | // log.error( optionalError ); 67 | System.err.println( optionalError ); 68 | } 69 | formatter.printHelp( HELP_USAGE, options, true ); 70 | System.exit( errorCode ); 71 | } 72 | 73 | public static void main( String [] argv ) throws Exception { 74 | 75 | options = new Options(); 76 | options.addOption( "u", "url", true, "URL for Solr, OR set host, port and possibly collection" ); 77 | options.addOption( "h", "host", true, "IP address for Solr, default=localhost but still required of no other args passed" ); 78 | options.addOption( "p", "port", true, "Port for Solr, default=8983" ); 79 | options.addOption( "c", "collection", true, "Collection/Core for Solr, Eg: collection1" ); 80 | if ( argv.length < 1 ) { 81 | helpAndExit( "Must specifify at least url or host", 1 ); 82 | } 83 | CommandLine cmd = null; 84 | try { 85 | CommandLineParser parser = new PosixParser(); 86 | cmd = parser.parse( options, argv ); 87 | } 88 | catch( ParseException exp ) { 89 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 ); 90 | } 91 | String fullUrl = cmd.getOptionValue( "url" ); 92 | String host = cmd.getOptionValue( "host" ); 93 | String port = cmd.getOptionValue( "port" ); 94 | String coll = cmd.getOptionValue( "collection" ); 95 | if ( null==fullUrl && null==host ) { 96 | helpAndExit( "Must specifify at least url or host (b)", 3 ); 97 | } 98 | if ( null!=fullUrl && null!=host ) { 99 | helpAndExit( "Must not specifify both url and host", 4 ); 100 | } 101 | // Init 102 | // HttpSolrServer solr = SolrUtils.getServer( HOST, PORT, COLL ); 103 | HttpSolrServer solr; 104 | if ( null!=fullUrl ) { 105 | solr = SolrUtils.getServer( fullUrl ); 106 | } 107 | else { 108 | // Utils handle null values 109 | solr = SolrUtils.getServer( host, port, coll ); 110 | } 111 | 112 | // System.out.println( "Solr = " + solr.getBaseURL() ); 113 | // EmptyFieldStats fs = new EmptyFieldStats( solr ); 114 | DumpIds di = new DumpIds( solr ); 115 | di.dumpIds(); 116 | 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/diff/DiffEmptyFieldStats.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.diff; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.text.MessageFormat; 6 | import java.text.NumberFormat; 7 | import java.util.LinkedHashSet; 8 | import java.util.Set; 9 | 10 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 11 | 12 | import com.lucidworks.dq.data.EmptyFieldStats; 13 | import com.lucidworks.dq.schema.Schema; 14 | import com.lucidworks.dq.schema.SchemaFromRest; 15 | import com.lucidworks.dq.schema.SchemaFromXml; 16 | import com.lucidworks.dq.util.HasDescription; 17 | import com.lucidworks.dq.util.SetUtils; 18 | import com.lucidworks.dq.util.SolrUtils; 19 | 20 | import org.apache.commons.cli.CommandLine; 21 | import org.apache.commons.cli.CommandLineParser; 22 | import org.apache.commons.cli.HelpFormatter; 23 | import org.apache.commons.cli.Options; 24 | import org.apache.commons.cli.ParseException; 25 | import org.apache.commons.cli.PosixParser; 26 | 27 | public class DiffEmptyFieldStats /*implements HasDescription*/ { 28 | static String HELP_WHAT_IS_IT = "Compare fields that aren't fully populated between two cores/collections."; 29 | static String HELP_USAGE = "DiffEmptyFieldStats"; 30 | // final static Logger log = LoggerFactory.getLogger( TermStats.class ); 31 | 32 | public static String getShortDescription() { 33 | return HELP_WHAT_IS_IT; 34 | } 35 | 36 | static Options options; 37 | 38 | public static String generateReport( EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB, String labelA, String labelB ) throws Exception { 39 | StringWriter sw = new StringWriter(); 40 | PrintWriter out = new PrintWriter(sw); 41 | 42 | out.println( "========== Differences Report ==========" ); 43 | out.println( "Schema A = " + labelA ); 44 | out.println( "Schema B = " + labelB ); 45 | 46 | out.println(); 47 | addSimpleStatToReport( out, "A: Total Active Docs", fieldStatsA.getTotalDocCount() ); 48 | addSimpleStatToReport( out, "B: Total Active Docs", fieldStatsB.getTotalDocCount() ); 49 | 50 | out.println(); 51 | Set fieldsA = fieldStatsA.getAllFieldNames(); 52 | Set fieldsB = fieldStatsB.getAllFieldNames(); 53 | addSetComparisonToReport( out, fieldsA, fieldsB, "All Fields" ); 54 | 55 | out.println(); 56 | addAllFieldStatsToReport( out, fieldStatsA, fieldStatsB ); 57 | 58 | 59 | // // Simple Values 60 | // // ------------- 61 | // // Name 62 | // String nameA = schemaA.getSchemaName(); 63 | // String nameB = schemaB.getSchemaName(); 64 | // addStringComparisionToReport( out, nameA, nameB, "Schema Name" ); 65 | // // Version 66 | // float versA = schemaA.getSchemaVersion(); 67 | // float versB = schemaB.getSchemaVersion(); 68 | // out.print( "Schema Version: " ); 69 | // if ( versA == versB ) { 70 | // out.println( "Both = '" + versA + "'" ); 71 | // } 72 | // else { 73 | // out.println( "\tA = '" + versA + "'" ); 74 | // out.println( "\tB = '" + versB + "'" ); 75 | // } 76 | 77 | // // Complex Values 78 | // // -------------- 79 | // // Fields 80 | // Set fieldsA = schemaA.getAllSchemaFieldNames(); 81 | // Set fieldsB = schemaB.getAllSchemaFieldNames(); 82 | // addSetComparisonToReport( out, fieldsA, fieldsB, "Fields" ); 83 | // // Dynamic Field Patterns 84 | // // TODO: Verify that order is being preserved through the entire process 85 | // Set patternsA = schemaA.getAllDynamicFieldPatterns(); 86 | // Set patternsB = schemaB.getAllDynamicFieldPatterns(); 87 | // addSetComparisonToReport( out, patternsA, patternsB, "Dynamic-Field Patterns", true ); 88 | 89 | String outStr = sw.toString(); 90 | return outStr; 91 | } 92 | 93 | static void addAllFieldStatsToReport( PrintWriter out, EmptyFieldStats fieldStatsA, EmptyFieldStats fieldStatsB ) { 94 | Set fieldsA = fieldStatsA.getAllFieldNames(); 95 | Set fieldsB = fieldStatsB.getAllFieldNames(); 96 | Set allFields = SetUtils.union_nonDestructive( fieldsA, fieldsB ); 97 | 98 | // Fully Populated 99 | Set fullFieldsA = fieldStatsA.getFullyPopulatedIndexedFields(); 100 | Set fullFieldsB = fieldStatsB.getFullyPopulatedIndexedFields(); 101 | // Subset 102 | Set fullFieldsBoth = SetUtils.intersection_nonDestructive( fullFieldsA, fullFieldsB ); 103 | 104 | // Empty 105 | Set emptyFieldsA = fieldStatsA.getFieldsWithNoIndexedValues(); 106 | Set emptyFieldsB = fieldStatsB.getFieldsWithNoIndexedValues(); 107 | // Subset 108 | Set emptyFieldsBoth = SetUtils.intersection_nonDestructive( emptyFieldsA, emptyFieldsB ); 109 | 110 | // All Other Fields 111 | // We can only summarize the subsets of completely full and completely empty fields in both collections 112 | // All other fields need to be listed in the detailed report 113 | Set detailFields = new LinkedHashSet<>(); 114 | detailFields.addAll( allFields ); 115 | detailFields.removeAll( fullFieldsBoth ); 116 | detailFields.removeAll( emptyFieldsBoth ); 117 | 118 | out.println( "Populated at 100% in Both A and B: " + fullFieldsBoth ); 119 | out.println(); 120 | out.println( "No Indexed Values / 0% in Both A and B: " + emptyFieldsBoth ); 121 | out.println(); 122 | 123 | out.println( "Partially Populated Fields and Percentages, A / B:" ); 124 | for ( String name : detailFields ) { 125 | Long countA = null; 126 | if ( fieldStatsA.getIndexedValueCounts().containsKey(name) ) { 127 | countA = fieldStatsA.getIndexedValueCounts().get(name); 128 | } 129 | Double percentA = null; 130 | if ( fieldStatsA.getIndexedValuePercentages().containsKey(name) ) { 131 | percentA = fieldStatsA.getIndexedValuePercentages().get( name ); 132 | } 133 | Long countB = null; 134 | if ( fieldStatsB.getIndexedValueCounts().containsKey(name) ) { 135 | countB = fieldStatsB.getIndexedValueCounts().get(name); 136 | } 137 | Double percentB = null; 138 | if ( fieldStatsB.getIndexedValuePercentages().containsKey(name) ) { 139 | percentB = fieldStatsB.getIndexedValuePercentages().get( name ); 140 | } 141 | addStatsPairAndPercentToReport( out, name, countA, countB, percentA, percentB, "\t" ); 142 | } 143 | } 144 | 145 | static void addSimpleStatToReport( PrintWriter out, String label, long stat ) { 146 | String statStr = NumberFormat.getNumberInstance().format( stat ); 147 | out.println( "" + label + ": " + statStr ); 148 | } 149 | 150 | static void addStringComparisionToReport( PrintWriter out, String thingA, String thingB, String attrLabel ) { 151 | out.print( attrLabel + ":" ); 152 | if ( null!=thingA && null!=thingB && thingA.equals(thingB) ) { 153 | out.println( " Both = '" + thingA + "'" ); 154 | } 155 | else { 156 | out.println(); 157 | out.println( "\tA = '" + thingA + "'" ); 158 | out.println( "\tB = '" + thingB + "'" ); 159 | } 160 | } 161 | 162 | static void addStatsPairAndPercentToReport( PrintWriter out, String label, Long statA, Long statB, Double percA, Double percB, String optIndent ) { 163 | if ( null!=optIndent ) { 164 | out.print( optIndent ); 165 | } 166 | String statStrA = null!=statA ? NumberFormat.getNumberInstance().format( statA ) : "(not in A)"; 167 | String statStrB = null!=statB ? NumberFormat.getNumberInstance().format( statB ) : "(not in B)"; 168 | String percStrA = null!=percA ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percA ) : ""; 169 | String percStrB = null!=percB ? " (" + MessageFormat.format( "{0,number,#.##%}" + ")", percB ) : ""; 170 | out.println( "" + label + ": " + statStrA + percStrA + " / " + statStrB + percStrB ); 171 | } 172 | 173 | 174 | static void addSetComparisonToReport( PrintWriter out, Set setA, Set setB, String attrLabel ) { 175 | addSetComparisonToReport( out, setA, setB, attrLabel, false ); 176 | } 177 | static void addSetComparisonToReport( PrintWriter out, Set setA, Set setB, String attrLabel, boolean checkOrder ) { 178 | Set inBoth = SetUtils.intersection_nonDestructive( setA, setB ); 179 | Set inAOnly = SetUtils.inAOnly_nonDestructive( setA, setB ); 180 | Set inBOnly = SetUtils.inBOnly_nonDestructive( setA, setB ); 181 | out.println(); 182 | out.print( attrLabel + ":" ); 183 | if ( inBoth.isEmpty() && inAOnly.isEmpty() && inBOnly.isEmpty() ) { 184 | out.println( " None!" ); 185 | } 186 | else { 187 | out.println(); 188 | if ( ! inBoth.isEmpty() ) { 189 | if ( ! checkOrder ) { 190 | out.println( "\tIn both = '" + inBoth + "'" ); 191 | } 192 | else { 193 | // Note: Sets don't normally perserve order but I've been careful 194 | // to use LinkedHashSet and LinkedHashMap, which DO 195 | Set commonA = SetUtils.intersection_nonDestructive( setA, setB ); 196 | Set commonB = SetUtils.intersection_nonDestructive( setB, setA ); 197 | boolean inSameOrder = SetUtils.sameAndInSameOrder( commonA, commonB ); 198 | if ( inSameOrder ) { 199 | out.println( "\tIn both and SAME relative order = '" + inBoth + "'" ); 200 | } 201 | else { 202 | out.println( "\tIn both but DIFFERENT relative order:" ); 203 | out.println( "\t\tCommon, order in A = '" + commonA + "'" ); 204 | out.println( "\t\tCommon, order in B = '" + commonB + "'" ); 205 | } 206 | } 207 | } 208 | if ( ! inAOnly.isEmpty() ) { 209 | out.println( "\tA only = '" + inAOnly + "'" ); 210 | } 211 | if ( ! inBOnly.isEmpty() ) { 212 | out.println( "\tB only = '" + inBOnly + "'" ); 213 | } 214 | } 215 | } 216 | 217 | static void helpAndExit() { 218 | helpAndExit( null, 1 ); 219 | } 220 | static void helpAndExit( String optionalError, int errorCode ) { 221 | HelpFormatter formatter = new HelpFormatter(); 222 | if ( null==optionalError ) { 223 | // log.info( HELP_WHAT_IS_IT ); 224 | System.out.println( HELP_WHAT_IS_IT ); 225 | } 226 | else { 227 | // log.error( optionalError ); 228 | System.err.println( optionalError ); 229 | } 230 | formatter.printHelp( HELP_USAGE, options, true ); 231 | System.exit( errorCode ); 232 | } 233 | 234 | public static void main( String[] argv ) throws Exception { 235 | options = new Options(); 236 | options.addOption( "u", "url_a", true, "URL for first Solr, OR set host, port and possibly collection" ); 237 | options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" ); 238 | options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" ); 239 | options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" ); 240 | options.addOption( "U", "url_b", true, "URL for second Solr, OR set host, port and possibly collection" ); 241 | options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" ); 242 | options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" ); 243 | options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" ); 244 | 245 | if ( argv.length < 1 ) { 246 | helpAndExit(); 247 | } 248 | CommandLine cmd = null; 249 | try { 250 | CommandLineParser parser = new PosixParser(); 251 | // CommandLineParser parser = new DefaultParser(); 252 | cmd = parser.parse( options, argv ); 253 | } 254 | catch( ParseException exp ) { 255 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 ); 256 | } 257 | // Already using -h for host, don't really need help, just run with no options 258 | //if ( cmd.hasOption("help") ) { 259 | // helpAndExit(); 260 | //} 261 | 262 | String fullUrlA = cmd.getOptionValue( "url_a" ); 263 | String hostA = cmd.getOptionValue( "host_a" ); 264 | String portA = cmd.getOptionValue( "port_a" ); 265 | String collA = cmd.getOptionValue( "collection_a" ); 266 | if ( null==fullUrlA && null==hostA ) { 267 | helpAndExit( "Must specifify at least url or host for first Solr", 3 ); 268 | } 269 | if ( null!=fullUrlA && null!=hostA ) { 270 | helpAndExit( "Must not specifify both url and host for first Solr", 4 ); 271 | } 272 | 273 | String fullUrlB = cmd.getOptionValue( "url_b" ); 274 | String hostB = cmd.getOptionValue( "host_b" ); 275 | String portB = cmd.getOptionValue( "port_b" ); 276 | String collB = cmd.getOptionValue( "collection_b" ); 277 | if ( null==fullUrlB && null==hostB ) { 278 | helpAndExit( "Must specifify at least url or host for second Solr", 3 ); 279 | } 280 | if ( null!=fullUrlB && null!=hostB ) { 281 | helpAndExit( "Must not specifify both url and host for second Solr", 4 ); 282 | } 283 | 284 | // Init 285 | // HttpSolrServer solrA = SolrUtils.getServer( HOST1, PORT1, COLL1 ); 286 | HttpSolrServer solrA; 287 | if ( null!=fullUrlA ) { 288 | solrA = SolrUtils.getServer( fullUrlA ); 289 | } 290 | else { 291 | // Utils handle null values 292 | solrA = SolrUtils.getServer( hostA, portA, collA ); 293 | } 294 | System.out.println( "First Solr / Solr A = " + solrA.getBaseURL() ); 295 | // HttpSolrServer solrB = SolrUtils.getServer( HOST2, PORT2, COLL2 ); 296 | HttpSolrServer solrB; 297 | if ( null!=fullUrlB ) { 298 | solrB = SolrUtils.getServer( fullUrlB ); 299 | } 300 | else { 301 | // Utils handle null values 302 | solrB = SolrUtils.getServer( hostB, portB, collB ); 303 | } 304 | System.out.println( "Second Solr / Solr B = " + solrB.getBaseURL() ); 305 | 306 | String labelA = solrA.getBaseURL(); 307 | EmptyFieldStats fieldsStatsA = new EmptyFieldStats( solrA ); 308 | String reportA = fieldsStatsA.generateReport( labelA ); 309 | 310 | String labelB = solrB.getBaseURL(); 311 | EmptyFieldStats fieldsStatsB = new EmptyFieldStats( solrB ); 312 | String reportB = fieldsStatsB.generateReport( labelB ); 313 | 314 | System.out.println( "========== Individual Reports ==========" ); 315 | System.out.println(); 316 | System.out.println( "---------- A: " + labelA + " ----------" ); 317 | System.out.println( reportA ); 318 | System.out.println( "---------- B: " + labelB + " ----------" ); 319 | System.out.println( reportB ); 320 | 321 | String report = generateReport( fieldsStatsA, fieldsStatsB, labelA, labelB ); 322 | System.out.println( report ); 323 | } 324 | 325 | 326 | static String HOST0 = "localhost"; 327 | static String PORT0 = "8983"; 328 | static String COLL0 = "demo_shard1_replica1"; 329 | static String URL0 = "http://" + HOST0 + ":" + PORT0 + "/solr/" + COLL0; 330 | // + "/select?q=*:*&rows=" + ROWS + "&fl=id&wt=json&indent=on" 331 | 332 | static String HOST1 = "localhost"; 333 | static String PORT1 = "8984"; // "8983"; 334 | static String COLL1 = "collection1"; 335 | static String URL1 = "http://" + HOST1 + ":" + PORT1 + "/solr/" + COLL1; 336 | 337 | static String HOST2 = "localhost"; 338 | static String PORT2 = "8985"; // "8983"; 339 | static String COLL2 = "collection1"; 340 | static String URL2 = "http://" + HOST1 + ":" + PORT2 + "/solr/" + COLL2; 341 | 342 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/diff/DiffIds.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.diff; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.io.OutputStreamWriter; 10 | import java.io.PrintStream; 11 | import java.io.PrintWriter; 12 | import java.nio.charset.Charset; 13 | import java.nio.charset.CharsetDecoder; 14 | import java.nio.charset.CharsetEncoder; 15 | import java.nio.charset.CodingErrorAction; 16 | import java.util.LinkedHashSet; 17 | import java.util.Set; 18 | 19 | import org.apache.commons.cli.CommandLine; 20 | import org.apache.commons.cli.CommandLineParser; 21 | import org.apache.commons.cli.HelpFormatter; 22 | import org.apache.commons.cli.Options; 23 | import org.apache.commons.cli.ParseException; 24 | import org.apache.commons.cli.PosixParser; 25 | import org.apache.solr.client.solrj.SolrQuery; 26 | import org.apache.solr.client.solrj.SolrServerException; 27 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 28 | import org.apache.solr.client.solrj.response.QueryResponse; 29 | import org.apache.solr.common.SolrDocument; 30 | 31 | import com.lucidworks.dq.util.HasDescription; 32 | import com.lucidworks.dq.util.SetUtils; 33 | import com.lucidworks.dq.util.SolrUtils; 34 | 35 | public class DiffIds /*implements HasDescription*/ { 36 | static String HELP_WHAT_IS_IT = "Compare IDs between two cores/collections."; 37 | static String HELP_USAGE = "DiffIds"; 38 | // final static Logger log = LoggerFactory.getLogger( TermStats.class ); 39 | 40 | static String MODE_REPORT = "full_report"; 41 | static String MODE_A_ONLY = "a_only"; 42 | static String MODE_B_ONLY = "b_only"; 43 | static String MODE_INTERSECT = "intersect"; 44 | static String MODE_UNION = "union"; 45 | static String DEFAULT_MODE = MODE_REPORT; 46 | static Set VALID_MODES = new LinkedHashSet() {{ 47 | add( MODE_REPORT ); 48 | add( MODE_A_ONLY ); 49 | add( MODE_B_ONLY ); 50 | add( MODE_INTERSECT ); 51 | add( MODE_UNION ); 52 | }}; 53 | 54 | public static String getShortDescription() { 55 | return HELP_WHAT_IS_IT; 56 | } 57 | 58 | public static String NL = System.getProperty("line.separator"); 59 | 60 | // command line options 61 | static Options options; 62 | 63 | static Set readIdsFromFile( File targetFile, CharsetDecoder deccoder ) throws IOException { 64 | Set ids = new LinkedHashSet(); 65 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), deccoder)); 66 | String line; 67 | while ((line = in.readLine()) != null) { 68 | // skip completely blank lines, but doesn't do any trimming 69 | if ( line.length()<1 ) { 70 | continue; 71 | } 72 | ids.add( line ); 73 | } 74 | in.close(); 75 | return ids; 76 | } 77 | 78 | static void helpAndExit() { 79 | helpAndExit( null, 1 ); 80 | } 81 | static void helpAndExit( String optionalError, int errorCode ) { 82 | HelpFormatter formatter = new HelpFormatter(); 83 | if ( null==optionalError ) { 84 | // log.info( HELP_WHAT_IS_IT ); 85 | System.err.println( HELP_WHAT_IS_IT ); 86 | } 87 | else { 88 | // log.error( optionalError ); 89 | System.err.println( optionalError ); 90 | } 91 | // stdout 92 | //formatter.printHelp( HELP_USAGE, options, true ); 93 | // stderr 94 | PrintWriter pw = new PrintWriter(System.err); 95 | formatter.printHelp( pw, 78, HELP_USAGE, null, options, 1, 1, null, true ); 96 | pw.flush(); 97 | System.exit( errorCode ); 98 | } 99 | 100 | public static void main( String[] argv ) throws SolrServerException, IOException { 101 | 102 | options = new Options(); 103 | options.addOption( "u", "url_a", true, "URL for first Solr, Eg http://localhost:8983/solr/collection1, OR set host, port and possibly collection" ); 104 | options.addOption( "h", "host_a", true, "IP address for first Solr, default=localhost" ); 105 | options.addOption( "p", "port_a", true, "Port for first Solr, default=8983" ); 106 | options.addOption( "c", "collection_a", true, "Collection/Core for first Solr, Eg: collection1" ); 107 | 108 | options.addOption( "U", "url_b", true, "URL for second Solr, Eg http://localhost:8983/solr/collection2, OR set host, port and possibly collection" ); 109 | options.addOption( "H", "host_b", true, "IP address for second Solr, default=localhost" ); 110 | options.addOption( "P", "port_b", true, "Port for second Solr, default=8983" ); 111 | options.addOption( "C", "collection_b", true, "Collection/Core for second Solr, Eg: collection1" ); 112 | 113 | options.addOption( "f", "file_a", true, "Read IDs for A from a text file, one ID per line (skips 0 length lines, not counting newlines)" ); 114 | options.addOption( "F", "file_b", true, "Read IDs for B from a text file, one ID per line (skips 0 length lines, not counting newlines)" ); 115 | 116 | options.addOption( "o", "output_file", true, "Output file to create for the full report or ID list (default or \"-\" is stdout / standard out)" ); 117 | options.addOption( "e", "encoding", true, "Character Encoding for reading and writing files (default is UTF-8, which enables cross-platform comparisons)" ); 118 | options.addOption( "l", "loose_encoding", false, "Disable strict character encoding so that problems don't throw Exceptions (NOT recommended)" ); 119 | 120 | options.addOption( "m", "mode", true, 121 | "What to output:" 122 | + " \"" + MODE_REPORT + "\" means fully formatted report (default)" 123 | + ", \"" + MODE_A_ONLY + "\" bare list of IDs only in A (one per line)" 124 | + ", \"" + MODE_B_ONLY + "\" IDs only in B" 125 | + ", \"" + MODE_INTERSECT + "\" IDs preent in BOTH A AND B" 126 | + ", \"" + MODE_UNION + "\" IDs in A or B or in both (combines all IDs from both, but each ID will only appear once)" 127 | ); 128 | if ( argv.length < 1 ) { 129 | helpAndExit(); 130 | } 131 | CommandLine cmd = null; 132 | try { 133 | CommandLineParser parser = new PosixParser(); 134 | // CommandLineParser parser = new DefaultParser(); 135 | cmd = parser.parse( options, argv ); 136 | } 137 | catch( ParseException exp ) { 138 | helpAndExit( "Parsing command line failed. Reason: " + exp.getMessage(), 2 ); 139 | } 140 | // Already using -h for host, don't really need help, just run with no options 141 | //if ( cmd.hasOption("help") ) { 142 | // helpAndExit(); 143 | //} 144 | 145 | String fullUrlA = cmd.getOptionValue( "url_a" ); 146 | String hostA = cmd.getOptionValue( "host_a" ); 147 | String portA = cmd.getOptionValue( "port_a" ); 148 | String collA = cmd.getOptionValue( "collection_a" ); 149 | String fileA = cmd.getOptionValue( "file_a" ); 150 | int optsA = 0; 151 | optsA += (null!=fullUrlA) ? 1 : 0; 152 | optsA += (null!=hostA) ? 1 : 0; 153 | optsA += (null!=fileA) ? 1 : 0; 154 | if ( optsA < 1 ) { 155 | helpAndExit( "Must specifify at least url or host or ids file for first Solr instance", 3 ); 156 | } 157 | if ( optsA > 1 ) { 158 | helpAndExit( "Can only specifify one of url, host or ids file for first Solr instance", 4 ); 159 | } 160 | 161 | String fullUrlB = cmd.getOptionValue( "url_b" ); 162 | String hostB = cmd.getOptionValue( "host_b" ); 163 | String portB = cmd.getOptionValue( "port_b" ); 164 | String collB = cmd.getOptionValue( "collection_b" ); 165 | String fileB = cmd.getOptionValue( "file_b" ); 166 | int optsB = 0; 167 | optsB += (null!=fullUrlB) ? 1 : 0; 168 | optsB += (null!=hostB) ? 1 : 0; 169 | optsB += (null!=fileB) ? 1 : 0; 170 | if ( optsB < 1 ) { 171 | helpAndExit( "Must specifify at least url or host or ids file for second Solr instance", 3 ); 172 | } 173 | if ( optsB > 1 ) { 174 | helpAndExit( "Can only specifify one of url, host or ids file for second Solr instance", 4 ); 175 | } 176 | 177 | // VALID_MODES 178 | String mode = cmd.getOptionValue( "mode" ); 179 | if ( null!=mode ) { 180 | mode = mode.toLowerCase().trim(); 181 | if ( ! VALID_MODES.contains(mode) ) { 182 | helpAndExit( "Invalid mode, must be one of: " + VALID_MODES, 5 ); 183 | } 184 | } 185 | boolean isNormalReport = (null==mode) || mode.equals( MODE_REPORT ); 186 | 187 | // File IO 188 | String outputFile = cmd.getOptionValue( "output_file" ); 189 | String encodingStr = cmd.getOptionValue( "encoding" ); 190 | if ( null==encodingStr || encodingStr.trim().length()<1 ) { 191 | encodingStr = "UTF-8"; 192 | } 193 | boolean strictEncoding = true; 194 | if(cmd.hasOption("loose_encoding")) { 195 | strictEncoding = false; 196 | } 197 | 198 | // Setup IO encoding 199 | Charset charset = Charset.forName( encodingStr ); 200 | // Input uses Decoder 201 | CharsetDecoder decoder = charset.newDecoder(); 202 | // Output uses Encoder 203 | CharsetEncoder encoder = charset.newEncoder(); 204 | if ( strictEncoding ) { 205 | decoder.onMalformedInput( CodingErrorAction.REPORT ); 206 | encoder.onMalformedInput( CodingErrorAction.REPORT ); 207 | } 208 | 209 | PrintWriter out = null; 210 | if( null!=outputFile && ! outputFile.equals("-") ) { 211 | out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoder), true); 212 | } else { 213 | out = new PrintWriter(new OutputStreamWriter(System.out, encoder), true); 214 | } 215 | 216 | // Init 217 | // HttpSolrServer solrA = new HttpSolrServer( URL1 ); 218 | HttpSolrServer solrA = null; 219 | if ( null==fileA ) { 220 | if ( null!=fullUrlA ) { 221 | solrA = SolrUtils.getServer( fullUrlA ); 222 | } 223 | else { 224 | // Utils handle null values 225 | solrA = SolrUtils.getServer( hostA, portA, collA ); 226 | } 227 | if(isNormalReport) out.println( "First Solr / Solr A = " + solrA.getBaseURL() ); 228 | } 229 | else { 230 | if(isNormalReport) out.println( "First Solr / Solr A read from file = " + fileA ); 231 | } 232 | 233 | // HttpSolrServer solrB = new HttpSolrServer( URL2 ); 234 | HttpSolrServer solrB = null; 235 | if ( null==fileB ) { 236 | if ( null!=fullUrlB ) { 237 | solrB = SolrUtils.getServer( fullUrlB ); 238 | } 239 | else { 240 | // Utils handle null values 241 | solrB = SolrUtils.getServer( hostB, portB, collB ); 242 | } 243 | if(isNormalReport) out.println( "Second Solr / Solr B = " + solrB.getBaseURL() ); 244 | } 245 | else { 246 | if(isNormalReport) out.println( "Second Solr / Solr B read from file = " + fileB ); 247 | } 248 | 249 | Set idsA = (null!=solrA) ? SolrUtils.getAllIds( solrA ) : readIdsFromFile( new File(fileA), decoder ); 250 | Set idsB = (null!=solrB) ? SolrUtils.getAllIds( solrB ) : readIdsFromFile( new File(fileB), decoder ); 251 | 252 | if ( isNormalReport ) { 253 | // Use non-destructive here since we use the lists more than once 254 | Set aOnly = SetUtils.inAOnly_nonDestructive(idsA, idsB); 255 | Set bOnly = SetUtils.inBOnly_nonDestructive(idsA, idsB); 256 | out.println( "A-only: " + aOnly ); 257 | out.println( "B-only: " + bOnly ); 258 | } 259 | else { 260 | Set ids = null; 261 | if ( mode.equals(MODE_A_ONLY) ) { 262 | // destructive OK here since we're just doing 1 calculation 263 | ids = SetUtils.inAOnly_destructive( idsA, idsB ); 264 | } 265 | else if ( mode.equals(MODE_B_ONLY) ) { 266 | ids = SetUtils.inBOnly_destructive( idsA, idsB ); 267 | } 268 | else if ( mode.equals(MODE_INTERSECT) ) { 269 | ids = SetUtils.intersection_destructive( idsA, idsB ); 270 | } 271 | else if ( mode.equals(MODE_UNION) ) { 272 | ids = SetUtils.union_destructive( idsA, idsB ); 273 | } 274 | else { 275 | // This should never happen. 276 | // If it ever does, maybe somebody added to VALID_MODES but didn't add a case here 277 | throw new IllegalStateException( "Unknown mode \"" + mode + "\", check VALID_MODES" ); 278 | } 279 | 280 | // Print the results 281 | for ( String id : ids ) { 282 | out.println( id ); 283 | } 284 | } 285 | out.close(); 286 | } 287 | 288 | } 289 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntry.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.Collection; 4 | 5 | /* 6 | * Log entries can have structure. 7 | * Sometimes the structure isn't known when log entries are first ingested, they may come in as raw strings. 8 | * The idea is that a log entry could be fed into a process and then a more specific log entry comes out. 9 | * This process could be repeated for even more specific or normalized entries. 10 | * Ideally more evolved log entries can have the option of still referring back to their parent entries 11 | * for auditing or so that rules can be rerun. 12 | * Another issue is that some series of lines in a log file constitute a higher level log entry. 13 | * Some of the strecture might be fixed text, whereas other items might be parameterizable. 14 | * Eg: 15 | * &name=dave 16 | * &name=mark 17 | * &name=satish 18 | * -> "name" is a fixed identifier, whereas values can vary. 19 | * 20 | * My post on Stack Overflow: 21 | * http://stackoverflow.com/questions/26518770/advanced-requirements-for-log-file-utilities-am-i-reinventing-the-wheel 22 | */ 23 | interface LogEntry { 24 | 25 | String getRawText(); 26 | 27 | Collection getReferences(); 28 | // TODO: should setters be defined in Interface? 29 | // void addReference( LogEntryReference ref ); 30 | 31 | // getDate 32 | // getPath 33 | // getHandler 34 | // getParamsString 35 | // getParent 36 | // getChildren 37 | // getEntities 38 | // getEventLevel // Info, warn, error, default 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntryBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | 6 | public class LogEntryBase implements LogEntry { 7 | 8 | String rawText; 9 | Collection references = new ArrayList<>(); 10 | 11 | LogEntryBase( String rawText ) { 12 | this.rawText = rawText; 13 | } 14 | 15 | @Override 16 | public String getRawText() { 17 | return rawText; 18 | } 19 | public void setRawText( String rawText ) { 20 | this.rawText = rawText; 21 | } 22 | 23 | public static LogEntry logEntryFromString( String rawText ) { 24 | return new LogEntryBase( rawText ); 25 | } 26 | 27 | 28 | @Override 29 | public Collection getReferences() { 30 | return references; 31 | } 32 | 33 | // @Override 34 | public void addReference(LogEntryReference ref) { 35 | references.add( ref ); 36 | } 37 | 38 | /* 39 | * Throw exception so that derived classes are allowed to do so 40 | */ 41 | public static void main(String[] args) throws Exception { 42 | for ( int i=0; i> parsedParamValues; 72 | 73 | // factory method 74 | public static LogEntry solrLogEntryFromBaseEntryOrNull( LogEntry entry ) { 75 | LogEntryFromSolr newEntry = new LogEntryFromSolr( entry ); 76 | if ( newEntry.isSolrPattern() ) { 77 | return newEntry; 78 | } 79 | else { 80 | return null; 81 | } 82 | } 83 | 84 | LogEntryFromSolr( LogEntry entry ) { 85 | this( entry.getRawText() ); 86 | this.earlierEntry = entry; 87 | init( entry.getRawText() ); 88 | } 89 | LogEntryFromSolr(String rawText) { 90 | super( rawText ); 91 | init( rawText ); 92 | } 93 | // need init broken out so constructor1 can store earlierEntry before calling this 94 | void init( String rawText ) { 95 | this.originalText = rawText; 96 | paramsPattern = Pattern.compile( PARAMS_PATTERN_STR ); 97 | paramsMatcher = paramsPattern.matcher( rawText ); 98 | if ( paramsMatcher.find() ) { 99 | String matchStr = paramsMatcher.group(); 100 | setRawText( matchStr ); 101 | int overallStart = paramsMatcher.start(); 102 | int overallEnd = paramsMatcher.end(); 103 | 104 | int group = 1; 105 | paramsString = paramsMatcher.group( group ); 106 | paramsStart = paramsMatcher.start( group ); 107 | paramsEnd = paramsMatcher.end( group ); 108 | // Make relative to overall pattern match 109 | paramsStart -= overallStart; 110 | // paramsEnd = overallEnd - paramsEnd; 111 | // Relative-to-end might not work in streaming apps since we wouldn't know where the end is 112 | paramsEnd -= overallStart; 113 | 114 | 115 | // TODO: look for other things like the handler, matches and qtime 116 | 117 | // Hookup references *if* we were created from an earlier log entry 118 | if ( null != this.earlierEntry ) { 119 | LogEntryReference ref = new LogEntryReferenceBase( this.earlierEntry, this, "LogEntryFromSolr" ); 120 | // ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( paramsStart, paramsEnd ); 121 | ((LogEntryReferenceBase) ref).setRelativeRegionOfInterest( overallStart, overallEnd ); 122 | } 123 | 124 | doSimpleFieldParsing(); 125 | 126 | isSolrPattern = true; 127 | } 128 | } 129 | 130 | public String makeParamNamesKey() { 131 | return StringUtils.join( getParsedSolrParams().keySet(), "|" ); 132 | } 133 | public Set getParamNames() { 134 | return getParsedSolrParams().keySet(); 135 | } 136 | public Collection getParamValues( String paramName ) { 137 | return getParsedSolrParams().get( paramName ); 138 | } 139 | 140 | public static Map tabulateQueryArgCombos( Collection entries ) { 141 | Map counts = new HashMap<>(); 142 | for ( LogEntryFromSolr e : entries ) { 143 | String key = e.makeParamNamesKey(); 144 | SetUtils.incrementMapCounter( counts, key ); 145 | } 146 | return counts; 147 | } 148 | // { composite-parameter-key -> { each-parameter-name-> { unique-value: count } } } 149 | public static Map>> tabulateQueryArgCombosAndValues( Collection entries ) { 150 | // Level 1: by Composite Key 151 | Map>> nestedCounts = new HashMap<>(); 152 | // Foreach Raw Entry 153 | for ( LogEntryFromSolr e : entries ) { 154 | 155 | String overallKey = e.makeParamNamesKey(); 156 | // Level 2: by Parameter Name 157 | Map> paramsAndValues = null; 158 | if ( nestedCounts.containsKey(overallKey) ) { 159 | paramsAndValues = nestedCounts.get(overallKey); 160 | } 161 | else { 162 | paramsAndValues = new TreeMap<>(); // LinkedHashMap<>(); 163 | nestedCounts.put( overallKey, paramsAndValues ); 164 | } 165 | 166 | Set paramNames = e.getParamNames(); 167 | // Foreach Parameter Name 168 | for ( String name : paramNames ) { 169 | // Level 3: by Value 170 | Map tabulatedValues = null; 171 | if ( paramsAndValues.containsKey(name) ) { 172 | tabulatedValues = paramsAndValues.get(name); 173 | } 174 | else { 175 | tabulatedValues = new LinkedHashMap<>(); 176 | paramsAndValues.put( name, tabulatedValues ); 177 | } 178 | Collection rawValues = e.getParamValues( name ); 179 | for ( String rv : rawValues ) { 180 | Long count = 0L; 181 | if ( tabulatedValues.containsKey(rv) ) { 182 | count = tabulatedValues.get(rv); 183 | } 184 | count += 1L; 185 | tabulatedValues.put( rv, count ); 186 | } 187 | 188 | } // End Foreach Parameter Name 189 | 190 | } // End Foreach Raw Entry 191 | 192 | return nestedCounts; 193 | } 194 | 195 | void doSimpleFieldParsing() { 196 | parseHandlerName(); 197 | parseCollectionName(); 198 | parseHits(); 199 | parseStatus(); 200 | parseQTime(); 201 | } 202 | void parseHandlerName() { 203 | handlerName = StringUtils.parseAndCatchGroupAsStringOrNull( HANDLER_PATTERN_STR, getOriginalText(), 1 ); 204 | } 205 | void parseCollectionName() { 206 | collectionName = StringUtils.parseAndCatchGroupAsStringOrNull( COLLECTION_PATTERN_STR, getOriginalText(), 1 ); 207 | } 208 | void parseHits() { 209 | hits = StringUtils.parseAndCatchGroupAsLongOrNull( HITS_PATTERN_STR, getOriginalText(), 1 ); 210 | } 211 | void parseStatus() { 212 | status = StringUtils.parseAndCatchGroupAsLongOrNull( STATUS_PATTERN_STR, getOriginalText(), 1 ); 213 | } 214 | void parseQTime() { 215 | qTime = StringUtils.parseAndCatchGroupAsLongOrNull( QTIME_PATTERN_STR, getOriginalText(), 1 ); 216 | } 217 | 218 | // Not thread safe, but OK for now, for single thread utility 219 | public Map> getParsedSolrParams() { 220 | if ( null==parsedParamValues ) { 221 | parsedParamValues = StringUtils.parseCgiParameters( getParamsString() ); 222 | } 223 | return parsedParamValues; 224 | } 225 | 226 | public boolean isSolrPattern() { 227 | return isSolrPattern; 228 | } 229 | 230 | public String getParamsString() { 231 | return paramsString; 232 | } 233 | 234 | String getOriginalText() { 235 | return originalText; 236 | } 237 | 238 | String getHandlerName() { 239 | return handlerName; 240 | } 241 | String getCollectionName() { 242 | return collectionName; 243 | } 244 | // Don't really need Longs here, but it's what utility returns 245 | /* 246 | * get number of Matches 247 | */ 248 | Long getHits() { 249 | return hits; 250 | } 251 | /* 252 | * Similar to HTTP Numeric Status Code 253 | * Eg: 200, 500, etc. 254 | */ 255 | Long getStatus() { 256 | return status; 257 | } 258 | /* 259 | * Query time in milliseconds 260 | * may not include transmission time of payload to requesting client 261 | */ 262 | Long getQTime() { 263 | return qTime; 264 | } 265 | 266 | public static void main(String[] args) throws IOException { 267 | for ( int i=0; i logs = repo.findLogFiles(); 272 | for ( LogFile lf : logs ) { 273 | lf.read(); 274 | Collection rawEntries = lf.getEntries(); 275 | Collection solrEntries = new ArrayList<>(); 276 | for ( LogEntry rawEntry : rawEntries ) { 277 | // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry ); 278 | LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry ); 279 | // if ( solrEntry.isSolrPattern() ) 280 | if ( null != solrEntry ) 281 | { 282 | solrEntries.add( (LogEntryFromSolr) solrEntry ); 283 | } 284 | } 285 | 286 | // Tabulate 287 | Map queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries ); 288 | // composite-parameter-key -> each-parameter-name-> unique-value -> count 289 | Map>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries ); 290 | queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts ); 291 | queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts ); 292 | 293 | // Report 294 | for ( Entry e1 : queryTypeCounts.entrySet() ) { 295 | String queryType = e1.getKey(); 296 | Long queryTypeCount = e1.getValue(); 297 | System.out.println( "" + queryTypeCount + " " + queryType ); 298 | Map> statsForQueryType = detailedStats.get( queryType ); 299 | for ( Entry> e2 : statsForQueryType.entrySet() ) { 300 | String paramName = e2.getKey(); 301 | System.out.println( "\t" + paramName + ":" ); 302 | Map paramValues = e2.getValue(); 303 | paramValues = SetUtils.sortMapByValues( paramValues ); 304 | paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues ); 305 | for ( Entry e3 : paramValues.entrySet() ) { 306 | String value = e3.getKey(); 307 | Long valueCount = e3.getValue(); 308 | System.out.println( "\t\t" + valueCount + " " + value ); 309 | } 310 | } 311 | } 312 | } 313 | // System.out.println( repo ); 314 | } 315 | 316 | } 317 | 318 | } 319 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntryGroup.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.Collection; 4 | 5 | /* 6 | * TODO: Do we really need this? 7 | * Pro: good abstraction, might developer additional features 8 | * Con: converting back and forth between this and Collection 9 | */ 10 | public interface LogEntryGroup /*extends Collection*/ { 11 | Collection getEntries(); 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntryGroupFromSolr.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.Collection; 4 | 5 | public class LogEntryGroupFromSolr implements LogEntryGroup { 6 | 7 | @Override 8 | public Collection getEntries() { 9 | // TODO Auto-generated method stub 10 | return null; 11 | } 12 | 13 | public static void main(String[] args) { 14 | // TODO Auto-generated method stub 15 | 16 | } 17 | 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntryReference.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.Collection; 4 | 5 | public interface LogEntryReference { 6 | Collection getEarlierEntries(); 7 | Collection getLaterEntries(); 8 | //void addEarlierEntry( LogEntry entry ); 9 | //void addLaterEntry( LogEntry entry ); 10 | 11 | String getComment(); 12 | //void setComment( String comment ); 13 | 14 | int getRelativeStart(); 15 | int getRelativeEnd(); 16 | //void setRelativeRegionOfInterest( int fromStart, int fromEnd ); 17 | //void setRelativeStart( int fromStart ); 18 | //void setRelativeEnd( int fromEnd ); 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogEntryReferenceBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | 6 | public class LogEntryReferenceBase implements LogEntryReference { 7 | 8 | String comment; 9 | // LogEntryGroup is approx Collection 10 | Collection earlierEntries = new ArrayList<>(); 11 | Collection laterEntries = new ArrayList<>(); 12 | 13 | int relativeRegionOfInterestStart; 14 | int relativeRegionOfInterestEnd; 15 | 16 | public LogEntryReferenceBase() { } 17 | 18 | public LogEntryReferenceBase( LogEntry earlierEntry, LogEntry laterEntry, String comment ) { 19 | this(); 20 | // Link to log entries 21 | addEarlierEntry( earlierEntry ); 22 | addLaterEntry( laterEntry ); 23 | // Link log entries back to us 24 | ( (LogEntryBase)earlierEntry ).addReference( this ); 25 | ( (LogEntryBase)laterEntry ).addReference( this ); 26 | setComment( comment ); 27 | } 28 | 29 | @Override 30 | public Collection getEarlierEntries() { 31 | return earlierEntries; 32 | } 33 | public void addEarlierEntry( LogEntry entry ) { 34 | earlierEntries.add( entry ); 35 | } 36 | 37 | @Override 38 | public Collection getLaterEntries() { 39 | return laterEntries; 40 | } 41 | public void addLaterEntry( LogEntry entry ) { 42 | laterEntries.add( entry ); 43 | } 44 | 45 | @Override 46 | public String getComment() { 47 | return comment; 48 | } 49 | public void setComment( String comment ) { 50 | this.comment = comment; 51 | } 52 | 53 | @Override 54 | public int getRelativeStart() { 55 | return relativeRegionOfInterestStart; 56 | } 57 | @Override 58 | public int getRelativeEnd() { 59 | return relativeRegionOfInterestEnd; 60 | } 61 | //@Override 62 | public void setRelativeRegionOfInterest( int fromStart, int fromEnd ) { 63 | relativeRegionOfInterestStart = fromStart; 64 | relativeRegionOfInterestEnd = fromEnd; 65 | } 66 | //@Override 67 | public void setRelativeStart( int fromStart ) { 68 | this.relativeRegionOfInterestStart = fromStart; 69 | } 70 | //@Override 71 | public void setRelativeEnd( int fromEnd ) { 72 | this.relativeRegionOfInterestEnd = fromEnd; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogFile.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.io.IOException; 4 | import java.util.Collection; 5 | 6 | public interface LogFile extends LogEntryGroup { 7 | 8 | void read() throws IOException; 9 | 10 | // Inherits getEntries() from super 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogFileBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.io.UnsupportedEncodingException; 10 | import java.util.ArrayList; 11 | import java.util.Collection; 12 | 13 | public class LogFileBase implements LogFile { 14 | 15 | // TODO: could leave this NULL until they've called .process() ? 16 | Collection entries = new ArrayList<>(); 17 | File sourceFile; 18 | 19 | // Public "factory" methods 20 | public static LogFile logFileFromDiskFile( File inFile ) throws IOException { 21 | return new LogFileBase( inFile ); 22 | } 23 | public static LogFile logFileFromDiskFile( String fileName ) throws IOException { 24 | return new LogFileBase( new File(fileName) ); 25 | } 26 | 27 | LogFileBase( File sourceFile ) { 28 | this.sourceFile = sourceFile; 29 | } 30 | 31 | // Break out processing logic out from constructor 32 | // in case we want to defer it 33 | @Override 34 | public void read() throws IOException { 35 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile), "UTF-8")); 36 | while( true ) { 37 | String line = in.readLine(); 38 | if ( null==line ) { 39 | break; 40 | } 41 | LogEntry entry = LogEntryBase.logEntryFromString( line ); 42 | entries.add( entry ); 43 | } 44 | in.close(); 45 | } 46 | 47 | @Override 48 | public Collection getEntries() { 49 | return entries; 50 | } 51 | 52 | 53 | public static void main(String[] args) throws IOException { 54 | for ( int i=0; i findLogFiles( File startingDirOrFile ); 10 | //Collection findLogFiles( Collection startingDirOrFiles ); 11 | Collection findLogFiles(); 12 | 13 | // TODO: maybe Log *File* Repo is a filesystem impl of a more generic Log Unit Source Repo 14 | // TODO: although we really do need setters, should they be defined in the interface? 15 | String getIncludePattern(); 16 | void setIncludePattern( String pattern ); 17 | boolean getIncludeCompressedFiles(); 18 | void setIncludeCompressedFiles( boolean flag ); 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/logs/LogFileRepoBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.logs; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Collection; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | import java.util.Queue; 10 | import java.util.concurrent.ConcurrentLinkedQueue; 11 | 12 | import com.lucidworks.dq.util.SetUtils; 13 | 14 | public class LogFileRepoBase implements LogFileRepo { 15 | 16 | Collection myQueue = new ConcurrentLinkedQueue<>(); 17 | 18 | File startingDirOrFile; 19 | 20 | // Regex, Optional 21 | String includePattern; 22 | 23 | boolean shouldIncludeCompressedFiles; 24 | 25 | public LogFileRepoBase( String startingDirOrFile ) { 26 | this( new File(startingDirOrFile) ); 27 | } 28 | public LogFileRepoBase( File startingDirOrFile ) { 29 | this.startingDirOrFile = startingDirOrFile; 30 | } 31 | 32 | @Override 33 | public Collection findLogFiles() { 34 | traverse( myQueue, startingDirOrFile ); 35 | Collection outList = new ArrayList<>(); 36 | for ( File f : myQueue ) { 37 | LogFile lf = new LogFileBase( f ); 38 | outList.add( lf ); 39 | } 40 | return outList; 41 | } 42 | 43 | @Override 44 | public void setIncludePattern(String pattern) { 45 | this.includePattern = pattern; 46 | } 47 | @Override 48 | public String getIncludePattern() { 49 | return includePattern; 50 | } 51 | 52 | @Override 53 | public void setIncludeCompressedFiles(boolean flag) { 54 | this.shouldIncludeCompressedFiles = flag; 55 | } 56 | @Override 57 | public boolean getIncludeCompressedFiles() { 58 | return shouldIncludeCompressedFiles; 59 | } 60 | 61 | //Lookup all the files 62 | //traverse( myQueue, "someDirName", null ); 63 | //Or simpler 64 | //Collection files = LinkedHashSet(); 65 | //traverse( files, "someDirName", null ); 66 | 67 | //TODO: would be better to pass in method to call 68 | void traverse( Collectionqueue, String startDir ) { 69 | traverse( queue, new File(startDir) ); 70 | } 71 | void traverse( Collectionqueue, File candidate ) { 72 | if( candidate.isFile() ) { 73 | if ( null==getIncludePattern() || candidate.toString().matches(getIncludePattern()) ) { 74 | queue.add( candidate ); 75 | } 76 | } 77 | // Else probably a directory 78 | else if ( candidate.isDirectory() ) { 79 | File [] entries = candidate.listFiles(); 80 | for ( File f : entries ) { 81 | traverse( queue, f ); 82 | } 83 | } 84 | else { 85 | System.out.println( "ERROR: Neither file nor directory: " + candidate ); 86 | } 87 | } 88 | 89 | public static void main(String[] args) throws IOException { 90 | // Moved to LogEntryFromSolr main 91 | 92 | // for ( int i=0; i logs = repo.findLogFiles(); 95 | // for ( LogFile lf : logs ) { 96 | // lf.read(); 97 | // Collection rawEntries = lf.getEntries(); 98 | // Collection solrEntries = new ArrayList<>(); 99 | // for ( LogEntry rawEntry : rawEntries ) { 100 | // // LogEntryFromSolr solrEntry = new LogEntryFromSolr( rawEntry ); 101 | // LogEntry solrEntry = LogEntryFromSolr.solrLogEntryFromBaseEntryOrNull( rawEntry ); 102 | // // if ( solrEntry.isSolrPattern() ) 103 | // if ( null != solrEntry ) 104 | // { 105 | // solrEntries.add( (LogEntryFromSolr) solrEntry ); 106 | // } 107 | // } 108 | // Map queryTypeCounts = LogEntryFromSolr.tabulateQueryArgCombos( solrEntries ); 109 | // // composite-parameter-key -> each-parameter-name-> unique-value -> count 110 | // Map>> detailedStats = LogEntryFromSolr.tabulateQueryArgCombosAndValues( solrEntries ); 111 | // queryTypeCounts = SetUtils.sortMapByValues( queryTypeCounts ); 112 | // queryTypeCounts = SetUtils.reverseMapEntryKeyOrder( queryTypeCounts ); 113 | // for ( Entry e1 : queryTypeCounts.entrySet() ) { 114 | // String queryType = e1.getKey(); 115 | // Long queryTypeCount = e1.getValue(); 116 | // System.out.println( "" + queryTypeCount + " " + queryType ); 117 | // Map> statsForQueryType = detailedStats.get( queryType ); 118 | // for ( Entry> e2 : statsForQueryType.entrySet() ) { 119 | // String paramName = e2.getKey(); 120 | // System.out.println( "\t" + paramName + ":" ); 121 | // Map paramValues = e2.getValue(); 122 | // paramValues = SetUtils.sortMapByValues( paramValues ); 123 | // paramValues = SetUtils.reverseMapEntryKeyOrder( paramValues ); 124 | // for ( Entry e3 : paramValues.entrySet() ) { 125 | // String value = e3.getKey(); 126 | // Long valueCount = e3.getValue(); 127 | // System.out.println( "\t\t" + valueCount + " " + value ); 128 | // } 129 | // } 130 | // } 131 | // } 132 | // // System.out.println( repo ); 133 | // } 134 | 135 | } 136 | 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/Schema.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | public interface Schema { 7 | 8 | // TODO: move throws Exception down to implementation level 9 | // and errors buffer 10 | 11 | public float getSchemaVersion() throws Exception; 12 | 13 | public String getSchemaName() throws Exception; 14 | 15 | public String getUniqueKeyFieldName() throws Exception; 16 | 17 | public String getSimilarityModelClassName() throws Exception; 18 | 19 | public String getDefaultOperator() throws Exception; 20 | 21 | public String getDefaultSearchField() throws Exception; 22 | 23 | public Map> getAllDeclaredAndDynamicFieldsByType() throws Exception; 24 | 25 | public Set getAllSchemaFieldNames() throws Exception; 26 | 27 | public Set getAllDynamicFieldPatterns() throws Exception; 28 | 29 | public Set getAllFieldTypeNames() throws Exception; 30 | 31 | public Set getAllCopyFieldSourceNames() throws Exception; 32 | 33 | public Set getAllCopyFieldDestinationNames() throws Exception; 34 | 35 | public Set getCopyFieldDestinationsForSource(String sourceName) throws Exception; 36 | 37 | public Set getCopyFieldSourcesForDestination(String destName) throws Exception; 38 | 39 | public String generateReport() throws Exception; 40 | 41 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/SchemaBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.util.LinkedHashSet; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | public abstract class SchemaBase implements Schema { 10 | 11 | // Also helpful for debugging code 12 | @Override 13 | public String generateReport() throws Exception { 14 | StringWriter sw = new StringWriter(); 15 | PrintWriter out = new PrintWriter(sw); 16 | 17 | // Singular Values 18 | String name = getSchemaName(); 19 | out.println( "Schema Name: " + name ); 20 | float vers = getSchemaVersion(); 21 | out.println("Schema Version: " + vers); 22 | String key = getUniqueKeyFieldName(); 23 | out.println( "Key Field: " + key ); 24 | String defOp = getDefaultOperator(); 25 | out.println( "Default Operator: " + defOp ); 26 | String sim = getSimilarityModelClassName(); 27 | out.println( "Similarity Class Name: " + sim ); 28 | String defField = getDefaultSearchField(); 29 | out.println( "Default Search Field: " + defField ); 30 | 31 | // Complex Values 32 | Set fields = getAllSchemaFieldNames(); 33 | out.println(); 34 | out.println( "Fields: " + fields ); 35 | 36 | Set dynFields = getAllDynamicFieldPatterns(); 37 | out.println(); 38 | out.println( "Dynamic Field Patterns: " + dynFields ); 39 | 40 | Set typeNames = getAllFieldTypeNames(); 41 | out.println(); 42 | out.println( "Types: " + typeNames ); 43 | 44 | Map> typesAndNames = getAllDeclaredAndDynamicFieldsByType(); 45 | out.println(); 46 | out.println( "Type -> Fields: (declared and dynamic patterns)" ); 47 | out.println( "\t(" + typesAndNames.size() + " types)" ); 48 | for ( String type : typesAndNames.keySet() ) { 49 | out.println( "\t" + type + ":" ); 50 | Set typeFields = typesAndNames.get( type ); 51 | out.println( "\t\t(" + typeFields.size() + " fields)" ); 52 | for ( String field : typeFields ) { 53 | out.println( "\t\t" + field ); 54 | } 55 | } 56 | 57 | 58 | Set sourceNames = getAllCopyFieldSourceNames(); 59 | out.println(); 60 | out.println( "Copy Sources: " + sourceNames ); 61 | for ( String source : sourceNames ) { 62 | Set tmpDests = getCopyFieldDestinationsForSource(source); 63 | out.println( "\tFrom: '"+ source + "' To " + tmpDests ); 64 | } 65 | 66 | Set destNames = getAllCopyFieldDestinationNames(); 67 | out.println(); 68 | out.println( "Copy Destinations: " + destNames ); 69 | for ( String dest : destNames ) { 70 | Set tmpSrcs = getCopyFieldSourcesForDestination( dest ); 71 | out.println( "\tDest: '"+ dest + "' From " + tmpSrcs ); 72 | } 73 | 74 | String outStr = sw.toString(); 75 | return outStr; 76 | } 77 | 78 | static void utilTabulateFieldTypeAndName( Map> map, String type, String name ) { 79 | if ( map.containsKey(type) ) { 80 | map.get(type).add( name ); 81 | } 82 | else { 83 | Set vector = new LinkedHashSet<>(); 84 | vector.add( name ); 85 | map.put( type, vector ); 86 | } 87 | } 88 | 89 | @Override 90 | public abstract float getSchemaVersion() throws Exception; 91 | @Override 92 | public abstract String getSchemaName() throws Exception; 93 | @Override 94 | public abstract String getUniqueKeyFieldName() throws Exception; 95 | @Override 96 | public abstract String getSimilarityModelClassName() throws Exception; 97 | @Override 98 | public abstract String getDefaultOperator() throws Exception; 99 | @Override 100 | public abstract String getDefaultSearchField() throws Exception; 101 | @Override 102 | public abstract Set getAllSchemaFieldNames() throws Exception; 103 | @Override 104 | public abstract Set getAllDynamicFieldPatterns() throws Exception; 105 | @Override 106 | public abstract Set getAllFieldTypeNames() throws Exception; 107 | @Override 108 | public abstract Set getAllCopyFieldSourceNames() throws Exception; 109 | @Override 110 | public abstract Set getAllCopyFieldDestinationNames() throws Exception; 111 | @Override 112 | public abstract Set getCopyFieldDestinationsForSource(String sourceName) throws Exception; 113 | @Override 114 | public abstract Set getCopyFieldSourcesForDestination(String destName) throws Exception; 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/SchemaFromLocalCore_broken.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.LinkedHashSet; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Map.Entry; 8 | import java.util.Properties; 9 | import java.util.Set; 10 | 11 | import org.apache.solr.common.util.NamedList; 12 | import org.apache.solr.core.ConfigSolr; 13 | import org.apache.solr.core.ConfigSolrXmlOld; 14 | import org.apache.solr.core.CoreContainer; 15 | import org.apache.solr.core.SolrCore; 16 | import org.apache.solr.core.SolrResourceLoader; 17 | import org.apache.solr.request.LocalSolrQueryRequest; 18 | import org.apache.solr.request.SolrQueryRequest; 19 | import org.apache.solr.schema.CopyField; 20 | import org.apache.solr.schema.FieldType; 21 | import org.apache.solr.schema.IndexSchema; 22 | import org.apache.solr.schema.IndexSchema.DynamicField; 23 | import org.apache.solr.schema.SchemaField; 24 | 25 | public class SchemaFromLocalCore_broken extends SchemaBase implements Schema { 26 | 27 | static String PATH1 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example"; 28 | static String PATH2 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr"; 29 | static String PATH3 = "/Users/mbennett/data/dev/solr-lucene-461-src/solr/example/solr/collection1"; 30 | 31 | private IndexSchema schema; 32 | 33 | public SchemaFromLocalCore_broken( String path, String optCoreName ) { 34 | // TODO: currently broken, touble finding info online, postponing for now 35 | SolrResourceLoader loader = new SolrResourceLoader( path ); 36 | String confDir = loader.getConfigDir(); 37 | String dataDir = loader.getDataDir(); 38 | String instanceDir = loader.getInstanceDir(); 39 | Properties props = loader.getCoreProperties(); 40 | System.out.println( "path = " + path ); 41 | System.out.println( "confDir = " + confDir ); 42 | System.out.println( "dataDir = " + dataDir ); 43 | System.out.println( "instanceDir = " + instanceDir ); 44 | System.out.println( "props = " + props ); 45 | ConfigSolr config = ConfigSolr.fromSolrHome( loader, path ); 46 | CoreContainer container = new CoreContainer( loader, config ); 47 | if ( container.getCores().isEmpty() ) { 48 | throw new IllegalArgumentException( "No cores found at " + path ); 49 | } 50 | String coreName = optCoreName!=null ? optCoreName : ConfigSolrXmlOld.DEFAULT_DEFAULT_CORE_NAME; 51 | SolrCore core = container.getCore( coreName ); 52 | if ( null==core ) { 53 | throw new IllegalArgumentException( "Unable to find core \"" + coreName + "\" at " + path ); 54 | } 55 | // SolrQueryRequest req = new LocalSolrQueryRequest( core, "*:*", null, 0, 0, null ); 56 | NamedList args = new NamedList(); 57 | SolrQueryRequest req = new LocalSolrQueryRequest( core, args ); 58 | schema = req.getSchema(); 59 | }; 60 | 61 | public float getSchemaVersion() throws Exception { 62 | return schema.getVersion(); 63 | } 64 | 65 | public String getSchemaName() throws Exception { 66 | return schema.getSchemaName(); 67 | } 68 | 69 | public String getUniqueKeyFieldName() throws Exception { 70 | return schema.getUniqueKeyField().getName(); 71 | } 72 | 73 | public String getSimilarityModelClassName() throws Exception { 74 | return schema.getSimilarity().getClass().getName(); 75 | } 76 | 77 | // TODO: not sure where this comes from 78 | public String getDefaultOperator() throws Exception { 79 | return null; 80 | } 81 | 82 | public String getDefaultSearchField() throws Exception { 83 | return schema.getDefaultSearchFieldName(); 84 | } 85 | 86 | public Map> getAllDeclaredAndDynamicFieldsByType() { 87 | Map> out = new LinkedHashMap<>(); 88 | return out; 89 | //return null; 90 | } 91 | 92 | public Set getAllSchemaFieldNames() throws Exception { 93 | Map fields = schema.getFields(); 94 | return fields.keySet(); 95 | // return new LinkedHashSet<>( fields.keySet() ); 96 | } 97 | 98 | public Set getAllDynamicFieldPatterns() throws Exception { 99 | DynamicField[] dynFields = schema.getDynamicFields(); 100 | Set out = new LinkedHashSet<>(); 101 | for ( DynamicField df : dynFields ) { 102 | out.add( df.getRegex() ); 103 | } 104 | return out; 105 | } 106 | 107 | public Set getAllFieldTypeNames() throws Exception { 108 | Map types = schema.getFieldTypes(); 109 | return types.keySet(); 110 | } 111 | 112 | public Set getAllCopyFieldSourceNames() throws Exception { 113 | Map> copyMap = schema.getCopyFieldsMap(); 114 | return copyMap.keySet(); 115 | } 116 | 117 | public Set getAllCopyFieldDestinationNames() throws Exception { 118 | Set out = new LinkedHashSet<>(); 119 | Map> copyMap = schema.getCopyFieldsMap(); 120 | for ( Entry> copyEntry : copyMap.entrySet() ) { 121 | // String srcFieldName = copyEntry.getKey(); 122 | List copyList = copyEntry.getValue(); 123 | for ( CopyField cf : copyList ) { 124 | SchemaField destField = cf.getDestination(); 125 | out.add( destField.getName() ); 126 | } 127 | } 128 | return out; 129 | } 130 | 131 | public Set getCopyFieldDestinationsForSource(String sourceName) throws Exception { 132 | Set out = new LinkedHashSet<>(); 133 | List copyList = schema.getCopyFieldsList( sourceName ); 134 | if ( null==copyList || copyList.isEmpty() ) { 135 | return out; 136 | } 137 | for ( CopyField cf : copyList ) { 138 | SchemaField destField = cf.getDestination(); 139 | out.add( destField.getName() ); 140 | } 141 | return out; 142 | } 143 | 144 | public Set getCopyFieldSourcesForDestination(String targetDestName) throws Exception { 145 | Set out = new LinkedHashSet<>(); 146 | Map> copyMap = schema.getCopyFieldsMap(); 147 | for ( Entry> copyEntry : copyMap.entrySet() ) { 148 | String srcFieldName = copyEntry.getKey(); 149 | List copyList = copyEntry.getValue(); 150 | for ( CopyField cf : copyList ) { 151 | SchemaField destField = cf.getDestination(); 152 | String destFieldName = destField.getName(); 153 | if ( destFieldName.equals(targetDestName) ) { 154 | out.add( srcFieldName ); 155 | } 156 | } 157 | } 158 | return out; 159 | } 160 | 161 | // public String generateReport() throws Exception; 162 | 163 | public static void main( String[] argv ) throws Exception { 164 | Schema schema = new SchemaFromLocalCore_broken( PATH3, null ); 165 | schema.generateReport(); 166 | } 167 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/SolrConfig.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.util.Collection; 4 | 5 | import javax.xml.xpath.XPathExpressionException; 6 | 7 | public interface SolrConfig { 8 | 9 | public String generateReport() throws Exception; 10 | 11 | // Can't return float, could be const or config 12 | public String getLuceneMatchVersion() throws Exception; 13 | 14 | // Can't return bool, could be const or config 15 | public String getAbortOnConfigurationError() throws Exception; 16 | 17 | public Collection getRequestHandlers() throws Exception; 18 | 19 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/SolrConfigBase.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.util.Collection; 6 | 7 | public abstract class SolrConfigBase implements SolrConfig { 8 | 9 | @Override 10 | public String generateReport() throws Exception { 11 | StringWriter sw = new StringWriter(); 12 | PrintWriter out = new PrintWriter(sw); 13 | 14 | // Singular Values 15 | 16 | String version = getLuceneMatchVersion(); 17 | out.println( "Lucene Match Version = " + version ); 18 | String abort = getAbortOnConfigurationError(); 19 | out.println( "Abort on config error = " + abort ); 20 | 21 | // Complex Values 22 | 23 | Collection handlers = getRequestHandlers(); 24 | out.println(); 25 | out.println( "Request Handlers and Classes:" ); 26 | for ( String handler : handlers ) { 27 | out.println( "\t" + handler ); 28 | } 29 | 30 | String outStr = sw.toString(); 31 | return outStr; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/schema/SolrConfigFromXml.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.schema; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.net.URL; 8 | import java.util.ArrayList; 9 | import java.util.Collection; 10 | import java.util.LinkedHashSet; 11 | import java.util.Set; 12 | 13 | import javax.xml.parsers.DocumentBuilder; 14 | import javax.xml.parsers.DocumentBuilderFactory; 15 | import javax.xml.parsers.ParserConfigurationException; 16 | import javax.xml.xpath.XPath; 17 | import javax.xml.xpath.XPathConstants; 18 | import javax.xml.xpath.XPathExpressionException; 19 | import javax.xml.xpath.XPathFactory; 20 | 21 | import org.w3c.dom.Document; 22 | import org.w3c.dom.NamedNodeMap; 23 | import org.w3c.dom.NodeList; 24 | import org.xml.sax.SAXException; 25 | import org.w3c.dom.Node; 26 | 27 | public class SolrConfigFromXml extends SolrConfigBase implements SolrConfig { 28 | // get from resources folder 29 | static String CONFIG_FILE_NAME = "solrconfig-480.xml"; 30 | 31 | Document document; 32 | XPathFactory xpathFactory = XPathFactory.newInstance(); 33 | private final String prefix = null; 34 | private final String name = ""; 35 | 36 | // Note: Some of this code was copied from: 37 | // * Solr's IndexSchema.java 38 | // * Solr's Config.java 39 | 40 | 41 | public SolrConfigFromXml() throws ParserConfigurationException, IOException, SAXException { 42 | // this( SCHEMA_FILE_NAME ); 43 | //URL schemaPath = this.getClass().getResource( CONFIG_FILE_NAME ); 44 | //init( schemaPath ); 45 | init( (URL) null ); 46 | } 47 | public SolrConfigFromXml( File schemaPath ) throws ParserConfigurationException, SAXException, IOException { 48 | // URI uri = schemaPath.toURI(); 49 | // URL url = uri.toURL(); 50 | // init( url ); 51 | InputStream is = new FileInputStream( schemaPath ); 52 | init( is ); 53 | } 54 | public SolrConfigFromXml( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException { 55 | init( schemaPath ); 56 | } 57 | void init( URL schemaPath ) throws ParserConfigurationException, IOException, SAXException { 58 | if ( null==schemaPath ) { 59 | schemaPath = this.getClass().getClassLoader().getResource( CONFIG_FILE_NAME ); 60 | } 61 | InputStream is = schemaPath.openConnection().getInputStream(); 62 | init( is ); 63 | } 64 | void init( InputStream in ) throws ParserConfigurationException, SAXException, IOException { 65 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 66 | DocumentBuilder builder = factory.newDocumentBuilder(); 67 | this.document = builder.parse( in ); 68 | xpathFactory = XPathFactory.newInstance(); 69 | } 70 | 71 | // Can't return float, could be const or config 72 | /* (non-Javadoc) 73 | * @see com.lucidworks.dq.schema.SolrConfig#getLuceneMatchVersion() 74 | */ 75 | @Override 76 | public String getLuceneMatchVersion() throws Exception { 77 | XPath xpath = xpathFactory.newXPath(); 78 | // "/config/luceneMatchVersion" 79 | String expression = stepsToPath(CONFIG, LUCENE_VERSION); 80 | // float version = getFloat(expression, 0.0f); 81 | Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE); 82 | String payload = null; 83 | if ( null!=nd ) { 84 | // payload = nd.getNodeValue(); 85 | payload = nd.getTextContent(); 86 | } 87 | return payload; 88 | } 89 | 90 | // Can't return bool, could be const or config 91 | /* (non-Javadoc) 92 | * @see com.lucidworks.dq.schema.SolrConfig#getAbortOnConfigurationError() 93 | */ 94 | @Override 95 | public String getAbortOnConfigurationError() throws Exception { 96 | XPath xpath = xpathFactory.newXPath(); 97 | // "/config/abortOnConfigurationError" 98 | String expression = stepsToPath(CONFIG, ABORT); 99 | Node nd = (Node) xpath.evaluate(expression, document, XPathConstants.NODE); 100 | String payload = null; 101 | if ( null!=nd ) { 102 | payload = nd.getTextContent(); 103 | } 104 | return payload; 105 | } 106 | 107 | // TODO: getLibs: 108 | // TODO: getDataDir: ${solr.data.dir:} 109 | // TODO: getDirectoryFactory: 110 | // TODO: getIndexConfig (nested!): 111 | // TODO: 112 | // TODO: 113 | // TODO: nested 114 | // TODO: Nested: 115 | // TODO: Nested: 116 | // * TODO: Request Handlers, Nested! 117 | // 118 | // 119 | // 120 | // 121 | // 122 | // 123 | // 124 | // 125 | // Parts copied from Solr's IndexSchema .loadFields 126 | /* (non-Javadoc) 127 | * @see com.lucidworks.dq.schema.SolrConfig#getRequestHandlers() 128 | */ 129 | @Override 130 | public Collection getRequestHandlers() throws XPathExpressionException { 131 | Collection out = new ArrayList<>(); 132 | XPath xpath = xpathFactory.newXPath(); 133 | // /schema/fields/field | /schema/fields/dynamicField 134 | // | /schema/field | /schema/dynamicField 135 | // Note: could remove OR and eliminate node name check, but this is closer to Solr code 136 | String expression = stepsToPath(CONFIG, HANDLER); 137 | NodeList nodes = (NodeList)xpath.evaluate(expression, document, XPathConstants.NODESET); 138 | for (int i=0; i TYPES = new HashMap() {{ 20 | // put( 1, "R / DIRECTIONALITY_RIGHT_TO_LEFT" ); 21 | // put( 2, "AL / DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC" ); 22 | // put( 11, "S / DIRECTIONALITY_SEGMENT_SEPARATOR" ); 23 | // put( 12, "WS / DIRECTIONALITY_WHITESPACE" ); 24 | put( 1, "Lu_UPPERCASE_LETTER" ); 25 | put( 2, "Ll_LOWERCASE_LETTER" ); 26 | put( 3, "Lt_TITLECASE_LETTER" ); 27 | put( 4, "Lm_MODIFIER_LETTER" ); 28 | put( 5, "Lo_OTHER_LETTER" ); 29 | put( 6, "Mn_NON_SPACING_MARK" ); 30 | put( 7, "Me_ENCLOSING_MARK" ); 31 | put( 8 , "Mc_COMBINING_SPACING_MARK" ); 32 | put( 9, "Nd_DECIMAL_DIGIT_NUMBER" ); 33 | put( 11, "No_OTHER_NUMBER" ); 34 | put( 12, "Zs_SPACE_SEPARATOR" ); 35 | put( 13, "Zl_LINE_SEPARATOR" ); 36 | put( 14, "Zp_PARAGRAPH_SEPARATOR" ); 37 | put( 15, "Cc_CONTROL" ); 38 | put( 16, "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 39 | // 17? 40 | put( 18, "Co_PRIVATE_USE" ); 41 | put( 19, "Cs_SURROGATE" ); 42 | put( 20, "Pd_DASH_PUNCTUATION" ); 43 | put( 21, "Ps_START_PUNCTUATION" ); 44 | put( 22, "Pe_END_PUNCTUATION" ); 45 | put( 23, "Pc_CONNECTOR_PUNCTUATION" ); 46 | put( 24, "Po_OTHER_PUNCTUATION" ); 47 | put( 25, "Sm_MATH_SYMBOL" ); 48 | put( 26, "Sc_CURRENCY_SYMBOL" ); 49 | put( 27, "Sk_MODIFIER_SYMBOL" ); 50 | put( 28, "So_OTHER_SYMBOL" ); 51 | put( 29, "Pi_INITIAL_QUOTE_PUNCTUATION" ); 52 | put( 30, "Pf_FINAL_QUOTE_PUNCTUATION" ); 53 | }}; 54 | 55 | static final Map ALIASES_SHORT_TO_LONG = new HashMap() {{ 56 | // Custom 57 | put( "Qm", QUESTION_MARK_NAME ); 58 | 59 | // Script 60 | put( "Com", "COMMON" ); 61 | put( "Lat", "LATIN" ); 62 | 63 | // Block 64 | put( "Basic", "BASIC_LATIN" ); 65 | put( "L1Sup", "LATIN_1_SUPPLEMENT" ); 66 | put( "GenPunct", "GENERAL_PUNCTUATION" ); 67 | put( "LetterSym", "LETTERLIKE_SYMBOLS" ); 68 | 69 | // Types 70 | put( "UPPER", "Lu_UPPERCASE_LETTER" ); 71 | put( "lower", "Ll_LOWERCASE_LETTER" ); 72 | put( "Title", "Lt_TITLECASE_LETTER" ); 73 | put( "ModL", "Lm_MODIFIER_LETTER" ); 74 | put( "OtherL", "Lo_OTHER_LETTER" ); 75 | put( "NonSpc", "Mn_NON_SPACING_MARK" ); 76 | put( "Encl", "Me_ENCLOSING_MARK" ); 77 | put( "Combining" , "Mc_COMBINING_SPACING_MARK" ); 78 | put( "Digit", "Nd_DECIMAL_DIGIT_NUMBER" ); 79 | put( "OtherNum", "No_OTHER_NUMBER" ); 80 | put( "Space", "Zs_SPACE_SEPARATOR" ); 81 | put( "Line", "Zl_LINE_SEPARATOR" ); 82 | put( "Para", "Zp_PARAGRAPH_SEPARATOR" ); 83 | put( "Ctrl", "Cc_CONTROL" ); 84 | put( "Fmt", "Cf_FORMAT" ); // or SIZE or DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 85 | // 17? 86 | put( "Priv", "Co_PRIVATE_USE" ); 87 | put( "Sur", "Cs_SURROGATE" ); 88 | put( "Dash", "Pd_DASH_PUNCTUATION" ); 89 | put( "Start", "Ps_START_PUNCTUATION" ); 90 | put( "End", "Pe_END_PUNCTUATION" ); 91 | put( "Conn", "Pc_CONNECTOR_PUNCTUATION" ); 92 | put( "OtherP", "Po_OTHER_PUNCTUATION" ); 93 | put( "Math", "Sm_MATH_SYMBOL" ); 94 | put( "Currency", "Sc_CURRENCY_SYMBOL" ); 95 | put( "ModSym", "Sk_MODIFIER_SYMBOL" ); 96 | put( "OtherSym", "So_OTHER_SYMBOL" ); 97 | put( "StartQ", "Pi_INITIAL_QUOTE_PUNCTUATION" ); 98 | put( "EndQ", "Pf_FINAL_QUOTE_PUNCTUATION" ); 99 | }}; 100 | 101 | static final Map ALIASES_LONG_TO_SHORT = new HashMap(); 102 | static { 103 | for ( Entry entry : ALIASES_SHORT_TO_LONG.entrySet() ) { 104 | String shortName = entry.getKey(); 105 | String longName = entry.getValue(); 106 | ALIASES_LONG_TO_SHORT.put( longName, shortName ); 107 | } 108 | } 109 | 110 | // Compound Aliases 111 | // Note: reversed order of initialization here 112 | static final Map COMPOUND_ALIASES_LONG_TO_SHORT = new HashMap() {{ 113 | put( "Com-Basic-Space", "space" ); 114 | put( "Lat-Basic-UPPER", "UPPER" ); 115 | put( "Lat-Basic-lower", "lower" ); 116 | put( "Com-Basic-Conn", "Connector" ); 117 | put( "Com-Basic-Currency", "Currency" ); 118 | put( "Com-Basic-Digit", "Digit" ); 119 | put( "Com-Basic-OtherP", "OtherPunct" ); 120 | put( "Com-L1Sup-OtherSym", "OtherSym" ); 121 | put( "Com-Basic-Start", "Start" ); 122 | put( "Com-Basic-End", "Stop" ); 123 | put( "Com-Basic-Math", "Math" ); 124 | put( "Com-Basic-Dash", "Dash1" ); 125 | put( "Com-GenPunct-Dash", "Dash2" ); 126 | put( "Com-LetterSym-OtherSym", "LetterSymbol" ); 127 | put( "Com-Basic-Qm", "QuestionMark" ); // add suffix 1 when needed 128 | }}; 129 | static final Map COMPOUND_ALIASES_SHORT_TO_LONG = new HashMap(); 130 | static { 131 | for ( Entry entry : COMPOUND_ALIASES_LONG_TO_SHORT.entrySet() ) { 132 | String longName = entry.getKey(); 133 | String shortName = entry.getValue(); 134 | COMPOUND_ALIASES_SHORT_TO_LONG.put( shortName, longName ); 135 | } 136 | } 137 | 138 | static String generateReport() { 139 | return generateReportForRange( 0, 255 ); 140 | } 141 | static String generateReportForRange( int min, int max ) { 142 | StringWriter sw = new StringWriter(); 143 | PrintWriter out = new PrintWriter(sw); 144 | 145 | for ( int i=min; i<=max; i++ ) { 146 | addCharInfoToReport( out, i ); 147 | } 148 | 149 | String outStr = sw.toString(); 150 | return outStr; 151 | } 152 | static String generateReportForPoints( int ... codePoints ) { 153 | StringWriter sw = new StringWriter(); 154 | PrintWriter out = new PrintWriter(sw); 155 | 156 | for ( int i : codePoints ) { 157 | addCharInfoToReport( out, i ); 158 | } 159 | 160 | String outStr = sw.toString(); 161 | return outStr; 162 | } 163 | static void addCharInfoToReport( PrintWriter out, int codePoint ) { 164 | out.print( "" + codePoint ); 165 | out.print( ", " ); 166 | out.print( String.format("%X", codePoint) ); 167 | out.print( ": " ); 168 | if ( codePoint >= 32 ) { 169 | Character c = new Character( (char)codePoint ); 170 | if ( ! Character.isSupplementaryCodePoint( codePoint ) ) { 171 | out.print( " c='"+c+"'" ); 172 | } 173 | // Extended / Supplmental Unicode 174 | else { 175 | // also StringBuffer appendCodePoint(int cp) 176 | char[] chars = Character.toChars( codePoint ); 177 | out.print( " c='" ); 178 | for ( char cS : chars ) { 179 | out.print( cS ); 180 | } 181 | out.print( "'" ); 182 | } 183 | } 184 | boolean isDef = Character.isDefined( codePoint ); 185 | out.print( " isDef="+isDef ); 186 | boolean isValid = Character.isValidCodePoint( codePoint ); 187 | out.print( " isValid="+isValid ); 188 | boolean isCtrl = Character.isISOControl( codePoint ); 189 | out.print( " isCtrl="+isCtrl ); 190 | boolean isBmp = Character.isBmpCodePoint( codePoint ); 191 | out.print( " isBmp="+isBmp ); 192 | boolean isSupp = Character.isSupplementaryCodePoint( codePoint ); 193 | out.print( " isSupp="+isSupp ); 194 | boolean isAlpha = Character.isAlphabetic( codePoint ); 195 | out.print( " isAlpha="+isAlpha ); 196 | boolean isLetter = Character.isLetter( codePoint ); 197 | out.print( " isLetter="+isLetter ); 198 | boolean isDigit = Character.isDigit( codePoint ); 199 | out.print( " isDigit="+isDigit ); 200 | int type = Character.getType( codePoint ); 201 | String typeStr = "" + type; 202 | if ( TYPES.containsKey(type) ) { 203 | typeStr += " " + TYPES.get(type); 204 | } 205 | else { 206 | typeStr += " (no-TYPES-entry)"; 207 | } 208 | out.print( " type="+typeStr ); 209 | String block = null; 210 | String script = null; 211 | try { 212 | block = UnicodeBlock.of( codePoint ).toString(); 213 | script = UnicodeScript.of( codePoint ).toString(); 214 | } 215 | catch( Exception e ) { } 216 | out.print( " script="+script ); 217 | out.print( " block="+block ); 218 | String name = Character.getName( codePoint ); 219 | out.print( " name="+name ); 220 | out.println(); 221 | } 222 | 223 | public static String getScriptName_LongForm( int codePoint ) { 224 | String script = "Unknown_Unicode_Script"; 225 | try { 226 | script = UnicodeScript.of( codePoint ).toString(); 227 | } 228 | catch( Exception e ) { } 229 | return script; 230 | } 231 | public static String getScriptName_ShortForm( int codePoint ) { 232 | String longName = getScriptName_LongForm( codePoint ); 233 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) { 234 | return ALIASES_LONG_TO_SHORT.get(longName); 235 | } 236 | else { 237 | return longName; 238 | } 239 | } 240 | public static String getBlockName_LongForm( int codePoint ) { 241 | String block = "Unknown_Unicode_Block"; 242 | try { 243 | block = UnicodeBlock.of( codePoint ).toString(); 244 | } 245 | catch( Exception e ) { } 246 | return block; 247 | } 248 | public static String getBlockName_ShortForm( int codePoint ) { 249 | String longName = getBlockName_LongForm( codePoint ); 250 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) { 251 | return ALIASES_LONG_TO_SHORT.get(longName); 252 | } 253 | else { 254 | return longName; 255 | } 256 | } 257 | public static String getTypeName_LongForm( int codePoint ) { 258 | int type = Character.getType( codePoint ); 259 | String typeStr = ""; 260 | if ( codePoint == QUESTION_MARK_CODEPOINT ) { 261 | typeStr = QUESTION_MARK_NAME; 262 | } 263 | else if ( TYPES.containsKey(type) ) { 264 | typeStr = TYPES.get(type); 265 | } 266 | else { 267 | typeStr = "" + type + "_No_TYPES_Entry"; 268 | } 269 | return typeStr; 270 | } 271 | public static String getTypeName_ShortForm( int codePoint ) { 272 | String longName = getTypeName_LongForm( codePoint ); 273 | if ( ALIASES_LONG_TO_SHORT.containsKey(longName) ) { 274 | return ALIASES_LONG_TO_SHORT.get(longName); 275 | } 276 | else { 277 | return longName; 278 | } 279 | } 280 | // returns "script-block-type" 281 | public static String getCompoundClassifier_LongForm( int codePoint ) { 282 | return getScriptName_LongForm(codePoint) 283 | + "-" + getBlockName_LongForm(codePoint) 284 | + "-" + getTypeName_LongForm(codePoint) 285 | ; 286 | } 287 | public static String getCompoundClassifier_ShortForm( int codePoint ) { 288 | String candidate = getScriptName_ShortForm(codePoint) 289 | + "-" + getBlockName_ShortForm(codePoint) 290 | + "-" + getTypeName_ShortForm(codePoint) 291 | ; 292 | if ( COMPOUND_ALIASES_LONG_TO_SHORT.containsKey(candidate) ) { 293 | return COMPOUND_ALIASES_LONG_TO_SHORT.get( candidate ); 294 | } 295 | else { 296 | return candidate; 297 | } 298 | } 299 | 300 | public static Map classifyString_LongForm( String inStr ) { 301 | return classifyString_LongForm( inStr, null ); 302 | } 303 | public static Map classifyString_LongForm( String inStr, Map stats ) { 304 | // Automatically sorts by key-order 305 | if ( null==stats ) { 306 | // In order by key, easier for overall tabulation 307 | stats = new TreeMap<>(); 308 | } 309 | if ( null==inStr || inStr.isEmpty() ) { 310 | return stats; 311 | } 312 | // Special looping to allow for Supplementary Unicode Characters (> 65k) 313 | int length = inStr.length(); 314 | for (int offset = 0; offset < length; ) { 315 | int codePoint = inStr.codePointAt( offset ); 316 | String charKey = getCompoundClassifier_LongForm( codePoint ); 317 | // Tabulate 318 | long count = 0L; 319 | if ( stats.containsKey(charKey) ) { 320 | count = stats.get( charKey ); 321 | } 322 | count++; 323 | stats.put( charKey, count ); 324 | // Advance 325 | offset += Character.charCount( codePoint ); 326 | } 327 | return stats; 328 | } 329 | public static Map classifyString_ShortForm( String inStr ) { 330 | return classifyString_ShortForm( inStr, null ); 331 | } 332 | // TODO: code very similar to LongForm, combine 333 | public static Map classifyString_ShortForm( String inStr, Map stats ) { 334 | // Automatically sorts by key-order 335 | if ( null==stats ) { 336 | // In order by key, easier for overall tabulation 337 | stats = new TreeMap<>(); 338 | } 339 | if ( null==inStr || inStr.isEmpty() ) { 340 | return stats; 341 | } 342 | // Special looping to allow for Supplementary Unicode Characters (> 65k) 343 | int length = inStr.length(); 344 | for (int offset = 0; offset < length; ) { 345 | int codePoint = inStr.codePointAt( offset ); 346 | String charKey = getCompoundClassifier_ShortForm( codePoint ); 347 | // Tabulate 348 | long count = 0L; 349 | if ( stats.containsKey(charKey) ) { 350 | count = stats.get( charKey ); 351 | } 352 | count++; 353 | stats.put( charKey, count ); 354 | // Advance 355 | offset += Character.charCount( codePoint ); 356 | } 357 | return stats; 358 | } 359 | 360 | public static void main( String [] argv ) { 361 | // U+306E, dec:12398 362 | System.out.println( "Japanese \"no\": '\u306e'" ); 363 | // U+4e00 19968, U+4e8c 20108, U+4e09 19977 364 | System.out.println( "Chinese 1 2 3: '\u4e00\u4e8c\u4e09'" ); 365 | // U+1D11E, dec:119070 366 | System.out.println( "Extended: Musical G-clef: '\uD834\uDD1E'" ); 367 | // U+1F37A, dec:127866 368 | System.out.println( "Extended: Beer Mug: '\uD83C\uDF7A'" ); 369 | 370 | // String report = generateReportForRange( 0, 255 ); 371 | String report = generateReportForPoints( 12398, 19968, 20108, 19977, 119070, 127866 ); 372 | System.out.print( report ); 373 | } 374 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.lang.reflect.Field; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.lang.reflect.Method; 6 | import java.util.LinkedHashMap; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | 10 | public class CmdLineLauncher { 11 | // TODO: currently using static init but 12 | // fefactoring would require that all classes use lightweight null constructor 13 | // static final Map> CLASSES = new LinkedHashMap>() 14 | static final Map> CLASSES = new LinkedHashMap>() 15 | {{ 16 | put( "empty_fields", com.lucidworks.dq.data.EmptyFieldStats.class ); 17 | put( "term_stats", com.lucidworks.dq.data.TermStats.class ); 18 | put( "code_points", com.lucidworks.dq.data.TermCodepointStats.class ); 19 | put( "date_checker", com.lucidworks.dq.data.DateChecker.class ); 20 | put( "diff_empty_fields", com.lucidworks.dq.diff.DiffEmptyFieldStats.class ); 21 | put( "diff_ids", com.lucidworks.dq.diff.DiffIds.class ); 22 | put( "diff_schema", com.lucidworks.dq.diff.DiffSchema.class ); 23 | put( "diff_config", com.lucidworks.dq.diff.DiffSolrConfig.class ); 24 | put( "doc_count", com.lucidworks.dq.data.DocCount.class ); 25 | put( "dump_ids", com.lucidworks.dq.data.DumpIds.class ); 26 | put( "delete_by_ids", com.lucidworks.dq.data.DeleteByIds.class ); 27 | put( "solr_to_solr", com.lucidworks.dq.data.SolrToSolr.class ); 28 | put( "solr_to_csv", com.lucidworks.dq.data.SolrToCsv.class ); 29 | put( "hash_and_shard", com.lucidworks.dq.util.HashAndShard.class ); 30 | }}; 31 | public static void main( String[] argv ) { 32 | if( argv.length < 1 ) { 33 | System.out.println( "Pass a command name on the command line to see help for that class:" ); 34 | // for( Entry> entry : CLASSES.entrySet() ) 35 | for( Entry> entry : CLASSES.entrySet() ) 36 | { 37 | String cmdName = entry.getKey(); 38 | // Class clazz = entry.getValue(); 39 | Class clazz = entry.getValue(); 40 | 41 | String desc = null; 42 | try { 43 | Method descMeth = clazz.getMethod( "getShortDescription" ); 44 | desc = (String) descMeth.invoke( null, (Object[]) null ); 45 | // Field f = clazz.getDeclaredField( "HELP_WHAT_IS_IT" ); 46 | // desc = (String) f.get(null); 47 | } catch (SecurityException | IllegalArgumentException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) { 48 | // TODO Auto-generated catch block 49 | e.printStackTrace(); 50 | } 51 | 52 | // System.out.println( cmdName + ": " + desc ); 53 | System.out.printf( "%20s: %s\n", cmdName, desc ); 54 | } 55 | } 56 | // Has a command name 57 | else { 58 | String cmdName = argv[ 0 ]; 59 | if ( CLASSES.containsKey(cmdName) ) { 60 | // Copy over all the first arg 61 | String [] argv2 = new String[ argv.length - 1 ]; 62 | for ( int i=1; i clazz = CLASSES.get(cmdName); 66 | try { 67 | Method main = clazz.getMethod( "main", String[].class ); 68 | // main.invoke( null, argv2 ); 69 | // main.invoke( null, (Object[]) argv2 ); 70 | main.invoke( null, (Object) argv2 ); 71 | } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) { 72 | // TODO Auto-generated catch block 73 | e.printStackTrace(); 74 | System.exit(2); 75 | } 76 | } 77 | else { 78 | System.err.println( "Command \"" + cmdName + "\" not found in " + CLASSES.keySet() ); 79 | System.exit(2); 80 | } 81 | } 82 | } 83 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/DateUtils.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.text.DateFormat; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | import java.util.Date; 9 | import java.util.List; 10 | import java.util.TimeZone; 11 | 12 | public class DateUtils { 13 | 14 | public static final String JAVA_FORMAT = "EEE MMM dd HH:mm:ss z yyyy"; 15 | public static final String ZULU_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'"; 16 | // public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.S"; 17 | public static final String COMPACT_LOG_FORMAT = "yyyy-MM-dd_HH:mm:ss.SSS"; 18 | 19 | public static String getLocalTimestamp( Date inDate ) { 20 | DateFormat compactFormatter = new SimpleDateFormat( COMPACT_LOG_FORMAT ); 21 | // NOT setting timezone 22 | return compactFormatter.format( inDate ); 23 | } 24 | public static String getLocalTimestamp() { 25 | return getLocalTimestamp( new Date() ); 26 | } 27 | public static String getLocalTimestamp( long ms ) { 28 | return getLocalTimestamp( new Date(ms) ); 29 | } 30 | public static String javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException { 31 | java.util.Date dateObj = javaDefault2Date_str2date( inDate ); 32 | String outDateStr = date2SolrXmlZulu_date2str( dateObj ); 33 | return outDateStr; 34 | } 35 | public static String solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException { 36 | java.util.Date dateObj = solrXmlZulu2Date_str2date( inDate ); 37 | String outDateStr = date2JavaDefault_date2str( dateObj ); 38 | return outDateStr; 39 | } 40 | public static String _javaDefault2SolrXmlZulu_str2str( String inDate ) throws ParseException { 41 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT ); 42 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT ); 43 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") ); 44 | java.util.Date tmpDate = javaFormatter.parse( inDate ); 45 | String outDate = zuluFormatter.format( tmpDate ); 46 | return outDate; 47 | } 48 | public static String _solrXmlZulu2JavaDefault_str2str( String inDate ) throws ParseException { 49 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT ); 50 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") ); 51 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT ); 52 | java.util.Date tmpDate = zuluFormatter.parse( inDate ); 53 | String outDate = javaFormatter.format( tmpDate ); 54 | return outDate; 55 | } 56 | 57 | public static String date2SolrXmlZulu_date2str( java.util.Date inDate ) throws ParseException { 58 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT ); 59 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") ); 60 | String outDate = zuluFormatter.format( inDate ); 61 | return outDate; 62 | } 63 | public static String date2JavaDefault_date2str( java.util.Date inDate ) throws ParseException { 64 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT ); 65 | String outDate = javaFormatter.format( inDate ); 66 | return outDate; 67 | } 68 | 69 | public static java.util.Date javaDefault2Date_str2date( String inDate ) throws ParseException { 70 | DateFormat javaFormatter = new SimpleDateFormat( JAVA_FORMAT ); 71 | java.util.Date outDate = javaFormatter.parse( inDate ); 72 | return outDate; 73 | } 74 | public static java.util.Date solrXmlZulu2Date_str2date( String inDate ) throws ParseException { 75 | DateFormat zuluFormatter = new SimpleDateFormat( ZULU_FORMAT ); 76 | zuluFormatter.setTimeZone( TimeZone.getTimeZone("GMT") ); 77 | java.util.Date outDate = zuluFormatter.parse( inDate ); 78 | return outDate; 79 | } 80 | 81 | public static List dates2Doubles( Collection dates ) { 82 | List out = new ArrayList<>(); 83 | for ( Date d : dates ) { 84 | out.add( new Double( d.getTime() ) ); 85 | } 86 | return out; 87 | } 88 | public static Double date2Double( Date d ) { 89 | return new Double( d.getTime() ).doubleValue(); 90 | } 91 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/HasDescription.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | // TODO: future... Refactor and add lightweight null constructors 4 | // see also util.CmdLineLauncher 5 | public interface HasDescription { 6 | String getShortDescription(); 7 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/HashAndShard.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.solr.common.cloud.DocRouter.Range; 7 | import org.apache.solr.common.util.Hash; 8 | 9 | public class HashAndShard { 10 | 11 | // Should correspond to: 12 | // http://localhost:8983/solr/collection1/select?q=*&fl=*,[shard] 13 | 14 | static String HELP_WHAT_IS_IT = "Calculate hash and shard for a document ID"; 15 | static String HELP_USAGE = "HashAndShard docId [numberOfShards [-q]] # shards can be decimal, hex, octal, etc"; 16 | public static String getShortDescription() { 17 | return HELP_WHAT_IS_IT; 18 | } 19 | 20 | 21 | /* From: 22 | * solr-lucene-490-src/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java 23 | */ 24 | private static int bits = 16; 25 | static List partitionRange( int partitions ) { 26 | int min = Integer.MIN_VALUE; // -2^31 = -2147483648 = -2,147,483,648 27 | int max = Integer.MAX_VALUE; // 2^31-1 = 2147483647 = 2,147,483,647 28 | 29 | // assert max >= min; 30 | // if (partitions == 0) return Collections.EMPTY_LIST; 31 | long rangeSize = (long) max - (long) min; 32 | long rangeStep = Math.max(1, rangeSize / partitions); 33 | 34 | List ranges = new ArrayList<>(partitions); 35 | 36 | long start = min; 37 | long end = start; 38 | 39 | // keep track of the idealized target to avoid accumulating rounding errors 40 | long targetStart = min; 41 | long targetEnd = targetStart; 42 | 43 | // Round to avoid splitting hash domains across ranges if such rounding is not significant. 44 | // With default bits==16, one would need to create more than 4000 shards before this 45 | // becomes false by default. 46 | int mask = 0x0000ffff; 47 | boolean round = rangeStep >= (1 << bits) * 16; 48 | 49 | while (end < max) { 50 | targetEnd = targetStart + rangeStep; 51 | end = targetEnd; 52 | 53 | if (round && ((end & mask) != mask)) { 54 | // round up or down? 55 | int increment = 1 << bits; // 0x00010000 56 | long roundDown = (end | mask) - increment; 57 | long roundUp = (end | mask) + increment; 58 | if (end - roundDown < roundUp - end && roundDown > start) { 59 | end = roundDown; 60 | } else { 61 | end = roundUp; 62 | } 63 | } 64 | 65 | // make last range always end exactly on MAX_VALUE 66 | if (ranges.size() == partitions - 1) { 67 | end = max; 68 | } 69 | ranges.add(new Range((int) start, (int) end)); 70 | start = end + 1L; 71 | targetStart = targetEnd + 1L; 72 | } 73 | 74 | return ranges; 75 | } 76 | 77 | static void printRanges( List ranges, Integer hash ) { 78 | int shardCounter = 0; 79 | for ( Range r : ranges ) { 80 | shardCounter++; 81 | System.out.println( "Shard # " + shardCounter ); 82 | System.out.println( "\tRange: " 83 | + String.format("0x%8s", Integer.toHexString(r.min)).replace(' ', '0') 84 | + " to " 85 | + String.format("0x%8s", Integer.toHexString(r.max)).replace(' ', '0') 86 | ); 87 | if ( null!=hash ) { 88 | if ( hash >= r.min && hash <= r.max ) { 89 | System.out.println( "\tcontains " 90 | + String.format("0x%8s", Integer.toHexString(hash)).replace(' ', '0') 91 | ); 92 | } 93 | } 94 | } 95 | } 96 | static int findShardForHash( List ranges, Integer hash ) { 97 | int shardCounter = 0; 98 | for ( Range r : ranges ) { 99 | shardCounter++; 100 | if ( hash >= r.min && hash <= r.max ) { 101 | return shardCounter; 102 | } 103 | } 104 | return -1; 105 | } 106 | 107 | public static void main(String[] args) { 108 | if ( args.length < 1 || args.length > 3 ) { 109 | System.err.println( "Error: syntax: " + HELP_USAGE ); 110 | System.exit(1); 111 | } 112 | String docId = args[0]; 113 | if ( docId.length() < 1 ) { 114 | System.err.println( "Error: empty docId" ); 115 | System.exit(2); 116 | } 117 | String numShardsStr = args.length >= 2 ? args[1] : null; 118 | String quietStr = args.length >= 3 ? args[2] : null; 119 | boolean quiet = null!=quietStr && quietStr.equalsIgnoreCase("-q"); 120 | 121 | int signedHash = Hash.murmurhash3_x86_32( docId, 0, docId.length(), 0 ); 122 | long unsignedHash = signedHash & 0x00000000ffffffffL; 123 | if ( ! quiet ) { 124 | System.out.println( "docId: \"" + docId + '"' ); 125 | System.out.println( "32-bit Hash (signed decimal int): " + signedHash ); 126 | System.out.println( "32-bit Hash (unsigned dec int): " + unsignedHash ); 127 | System.out.println( "32-bit Hash (hex): " + String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') ); 128 | System.out.println( "32-bit Hash (binary): " + String.format("%32s", Integer.toBinaryString(signedHash)).replace(' ', '0') ); 129 | } 130 | else { 131 | System.out.print( docId + " " ); 132 | System.out.print( String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') ); 133 | } 134 | 135 | if ( null != numShardsStr ) { 136 | Integer numShards = null; 137 | try { 138 | numShards = Integer.decode( numShardsStr ); 139 | } 140 | catch( NumberFormatException e ) { 141 | System.err.println( "Error parsing numberOfShards: " + e ); 142 | System.exit(3); 143 | } 144 | if ( numShards <= 0 ) { 145 | System.err.println( "Error: numberOfShards must be > 0; got " + numShards ); 146 | System.exit(4); 147 | } 148 | // WRONG! 149 | // long shardNumber = (unsignedHash % numShards) + 1; 150 | // System.out.println( "Route to Shard (base-ONE): " + shardNumber ); 151 | 152 | List ranges = partitionRange( numShards ); 153 | 154 | if ( ! quiet ) { 155 | System.out.println( "Number of Shards: " + numShards ); 156 | 157 | printRanges( ranges, signedHash ); 158 | } 159 | else { 160 | int targetShard = findShardForHash( ranges, signedHash ); 161 | System.out.print( " " + targetShard ); 162 | } 163 | } 164 | if ( quiet ) { 165 | System.out.println(); 166 | } 167 | 168 | } 169 | 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/IO_Utils.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.URI; 6 | import java.net.URISyntaxException; 7 | import java.nio.file.CopyOption; 8 | import java.nio.file.FileSystem; 9 | import java.nio.file.FileSystems; 10 | import java.nio.file.FileVisitResult; 11 | import java.nio.file.Files; 12 | import java.nio.file.Path; 13 | import java.nio.file.Paths; 14 | import java.nio.file.SimpleFileVisitor; 15 | import java.nio.file.StandardCopyOption; 16 | import java.nio.file.attribute.BasicFileAttributes; 17 | import java.util.Collections; 18 | 19 | 20 | public class IO_Utils { 21 | 22 | public static File materializeSolrHomeIntoTemp() throws IOException, URISyntaxException { 23 | String prefix = "solr_dq_utils_"; 24 | String topName = "solr_home"; 25 | //String magicName = "configsets"; 26 | Path baseTempDir = Files.createTempDirectory( prefix ); 27 | // File destinationDir = new File( baseTempDir.toFile(), magicName ); 28 | File destinationDir = new File( baseTempDir.toFile(), topName ); 29 | if ( ! destinationDir.mkdirs() ) { 30 | throw new IOException( "Unable to create path \"" + destinationDir + "\"" ); 31 | } 32 | // System.out.println( "Created \"" + destinationDir + "\"" ); 33 | IO_Utils iou = new IO_Utils(); 34 | 35 | //String sourcePathWithinJar = "/"; 36 | // ^-- gets all classes from every combined jar 37 | 38 | //String sourcePathWithinJar = "configsets"; 39 | // ^-- Exception in thread "main" java.lang.IllegalArgumentException, no details 40 | 41 | // String sourcePathWithinJar = "/" + magicName; 42 | String sourcePathWithinJar = "/" + topName; 43 | 44 | // String destinationPathInFilesystem = "/Users/mbennett/tmp_test_copy"; 45 | // ^-- Doesn't create spanning .../configsets/... dir, just subdirectories of it 46 | 47 | // iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationPathInFilesystem) ); 48 | iou.copyFromJar( sourcePathWithinJar, Paths.get(destinationDir.toString()) ); 49 | return destinationDir; 50 | } 51 | 52 | // Parts take from: 53 | // * http://stackoverflow.com/a/24316335/295802 54 | // * http://codingjunkie.net/java-7-copy-move/ 55 | // Usage: copyFromJar("/path/to/the/template/in/jar", Paths.get("/tmp/from-jar")) 56 | public void copyFromJar(String source, final Path target) throws URISyntaxException, IOException { 57 | System.out.println( "source str = \"" + source + "\"" ); 58 | 59 | 60 | // getClass is defined in Object 61 | URI resource = getClass().getResource("").toURI(); 62 | 63 | // ... ? FileSystems.newFileSystem(...) 64 | // ^-- java.lang.IllegalArgumentException: Path component should be '/' 65 | // at least when run in Eclipse (non .jar packaging) 66 | //URI resource = getClass().getResource("/").toURI(); 67 | 68 | System.out.println( "URI Resource = \"" + resource + "\"" ); 69 | // ^-- Interactive: "file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/" 70 | // ^-- Run Uberjar: "jar:file:/Users/mbennett/data/dev/DQ/data-quality-github/target/data-quality-java-1.0-SNAPSHOT.jar!/com/lucidworks/dq/util/" 71 | 72 | // jar:file: - Running from packaged jar 73 | if ( resource.toString().startsWith("jar:file:" ) ) { 74 | FileSystem fileSystem = FileSystems.newFileSystem( 75 | resource, 76 | Collections.emptyMap() 77 | ); 78 | 79 | final Path jarPath = fileSystem.getPath(source); 80 | 81 | // Recursive copy 82 | // TODO: looks similar to other recursive copy below, maybe combine 83 | Files.walkFileTree(jarPath, new SimpleFileVisitor() { 84 | 85 | private Path currentTarget; 86 | 87 | @Override 88 | public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { 89 | currentTarget = target.resolve(jarPath.relativize(dir).toString()); 90 | Files.createDirectories(currentTarget); 91 | return FileVisitResult.CONTINUE; 92 | } 93 | 94 | @Override 95 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 96 | //System.out.println( "Copying \"" + file.toString() + "\" ..." ); 97 | Files.copy(file, target.resolve(jarPath.relativize(file).toString()), StandardCopyOption.REPLACE_EXISTING); 98 | return FileVisitResult.CONTINUE; 99 | } 100 | 101 | }); 102 | 103 | } 104 | // file: - Running from Eclipse or other non-packaged runner 105 | else if ( resource.toString().startsWith("file:" ) ) { 106 | // Our resource is relative root level, not this specific package 107 | URI resource2 = getClass().getResource("/").toURI(); 108 | File base = new File( resource2.getPath() ); 109 | File srcDir = new File( base, source ); 110 | final Path fromPath = srcDir.toPath(); 111 | final Path toPath = target; 112 | 113 | // Recursive copy 114 | // TODO: looks similar to other recursive copy above, maybe combine 115 | Files.walkFileTree(fromPath, new SimpleFileVisitor() { 116 | 117 | @Override 118 | public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { 119 | Path targetPath = toPath.resolve(fromPath.relativize(dir)); 120 | if ( ! Files.exists(targetPath) ){ 121 | Files.createDirectory(targetPath); 122 | } 123 | return FileVisitResult.CONTINUE; 124 | } 125 | 126 | @Override 127 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 128 | Files.copy(file, toPath.resolve(fromPath.relativize(file)), StandardCopyOption.REPLACE_EXISTING); 129 | return FileVisitResult.CONTINUE; 130 | } 131 | 132 | }); 133 | 134 | 135 | /*** 136 | // TODO: recursive copy from filesystem 137 | // Files.copy( new File(source).toPath(), target, StandardCopyOption.REPLACE_EXISTING ); 138 | // ^-- No, only has "/solr_home" 139 | // and "resource" is too far down: 140 | // Gives: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/com/lucidworks/dq/util 141 | // Need: /Users/mbennett/data/dev/DQ/data-quality-github/target/classes/solr_home 142 | URI resource2 = getClass().getResource("/").toURI(); 143 | // gives! file:/Users/mbennett/data/dev/DQ/data-quality-github/target/classes/ 144 | System.out.println( "URI Resource2 = \"" + resource2 + "\"" ); 145 | File base = new File( resource2.getPath() ); 146 | File srcDir = new File( base, source ); 147 | Path srcPath = srcDir.toPath(); 148 | System.out.println( "srcPath = \"" + srcPath + "\"" ); 149 | System.out.println( "target = \"" + target + "\"" ); 150 | //Files.copy( srcPath, target, StandardCopyOption.REPLACE_EXISTING ); 151 | 152 | // EnumSet opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS); 153 | // TreeCopier tc = new TreeCopier(source[i], dest, prompt, preserve); 154 | // Files.walkFileTree(source[i], opts, Integer.MAX_VALUE, tc); 155 | 156 | ***/ 157 | 158 | 159 | } 160 | else { 161 | throw new IllegalArgumentException( "Don't know how to handle " + resource ); 162 | } 163 | 164 | } 165 | 166 | 167 | 168 | 169 | public static void main(String[] args) throws URISyntaxException, IOException { 170 | //File configSetsDir = materializeConfigsetsInTemp(); 171 | File configSetsDir = materializeSolrHomeIntoTemp(); 172 | System.out.println( "ConfigSets = " + configSetsDir ); 173 | 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/LLR.java-new: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | import java.util.Collection; 6 | import java.util.LinkedHashMap; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | import java.util.Set; 10 | import java.util.TreeMap; 11 | import java.util.TreeSet; 12 | 13 | import org.apache.solr.client.solrj.SolrServerException; 14 | import org.apache.solr.client.solrj.impl.HttpSolrServer; 15 | 16 | public class LLR { 17 | 18 | Map wordsA; 19 | Map wordsB; 20 | // TODO: consider this, BUT threshold for A OR B, or A AND B ? 21 | // long minWordsThreshold = 0L; 22 | 23 | // Column Totals 24 | double sumA = 0.0; 25 | double sumB = 0.0; 26 | // K Total 27 | double grandTotal; 28 | // Row Totals 29 | Map rowTotals = new LinkedHashMap<>(); 30 | 31 | // Set allWordsAboveThreshold = new TreeSet<>(); 32 | Set allWords = new TreeSet<>(); 33 | 34 | Map scoresByWord = new TreeMap<>(); 35 | Map sortedScoresByWord = new TreeMap<>(); 36 | 37 | // Peformance Stat 38 | long plogp_counter = 0L; 39 | 40 | public LLR( Map wordsA, Map wordsB /*, Long optThreshold*/ ) { 41 | this.wordsA = wordsA; 42 | this.wordsB = wordsB; 43 | //if ( null!=optThreshold && optThreshold.longValue() > 0L ) { 44 | // this.minWordsThreshold = optThreshold.longValue(); 45 | //} 46 | doInitialCalculations(); 47 | calcAllWords(); 48 | sortWords(); 49 | } 50 | 51 | public void doInitialCalculations() { 52 | 53 | // Column Totals 54 | // ------------- 55 | // sumA = sumWithThreshold( wordsA.values() ); 56 | // sumB = sumWithThreshold( wordsB.values() ); 57 | sumA = new Double( StatsUtils.sumList_Longs(wordsA.values()) ).doubleValue(); 58 | sumB = new Double( StatsUtils.sumList_Longs(wordsB.values()) ).doubleValue(); 59 | if ( sumA<=0.0 || sumB<=0.0 ) { 60 | throw new IllegalArgumentException( "Must have non-zero word counts: A=" + sumA + ", B=" + sumB ); 61 | } 62 | 63 | // K Total 64 | grandTotal = sumA + sumB; 65 | 66 | // Row Totals 67 | // ---------- 68 | allWords.addAll( wordsA.keySet() ); 69 | allWords.addAll( wordsB.keySet() ); 70 | for ( String word : allWords ) { 71 | Long countA = wordsA.containsKey(word) ? wordsA.get(word) : 0L; 72 | Long countB = wordsB.containsKey(word) ? wordsB.get(word) : 0L; 73 | rowTotals.put( word, new Double(countA + countB) ); 74 | } 75 | 76 | } 77 | 78 | public void calcAllWords() { 79 | for ( String word : allWords ) { 80 | // double g2 = calcG2_viaDunning( word ); 81 | double g2 = calcG2_viaTraditional( word ); 82 | scoresByWord.put( word, g2 ); 83 | } 84 | } 85 | 86 | 87 | // TODO: G2 is the same as -2 log lambda ? 88 | // http://scg.unibe.ch/archive/papers/Kuhn09aLogLikelihoodRatio.pdf 89 | // Before Sign: 90 | // food: 0.0 91 | // bananas: 0.46192170199964266 92 | // apples: 0.6291706616789554 93 | // carrots: 60.03320678316349 94 | // candy: 60.03320678316351 95 | // After Sign: 96 | // candy: -60.03320678316351 97 | // bananas: -0.46192170199964266 98 | // food: 0.0 99 | // apples: 0.6291706616789554 100 | // carrots: 60.03320678316349 101 | double calcG2_viaTraditional( String word ) { 102 | boolean debug = false; 103 | if(debug) System.out.println( "\n=== Calculating G2 via Traditional formula for \"" + word + "\" ===" ); 104 | // Simple terms 105 | double k1 = wordsA.containsKey(word) ? wordsA.get(word) : 0L; 106 | double k2 = wordsB.containsKey(word) ? wordsB.get(word) : 0L; 107 | double n1 = sumA; 108 | double n2 = sumB; 109 | double p1 = k1 / n1; 110 | double p2 = k2 / n2; 111 | if(debug) System.out.println( "Corpus A: k1, n1, p1: " + k1 + ", " + n1 + ", " + p1 ); 112 | if(debug) System.out.println( "Corpus B: k2, n2, p2: " + k2 + ", " + n2 + ", " + p2 ); 113 | double p = (k1 + k2) / (n1 + n2); // rowCount / grandTotal 114 | if(debug) System.out.println( "Combined: k1+2, n1+2, p1+2: " + (k1+k2) + ", " + (n1+n2) + ", " + p ); 115 | // Factors 116 | double factorA = Math.log( L(p1,k1,n1) ); 117 | double factorB = Math.log( L(p2,k2,n2) ); 118 | double factorC = Math.log( L(p,k1,n1) ); 119 | double factorD = Math.log( L(p,k2,n2) ); 120 | double sign = sign( p1, p2 ); 121 | // Result 122 | double out = sign * 2.0 * ( factorA + factorB - factorC - factorD ); 123 | if(debug) System.out.println( "out = +/-sign * 2.0 * ( factorA + factorB - factorC - factorD )" ); 124 | if(debug) System.out.println( "Sign and Factors A, B, C, D: " + sign + ", " + factorA + ", " + factorB + ", " + factorC + ", " + factorD ); 125 | if(debug) System.out.println( "out = " + out ); 126 | return out; 127 | } 128 | 129 | // TODO: this is Binomial Likelihood ? 130 | // k = word count 131 | // n = total words in corpus (non-unique) 132 | // p = k/n, BUT might use different k and n 133 | static double L( double p, double k, double n ) { 134 | double part1 = Math.pow( p, k ); 135 | double part2 = Math.pow( (1.0-p), (n-k) ); 136 | return part1 * part2; 137 | } 138 | 139 | // TODO: confirm meaning of +/- 140 | // plus = heavier in first collection 141 | // minus = heavier in second collection 142 | static double sign( double p1, double p2 ) { 143 | if ( p1 - p2 >= 0.0 ) { 144 | return 1.0; 145 | } 146 | else { 147 | return -1.0; 148 | } 149 | } 150 | 151 | // Each word is done individually, across both collections 152 | // food: 1.7319479184152442E-13 153 | // bananas: 0.4619217019995059 154 | // apples: 0.6291706616789394 155 | // candy: 60.03320678316341 156 | // carrots: 60.03320678316341 157 | double calcG2_viaDunning( String word ) { 158 | boolean debug = false; 159 | if(debug) System.out.println( "\n=== Calculating G2 via Dunning Entropy formula for \"" + word + "\" ===" ); 160 | // Calc H_rowSums 161 | // --------------- 162 | double row1Total = rowTotals.get(word); 163 | double row2Total = grandTotal - row1Total; 164 | if(debug) System.out.println( "Row Totals: " + row1Total + " " + row2Total ); 165 | // plnp = probability * log (probability), log = natural log 166 | double plogpRow1 = 0.0; 167 | if ( row1Total > 0.0 ) { 168 | double prob = row1Total / grandTotal; 169 | plogpRow1 = prob * Math.log(prob); 170 | plogp_counter++; 171 | } 172 | double plogpRow2 = 0.0; 173 | if ( row2Total > 0.0 ) { 174 | double prob = row2Total / grandTotal; 175 | plogpRow2 = prob * Math.log(prob); 176 | plogp_counter++; 177 | } 178 | double H_rowSums = -1.0 * ( plogpRow1 + plogpRow2 ); 179 | if(debug) System.out.println( "Row plogp 1 & 2 and H_rowSums: " + plogpRow1 + " " + plogpRow2 + " " + H_rowSums ); 180 | 181 | // Calc H_colSums 182 | // -------------- 183 | // We checked column sums earlier 184 | double probCol1 = sumA / grandTotal; 185 | double plogpCol1 = probCol1 * Math.log( probCol1 ); 186 | plogp_counter++; 187 | double probCol2 = sumB / grandTotal; 188 | double plogpCol2 = probCol2 * Math.log( probCol2 ); 189 | plogp_counter++; 190 | double H_colSums = -1.0 * ( plogpCol1 + plogpCol2 ); 191 | if(debug) System.out.println( "Column plogp 1 & 2 and H_colSums: " + plogpCol1 + " " + plogpCol2 + " " + H_colSums ); 192 | 193 | // Calc H_k 194 | // ----------- 195 | // column 1 counts 196 | double k_11 = wordsA.containsKey(word) ? wordsA.get(word) : 0L; 197 | double k_21 = sumA - k_11; // all other counts 198 | // column 2 counts 199 | double k_12 = wordsB.containsKey(word) ? wordsB.get(word) : 0L; 200 | double k_22 = sumB - k_12; // all other counts 201 | if(debug) System.out.println( "K counts:\n\t" + k_11 + " " + k_12 + "\n\t" + k_21 + " " + k_22 ); 202 | // probabilities 203 | double prob_11 = k_11 / grandTotal; 204 | double prob_21 = k_21 / grandTotal; 205 | double prob_12 = k_12 / grandTotal; 206 | double prob_22 = k_22 / grandTotal; 207 | // p log( p ) 208 | // method has its own counter 209 | double plogp_11 = plogp( prob_11 ); 210 | double plogp_21 = plogp( prob_21 ); 211 | double plogp_12 = plogp( prob_12 ); 212 | double plogp_22 = plogp( prob_22 ); 213 | // finally H_k 214 | double H_k = -1.0 * ( plogp_11 + plogp_21 + plogp_12 + plogp_22 ); 215 | if(debug) System.out.println( "K plogp:\n\t" + plogp_11 + " " + plogp_12 + "\n\t" + plogp_21 + " " + plogp_22 ); 216 | if(debug) System.out.println( "H_k = " + H_k ); 217 | 218 | // Dunning's formula 219 | // http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html 220 | // double G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums ); 221 | // if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_k - H_rowSums - H_colSums )" ); 222 | // if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_k + " - " + H_rowSums + " - " + H_colSums + " )" ); 223 | 224 | // Revised, see http://math.stackexchange.com/questions/693114/wrong-result-from-llr-using-dunning-entropy-method 225 | double G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k ); 226 | if(debug) System.out.println( "G2 = 2.0 * grandTotal * ( H_rowSums + H_colSums - H_k )" ); 227 | if(debug) System.out.println( "2 * " + grandTotal + " * ( " + H_rowSums + " + " + H_colSums + " - " + H_k + " )" ); 228 | 229 | return G2; 230 | } 231 | 232 | // Calculates p * log( p ) 233 | // natural log 234 | // but returns 0.0 if p is 0 235 | // TODO: maybe some implementitons just add 1 to all counts? 236 | double plogp( double prob ) { 237 | if ( prob > 0.0 ) { 238 | plogp_counter++; 239 | return prob * Math.log( prob ); 240 | } 241 | else { 242 | return 0.0; 243 | } 244 | } 245 | 246 | void sortWords() { 247 | // Map scoresByWord = new TreeMap<>(); 248 | // Map sortedScoresByWord = new TreeMap<>(); 249 | sortedScoresByWord = SetUtils.sortMapByValues( scoresByWord ); 250 | } 251 | 252 | // double pLogP_KOverallWordA( String word ) { 253 | // double prob = probKOverallWordA( word ); 254 | // if ( prob > 0.0 ) { 255 | // return prob * Math.log( prob ); 256 | // } 257 | // else { 258 | // return 0.0; 259 | // } 260 | // } 261 | // double pLogP_KOverallWordB( String word ) { 262 | // double prob = probKOverallWordB( word ); 263 | // if ( prob > 0.0 ) { 264 | // return prob * Math.log( prob ); 265 | // } 266 | // else { 267 | // return 0.0; 268 | // } 269 | // } 270 | // double probKOverallWordA( String word ) { 271 | // return probKOverallWord( word, wordsA ); 272 | // } 273 | // double probKOverallWordB( String word ) { 274 | // return probKOverallWord( word, wordsB ); 275 | // } 276 | // double probKOverallWord( String word, Map countMap ) { 277 | // long count = countMap.containsKey(word) ? countMap.get(word) : 0L; 278 | // double prob = (double) count / grandTotal; 279 | // return prob; 280 | // } 281 | 282 | // double sumWithThreshold( Collection counts ) { 283 | // double out = 0.0; 284 | // for ( Long c : counts ) { 285 | // if ( c >= minWordsThreshold ) { 286 | // out += c; 287 | // } 288 | // } 289 | // return out; 290 | // } 291 | 292 | public String generateReport( String optLabel ) { 293 | StringWriter sw = new StringWriter(); 294 | PrintWriter out = new PrintWriter(sw); 295 | 296 | int sampleSize = 5; 297 | 298 | if ( null!=optLabel ) { 299 | out.println( "----------- " + optLabel + " -----------" ); 300 | } 301 | 302 | out.println(); 303 | out.println( "Corpus A unique / total words: " + wordsA.size() + " / " + sumA ); 304 | out.println( "Corpus B unique / total words: " + wordsB.size() + " / " + sumB ); 305 | out.println( "Combined unique / total words: " + allWords.size() + " / " + grandTotal ); 306 | out.println( "Number of p log(p) calculations: " + plogp_counter ); 307 | out.println(); 308 | 309 | if ( sortedScoresByWord.size() <= 2 * sampleSize + 1 ) { 310 | addTermsSliceToReport( out, "All Term Changes", sortedScoresByWord ); 311 | } 312 | else { 313 | Map firstTerms = SetUtils.mapHead( sortedScoresByWord, sampleSize ); 314 | addTermsSliceToReport( out, "Term Changes, first " + sampleSize + " entries", firstTerms ); 315 | Map lastTerms = SetUtils.mapTail( sortedScoresByWord, sampleSize ); 316 | addTermsSliceToReport( out, "Term Changes, last " + sampleSize + " entries", lastTerms ); 317 | } 318 | 319 | String outStr = sw.toString(); 320 | return outStr; 321 | } 322 | void addTermsSliceToReport( PrintWriter out, String label, Map terms ) { 323 | out.println( "" + label + ":" ); 324 | for ( Entry wordEntry : terms.entrySet() ) { 325 | String word = wordEntry.getKey(); 326 | double g2 = wordEntry.getValue(); 327 | out.println( "\t" + word + ": " + g2 ); 328 | } 329 | } 330 | 331 | public static void main( String[] argv ) throws SolrServerException { 332 | // Map corpusA = new LinkedHashMap() {{ 333 | // // 100k docs total 334 | // put( "blog", 25L ); // test word 335 | // put( "computer", 3200L ); // other words 336 | // put( "internet", 96775L ); // other words 337 | // }}; 338 | // Map corpusB = new LinkedHashMap() {{ 339 | // // 200k docs total 340 | // put( "blog", 2500L ); // test word 341 | // put( "computer", 6000L ); // other words 342 | // put( "internet", 191500L ); // other words 343 | // }}; 344 | 345 | // // Example posted online 346 | // Map corpusA = new LinkedHashMap() {{ 347 | // // 100k docs total 348 | // put( "spam", 40000L ); // test word 349 | // put( "other words", 60000L ); // other words 350 | // }}; 351 | // Map corpusB = new LinkedHashMap() {{ 352 | // // 200k docs total 353 | // put( "spam", 120000L ); // test word 354 | // put( "other words", 80000L ); // other words 355 | // }}; 356 | 357 | // Map corpusA = new LinkedHashMap() {{ 358 | // put( "apples", 25L ); 359 | // put( "bananas", 30L ); 360 | // put( "carrots", 40L ); 361 | // put( "food", 100L ); 362 | // }}; 363 | // Map corpusB = new LinkedHashMap() {{ 364 | // put( "apples", 20L ); // down by 5 365 | // put( "bananas", 35L ); // up by 5 366 | // put( "candy", 40L ); // carrots -> candy! 367 | // put( "food", 100L ); // unchanged, and total unchanged 368 | // }}; 369 | 370 | 371 | HttpSolrServer solrA = SolrUtils.getServer( "localhost", 8984 ); 372 | HttpSolrServer solrB = SolrUtils.getServer( "localhost", 8985 ); 373 | String fieldName = "text"; 374 | // Set corpusA = SolrUtils.getTermsForField_ViaTermsRequest( solrA, fieldName ); 375 | // Set corpusB = SolrUtils.getTermsForField_ViaTermsRequest( solrB, fieldName ); 376 | Map corpusA = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrA, fieldName ); 377 | Map corpusB = SolrUtils.getAllTermsAndCountsForField_ViaTermsRequest( solrB, fieldName ); 378 | 379 | LLR llr = new LLR( corpusA, corpusB ); 380 | String report = llr.generateReport( "A -> B" ); 381 | System.out.print( report ); 382 | 383 | } 384 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/SetUtils.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | import java.util.Date; 6 | import java.util.Iterator; 7 | import java.util.LinkedHashMap; 8 | import java.util.LinkedHashSet; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.Set; 13 | import java.util.TreeMap; 14 | import java.util.TreeSet; 15 | 16 | public class SetUtils { 17 | 18 | public static void incrementMapCounter( Map tabulationMap, String key ) { 19 | Long value = 0L; 20 | if ( tabulationMap.containsKey(key) ) { 21 | value = tabulationMap.get(key); 22 | } 23 | value += 1L; 24 | tabulationMap.put( key, value ); 25 | } 26 | 27 | /*** 28 | public static void incrementMapCounter( Map tabulationMap, Object key ) { 29 | incrementMapCounter( tabulationMap, key, 1 ); 30 | } 31 | public static void incrementMapCounter( Map tabulationMap, Object key, Number increment ) { 32 | Number value = 0; 33 | if ( tabulationMap.containsKey(key) ) { 34 | value = tabulationMap.get(key); 35 | } 36 | // value += increment; 37 | // value = value + increment; 38 | value = value.doubleValue() + increment.doubleValue(); 39 | tabulationMap.put( key, value ); 40 | } 41 | ***/ 42 | 43 | /*** 44 | // posted to http://stackoverflow.com/questions/26551403/method-call-doesnt-match-method-signature-even-though-method-is-using-more-gene 45 | public static void incrementMapCounter( Map tabulationMap, Object key ) { 46 | Number value = 0; 47 | if ( tabulationMap.containsKey(key) ) { 48 | value = tabulationMap.get(key); 49 | } 50 | value = value.doubleValue() + new Double(1); 51 | tabulationMap.put( key, value ); 52 | } 53 | ***/ 54 | /*** 55 | // public static void incrementMapCounter( Map tabulationMap, Object key ) 56 | public static void incrementMapCounter( Map tabulationMap, K key ) 57 | // public static void incrementMapCounter( Map tabulationMap, key ) 58 | { 59 | Number value = 0; 60 | if ( tabulationMap.containsKey(key) ) { 61 | value = tabulationMap.get(key); 62 | } 63 | value = value.doubleValue() + new Double(1); 64 | tabulationMap.put( key, value ); 65 | } 66 | ***/ 67 | 68 | /** 69 | * @deprecated use {@link StringUtils#join(Collection)} instead. 70 | */ 71 | @Deprecated 72 | public static String join( Collection strings ) { 73 | return StringUtils.join( strings ); 74 | } 75 | /** 76 | * @deprecated use {@link StringUtils#join(Collection, String)} instead. 77 | */ 78 | @Deprecated 79 | public static String join( Collection strings, String delimiter ) { 80 | return StringUtils.join( strings, delimiter ); 81 | } 82 | /** 83 | * @deprecated use {@link StringUtils#splitCsv(String)} instead. 84 | */ 85 | @Deprecated 86 | public static Set splitCsv( String inStr ) { 87 | return StringUtils.splitCsv( inStr ); 88 | } 89 | 90 | // Assumes always using LinkedHashMap which keep things in predictable insertion order 91 | public static Map reverseMapEntryKeyOrder( Map inEntries ) { 92 | List keys = new ArrayList<>( inEntries.keySet() ); 93 | List values = new ArrayList<>( inEntries.values() ); 94 | if ( keys.size() != values.size() ) { 95 | throw new IllegalStateException( "Number of of keys (" + keys.size() + ") != number of values (" + values.size() ); 96 | } 97 | Map out = new LinkedHashMap<>(); 98 | for ( int i=keys.size()-1; i>=0; i-- ) { 99 | out.put( keys.get(i), values.get(i) ); 100 | } 101 | return out; 102 | } 103 | 104 | public static Map mapHead( Map inEntries, int n ) { 105 | if ( n < 1 ) { 106 | throw new IllegalStateException( "Number of desired entries must be > 0, but n = " + n ); 107 | } 108 | // TODO: safe to do this? 109 | //if ( n >= inEntries.size() ) { 110 | // return inEntries; 111 | //} 112 | Map out = new LinkedHashMap<>(); 113 | int counter = 0; 114 | for ( Entry entry : inEntries.entrySet() ) { 115 | out.put( entry.getKey(), entry.getValue() ); 116 | counter++; 117 | if ( counter >= n ) { 118 | break; 119 | } 120 | } 121 | // for ( int i=1; i<=n; i++ ) 122 | return out; 123 | } 124 | public static Map mapTail( Map inEntries, int n ) { 125 | if ( n < 1 ) { 126 | throw new IllegalStateException( "Number of desired entries must be > 0, but n = " + n ); 127 | } 128 | List keys = new ArrayList<>( inEntries.keySet() ); 129 | List values = new ArrayList<>( inEntries.values() ); 130 | if ( keys.size() != values.size() ) { 131 | throw new IllegalStateException( "Number of of keys (" + keys.size() + ") != number of values (" + values.size() ); 132 | } 133 | Map out = new LinkedHashMap<>(); 134 | int start = inEntries.size() - n - 1; 135 | if ( start<0 ) start = 0; 136 | for ( int i=start; i Map sortMapByValues( Map inMap ) { 143 | // Inverting also sorts because we use TreeMap 144 | Map> invertedMap = invertMapAndSort( inMap ); 145 | // This preserves the new order 146 | Map out = uninvertMap( invertedMap ); 147 | return out; 148 | } 149 | // using tree map for output, so automatically sorted 150 | public static Map> invertMapAndSort( Map inMap ) { 151 | Map> out = new TreeMap<>(); 152 | for ( Entry entry : inMap.entrySet() ) { 153 | K key = entry.getKey(); 154 | V value = entry.getValue(); 155 | if ( out.containsKey(value) ) { 156 | Set vector = out.get(value); 157 | vector.add( key ); 158 | } 159 | else { 160 | Set vector = new TreeSet<>(); 161 | vector.add( key ); 162 | out.put( value, vector ); 163 | } 164 | } 165 | return out; 166 | } 167 | // Preserve insertion order 168 | public static Map uninvertMap( Map> inMap ) { 169 | Map out = new LinkedHashMap<>(); 170 | for ( Entry> entry : inMap.entrySet() ) { 171 | V value = entry.getKey(); 172 | Set keys = entry.getValue(); 173 | for ( K k : keys ) { 174 | if ( out.containsKey(k) ) { 175 | throw new IllegalArgumentException( "Duplicate entries for supposed unique key " + k ); 176 | } 177 | out.put( k, value ); 178 | } 179 | } 180 | return out; 181 | } 182 | 183 | public static boolean sameAndInSameOrder( Set idsA, Set idsB ) { 184 | // Bunch of edge cases 185 | // TODO: maybe move edge cases to same set 186 | // TODO: other methods don't do null checking.... 187 | if ( null==idsA && null==idsB ) { 188 | return true; 189 | } 190 | if ( null==idsA ) { 191 | return null==idsB || idsB.isEmpty(); 192 | } 193 | if ( null==idsB ) { 194 | return null==idsA || idsA.isEmpty(); 195 | } 196 | if ( idsA.isEmpty() && idsB.isEmpty() ) { 197 | return true; 198 | } 199 | if ( idsA.size() != idsB.size() ) { 200 | return false; 201 | } 202 | Set onlyA = inAOnly_nonDestructive( idsA, idsB ); 203 | Set onlyB = inBOnly_nonDestructive( idsA, idsB ); 204 | if ( ! onlyA.isEmpty() || ! onlyB.isEmpty() ) { 205 | return false; 206 | } 207 | // OK, walk them together 208 | // And we've checked the sizes 209 | Iterator itA = idsA.iterator(); 210 | Iterator itB = idsB.iterator(); 211 | 212 | // Note: 213 | // The while and if checks look redundant 214 | // but they handle the very unlikely edge case 215 | // where one list is added to while we're looping 216 | // and gets longer - that means FALSE 217 | // but if loop just ended we'd accidently return true 218 | while ( itA.hasNext() || itB.hasNext() ) { 219 | if ( ! itA.hasNext() || ! itB.hasNext() ) { 220 | return false; 221 | } 222 | String itemA = itA.next(); 223 | String itemB = itB.next(); 224 | if ( ! itemA.equals(itemB) ) { 225 | return false; 226 | } 227 | } 228 | // All tests have passed 229 | return true; 230 | } 231 | 232 | // TODO: refactor to handle anything implementing Collection 233 | public static boolean sameAndInSameOrder( Collection idsA, Collection idsB ) { 234 | // Bunch of edge cases 235 | // TODO: maybe move edge cases to same set 236 | // TODO: other methods don't do null checking.... 237 | if ( null==idsA && null==idsB ) { 238 | return true; 239 | } 240 | if ( null==idsA ) { 241 | return null==idsB || idsB.isEmpty(); 242 | } 243 | if ( null==idsB ) { 244 | return null==idsA || idsA.isEmpty(); 245 | } 246 | if ( idsA.isEmpty() && idsB.isEmpty() ) { 247 | return true; 248 | } 249 | if ( idsA.size() != idsB.size() ) { 250 | return false; 251 | } 252 | Collection onlyA = inAOnly_nonDestructive( idsA, idsB ); 253 | Collection onlyB = inBOnly_nonDestructive( idsA, idsB ); 254 | if ( ! onlyA.isEmpty() || ! onlyB.isEmpty() ) { 255 | return false; 256 | } 257 | // OK, walk them together 258 | // And we've checked the sizes 259 | Iterator itA = idsA.iterator(); 260 | Iterator itB = idsB.iterator(); 261 | 262 | // Note: 263 | // The while and if checks look redundant 264 | // but they handle the very unlikely edge case 265 | // where one list is added to while we're looping 266 | // and gets longer - that means FALSE 267 | // but if loop just ended we'd accidently return true 268 | while ( itA.hasNext() || itB.hasNext() ) { 269 | if ( ! itA.hasNext() || ! itB.hasNext() ) { 270 | return false; 271 | } 272 | String itemA = itA.next(); 273 | String itemB = itB.next(); 274 | if ( ! itemA.equals(itemB) ) { 275 | return false; 276 | } 277 | } 278 | // All tests have passed 279 | return true; 280 | } 281 | 282 | // Non-Destructive 283 | 284 | public static Set inAOnly_nonDestructive( Set idsA, Set idsB ) { 285 | Set out = new LinkedHashSet<>(); 286 | out.addAll( idsA ); 287 | out.removeAll( idsB ); 288 | return out; 289 | } 290 | // TODO: redo so it takes anything derived from Collection 291 | public static Collection inAOnly_nonDestructive( Collection idsA, Collection idsB ) { 292 | Set out = new LinkedHashSet<>(); 293 | out.addAll( idsA ); 294 | out.removeAll( idsB ); 295 | return out; 296 | } 297 | public static Set inBOnly_nonDestructive( Set idsA, Set idsB ) { 298 | return inAOnly_nonDestructive( idsB, idsA ); 299 | } 300 | // TODO: redo so it takes anything derived from Collection 301 | public static Collection inBOnly_nonDestructive( Collection idsA, Collection idsB ) { 302 | return inAOnly_nonDestructive( idsB, idsA ); 303 | } 304 | public static Set intersection_nonDestructive( Set idsA, Set idsB ) { 305 | Set out = new LinkedHashSet<>(); 306 | out.addAll( idsA ); 307 | out.retainAll( idsB ); 308 | return out; 309 | } 310 | public static Collection intersection_nonDestructive( Collection idsA, Collection idsB ) { 311 | Set out = new LinkedHashSet<>(); 312 | out.addAll( idsA ); 313 | out.retainAll( idsB ); 314 | return out; 315 | } 316 | public static Set union_nonDestructive( Set idsA, Set idsB ) { 317 | Set out = new LinkedHashSet<>(); 318 | out.addAll( idsA ); 319 | out.addAll( idsB ); 320 | return out; 321 | } 322 | 323 | // Destructive 324 | 325 | public static Set inAOnly_destructive( Set idsA, Set idsB ) { 326 | idsA.removeAll( idsB ); 327 | return idsA; 328 | } 329 | public static Set inBOnly_destructive( Set idsA, Set idsB ) { 330 | return inAOnly_destructive( idsB, idsA ); 331 | } 332 | public static Set intersection_destructive( Set idsA, Set idsB ) { 333 | idsA.retainAll( idsB ); 334 | return idsB; 335 | } 336 | public static Set union_destructive( Set idsA, Set idsB ) { 337 | idsA.addAll( idsB ); 338 | return idsA; 339 | } 340 | 341 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/StatsUtils.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | import java.util.Iterator; 6 | import java.util.LinkedHashSet; 7 | import java.util.List; 8 | import java.util.Set; 9 | 10 | public class StatsUtils { 11 | 12 | // Quick Stats 13 | // For ints, we still return long as a sum 14 | public static long sumList_Ints( Collection in ) { 15 | long out = 0L; 16 | for ( Integer i : in ) { 17 | if ( null!=i ) { 18 | out += i; 19 | } 20 | } 21 | return out; 22 | } 23 | public static long sumList_Longs( Collection in ) { 24 | long out = 0L; 25 | for ( Long i : in ) { 26 | if ( null!=i ) { 27 | out += i; 28 | } 29 | } 30 | return out; 31 | } 32 | public static double sumList_Doubles( Collection in ) { 33 | double out = 0L; 34 | for ( Double i : in ) { 35 | if ( null!=i ) { 36 | out += i; 37 | } 38 | } 39 | return out; 40 | } 41 | public static int minList_Ints( Collection in ) { 42 | int out = Integer.MAX_VALUE; 43 | for ( Integer i : in ) { 44 | if ( i < out ) { 45 | out = i; 46 | } 47 | } 48 | return out; 49 | } 50 | public static long minList_Longs( Collection in ) { 51 | long out = Long.MAX_VALUE; 52 | for ( Long i : in ) { 53 | if ( i < out ) { 54 | out = i; 55 | } 56 | } 57 | return out; 58 | } 59 | public static int maxList_Ints( Collection in ) { 60 | int out = Integer.MIN_VALUE; 61 | for ( Integer i : in ) { 62 | if ( i > out ) { 63 | out = i; 64 | } 65 | } 66 | return out; 67 | } 68 | public static long maxList_Longs( Collection in ) { 69 | long out = Long.MIN_VALUE; 70 | for ( Long i : in ) { 71 | if ( i > out ) { 72 | out = i; 73 | } 74 | } 75 | return out; 76 | } 77 | public static double averageList_Ints( Collection in ) { 78 | if ( null==in || in.isEmpty() ) { 79 | return Double.NaN; 80 | } 81 | Long sum = sumList_Ints( in ); 82 | return (double) sum / (double) in.size(); 83 | // return new Double(sum) / new Double(in.size()); 84 | } 85 | public static double averageList_Longs( Collection in ) { 86 | if ( null==in || in.isEmpty() ) { 87 | return Double.NaN; 88 | } 89 | Long sum = sumList_Longs( in ); 90 | return (double) sum / (double) in.size(); 91 | // return new Double(sum) / new Double(in.size()); 92 | } 93 | public static double averageList_Doubles( Collection in ) { 94 | if ( null==in || in.isEmpty() ) { 95 | return Double.NaN; 96 | } 97 | Double sum = sumList_Doubles( in ); 98 | return (double) sum / (double) in.size(); 99 | // return new Double(sum) / new Double(in.size()); 100 | } 101 | // TODO: assumes full-population std, could add flag for sample, N-1 logic 102 | public static double standardDeviationList_Ints( Collection in ) { 103 | if ( null==in || in.isEmpty() ) { 104 | return 0.0; 105 | } 106 | double avg = averageList_Ints( in ); 107 | double sumOfDeltaSquared = 0.0; 108 | for ( int i : in ) { 109 | // Order doesn't matter since we square it 110 | double delta = avg - (double)i; 111 | sumOfDeltaSquared += delta * delta; 112 | } 113 | return Math.sqrt( sumOfDeltaSquared / (double)in.size() ); 114 | } 115 | public static double standardDeviationList_Longs( Collection in ) { 116 | if ( null==in || in.isEmpty() ) { 117 | return 0.0; 118 | } 119 | double avg = averageList_Longs( in ); 120 | double sumOfDeltaSquared = 0.0; 121 | for ( long i : in ) { 122 | // Order doesn't matter since we square it 123 | double delta = avg - (double) i; 124 | sumOfDeltaSquared += delta * delta; 125 | } 126 | return Math.sqrt( sumOfDeltaSquared / (double)in.size() ); 127 | } 128 | 129 | public static List longs2Doubles( Collection longs ) { 130 | List out = new ArrayList<>(); 131 | for ( Long l : longs ) { 132 | Double d = new Double(l); 133 | out.add( new Double( l ) ); 134 | } 135 | return out; 136 | } 137 | 138 | // http://math.stackexchange.com/questions/350754/fitting-exponential-curve-to-data 139 | // Returns [A,k] for y = A e^kx 140 | public static double [] leastSquares_Exponential( List xList, List yList ) { 141 | List xList2 = new ArrayList<>(); 142 | List yList2 = new ArrayList<>(); 143 | // Skip zeros! 144 | for ( int i=0; i 0.0 ) { 147 | Double x = xList.get(i); 148 | double y2 = Math.log(y); 149 | xList2.add( x ); 150 | yList2.add( y2 ); 151 | } 152 | // yList2.add( Math.log(d) ); 153 | } 154 | double [] line = leastSquares_Line( xList2, yList2 ); 155 | double m = line[0]; 156 | double b = line[1]; 157 | double A = Math.exp( b ); 158 | double k = m; 159 | double out[] = new double[2]; 160 | out[0] = A; 161 | out[1] = k; 162 | System.out.println( "leastSquares_Exponential: returning [A, k] = [" + A + ", " + k + "]" ); 163 | return out; 164 | } 165 | // Retruns [m, b] for y = mx+b 166 | // http://hotmath.com/hotmath_help/topics/line-of-best-fit.html 167 | public static double [] leastSquares_Line( List xList, List yList ) { 168 | double m = 0; 169 | double b = 0; 170 | if ( xList.size() != yList.size() ) { 171 | throw new IllegalStateException( "Number of of x values (" + xList.size() + ") != number of y (" + yList.size() ); 172 | } 173 | if ( xList.size() > 0 ) { 174 | double sumX = 0; 175 | double sumY = 0; 176 | double sumXY = 0; 177 | double sumSquaredX = 0; 178 | for ( int i=0; i> parseCgiParameters( String rawText ) { 50 | 51 | // picky options we might expose later on 52 | boolean maintainInsertionOrder = false; 53 | boolean isCaseSensitiveKeys = true; 54 | boolean trimKeys = true; 55 | String defaultParamName = "content"; 56 | String encoding = "UTF-8"; 57 | // Value normalization might vary by parameter name 58 | // TODO: separate method to look for CSV and space delimited values 59 | // TODO: separate method to perhaps provide default values 60 | 61 | // Map> outMap = maintainInsertionOrder ? new LinkedHashMap<>() : new TreeMap<>(); 62 | Map> outMap = null; 63 | if ( maintainInsertionOrder ) { 64 | outMap = new LinkedHashMap<>(); 65 | } 66 | else { 67 | outMap = new TreeMap<>(); 68 | } 69 | 70 | // Break on & and ? (usually just &) 71 | String [] args = rawText.split( "[?&]" ); 72 | for ( int i=0; i= arr.length) { 213 | sb.append('\\'); 214 | } else { 215 | char next = arr[i]; 216 | switch (next) { 217 | case ',': 218 | // escape not needed 219 | break; 220 | case 'Q': 221 | case 'E': 222 | // extra escape needed 223 | sb.append('\\'); 224 | default: 225 | sb.append('\\'); 226 | } 227 | sb.append(next); 228 | } 229 | break; 230 | case '*': 231 | if (inClass == 0) 232 | sb.append(".*"); 233 | else 234 | sb.append('*'); 235 | break; 236 | case '?': 237 | if (inClass == 0) 238 | sb.append('.'); 239 | else 240 | sb.append('?'); 241 | break; 242 | case '[': 243 | inClass++; 244 | firstIndexInClass = i + 1; 245 | sb.append('['); 246 | break; 247 | case ']': 248 | inClass--; 249 | sb.append(']'); 250 | break; 251 | case '.': 252 | case '(': 253 | case ')': 254 | case '+': 255 | case '|': 256 | case '^': 257 | case '$': 258 | case '@': 259 | case '%': 260 | if (inClass == 0 || (firstIndexInClass == i && ch == '^')) 261 | sb.append('\\'); 262 | sb.append(ch); 263 | break; 264 | case '!': 265 | if (firstIndexInClass == i) 266 | sb.append('^'); 267 | else 268 | sb.append('!'); 269 | break; 270 | case '{': 271 | inGroup++; 272 | sb.append('('); 273 | break; 274 | case '}': 275 | inGroup--; 276 | sb.append(')'); 277 | break; 278 | case ',': 279 | if (inGroup > 0) 280 | sb.append('|'); 281 | else 282 | sb.append(','); 283 | break; 284 | default: 285 | sb.append(ch); 286 | } 287 | } 288 | return sb.toString(); 289 | } 290 | 291 | // TODO: could also do list of matches with m.reset(myNewString), might be slightly faster 292 | public static boolean checkPatternsInList( Collection patterns, String targetString ) { 293 | for ( Pattern p : patterns ) { 294 | Matcher m = p.matcher( targetString ); 295 | if ( m.matches() ) { 296 | return true; 297 | } 298 | } 299 | return false; 300 | } 301 | 302 | 303 | } 304 | -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/util/TupleEntropy.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.util; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | 7 | public class TupleEntropy { 8 | static boolean debug = false; 9 | public static double calcTupleEntropyForAllLengths( String word ) { 10 | double outEntropy = 0.0; 11 | for ( int i=1; i<=word.length(); i++ ) { 12 | double newEntropy = calcTupleEntropyForLength( word, i ); 13 | outEntropy += newEntropy; 14 | } 15 | return outEntropy; 16 | } 17 | public static double calcTupleEntropyForLength( String word, int len ) { 18 | Map tupleStats = calcTuplesForLen( word, len ); 19 | double outEntropy = calcEntropyForCounts( tupleStats ); 20 | if(debug) System.out.println( "\tTuple Len: " + len + " has " + tupleStats.keySet().size() + " / " + StatsUtils.sumList_Doubles(tupleStats.values()) + " unique/total" ); 21 | if(debug) System.out.println( "\t\tTuples: " + tupleStats ); 22 | if(debug) System.out.println( "\t\tEntropy = " + outEntropy ); 23 | return outEntropy; 24 | } 25 | public static double calcEntropyForCounts( Map inMap ) { 26 | if ( null==inMap || inMap.isEmpty() ) { 27 | return 0.0; 28 | } 29 | double sum = StatsUtils.sumList_Doubles( inMap.values() ); 30 | if ( sum <= 0.0 ) { 31 | return 0.0; 32 | } 33 | double outEntropy = 0.0; 34 | for ( Entry entry : inMap.entrySet() ) { 35 | String word = entry.getKey(); 36 | double count = entry.getValue(); 37 | double prob = count / sum; 38 | if ( prob > 0.0 ) { 39 | double newEntropy = -1.0 * prob * Math.log( prob ); 40 | outEntropy += newEntropy; 41 | } 42 | } 43 | return outEntropy; 44 | } 45 | public static Map calcTuplesForLen( String word, int len ) { 46 | Map out = new LinkedHashMap<>(); 47 | if ( len > 0 && word.length() >= len ) { 48 | for ( int i=0; i <= word.length() - len; i++ ) { 49 | String tupe = word.substring( i, i + len ); 50 | double oldCount = 0.0; 51 | if ( out.containsKey(tupe) ) { 52 | oldCount = out.get( tupe ); 53 | } 54 | out.put( tupe, oldCount + 1.0 ); 55 | } 56 | } 57 | return out; 58 | } 59 | public static void main( String[] argv ) { 60 | for ( String word : argv ) { 61 | if(debug) System.out.println( "Word: \"" + word + "\"" ); 62 | double entropy = calcTupleEntropyForAllLengths( word ); 63 | if(debug) System.out.println( "\ttotal for word \"" + word + "\": " + entropy ); 64 | if ( ! debug ) { 65 | System.out.println( "" + word + "\t" + word.length() + "\t" + entropy ); 66 | } 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /src/main/java/com/lucidworks/dq/zk_experiment/ZkSmartClient.java: -------------------------------------------------------------------------------- 1 | package com.lucidworks.dq.zk_experiment; 2 | 3 | import java.io.IOException; 4 | import java.util.Set; 5 | 6 | import org.apache.solr.client.solrj.SolrQuery; 7 | import org.apache.solr.client.solrj.SolrServer; 8 | import org.apache.solr.client.solrj.SolrServerException; 9 | import org.apache.solr.client.solrj.impl.CloudSolrServer; 10 | import org.apache.solr.client.solrj.response.QueryResponse; 11 | import org.apache.solr.common.SolrDocument; 12 | import org.apache.solr.common.SolrInputDocument; 13 | 14 | /* 15 | * Demonstrate a ZooKeeper aware "Smart Client" that can automatically handle server issues 16 | */ 17 | public class ZkSmartClient { 18 | static final String COLLECTION = "collection1"; 19 | static final String ID_FIELD = "id"; 20 | 21 | // BEFORE: Would normally use this w/ HttpSolrServer 22 | static final String SOLR_URL = "http://localhost:8983/solr/" + COLLECTION; 23 | 24 | // AFTER: Instead we use this w/ CloudSolrServer 25 | // These are ZooKeeper instances, could map to 1 or 100 Solr servers 26 | static final String ZK_ENSEMBLE = "localhost:2181,localhost:2182,localhost:2183"; 27 | 28 | static SolrServer openServer() { 29 | 30 | // BEFORE: Normally we'd use this: 31 | // HttpSolrServer extends SolrServer 32 | // HttpSolrServer server = new HttpSolrServer( serverUrl ); 33 | 34 | // AFTER: Instead we use this: 35 | // CloudSolrServer extends SolrServer 36 | CloudSolrServer server = new CloudSolrServer( ZK_ENSEMBLE ); 37 | // .setDefaultColl not defined for base SolrServer type 38 | server.setDefaultCollection( COLLECTION ); 39 | 40 | return server; 41 | } 42 | 43 | static void addDoc( SolrServer server, int id ) throws SolrServerException, IOException { 44 | SolrInputDocument doc = new SolrInputDocument(); 45 | doc.addField( ID_FIELD, "" + id ); 46 | doc.addField( "name", "Test Doc " + id ); 47 | server.add(doc); 48 | // Normally wouldn't do this, but OK for small test 49 | server.commit(); 50 | System.out.println( "Added doc " + id ); 51 | } 52 | 53 | static void testSearch( SolrServer server ) throws SolrServerException { 54 | SolrQuery query = new SolrQuery( "*:*" ); 55 | query.addField( ID_FIELD ); 56 | QueryResponse res = server.query( query ); 57 | System.out.println( "Sample doc IDs:" ); 58 | // gets max of 10 docs by default 59 | for ( SolrDocument doc : res.getResults() ) { 60 | String id = (String) doc.get( ID_FIELD ); 61 | System.out.println( id ); 62 | } 63 | } 64 | 65 | public static void main(String[] args) throws SolrServerException, IOException { 66 | SolrServer server = openServer(); 67 | addDoc( server, 4 ); 68 | testSearch( server ); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/resources/DQ-Prototype-and-SolrJ.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucidworks/data-quality/6fd557d8757d5e956082a51f669f88bf7c226d80/src/main/resources/DQ-Prototype-and-SolrJ.key -------------------------------------------------------------------------------- /src/main/resources/DQ-Prototype-and-SolrJ.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucidworks/data-quality/6fd557d8757d5e956082a51f669f88bf7c226d80/src/main/resources/DQ-Prototype-and-SolrJ.pdf -------------------------------------------------------------------------------- /src/main/resources/sample-reports/README.txt: -------------------------------------------------------------------------------- 1 | These aren't the exact output of current reports: 2 | * Comments added 3 | * Reports broken into separate parts in some cases 4 | * Slight wording/formatting differences from current java code 5 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/dates-curve-fitting.txt: -------------------------------------------------------------------------------- 1 | Date Histogram and Exponential Growth Curve Fitting 2 | 3 | Report: 4 | * Guesses your date fields 5 | * Generates a Histogram by year 6 | * Also tries to fit an exponential curve to your Dates' growth 7 | (via least squares on natual log of counts) 8 | 9 | In this example report, the fitted curves are 10 | flatter than the data for 2 reasons: 11 | 1: Has dates in distant past 12 | (late 1800's, etc, below display threshhold) 13 | 2: Most recent year was when data was captured, midyear, 14 | so value shortfall 15 | 16 | ------------ 17 | 18 | Solr = http://localhost:8983/solr/demo_shard1_replica1 19 | 20 | Stats, Strs: Start/Stop: 1900-01-01T00:00:00Z / 2025-12-31T00:00:00Z 21 | Years with < .5 hash marks aren't displayed 22 | Not many dates in early 1900's nor in the future 23 | 24 | Date Field: releaseDate 25 | 2012-01-01: =====#===================== 26 | 2011-01-01: =====#===================================== 27 | 2010-01-01: ====#================================= 28 | 2009-01-01: ===#====================================== 29 | 2008-01-01: ===#======================================================== 30 | 2007-01-01: ==#=================================================== 31 | 2006-01-01: ==#==================================================== 32 | 2005-01-01: ==#===================================================== 33 | 2004-01-01: =#=================================================== 34 | 2003-01-01: =#============================================ 35 | 2002-01-01: =#===================================== 36 | 2001-01-01: =#==================================== 37 | 2000-01-01: #=========================================== 38 | 1999-01-01: #=============================== 39 | 1998-01-01: #============================= 40 | 1997-01-01: #===================== 41 | 1996-01-01: #====================== 42 | 1995-01-01: #========================= 43 | 1994-01-01: #==================== 44 | 1993-01-01: #============ 45 | 1992-01-01: ========== 46 | 1991-01-01: ========= 47 | 1990-01-01: =============== 48 | 1989-01-01: == 49 | 50 | Stats, Strs: Start/Stop: 1884-01-01T00:00:00Z / 2012-07-29T00:00:00Z 51 | Date Field: startDate 52 | 2012-01-01: ==========#=== 53 | 2011-01-01: =========#============================================= 54 | 2010-01-01: ========#========================================== 55 | 2009-01-01: =======#==================================== 56 | 2008-01-01: ======#=============================================== 57 | 2007-01-01: =====#====================================================== 58 | 2006-01-01: ====#================================================ 59 | 2005-01-01: ====#================================================= 60 | 2004-01-01: ===#===================================================== 61 | 2003-01-01: ===#=============================================== 62 | 2002-01-01: ==#====================================== 63 | 2001-01-01: ==#================================= 64 | 2000-01-01: =#==================================== 65 | 1999-01-01: =#===================================== 66 | 1998-01-01: =#=========================== 67 | 1997-01-01: =#======================= 68 | 1996-01-01: #=================== 69 | 1995-01-01: #======================== 70 | 1994-01-01: #===================== 71 | 1993-01-01: #================ 72 | 1992-01-01: #========== 73 | 1991-01-01: #======== 74 | 1990-01-01: #============== 75 | 1989-01-01: #== 76 | 1988-01-01: # 77 | 78 | Calculations output, debugging, etc................. 79 | 80 | leastSquares_Line: x[55] = [-2.2089888E12, -1.230768E12, -1.1045376E12, -6.31152E11, -5.049216E11, -4.733856E11, -4.418496E11, -1.577664E11, -9.46944E10, -6.31584E10, -3.1536E10, 0.0, 6.3072E10, 1.262304E11, 1.577664E11, 1.893024E11, 2.209248E11, 2.524608E11, 2.839968E11, 3.155328E11, 3.471552E11, 3.786912E11, 4.102272E11, 4.417632E11, 4.733856E11, 5.049216E11, 5.364576E11, 5.679936E11, 5.99616E11, 6.31152E11, 6.62688E11, 6.94224E11, 7.258464E11, 7.573824E11, 7.889184E11, 8.204544E11, 8.520768E11, 8.836128E11, 9.151488E11, 9.466848E11, 9.783072E11, 1.0098432E12, 1.0413792E12, 1.0729152E12, 1.1045376E12, 1.1360736E12, 1.1676096E12, 1.1991456E12, 1.230768E12, 1.262304E12, 1.29384E12, 1.325376E12, 1.3569984E12, 1.5778368E12, 1.7356896E12] 81 | leastSquares_Line: y[55] = [2.6390573296152584, 2.0794415416798357, 0.0, 1.0986122886681098, 0.0, 0.6931471805599453, 0.6931471805599453, 0.0, 0.0, 0.0, 0.6931471805599453, 1.3862943611198906, 0.0, 0.0, 1.0986122886681098, 0.0, 1.0986122886681098, 1.6094379124341003, 1.0986122886681098, 5.288267030694535, 1.0986122886681098, 2.0794415416798357, 2.4849066497880004, 3.6635616461296463, 3.6888794541139363, 4.574710978503383, 5.220355825078324, 6.1224928095143865, 8.119993827725105, 9.971333099431195, 9.512516890578416, 9.616405300156314, 9.850666776352545, 10.339805124127057, 10.555656476367515, 10.411358816475682, 10.388964598613677, 10.683041760836812, 10.735679026718607, 11.068215243411098, 10.914051563182127, 10.95166582665281, 11.113566675293825, 11.244038976969438, 11.3011050641372, 11.297750083305196, 11.263489271701095, 11.376509917165842, 11.00829726411219, 10.910258637538657, 11.04809244505471, 10.59383080576334, 3.258096538021482, 1.0986122886681098, 3.1780538303479458] 82 | leastSquares_Line: sumX = 2.6097552E13 83 | leastSquares_Line: sumY = 310.2204061940795 84 | leastSquares_Line: sumXY = 2.6660475019541412E14 85 | leastSquares_Line: sumSquaredX = 4.153851514838016E25 86 | leastSquares_Line: meanX = 4.745009454545455E11 87 | leastSquares_Line: meanY = 5.6403710217105365 88 | leastSquares_Line: n = 55.0 89 | leastSquares_Line: m_mumerator = 1.1940487415703028E14 90 | leastSquares_Line: m_denominator = 2.9155202050330995E25 91 | leastSquares_Line: returning [m, b] = [4.095491224890163E-12, 3.6970565633993595] 92 | leastSquares_Exponential: returning [A, k] = [40.328425326295104, 4.095491224890163E-12] 93 | leastSquares_Line: x[55] = [-2.7139104E12, -2.2405248E12, -1.230768E12, -1.1676096E12, -1.1045376E12, -6.31152E11, -5.364576E11, -5.049216E11, -4.733856E11, -1.577664E11, -1.262304E11, -9.46944E10, -6.31584E10, -3.1536E10, 0.0, 6.3072E10, 1.262304E11, 1.577664E11, 1.893024E11, 2.209248E11, 2.524608E11, 2.839968E11, 3.155328E11, 3.471552E11, 3.786912E11, 4.102272E11, 4.417632E11, 4.733856E11, 5.049216E11, 5.364576E11, 5.679936E11, 5.99616E11, 6.31152E11, 6.62688E11, 6.94224E11, 7.258464E11, 7.573824E11, 7.889184E11, 8.204544E11, 8.520768E11, 8.836128E11, 9.151488E11, 9.466848E11, 9.783072E11, 1.0098432E12, 1.0413792E12, 1.0729152E12, 1.1045376E12, 1.1360736E12, 1.1676096E12, 1.1991456E12, 1.230768E12, 1.262304E12, 1.29384E12, 1.325376E12] 94 | leastSquares_Line: y[55] = [2.302585092994046, 2.6390573296152584, 2.0794415416798357, 1.3862943611198906, 0.0, 0.6931471805599453, 0.0, 0.6931471805599453, 1.9459101490553132, 0.0, 1.791759469228055, 0.0, 0.6931471805599453, 1.0986122886681098, 0.6931471805599453, 0.0, 1.0986122886681098, 0.6931471805599453, 0.0, 1.3862943611198906, 1.6094379124341003, 5.267858159063328, 1.791759469228055, 0.0, 2.3978952727983707, 3.4657359027997265, 3.7376696182833684, 4.07753744390572, 4.882801922586371, 5.420534999272286, 6.8966943316227125, 8.576781982827894, 10.054361440970256, 9.581145019820722, 9.7107519573933, 10.146002265529594, 10.431995911154427, 10.549202396661185, 10.33471786853032, 10.579234218415905, 10.713750879764753, 11.014785701011421, 10.989656070979896, 10.936316100658235, 11.045366844253579, 11.280917742717547, 11.376395294789138, 11.326547692846342, 11.319595537796637, 11.43589322129926, 11.336677426964654, 11.133391226885413, 11.264105066866035, 11.34294346177025, 9.984053200375696] 95 | leastSquares_Line: sumX = 1.72931328E13 96 | leastSquares_Line: sumY = 315.2068163472246 97 | leastSquares_Line: sumXY = 2.494364214301782E14 98 | leastSquares_Line: sumSquaredX = 4.317231463538688E25 99 | leastSquares_Line: meanX = 3.1442059636363635E11 100 | leastSquares_Line: meanY = 5.731033024494993 101 | leastSquares_Line: n = 55.0 102 | leastSquares_Line: m_mumerator = 1.5032890625640062E14 103 | leastSquares_Line: m_denominator = 3.7734997507415324E25 104 | leastSquares_Line: returning [m, b] = [3.983805914571994E-12, 4.478442393038285] 105 | leastSquares_Exponential: returning [A, k] = [88.09734471448613, 3.983805914571994E-12] 106 | ----------- http://localhost:8983/solr/demo_shard1_replica1 ----------- 107 | Total Active Docs: 1,275,077 108 | 109 | All Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight] 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/llr-larger-sample.txt: -------------------------------------------------------------------------------- 1 | Compare words between two sources. 2 | * Larger absolute number means more important change 3 | * sign (+/-) indicates direction of change 4 | * Note: signs may be backwards, still confirming 5 | 6 | Notice the terms with the highest absolute score are the ones 7 | that were added, eg: "acme", "cardboard", "box", etc. 8 | 9 | ----------- A -> B ----------- 10 | 11 | Corpus A unique / total words: 398 / 579.0 12 | Corpus B unique / total words: 385 / 593.0 13 | Combined unique / total words: 418 / 1172.0 14 | Number of p log(p) calculations: 0 15 | 16 | Term Changes, first 5 entries: 17 | acme: -4.09515240975383 18 | any: -4.09515240975383 19 | box: -4.09515240975383 20 | cardboard: -4.09515240975383 21 | fits: -4.09515240975383 22 | Term Changes, last 5 entries: 23 | silentseek: 1.4112036109151607 24 | sp2514n: 1.4112036109151607 25 | spinpoint: 1.4112036109151607 26 | ultra: 1.4112036109151607 27 | cache: 2.824159489031562 28 | hard: 2.824159489031562 29 | 30 | 31 | Data: 32 | 33 | Corpus A is stock Solr with all exampledocs XML files submitted. 34 | 35 | Corpus B is a slightly modified version with a few docs added. 36 | For example, has: 37 | 38 | new.xml 39 | ------- 40 | 41 | NEW111 42 | New Sample Product 43 | Acme, Inc. 44 | 45 | acme 46 | electronics 47 | gadget 48 | Rocket powered, sugar-free, fits in any tackle box! 49 | cardboard box 50 | 10.5 51 | 19.95 52 | 101 53 | true 54 | 55 | 43.17614,-90.57341 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/llr-tiny-sample.txt: -------------------------------------------------------------------------------- 1 | Compare words between two sources. 2 | * Larger absolute number means more important change 3 | * sign (+/-) indicates direction of change 4 | 5 | 6 | ----------- A -> B ----------- 7 | 8 | Corpus A unique / total words: 4 / 195.0 9 | Corpus B unique / total words: 4 / 195.0 10 | Combined unique / total words: 5 / 390.0 11 | 12 | All Term Changes: 13 | candy: -60.03320678316351 14 | bananas: -0.46192170199964266 15 | food: 0.0 16 | apples: 0.6291706616789554 17 | carrots: 60.03320678316349 18 | 19 | 20 | Inputs: 21 | Map corpusA = new LinkedHashMap() {{ 22 | put( "apples", 25L ); 23 | put( "bananas", 30L ); 24 | put( "carrots", 40L ); 25 | put( "food", 100L ); 26 | }}; 27 | Map corpusB = new LinkedHashMap() {{ 28 | put( "apples", 20L ); // down by 5 29 | put( "bananas", 35L ); // up by 5 30 | put( "candy", 40L ); // carrots -> candy! 31 | put( "food", 100L ); // unchanged, and total unchanged 32 | }}; 33 | 34 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/populated-fields-diff.txt: -------------------------------------------------------------------------------- 1 | Index A = http://localhost:8984/solr/collection1 2 | Index B = http://localhost:8985/solr/collection1 3 | 4 | A: Total Active Docs: 32 5 | B: Total Active Docs: 33 6 | 7 | 8 | All Fields: 9 | 10 | In both = '[_root_, _version_, author, cat, category, comments, content, content_type, description, features, id, inStock, includes, keywords, last_modified, links, manu, manu_exact, name, payloads, popularity, price, resourcename, sku, store, subject, text, text_rev, title, url, weight, address_s, compName_s, incubationdate_dt, manu_id_s, manufacturedate_dt, price_c, price_c____amount_raw, price_c____currency, store_0_coordinate, store_1_coordinate]' 11 | 12 | B only = '[field_a_en, field_b_en]' 13 | 14 | Populated at 100% in Both A and B: [_version_, id] 15 | 16 | No Indexed Values / 0% in Both A and B: [_root_, author, category, comments, content, content_type, description, keywords, last_modified, links, resourcename, sku, store, subject, text_rev, title, url] 17 | 18 | Partially Populated Fields and Percentages, A / B: 19 | cat: 20 (62.5%) / 21 (63.64%) 20 | features: 20 (62.5%) / 21 (63.64%) 21 | inStock: 21 (65.62%) / 20 (60.61%) 22 | includes: 3 (9.38%) / 6 (18.18%) 23 | manu: 20 (62.5%) / 21 (63.64%) 24 | manu_exact: 20 (62.5%) / 21 (63.64%) 25 | name: 21 (65.62%) / 22 (66.67%) 26 | payloads: 3 (9.38%) / 3 (9.09%) 27 | popularity: 15 (46.88%) / 14 (42.42%) 28 | price: 16 (50%) / 15 (45.45%) 29 | text: 21 (65.62%) / 22 (66.67%) 30 | weight: 9 (28.12%) / 10 (30.3%) 31 | address_s: 11 (34.38%) / 11 (33.33%) 32 | compName_s: 11 (34.38%) / 11 (33.33%) 33 | incubationdate_dt: 1 (3.12%) / 1 (3.03%) 34 | manu_id_s: 18 (56.25%) / 19 (57.58%) 35 | manufacturedate_dt: 11 (34.38%) / 9 (27.27%) 36 | price_c: 20 (62.5%) / 19 (57.58%) 37 | price_c____amount_raw: 20 (62.5%) / 19 (57.58%) 38 | price_c____currency: 20 (62.5%) / 19 (57.58%) 39 | store_0_coordinate: 14 (43.75%) / 13 (39.39%) 40 | store_1_coordinate: 14 (43.75%) / 13 (39.39%) 41 | field_a_en: (not in A) / 1 (3.03%) 42 | field_b_en: (not in A) / 1 (3.03%) 43 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/populated-fields-single.txt: -------------------------------------------------------------------------------- 1 | Notes: 2 | * Fields include both declared and dynamic fields. 3 | * This is based on fields with indexed values (vs. stored values) 4 | * Stats do NOT include deleted docs. 5 | 6 | - - - actual report - - - 7 | 8 | Total Active Docs: 1,275,077 9 | 10 | All Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight] 11 | 12 | Populated at 100%: [_version_, id, regularPrice, salePrice, store_id, text, type] 13 | 14 | No Indexed Values / 0%: [_root_, author, cat, category, categoryPath, comments, content, content_type, inStock, includes, keywords, last_modified, links, manu, manu_exact, payloads, popularity, price, resourcename, shippingWeight, sku, store, subject, text_rev, title, url] 15 | 16 | Partially Populated Fields / Percentages: 17 | accessories: 11,460 (0.9%) 18 | albumLabel: 876,821 (68.77%) 19 | albumTitle: 876,845 (68.77%) 20 | artistName: 871,477 (68.35%) 21 | bundledIn: 7,148 (0.56%) 22 | cast: 132,231 (10.37%) 23 | categoryIds: 1,262,671 (99.03%) 24 | categoryNames: 1,262,671 (99.03%) 25 | class: 1,258,757 (98.72%) 26 | color: 47,567 (3.73%) 27 | condition: 103,036 (8.08%) 28 | crew: 118,603 (9.3%) 29 | customerReviewAverage: 67,489 (5.29%) 30 | customerReviewCount: 67,489 (5.29%) 31 | department: 1,258,757 (98.72%) 32 | depthCategoryIds: 1,262,671 (99.03%) 33 | depthCategoryNames: 1,262,671 (99.03%) 34 | description: 7,499 (0.59%) 35 | details: 101,235 (7.94%) 36 | features: 116,881 (9.17%) 37 | format: 1,147,134 (89.97%) 38 | frequentlyPurchasedWith: 23,950 (1.88%) 39 | genre: 1,133,752 (88.92%) 40 | hardGoodType: 103,036 (8.08%) 41 | image: 1,273,774 (99.9%) 42 | lengthInMinutes: 197,204 (15.47%) 43 | longDescription: 136,234 (10.68%) 44 | manufacturer: 997,494 (78.23%) 45 | mpaaRating: 123,899 (9.72%) 46 | name: 1,274,453 (99.95%) 47 | plot: 204,358 (16.03%) 48 | product_id: 54,363 (4.26%) 49 | relatedProducts: 36,994 (2.9%) 50 | releaseDate: 1,162,200 (91.15%) 51 | salesRankLongTerm: 281,712 (22.09%) 52 | salesRankMediumTerm: 131,228 (10.29%) 53 | salesRankShortTerm: 112,483 (8.82%) 54 | shortDescription: 120,163 (9.42%) 55 | softwareGrade: 417 (0.03%) 56 | startDate: 1,273,615 (99.89%) 57 | studio: 256,401 (20.11%) 58 | subclass: 1,258,757 (98.72%) 59 | weight: 67,072 (5.26%) 60 | 61 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/schema-info-diff.txt: -------------------------------------------------------------------------------- 1 | Notes: 2 | 3 | Schema diffs: 4 | * It can even compare schemas between running systems (via REST) and local XML files. 5 | (though some fields blank depending on source) 6 | * The sample below is the default 4.6.1 schema vs. Apollo demo (plus my local changes) 7 | * A few lists are order-dependant 8 | TODO: turns out this may not be needed, to verify. 9 | 10 | (shows individual reports, then the diff report) 11 | 12 | ========== Differences Report ========== 13 | Schema A = Default Solr 4.6.1 Schema 14 | Schema B = Apollo demo plus local changes 15 | Schema Name: Both = 'example' 16 | Schema Version: Both = '1.5' 17 | Key Field: Both = 'id' 18 | Default Operator: 19 | A = 'null' 20 | B = '(not-available)' 21 | Similarity Class Name: 22 | A = 'org.apache.solr.search.similarities.DefaultSimilarityFactory' 23 | B = '(not-available)' 24 | 25 | Fields: 26 | 27 | In both = '[_version_, _root_, id, sku, name, manu, cat, features, includes, weight, price, popularity, inStock, store, title, subject, description, comments, author, keywords, category, resourcename, url, content_type, last_modified, links, content, text, text_rev, manu_exact, payloads]' 28 | 29 | B only = '[accessories, albumLabel, albumTitle, artistName, bundledIn, cast, categoryIds, categoryNames, categoryPath, class, color, condition, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, details, format, frequentlyPurchasedWith, genre, hardGoodType, image, lengthInMinutes, longDescription, manufacturer, mpaaRating, plot, product_id, regularPrice, relatedProducts, releaseDate, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, softwareGrade, startDate, store_id, studio, subclass, type]' 30 | 31 | Dynamic Field Patterns: 32 | 33 | In both but DIFFERENT relative order: 34 | Common, order in A = '[*_i, *_is, *_s, *_ss, *_l, *_ls, *_t, *_txt, *_en, *_b, *_bs, *_f, *_fs, *_d, *_ds, *_coordinate, *_dt, *_dts, *_p, *_ti, *_tl, *_tf, *_td, *_tdt, *_pi, *_c, ignored_*, attr_*, random_*]' 35 | Common, order in B = '[*_coordinate, ignored_*, random_*, attr_*, *_txt, *_dts, *_tdt, *_is, *_ss, *_ls, *_en, *_bs, *_fs, *_ds, *_dt, *_ti, *_tl, *_tf, *_td, *_pi, *_i, *_s, *_l, *_t, *_b, *_f, *_d, *_p, *_c]' 36 | 37 | Types: 38 | In both = '[string, boolean, int, float, long, double, tint, tfloat, tlong, tdouble, date, tdate, binary, pint, plong, pfloat, pdouble, pdate, random, text_ws, text_general, text_en, text_en_splitting, text_en_splitting_tight, text_general_rev, alphaOnlySort, phonetic, payloads, lowercase, descendent_path, ancestor_path, ignored, point, location, location_rpt, currency, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr]' 39 | 40 | Copy Field Sources: 41 | In both = '[cat, name, manu, features, includes, price, title, author, description, keywords, content, content_type, resourcename, url]' 42 | B only = '[id]' 43 | 44 | Copy Field Destinations: 45 | In both = '[text, manu_exact, price_c, author_s]' 46 | 47 | -------------------------------------------------------------------------------- /src/main/resources/sample-reports/schema-info-single.txt: -------------------------------------------------------------------------------- 1 | Notes: 2 | * It can view schemas from running systems (via REST) or local XML files. 3 | (though some fields blank depending on source) 4 | 5 | ========== Individual Reports ========== 6 | 7 | ---------- Schama A: Default Solr 4.6.1 Schema ---------- 8 | Schema Name: example 9 | Schema Version: 1.5 10 | Key Field: id 11 | Default Operator: null 12 | Similarity Class Name: org.apache.solr.search.similarities.DefaultSimilarityFactory 13 | Default Search Field: null 14 | 15 | Fields: [_version_, _root_, id, sku, name, manu, cat, features, includes, weight, price, popularity, inStock, store, title, subject, description, comments, author, keywords, category, resourcename, url, content_type, last_modified, links, content, text, text_rev, manu_exact, payloads] 16 | 17 | Dynamic field Patterns: [*_i, *_is, *_s, *_ss, *_l, *_ls, *_t, *_txt, *_en, *_b, *_bs, *_f, *_fs, *_d, *_ds, *_coordinate, *_dt, *_dts, *_p, *_ti, *_tl, *_tf, *_td, *_tdt, *_pi, *_c, ignored_*, attr_*, random_*] 18 | 19 | Types: [string, boolean, int, float, long, double, tint, tfloat, tlong, tdouble, date, tdate, binary, pint, plong, pfloat, pdouble, pdate, random, text_ws, text_general, text_en, text_en_splitting, text_en_splitting_tight, text_general_rev, alphaOnlySort, phonetic, payloads, lowercase, descendent_path, ancestor_path, ignored, point, location, location_rpt, currency, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr] 20 | 21 | Copy Sources: [cat, name, manu, features, includes, price, title, author, description, keywords, content, content_type, resourcename, url] 22 | From: 'cat' To [text] 23 | From: 'name' To [text] 24 | From: 'manu' To [text, manu_exact] 25 | From: 'features' To [text] 26 | From: 'includes' To [text] 27 | From: 'price' To [price_c] 28 | From: 'title' To [text] 29 | From: 'author' To [text, author_s] 30 | From: 'description' To [text] 31 | From: 'keywords' To [text] 32 | From: 'content' To [text] 33 | From: 'content_type' To [text] 34 | From: 'resourcename' To [text] 35 | From: 'url' To [text] 36 | Copy Destinations: [text, manu_exact, price_c, author_s] 37 | Dest: 'text' From [cat, name, manu, features, includes, title, author, description, keywords, content, content_type, resourcename, url] 38 | Dest: 'manu_exact' From [manu] 39 | Dest: 'price_c' From [price] 40 | Dest: 'author_s' From [author] 41 | 42 | ---------- Schema B: Apollo demo plus local changes ---------- 43 | Schema Name: example 44 | Schema Version: 1.5 45 | Key Field: id 46 | Default Operator: (not-available) 47 | Similarity Class Name: (not-available) 48 | Default Search Field: (not-available) 49 | 50 | Fields: [_root_, _version_, accessories, albumLabel, albumTitle, artistName, author, bundledIn, cast, cat, category, categoryIds, categoryNames, categoryPath, class, color, comments, condition, content, content_type, crew, customerReviewAverage, customerReviewCount, department, depthCategoryIds, depthCategoryNames, description, details, features, format, frequentlyPurchasedWith, genre, hardGoodType, id, image, inStock, includes, keywords, last_modified, lengthInMinutes, links, longDescription, manu, manu_exact, manufacturer, mpaaRating, name, payloads, plot, popularity, price, product_id, regularPrice, relatedProducts, releaseDate, resourcename, salePrice, salesRankLongTerm, salesRankMediumTerm, salesRankShortTerm, shippingWeight, shortDescription, sku, softwareGrade, startDate, store, store_id, studio, subclass, subject, text, text_rev, title, type, url, weight] 51 | 52 | Dynamic field Patterns: [*_coordinate, ignored_*, random_*, attr_*, *_txt, *_dts, *_tdt, *_is, *_ss, *_ls, *_en, *_bs, *_fs, *_ds, *_dt, *_ti, *_tl, *_tf, *_td, *_pi, *_i, *_s, *_l, *_t, *_b, *_f, *_d, *_p, *_c] 53 | 54 | Types: [alphaOnlySort, ancestor_path, binary, boolean, currency, date, descendent_path, double, float, ignored, int, location, location_rpt, long, lowercase, payloads, pdate, pdouble, pfloat, phonetic, pint, plong, point, random, string, tdate, tdouble, text_ar, text_bg, text_ca, text_cjk, text_cz, text_da, text_de, text_el, text_en, text_en_splitting, text_en_splitting_tight, text_es, text_eu, text_fa, text_fi, text_fr, text_ga, text_general, text_general_rev, text_gl, text_hi, text_hu, text_hy, text_id, text_it, text_ja, text_lv, text_nl, text_no, text_pt, text_ro, text_ru, text_sv, text_th, text_tr, text_ws, tfloat, tint, tlong] 55 | 56 | Copy Sources: [author, cat, content, content_type, description, features, id, includes, keywords, manu, name, resourcename, title, url, price] 57 | From: 'author' To [text, author_s] 58 | From: 'cat' To [text] 59 | From: 'content' To [text] 60 | From: 'content_type' To [text] 61 | From: 'description' To [text] 62 | From: 'features' To [text] 63 | From: 'id' To [text] 64 | From: 'includes' To [text] 65 | From: 'keywords' To [text] 66 | From: 'manu' To [manu_exact, text] 67 | From: 'name' To [text] 68 | From: 'resourcename' To [text] 69 | From: 'title' To [text] 70 | From: 'url' To [text] 71 | From: 'price' To [price_c] 72 | Copy Destinations: [text, manu_exact, price_c, author_s] 73 | Dest: 'text' From [author, cat, content, content_type, description, features, id, includes, keywords, manu, name, resourcename, title, url] 74 | Dest: 'manu_exact' From [manu] 75 | Dest: 'price_c' From [price] 76 | Dest: 'author_s' From [author] 77 | --------------------------------------------------------------------------------