16 | * Usage: java fixtolls 17 | */ 18 | public class fixtolls { 19 | 20 | public static void main(String[] args) throws Exception { 21 | File rawTollFile = new File(args[0]); 22 | File finalDataFile = new File(args[1]); 23 | File outputFile = new File(args[2]); 24 | 25 | // Need carid, day, xway 26 | // Key: carid+"-"+day -- Value: xway 27 | HashMap type3sInData = new HashMap<>(); 28 | String line; 29 | String key; 30 | String[] tokens; 31 | 32 | // Read through the main data file, the final one and extract all Type 3 notifications. 33 | BufferedReader reader = new BufferedReader(new FileReader(finalDataFile)); 34 | while ((line = reader.readLine()) != null) { 35 | if (line.startsWith("3")) { 36 | tokens = line.split(","); 37 | key = tokens[2] + "-" + tokens[14]; 38 | type3sInData.put(key, tokens[4]); 39 | } 40 | } 41 | reader.close(); 42 | 43 | // Read through the newly generated toll file and if there's a matching key from the main data input file 44 | // ensure that the xways will match. Note, this seems a bit trivial but since the queries have an xway and it 45 | // forms part of the key of the Type 3 query, we need them to match. 46 | PrintWriter writer = new PrintWriter(outputFile); 47 | reader = new BufferedReader(new FileReader(rawTollFile)); 48 | while ((line = reader.readLine()) != null) { 49 | tokens = line.split(","); 50 | key = tokens[0] + "-" + tokens[1]; 51 | if (type3sInData.containsKey(key)) { 52 | tokens[2] = type3sInData.get(key); 53 | } 54 | writer.println(tokens[0] + "," + tokens[1] + "," + tokens[2] + "," + tokens[3]); 55 | } 56 | writer.close(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /JavaGenerator/historical_tolls.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.PrintWriter; 3 | 4 | /** 5 | * Created by Sung Kim on 3/2/2016. 6 | * 7 | * NOTE: num xway is the number of xways, not the 0-indexed largest xway. So, 3 files => java historical_tolls 3 8 | * for xway ids from 0-2. 9 | * 10 | * Usage: java historical_tolls 11 | * 12 | */ 13 | 14 | public class historical_tolls { 15 | 16 | final static int NUM_DAYS_IN_HISTORY = 70; 17 | 18 | public static void main(String[] args) throws Exception { 19 | 20 | // Simple check for valid number of input parameters 21 | if (args.length != 3) { 22 | System.out.println("Usage: java historical_tolls "); 23 | System.exit(1); 24 | } 25 | 26 | int maxXway = Integer.parseInt(args[0]); 27 | int maxCarId = Integer.parseInt(args[1]); 28 | File outfile = new File(args[2]); 29 | 30 | int i, day, toll, xway; 31 | maxCarId++; // we want to include the max carid, [0, maxCarId], in the result set 32 | PrintWriter writer = new PrintWriter(outfile); 33 | 34 | for (i = 0; i < maxCarId; i++) { 35 | for (day = 1; day < NUM_DAYS_IN_HISTORY; day++) { 36 | toll = (int) (Math.random() * 1000) % 90 + 10; 37 | xway = (int) (Math.random() * 1000) % maxXway; 38 | // Using a writer versus a redirected println for performance 39 | writer.println(i + "," + day + "," + xway + "," + toll); 40 | //System.out.println(i + "," + day + "," + xway + "," + toll); 41 | } 42 | } 43 | writer.close(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /JavaGenerator/replacecars.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.PrintWriter; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | 8 | /** 9 | * Created by Sung Kim on 3/3/2016. 10 | * 11 | * A NON-database way to replacecars. 12 | * create_carsandtimes.java and create_carstoreplace.java can be used to create the files that the original Python 13 | * version uses to then do the actual replacements on the cars in the main data file, which would have been loaded 14 | * into a database. The whole database stop being too slow, and adding a level of required infrastructure, it is 15 | * better to simply to everything outside of the database, in files and scripts. 16 | * 17 | * This is performant since the simple act of loading a file into a db takes time. 18 | * 19 | * This replacecars.java also writes out each xway to a separate file to be combined later in time order by 20 | * combine_after_replace.java. This last step is actually NECESSARY as the non-database version of creating a 21 | * combined file does NOT order the cleaned, combined file. 22 | * 23 | * Usage: java replacecars 24 | */ 25 | public class replacecars { 26 | public static void main(String[] args) throws Exception { 27 | File carsToReplaceFile = new File(args[0]); 28 | File combinedFile = new File(args[1]); 29 | String outfileBaseName = args[2]; 30 | 31 | BufferedReader reader; 32 | PrintWriter writer; 33 | 34 | String tokens[]; 35 | String line; 36 | StringBuilder out_token = new StringBuilder(); 37 | 38 | int xway, time, lastTime; 39 | 40 | reader = new BufferedReader(new FileReader(carsToReplaceFile)); 41 | // Hold the carstoreplace file in a map. 42 | // The file will have the re-entrant car at pos. 0 and the to-be-replaced car at pos. 1. 43 | HashMap carsToReplace = new HashMap<>(); 44 | while ((line = reader.readLine()) != null) { 45 | tokens = line.split(","); 46 | // We're looking for the car to be replaced as the key, not the re-entrant car. 47 | // When we find the car to be replaced, tokens[1], we'll replace that car with the re-entrant car, 48 | // which is tokens[0]. 49 | carsToReplace.put(tokens[1], tokens[0]); 50 | } 51 | reader.close(); 52 | 53 | xway = 0; 54 | time = 0; 55 | lastTime = 0; 56 | // Read the cleaned, combined (but not yet time-ordered) main data file. 57 | reader = new BufferedReader(new FileReader(combinedFile)); 58 | // Start with the initial file. 59 | writer = new PrintWriter(new File(outfileBaseName+"-"+xway)); 60 | while ((line = reader.readLine()) != null) { 61 | tokens = line.split(","); 62 | 63 | // Make the replacement of the car if it's a car that needs to be replaced (by the value, whicih is the 64 | // car that will now be re-entrant. 65 | if (carsToReplace.containsKey(tokens[2])) { 66 | tokens[2] = carsToReplace.get(tokens[2]); 67 | } 68 | 69 | // We get the time to be able to check for file transitions. 70 | time = Integer.parseInt(tokens[1]); 71 | // File transition in the original file, one xway to the next. 72 | // If there is only one xway, this segment will not run, which is fine. 73 | if (lastTime == 10784 && time == 0) { 74 | writer.close(); 75 | xway++; 76 | writer = new PrintWriter(new File(outfileBaseName+"-"+xway)); 77 | } 78 | lastTime = time; 79 | 80 | // The files are sequential, so all the Type 2's and 3's simply flow with the proper xway and dont' have 81 | // to be accounted for here, in terms of placement. 82 | tokens[4] = Integer.toString(xway); 83 | dataval.printTokensToFile(writer, out_token, tokens); 84 | } 85 | reader.close(); 86 | writer.close(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /JavaValidator/CompareFiles.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.util.HashMap; 5 | 6 | /** 7 | * Created by Sung Kim on 5/30/2016. 8 | * Compare the output from a product run versus the expected output from the Validator. 9 | * 10 | * We check existence of an expected line of output and then check the values of that output. 11 | * 12 | * We don't check for extra lines in the output. 13 | * 14 | * Usage: java CompareFiles 15 | * Caveat: This version will only work up to the limits of the memory of the machine on which it's run. 16 | */ 17 | public class CompareFiles { 18 | /** 19 | * A convenience class to help us work with the data. 20 | */ 21 | private static class KVTuple { 22 | String key; 23 | String value; 24 | 25 | KVTuple(String key, String value) { 26 | this.key = key; 27 | this.value = value; 28 | } 29 | 30 | @Override 31 | public String toString() { 32 | return key + " -- " + value; 33 | } 34 | } 35 | 36 | /** 37 | * Based on the type of the input line, return the properly deconstructed key and values. 38 | * This de-multiplexes the multiplexing of all types into a single data line. 39 | * @param tokens 40 | * @return 41 | */ 42 | private static KVTuple getKVFromType(String[] tokens) { 43 | String key = null; 44 | String value = null; 45 | // [], square brackets, represent the "key" for each Type of validator and solution output. 46 | // [0, carid, time], proc_time, lav, toll 47 | // [1, time], proc_time, [xway, acc_seg, dir, carid] 48 | // [2, time], proc_time, toll_time, [qid], balance 49 | // [3, time], proc_time, [qid, toll] 50 | switch (tokens[0]) { 51 | case "0": 52 | key = tokens[0] + ";" + tokens[1] + ";" + tokens[2]; 53 | value = tokens[3] + ";" + tokens[4] + ";" + tokens[5]; 54 | break; 55 | case "1": 56 | key = tokens[0] + ";" + tokens[1] + ";" + tokens[3] + ";" + tokens[4] + ";" + tokens[5] + ";" + tokens[6]; 57 | value = tokens[2]; 58 | break; 59 | case "2": 60 | case "5": 61 | key = tokens[0] + ";" + tokens[1] + ";" + tokens[4]; 62 | value = tokens[2] + ";" + tokens[3] + ";" + tokens[5]; 63 | break; 64 | case "3": 65 | key = tokens[0] + ";" + tokens[1] + ";" + tokens[3] + ";" + tokens[4]; 66 | value = tokens[2]; 67 | break; 68 | } 69 | return new CompareFiles.KVTuple(key, value); 70 | } 71 | 72 | /** 73 | * Create a KVTuple object from a parse line of validator output and place into the validator output Map. 74 | * 75 | * @param output 76 | * @param tokens 77 | */ 78 | private static void insertType(HashMap output, String[] tokens) { 79 | CompareFiles.KVTuple kv = getKVFromType(tokens); 80 | output.put(kv.key, kv.value); 81 | } 82 | 83 | /** 84 | * Take the file created by the Validator and upload it, as the proper types, into a passed-by-reference Map. 85 | * 86 | * @param output The HashMap to hold the validator output split into key/value 87 | * @param file The validator output file 88 | */ 89 | private static long loadValidatorOutput(HashMap output, String file) { 90 | BufferedReader reader; 91 | String line; 92 | String[] tokens; 93 | long numValidatorRecords = 0; 94 | try { 95 | reader = new BufferedReader(new FileReader(new File(file))); 96 | while ((line = reader.readLine()) != null) { 97 | tokens = line.split(","); 98 | insertType(output, tokens); 99 | numValidatorRecords++; 100 | } 101 | } catch (Exception e) { 102 | System.err.println(e); 103 | } 104 | return numValidatorRecords; 105 | } 106 | 107 | /** 108 | * The bulk of the validation will occur here. 109 | * Based on the values of a given key, which tests existence (the presence of the key means the solution output 110 | * has some expected output), test the values against the expected values. 111 | * 112 | * @param vOutput 113 | * @param kv 114 | * @param type 115 | * @return 116 | */ 117 | private static long checkType(HashMap vOutput, KVTuple kv, String type) { 118 | String vValue = vOutput.get(kv.key); 119 | //System.out.println("kv: " + kv); 120 | if (vValue == null) { 121 | System.out.println("Key " + kv.key + " does not exist in validator output."); 122 | return 0; 123 | } 124 | // If the key exists, check the processing time, the lav, and the toll 125 | String[] vValues = vValue.split(";"); 126 | String[] pValues = kv.value.split(";"); 127 | 128 | // All values start with the proc_time field 129 | if (Double.valueOf(pValues[0]) > 5000.0) 130 | System.out.println(kv.key + " has time > 5"); // Remember, System.currentTimeMillis() is milliseconds since Jan 1, 1970 131 | 132 | switch (type) { 133 | case "0": 134 | // [0, carid, time], proc_time, lav, toll 135 | if (Integer.parseInt(pValues[2]) != Integer.parseInt(vValues[2])) 136 | System.out.println(kv.key + " has non-matching toll of " + pValues[2] + "; " + vValues[2] + " expected."); 137 | break; 138 | case "1": 139 | // [1, time], proc_time, [xway, acc_seg, dir, carid] 140 | // For accidents, besides existence, the only thing to check is the proc time 141 | break; 142 | case "2": 143 | case "5": 144 | // [2, time], proc_time, toll_time, [qid], balance 145 | if (Integer.parseInt(pValues[1]) != Integer.parseInt(vValues[1])) 146 | System.out.println(kv.key + " has non-matching toll time of " + pValues[1] + "; " + vValues[1] + " expected."); 147 | if (Integer.parseInt(pValues[2]) != Integer.parseInt(vValues[2])) 148 | System.out.println(kv.key + " has non-matching balance of " + pValues[2] + "; " + vValues[2] + " expected."); 149 | break; 150 | case "3": 151 | // [3, time], proc_time, [qid, toll] 152 | // For toll history, everything should match so proc_time is the only thing to check 153 | break; 154 | default: 155 | //System.err.println("Odd product output line: " + line); 156 | } 157 | return 1; 158 | } 159 | 160 | /** 161 | * Just a wrapper to checkType(). More functionality can be added, i.e. more types of checks, here. 162 | * 163 | * @param vOutput 164 | * @param line 165 | * @return 166 | */ 167 | private static long checkLine(HashMap vOutput, String line) { 168 | String[] tokens; 169 | tokens = line.split(","); 170 | return checkType(vOutput, getKVFromType(tokens), tokens[0]); 171 | } 172 | 173 | /** 174 | * Take productOutput file and find the corresponding keys (and values) in the built up validatorOutput Map. 175 | * This is the main function that starts validation after a validator-produced file is uploaded. 176 | * 177 | * @param validatorOutput The filled HashMap of expected output created by the Validator. 178 | * @param productOutputFile The fileName of the product output. 179 | */ 180 | private static long compareProductOutput(HashMap validatorOutput, String productOutputFile) { 181 | BufferedReader reader; 182 | String line; 183 | long recordCount = 0; 184 | try { 185 | reader = new BufferedReader(new FileReader(new File(productOutputFile))); 186 | while ((line = reader.readLine()) != null) { 187 | recordCount += checkLine(validatorOutput, line); 188 | } 189 | } catch (Exception e) { 190 | System.err.println(e); 191 | } 192 | return recordCount; 193 | } 194 | 195 | /** 196 | * Run the validator. 197 | * @param args 198 | */ 199 | public static void main(String[] args) { 200 | if (args.length < 2 || args.length > 2) { 201 | System.out.println("Usage: java CompareFiles "); 202 | System.exit(1); 203 | } 204 | // Load the validator output into a hash map 205 | HashMap validatorOutput = new HashMap<>(); 206 | // Holder for number of Product records processed 207 | long numValidatorRecords = 0; 208 | long numProductRecords = 0; 209 | // How big are output files again? About 1/5 the size of the input, which is still rather large 210 | // Keep it simple, [] represents the keys 211 | // [0, carid, time], proc_time, lav, toll 212 | // [1, time], proc_time, [xway, acc_seg, dir, carid] 213 | // [2, time], proc_time, toll_time, [qid], balance 214 | // [3, time], proc_time, [qid, toll] 215 | numValidatorRecords = loadValidatorOutput(validatorOutput, args[0]); 216 | numProductRecords = compareProductOutput(validatorOutput, args[1]); 217 | System.out.println("Total number of records in Validator file: " + numValidatorRecords); 218 | System.out.println("Total number of records in Product file: " + numProductRecords); 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /JavaValidator/CreateMatchingTolls.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.PrintWriter; 5 | import java.util.HashMap; 6 | 7 | /** 8 | * Created by Sung Kim on 3/11/2016. 9 | * 10 | * Create a file with only those tolls that actually show up in the data file. 11 | * This makes creating a Validation file much faster. 12 | * If there's no desire to test the ability of a data store to hold large amounts of data then using a cleaned file 13 | * to run the actual tests for candidate streaming engines would work just as well. 14 | */ 15 | 16 | public class CreateMatchingTolls { 17 | 18 | /** 19 | * Take the mainfile, the tollfile, and a desired output file and create a toll file that only has corresponding 20 | * lines in the mainfile. 21 | * 22 | * @param mainFile 23 | * @param tollFile 24 | * @param newFile 25 | * @throws Exception 26 | */ 27 | public static void createMatchingTollsFile(String mainFile, String tollFile, String newFile) throws Exception { 28 | File datafile = new File(mainFile); 29 | File tollfile = new File(tollFile); 30 | File newtfile = new File(newFile); 31 | 32 | // The fields of interest from the line of mainfile input. 33 | // carid, day, xway -> t[2], t[14], t[4] 34 | // We could use a Set instead as we don't care about values. 35 | HashMap t3s = new HashMap<>(); 36 | 37 | // Load Type 3's from the mainfile into a Map. 38 | BufferedReader reader = new BufferedReader(new FileReader(datafile)); 39 | String line; 40 | String[] tokens; 41 | String key; 42 | while ((line = reader.readLine()) != null) { 43 | tokens = line.split(","); 44 | if (tokens[0].equals("3")) { 45 | key = tokens[2] + "-" + tokens[14] + "-" + tokens[4]; 46 | t3s.put(key, null); 47 | } 48 | } 49 | reader.close(); 50 | 51 | // Now, reading through the tollfile find the actual matching lines 52 | // and write these out to a new file. 53 | reader = new BufferedReader(new FileReader(tollfile)); 54 | PrintWriter writer = new PrintWriter(newtfile); 55 | while ((line = reader.readLine()) != null) { 56 | tokens = line.split(","); 57 | // carid, day, xway, toll 58 | key = tokens[0] + "-" + tokens[1] + "-" + tokens[2]; 59 | if (t3s.containsKey(key)) { 60 | writer.println(line); 61 | } 62 | } 63 | reader.close(); 64 | writer.close(); 65 | } 66 | 67 | public static void main(String[] args) throws Exception { 68 | createMatchingTollsFile(args[0], args[1], args[3]); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /JavaValidator/README.md: -------------------------------------------------------------------------------- 1 | To use the Aerospike validator download the Aerospike Java driver, with dependencies, from http://www.aerospike.com/download/client/java/3.3.0/ 2 | 3 | 4 | -------------------------------------------------------------------------------- /JavaValidator/SplitFiles.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.PrintWriter; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * Created by Sung Kim on 3/29/2016. 10 | * Take an input file and split into n*2 files for multi-thread processing 11 | * The bulk of the work is to write types > 0 to the proper file. 12 | * We do this by storing the xway+dir of carids and assigning lines to the proper files accordingly. 13 | * Using the BlockingQueue version obviates this file splitting. 14 | */ 15 | public class SplitFiles { 16 | // Have auto-detection of number of xways 17 | public static void splitFiles(String input, int numXways) throws Exception { 18 | Map files = new HashMap<>(); // xway - dir: writer; xd is the key 19 | Map carsAndXwayDir = new HashMap<>(); // carid: xway - dir 20 | Map holding = new HashMap<>(); // carid - time: line; 21 | BufferedReader reader = new BufferedReader(new FileReader(input)); 22 | String line; 23 | String[] tokens; 24 | int currTime = -1; // used in conjunction with holding to appropriately write t2 and t3 25 | for (int i = 0; i < numXways; i++) { 26 | String key0 = i + "-0"; // 0 - indexed 27 | String key1 = i + "-1"; 28 | System.out.println(key0); 29 | System.out.println(key1); 30 | files.put(key0, new PrintWriter(new File(key0))); 31 | files.put(key1, new PrintWriter(new File(key1))); 32 | } 33 | 34 | while ((line = reader.readLine()) != null) { 35 | tokens = line.trim().split(","); 36 | //when the time changes we 'll process these in holding 37 | if (Integer.parseInt(tokens[1]) > currTime) { 38 | // process all in holding 39 | for (String k : holding.keySet()) { 40 | // Get the carid from the carid-time key to get the xway-dir to feed into files 41 | files.get(carsAndXwayDir.get(k.split("-")[0])).printf("%s\n", holding.get(k)); // We 're just getting the carid, and remember to print the lines in holding, not the current line ! 42 | } 43 | holding.clear(); 44 | } 45 | currTime = Integer.parseInt(tokens[1]); 46 | // The xway[4], dir[ 6] 47 | String c = tokens[2]; 48 | String xd = "0-0"; // Default xway-dir 49 | if (tokens[4].equals("-1") || tokens[6].equals("-1")) { // No xway or dir, so we get the xd from a previously seen entry for ths car 50 | if (carsAndXwayDir.containsKey(c)) { 51 | xd = carsAndXwayDir.get(c); 52 | } else { 53 | holding.put(c + "-" + tokens[1], line); // Since we don't know the xd for this type > 0, just store it until we find one. And no, you won't get more than 1 type > 0 for any given second. Type 0 and possibly 2, 3, or 4. 54 | continue; 55 | } 56 | } else { 57 | xd = tokens[4] + "-" + tokens[6]; 58 | } 59 | carsAndXwayDir.put(c, xd); // we do this so we can give xways to types2 and 3 (and this should be done during datagen/combination. Sup? 60 | files.get(xd).printf("%s\n", line); 61 | } 62 | for (String k : holding.keySet()) { 63 | // This block likely never gets called 64 | files.get(carsAndXwayDir.get(k.split("-")[0])).printf("%s\n", holding.get(k)); // We 're just getting the carid 65 | } 66 | for (PrintWriter w : files.values()) { 67 | w.close(); 68 | } 69 | } 70 | 71 | public static void main(String[] args) { 72 | try { 73 | SplitFiles.splitFiles(args[0], Integer.parseInt(args[1])); 74 | } catch (Exception e) { 75 | System.err.println(e); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /JavaValidator/ValidateMTBQEven3AeroTolls.java: -------------------------------------------------------------------------------- 1 | import com.aerospike.client.AerospikeClient; 2 | import com.aerospike.client.Bin; 3 | import com.aerospike.client.Key; 4 | import com.aerospike.client.Record; 5 | import com.aerospike.client.policy.WritePolicy; 6 | 7 | import java.io.*; 8 | import java.util.*; 9 | import java.util.concurrent.LinkedBlockingDeque; 10 | 11 | /** 12 | * Created by Sung Kim on 3/8/2016. 13 | * Use multiple threads. 14 | * Use a BlockingQueue to handle issues with some threads getting ahead of others. 15 | * Using the BlockingQueue means longer needing to create split files. 16 | * Process by the 'second's, and send when all have finished each second. 17 | * Trim the segment maps to only hold the most current minute and the past 5 simulation minutes. 18 | * Use Aerospike to hold all tolls. 19 | * Usage: time java ValidateMTBQEven3AeroTolls 20 | * FYI: 21 | * MT (MultiThreaded) 22 | * BQ (BlockingQueue) 23 | * Even (every thread finished processing the current second before moving on to the next second) 24 | * 3 (version) 25 | * AeroTolls (use Aerospike to hold the global toll information) 26 | * Non-Aerospike, or "other" database versions can be created by simply replacing the Aerospike portions with 27 | * a different database of choice. A Java Map can also be used but this limits the number of expressways for which a 28 | * validation file can be created. 29 | */ 30 | public class ValidateMTBQEven3AeroTolls extends Thread { 31 | public static HashMap historical; 32 | 33 | AerospikeClient client; 34 | WritePolicy policy; 35 | 36 | static { 37 | historical = new HashMap<>(); 38 | } 39 | 40 | private LinkedBlockingDeque q; 41 | private LRReader r; 42 | private boolean paused; 43 | private PrintWriter writer; 44 | private PrintWriter writerDEBUG; 45 | private long tollAssessmentCountDEBUG; 46 | private int xway; 47 | private int dir; 48 | private Car currentCar; 49 | private HashMap cars; // K, carId: Int ; V, Car(lastTime, lastSpeed, lastXway, lastLane, lastDir, lastSeg, lastPos, xPos, lastToll) 50 | private HashMap segSumSpeeds; // K, seg: Int, min: Int ; V, sumOfSpeeds: Int 51 | private HashMap segSumNumReadings; // K, seg: Int, min: Int ; V, sumNumSpeedReadings: Int 52 | private HashMap> segCarIdSet; // K, seg: Int, min: Int ; V, Set(carId: Int) 53 | private HashMap> stopped; // K, lane: Int, seg: Int, pos: Int ; V, Set(carId: Int) 54 | private HashMap accidents; // K, seg: Int ; V, Accident(time, clearTime, carId1, carId2) 55 | int type0Seen; 56 | int type0SeenDEBUG; 57 | int type2Seen; 58 | int type3Seen; 59 | int type0Processed; 60 | int type1Processed; 61 | int type2Processed; 62 | int type3Processed; 63 | 64 | public static class LRReader extends Thread { 65 | BufferedReader reader; 66 | Map lrs; 67 | int numNotifications; 68 | boolean done; 69 | // DEBUG 70 | int LRRestartsDEBUG; 71 | 72 | LRReader(BufferedReader reader, Map lrs) { 73 | this.reader = reader; 74 | this.lrs = lrs; 75 | numNotifications = 0; 76 | done = false; 77 | LRRestartsDEBUG = 0; 78 | } 79 | 80 | /** 81 | * ValidateMTBQEven3... thread will notify LRReader that it is done with its set of records. 82 | * When the number of notifications equals the number of threads, LRReader will restart. 83 | */ 84 | public void notifyDone() { 85 | synchronized (this) { 86 | numNotifications++; 87 | if (numNotifications == lrs.size()) { 88 | restart(); 89 | LRRestartsDEBUG++; 90 | } 91 | } 92 | } 93 | 94 | private void restart() { 95 | synchronized (this) { 96 | notify(); 97 | numNotifications = 0; 98 | } 99 | } 100 | 101 | /** 102 | * This needs to somehow run periodically because if the processing actually finishes before the reader is done 103 | * then it gets stuck. 104 | */ 105 | public void checkRestart() { 106 | for (ValidateMTBQEven3AeroTolls rt : lrs.values()) { 107 | if (!rt.isPaused()) { 108 | return; 109 | } 110 | } 111 | restart(); // We should only reach this if all processing threads are paused. 112 | } 113 | 114 | public boolean isDone() { 115 | return done; 116 | } 117 | 118 | @Override 119 | public void run() { 120 | String line; 121 | String[] tokens; 122 | int counter = 0; 123 | int currTime = 0; 124 | int lineTime = 0; 125 | Map lastSeen = new HashMap<>(); // To handle type 2 and 3 without xway and/or dir 126 | List twoThree = new ArrayList<>(); 127 | String xwayDir;// = "0-0"; 128 | int carId;// = 0; 129 | try { 130 | while ((line = reader.readLine()) != null) { 131 | tokens = line.split(","); 132 | lineTime = Integer.parseInt(tokens[1]); 133 | carId = Integer.parseInt(tokens[2]); 134 | if (currTime != lineTime) { 135 | //System.out.println(currTime + " to " + lineTime); 136 | synchronized (this) { 137 | //System.out.println("Pausing reader"); 138 | wait(); 139 | //System.out.println("Re-starting reader"); 140 | for (String l : twoThree) { 141 | String[] tokens2 = l.split(","); 142 | xwayDir = lastSeen.get(Integer.parseInt(tokens2[2])); 143 | lrs.get(xwayDir).addToQ(l); 144 | }// 145 | twoThree.clear(); 146 | } 147 | } 148 | //System.out.println(line); 149 | if (tokens[0].equals("0")) { 150 | xwayDir = tokens[4] + "-" + tokens[6]; 151 | lastSeen.put(carId, xwayDir); 152 | lrs.get(xwayDir).addToQ(line); 153 | } 154 | if (!tokens[0].equals("0")) { 155 | twoThree.add(line); 156 | //System.out.println(twoThree.size()); 157 | } 158 | currTime = lineTime; 159 | } 160 | 161 | // Clean up any 2/3's remaining 162 | for (String l : twoThree) { 163 | tokens = l.split(","); 164 | carId = Integer.parseInt(tokens[2]); 165 | xwayDir = lastSeen.get(carId); 166 | //System.out.println("Cleanup 2,3's: " + xwayDir); 167 | lrs.get(xwayDir).addToQ(l); 168 | } 169 | twoThree.clear(); 170 | } catch (InterruptedException e) { 171 | System.err.println(e); 172 | } catch (IOException e) { 173 | System.err.println(e); 174 | } 175 | done = true; 176 | for (ValidateMTBQEven3AeroTolls rt : lrs.values()) { 177 | rt.addToQ("quit"); 178 | } 179 | } 180 | } 181 | 182 | 183 | private class Car { 184 | int carId; 185 | int lastTime; 186 | int lastSpeed; 187 | int lastXway; 188 | int lastLane; 189 | int lastDir; 190 | int lastSeg; 191 | int lastPos; 192 | int xPos; 193 | int lastToll; 194 | 195 | Car(int carId) { 196 | this.carId = carId; 197 | lastTime = lastSpeed = lastXway = lastLane = lastDir = lastSeg = lastPos = -1; 198 | xPos = lastToll = 0; 199 | } 200 | 201 | void reset() { 202 | lastTime = lastSpeed = lastXway = lastLane = lastDir = lastSeg = lastPos = -1; 203 | xPos = lastToll = 0; 204 | } 205 | } 206 | 207 | private class Accident { 208 | int time; 209 | int clearTime; 210 | List accidentCars; 211 | 212 | Accident(int time, int carId1, int carId2) { 213 | this.time = time; 214 | this.clearTime = -1; 215 | accidentCars = new ArrayList(); 216 | accidentCars.add(carId1); 217 | accidentCars.add(carId2); 218 | } 219 | } 220 | 221 | public ValidateMTBQEven3AeroTolls(String f, int xway, int dir, LRReader r) { 222 | client = new AerospikeClient("127.0.0.1", 3000); 223 | policy = new WritePolicy(); 224 | policy.timeout = 100; 225 | 226 | q = new LinkedBlockingDeque<>(); 227 | this.r = r; 228 | paused = false; 229 | this.xway = xway; 230 | this.dir = dir; 231 | cars = new HashMap<>(); 232 | segSumSpeeds = new HashMap<>(); 233 | segSumNumReadings = new HashMap<>(); 234 | segCarIdSet = new HashMap<>(); 235 | stopped = new HashMap<>(); 236 | accidents = new HashMap<>(); 237 | try { 238 | //reader = new BufferedReader(new FileReader(f)); 239 | writer = new PrintWriter(f + "-out"); 240 | writerDEBUG = new PrintWriter(f + "-out-DEBUG"); 241 | } catch (FileNotFoundException e) { 242 | System.err.println(e); 243 | System.exit(1); 244 | } 245 | type0Seen = 0; 246 | type0SeenDEBUG = 0; 247 | tollAssessmentCountDEBUG = 0; 248 | type2Seen = 0; 249 | type3Seen = 0; 250 | type0Processed = 0; 251 | type1Processed = 0; 252 | type2Processed = 0; 253 | type3Processed = 0; 254 | } 255 | 256 | // Add an element to a given thread's queue for processing. 257 | public void addToQ(String line) { 258 | try { 259 | q.put(line); 260 | } catch (InterruptedException e) { 261 | System.err.println(e); 262 | } 263 | } 264 | 265 | public boolean isPaused() { 266 | return paused; 267 | } 268 | 269 | @Override 270 | public void run() { 271 | System.out.println("Thread " + this.getName() + " starting"); 272 | String line; 273 | Map mt; 274 | 275 | try { 276 | while (true) { 277 | if (q.peek() == null) { 278 | paused = true; 279 | r.notifyDone(); 280 | } 281 | line = q.take(); 282 | paused = false; 283 | if (line.equals("quit")) { 284 | writer.flush(); 285 | writerDEBUG.flush(); 286 | break; // This is the 'poison pill' to kill the thread 287 | } 288 | mt = createMT(line.split(",")); 289 | int type = mt.get("type"); 290 | switch (type) { 291 | case 0: 292 | type0Seen++; 293 | t0(mt); 294 | break; 295 | case 2: 296 | type2Seen++; 297 | t2(mt); 298 | break; 299 | case 3: 300 | type3Seen++; 301 | t3(mt); 302 | break; 303 | } 304 | } 305 | } catch (InterruptedException e) { 306 | System.err.println(e); 307 | } finally { 308 | writer.close(); 309 | writerDEBUG.close(); 310 | } 311 | } 312 | 313 | /** 314 | * Create a Map of the input line 315 | * 316 | * @param tokens The tokenized line. 317 | * @return A Map of the input line, split into its constituent parts. 318 | */ 319 | private Map createMT(String[] tokens) { 320 | Map m = new HashMap<>(); 321 | m.put("type", Integer.parseInt(tokens[0])); 322 | m.put("time", Integer.parseInt(tokens[1])); 323 | m.put("carId", Integer.parseInt(tokens[2])); 324 | m.put("speed", Integer.parseInt(tokens[3])); 325 | m.put("xway", Integer.parseInt(tokens[4])); 326 | m.put("lane", Integer.parseInt(tokens[5])); 327 | m.put("dir", Integer.parseInt(tokens[6])); 328 | m.put("seg", Integer.parseInt(tokens[7])); 329 | m.put("pos", Integer.parseInt(tokens[8])); 330 | m.put("qid", Integer.parseInt(tokens[9])); 331 | m.put("day", Integer.parseInt(tokens[14])); 332 | return m; 333 | } 334 | 335 | /** 336 | * Each xway and dir has its own Maps of segment key with speeds, number of speed readings, and a Set of carid's in that segment key. 337 | * The segment key is: (The segment number) + (The simulation time, the minute). 338 | * 339 | * @param mt The Map-ized input line 340 | * @return The segment key 341 | */ 342 | private String getOrCreateSeg(Map mt) { 343 | String segKey = mt.get("seg") + "-" + (mt.get("time") / 60 + 1); // Oh, duh, of COURSE you need the parens 344 | // Create a new record for a particular seg+min key for this xway+dir if it doesn't exist 345 | if (!segSumSpeeds.containsKey(segKey) && !segSumNumReadings.containsKey(segKey) && !segCarIdSet.containsKey(segKey)) { 346 | //segSumSpeeds.put(segKey, mt.get("speed")); 347 | segSumSpeeds.put(segKey, 0); 348 | //segSumNumReadings.put(segKey, 1); 349 | segSumNumReadings.put(segKey, 0); 350 | Set newCarIdSet = new HashSet<>(); 351 | newCarIdSet.add(mt.get("carId")); 352 | segCarIdSet.put(segKey, newCarIdSet); 353 | } 354 | return segKey; 355 | } 356 | 357 | /** 358 | * Each xway and dir has its own Map of cars. 359 | * The car key is: (The carid). 360 | * 361 | * @param mt The Map-ized input line. 362 | * @return The Car (Object) whether newly created for this xway and dir, or the currently existing one. 363 | */ 364 | private Car getOrCreateCar(Map mt) { 365 | Car car; 366 | if (!cars.containsKey(mt.get("carId"))) { 367 | car = new Car(mt.get("carId")); 368 | cars.put(mt.get("carId"), car); 369 | } else { 370 | car = cars.get(mt.get("carId")); 371 | if (mt.get("lane") == 0 && mt.get("time") > (car.lastTime + 60)) { // Check if the currentCar is a re-entrant car. If it is reset its values. 372 | car.reset(); 373 | } 374 | } 375 | return car; 376 | } 377 | 378 | /** 379 | * Each xway and dir has its own Map of stopped cars. 380 | * The stopped key is: (The Lane) + (The Seg) + (The Position) 381 | * Take a "Stopped Key" and a Car (Object) and potentially include the Car into the "stopped" Map. 382 | * 383 | * @param stoppedKey A String (lane+"-"+seg+"-"+pos). 384 | * @param c The Car. 385 | * @return Whether a car was inserted into this stopped Map at the stoppedKey. The current rule is that only two cars (enough to track an accident) will be tracked at a given stoppedKey. 386 | */ 387 | private boolean createStoppedCar(String stoppedKey, Car c) { // Return true if a new stopped car was added to 'this.stopped' 388 | if (!stopped.containsKey(stoppedKey)) { 389 | List s = new ArrayList<>(); 390 | s.add(c.carId); 391 | stopped.put(stoppedKey, s); 392 | return true; 393 | } else { 394 | if (stopped.get(stoppedKey).size() < 2 && !stopped.get(stoppedKey).contains(c.carId)) { // Do we allow more than two cars at any stopped position? Not for now. 395 | stopped.get(stoppedKey).add(c.carId); 396 | //System.out.println(stopped.get(stoppedKey)); 397 | return true; 398 | } 399 | } 400 | return false; 401 | } 402 | 403 | /** 404 | * Each xway and dir has its own Map of accidents. 405 | * The accidents key is: (The Seg) 406 | * The seg is the only element of the key as accident notifications are sent based on the segment. 407 | * Of course the xway and dir would be part of the key if this were single-threaded. 408 | * This creates an accident if one doesn't already exist. 409 | * The print statement is for debugging when accidents occur. 410 | * 411 | * @param stoppedKey The stoppedKey of the segment in question. 412 | * @param seg The segment. 413 | * @param time The simulation time (which is Floor(time/60) + 1) 414 | */ 415 | private void createAccident(String stoppedKey, int seg, int time) { 416 | if (stopped.get(stoppedKey).size() == 2 && !accidents.containsKey(seg)) { 417 | Accident newAccident = new Accident(time, stopped.get(stoppedKey).get(0), stopped.get(stoppedKey).get(1)); 418 | accidents.put(seg, newAccident); 419 | System.out.printf("%d,%d,%d,%d,%d\n", seg, newAccident.time, newAccident.clearTime, newAccident.accidentCars.get(0), newAccident.accidentCars.get(1)); 420 | } 421 | } 422 | 423 | /** 424 | * For potential toll calculation purposes get the number of cars seen in a segment in the last minute. 425 | * Note, this is means a car is not double-counted if it doesn't make it out of a segment within the 30 seconds of the next notification. 426 | * The key is a segment key, which is the segment number and a simulation minute. 427 | * 428 | * @param lastMinKey The seg + the previous min. 429 | * @return The number of vehicles. 430 | */ 431 | private int getNumV(String lastMinKey) { 432 | if (segCarIdSet.containsKey(lastMinKey)) { 433 | return segCarIdSet.get(lastMinKey).size(); 434 | } 435 | return 0; 436 | } 437 | 438 | /** 439 | * A simple toll calculation. 440 | * 441 | * @param numv 442 | * @return 443 | */ 444 | private int calcToll(int numv) { 445 | return (int) (2 * Math.pow(50 - numv, 2)); 446 | } 447 | 448 | /** 449 | * Take the segment and minute (last) and find that last average velocity using that seg-min's speed readings. 450 | * 451 | * @param seg 452 | * @param min 453 | * @return 454 | */ 455 | private int getLav(int seg, int min) { 456 | int totalSpeed = 0, totalSpeedReadings = 0; 457 | String lavKey; // The last average velocity 458 | for (int i = 1; i < 6; i++) { 459 | lavKey = seg + "-" + (min - i); 460 | if (segSumSpeeds.containsKey(lavKey)) totalSpeed += segSumSpeeds.get(lavKey); 461 | if (segSumNumReadings.containsKey(lavKey)) totalSpeedReadings += segSumNumReadings.get(lavKey); 462 | } 463 | //if (totalSpeedReadings > 0) writer.printf("lav: %d %f\n", totalSpeed, ((float) totalSpeedReadings)); 464 | if (totalSpeedReadings > 0) return Math.round(totalSpeed / ((float) totalSpeedReadings)); 465 | else return 0; 466 | } 467 | 468 | /** 469 | * As cars travel from one segment to another see if they are in accident zones 470 | * 471 | * @param seg 472 | * @param min 473 | * @return 474 | */ 475 | private int inAccidentZone(int seg, int min) { 476 | int k; 477 | Accident accident; 478 | for (int i = 0; i < 5; i++) { 479 | if (dir == 0) { 480 | k = seg + i; 481 | } else { 482 | k = seg - i; 483 | } 484 | if (accidents.containsKey(k)) { 485 | accident = accidents.get(k); 486 | int accNotiThresholdMin = accident.time / 60 + 2; 487 | int accClearMin = accident.clearTime / 60 + 1; 488 | if (accident.clearTime != -1 && accNotiThresholdMin > accClearMin) continue; 489 | if ((min >= accNotiThresholdMin && accident.clearTime == -1) || 490 | (min <= accClearMin && accident.clearTime != -1)) { 491 | return k; 492 | } 493 | } 494 | } 495 | return -1; 496 | } 497 | 498 | /** 499 | * Add a toll to the global tolls Map. This, along with, the historical Map, are the only two global data stores/structures. Thus, these two are the ones that can be placed into an external store as well. However, this will only solve one of the scaling issues if we keep all other Maps in memory. Each xway-dir thread will still require memory to hold all of its individual data, including many Maps. 500 | * 501 | * @param c 502 | * @param time 503 | */ 504 | private void assessToll(Car c, int time) { 505 | Key key = new Key("test", "myset", c.carId); 506 | Record record = client.get(policy, key); 507 | List tollList = null; 508 | if (record != null) { 509 | tollList = (List) record.getList("tolls"); 510 | } else { 511 | tollList = new ArrayList(); 512 | } 513 | //System.out.printf("Tolls for %d: %d\n", c.carId, tollList.size()); 514 | tollList.add(time); // The time 515 | tollList.add(c.lastToll); // The last toll 516 | Bin bin = new Bin("tolls", tollList); 517 | client.put(policy, key, bin); 518 | } 519 | 520 | /** 521 | * Process a type 0 input line 522 | * 523 | * @param mt 524 | */ 525 | public void t0(Map mt) { 526 | long startTime = System.currentTimeMillis(); 527 | int min = mt.get("time") / 60 + 1; 528 | String stoppedKey = mt.get("lane") + "-" + mt.get("seg") + "-" + mt.get("pos"); 529 | String segKey = getOrCreateSeg(mt); // Simply create a new seg-min combination if it doesn't exist 530 | currentCar = getOrCreateCar(mt); // Create or fetch a car 531 | if ((currentCar.lastLane == 4) && (mt.get("lane")) != 0) 532 | return; // Check this is an anomalous car, i.e. lastLane == 4 but it shows up again with a 'lane' != 0 and ignore 533 | /* SAME POSITION? */ 534 | if (currentCar.lastPos == mt.get("pos") && currentCar.lastLane == mt.get("lane")) { // This thread only operates on a single xway-dir // && currentCar.lastXway == mt.get("xway") && currentCar.lastDir == mt.get("dir")) 535 | if (currentCar.xPos == 3) { // Already seen three times at this pos+lane, so create a STOPPED car 536 | if (createStoppedCar(stoppedKey, currentCar)) { 537 | createAccident(stoppedKey, mt.get("seg"), mt.get("time")); 538 | } 539 | } 540 | currentCar.xPos++; // Update currentCar's xPos // Is this a reference to the object in the Hashmap? I think so ... 541 | /* NEW POSITION */ 542 | } else { 543 | String prevStoppedKey = currentCar.lastLane + "-" + currentCar.lastSeg + "-" + currentCar.lastPos; 544 | if (stopped.containsKey(prevStoppedKey)) { // Remove this carId from stopped if it's there 545 | stopped.get(prevStoppedKey).remove(mt.get("carId")); 546 | } 547 | if (accidents.containsKey(currentCar.lastSeg) && accidents.get(currentCar.lastSeg).accidentCars.contains(currentCar.carId) && accidents.get(currentCar.lastSeg).clearTime == -1) { // Clear accident involving this car if any 548 | accidents.get(currentCar.lastSeg).clearTime = mt.get("time"); 549 | Accident oldAccident = accidents.get(currentCar.lastSeg); 550 | System.out.printf("%d, %d,%d,%d,%d\n", currentCar.lastSeg, oldAccident.time, oldAccident.clearTime, oldAccident.accidentCars.get(0), oldAccident.accidentCars.get(1)); 551 | } 552 | currentCar.xPos = 1; // Reset current car's number of times at this position 553 | /* NEW POSITION BUT SAME SEGMENT */ 554 | if (mt.get("seg") == currentCar.lastSeg) { // I don't know if we really need to do anything here. I guess a car could move to an exit lane. 555 | if (mt.get("lane") == 4) { 556 | currentCar.lastLane = 4; 557 | } 558 | /* NEW POSITION NEW SEGMENT */ 559 | } else { 560 | int currToll = 0; 561 | int numv = 0; 562 | int lav = 0; 563 | if (mt.get("lane") != 4) { 564 | /* NUMV */ 565 | String lastMinKey = mt.get("seg") + "-" + (min - 1); 566 | numv = getNumV(lastMinKey); 567 | if (numv > 50) currToll = calcToll(numv); 568 | /* LAV */ 569 | lav = getLav(mt.get("seg"), min); 570 | if (lav >= 40) currToll = 0; 571 | /* ACCIDENTS */ 572 | int accSeg = inAccidentZone(mt.get("seg"), min); 573 | if (accSeg >= 0) { 574 | currToll = 0; 575 | writer.printf("1,%d,%d,%d,%d,%d,%d\n", mt.get("time"), mt.get("time") + (System.currentTimeMillis() - startTime), this.xway, accSeg, this.dir, currentCar.carId); 576 | type1Processed += 1; 577 | } 578 | writer.printf("0,%d,%d,%d,%d,%d\n", mt.get("carId"), mt.get("time"), mt.get("time") + (System.currentTimeMillis() - startTime), lav, currToll); 579 | type0Processed += 1; 580 | } 581 | //System.out.printf("%d,%d,%d\n", numv, lav, currToll); 582 | /* PREVIOUS TOLL */ 583 | if (currentCar.lastToll > 0) { 584 | assessToll(currentCar, mt.get("time")); 585 | tollAssessmentCountDEBUG++; 586 | writerDEBUG.printf("assessToll,%d,%d,%d,%d\n", mt.get("time"), mt.get("carId"), tollAssessmentCountDEBUG, currentCar.lastToll); 587 | } 588 | currentCar.lastToll = currToll; // New segment yields new toll 589 | } 590 | } 591 | // Update car and segment info. Car info should already be partially updated. 592 | currentCar.lastDir = mt.get("dir"); // Not necessary, BUT wasn't there something funky with the data where a car would jump directions or lanes? 593 | currentCar.lastLane = mt.get("lane"); 594 | currentCar.lastPos = mt.get("pos"); 595 | currentCar.lastSeg = mt.get("seg"); 596 | currentCar.lastSpeed = mt.get("speed"); 597 | currentCar.lastTime = mt.get("time"); 598 | // currentCar.lastToll // Updated above as needed 599 | currentCar.lastXway = mt.get("xway"); // Not necessary 600 | // currentCar.xPos // Updated above as needed 601 | 602 | // Clean up segments > 6 minutes away from the current minute 603 | // We do this before the alteration of the segment hashes 604 | //String segKey = mt.get("seg") + "-" + (mt.get("time") / 60 + 1); // This is here simply as a reference to remember what constitutes the segKey. 605 | int removeMin = min - 6; 606 | segSumSpeeds.remove(mt.get("seg") + "-" + removeMin); 607 | segSumNumReadings.remove(mt.get("seg") + "-" + removeMin); 608 | segCarIdSet.remove(mt.get("seg") + "-" + removeMin); 609 | 610 | segSumSpeeds.put(segKey, segSumSpeeds.get(segKey) + mt.get("speed")); 611 | segSumNumReadings.put(segKey, segSumNumReadings.get(segKey) + 1); 612 | segCarIdSet.get(segKey).add(mt.get("carId")); 613 | //type0SeenDEBUG++; 614 | //writerDEBUG.println(type0SeenDEBUG + ":" + segKey + ":" + segSumSpeeds.get(segKey) + "," + segSumNumReadings.get(segKey)); 615 | 616 | 617 | } 618 | 619 | /** 620 | * Process a type 2 intra-day account balance query line 621 | * 622 | * @param mt 623 | */ 624 | public void t2(Map mt) { 625 | // A type 2 could feasibly yield at least two (original says three) numbers 626 | long startTime = System.currentTimeMillis(); 627 | long bal0 = 0, bal1 = 0, rt0 = mt.get("time"), rt1 = 0; 628 | List charges = null; 629 | Key key = new Key("test", "myset", mt.get("carId")); 630 | Record record = client.get(policy, key); 631 | if (record != null) { 632 | charges = (List)record.getList("tolls"); 633 | for (int i = 1; i < charges.size(); i += 2) { 634 | long t = charges.get(i); 635 | bal0 += t; 636 | } 637 | if (charges.size() > 2) { 638 | bal1 = bal0 - charges.get(charges.size() - 1); 639 | rt1 = rt0 - 30; 640 | } 641 | } 642 | // We either need to convert millis since epoch to something else or simply give the millis till completion as we do here 643 | writer.printf("2,%d,%d,%d,%d,%d\n", mt.get("time"), mt.get("time") + System.currentTimeMillis() - startTime, rt0, mt.get("qid"), bal0); 644 | writer.printf("5,%d,%d,%d,%d,%d\n", mt.get("time"), mt.get("time") + System.currentTimeMillis() - startTime, rt1, mt.get("qid"), bal1); 645 | writerDEBUG.printf("%d:", mt.get("carId")); 646 | if (record != null && charges != null) { 647 | for (long i : charges) { 648 | writerDEBUG.printf("%d,", i); 649 | } 650 | } 651 | writerDEBUG.printf("\n"); 652 | type2Processed++; 653 | } 654 | 655 | /** 656 | * Process a type 3 historical query line 657 | * 658 | * @param mt 659 | */ 660 | public void t3(Map mt) { 661 | long startTime = System.currentTimeMillis(); 662 | String k = mt.get("carId") + "-" + mt.get("day") + "-" + mt.get("xway"); 663 | int toll = 0; 664 | synchronized (historical) { 665 | if (historical.containsKey(k) && mt.get("day") != 0) { 666 | toll = historical.get(k); 667 | writer.printf("3,%d,%d,%d,%d\n", mt.get("time"), mt.get("time") + System.currentTimeMillis() - startTime, mt.get("qid"), toll); 668 | type3Processed++; 669 | } 670 | } 671 | } 672 | 673 | public static void main(String[] args) throws Exception { 674 | if (args.length != 3) { 675 | System.out.println("Usage: java ValidateMTBQEven...

"); 676 | System.exit(1); 677 | } 678 | 679 | int totalType0Seen = 0; 680 | int totalType2Seen = 0; 681 | int totalType3Seen = 0; 682 | int totalType0Processed = 0; 683 | int totalType1Processed = 0; 684 | int totalType2Processed = 0; 685 | int totalType3Processed = 0; 686 | 687 | BufferedReader reader; 688 | String line; 689 | String[] tokens; 690 | 691 | String inputFile = args[0]; // 692 | int numXWays = Integer.parseInt(args[1]); 693 | String tollFileName = args[2]; // Never use the full size file but use the matching file created by CreateMatchingTolls 694 | 695 | // Create matching toll file 696 | String matchingTolls = "matchTollsOnly.dat"; 697 | CreateMatchingTolls.createMatchingTollsFile(inputFile, tollFileName, matchingTolls); 698 | // Load historical toll file 699 | reader = new BufferedReader(new FileReader(new File(matchingTolls))); 700 | String key; 701 | while ((line = reader.readLine()) != null) { 702 | tokens = line.split(","); 703 | // [0]carId-[1]day-[2]xway : [3]value 704 | key = tokens[0] + "-" + tokens[1] + "-" + tokens[2]; 705 | historical.put(key, Integer.parseInt(tokens[3])); 706 | } 707 | reader.close(); 708 | System.out.println("Finished loading historical files..."); 709 | 710 | // 711 | List files = new ArrayList<>(); 712 | int iNumFiles = numXWays; 713 | for (int i = 0; i < iNumFiles; i++) { 714 | files.add(i + "-0"); 715 | files.add(i + "-1"); 716 | } 717 | 718 | Map threads = new HashMap<>(); 719 | LRReader lrReader = new LRReader(new BufferedReader(new FileReader(inputFile)), threads); 720 | // Create and initialize the threads, one for each xway-dir 721 | for (int i = 0; i < files.size(); i++) { 722 | System.out.println("Do I get here?"); 723 | tokens = files.get(i).split("-"); 724 | ValidateMTBQEven3AeroTolls v = new ValidateMTBQEven3AeroTolls(files.get(i), Integer.parseInt(tokens[0]), Integer.parseInt(tokens[1]), lrReader); 725 | v.start(); 726 | threads.put(tokens[0] + "-" + tokens[1], v); 727 | } 728 | lrReader.start(); 729 | 730 | while (!lrReader.isDone()) { 731 | sleep(100); 732 | lrReader.checkRestart(); 733 | } 734 | for (ValidateMTBQEven3AeroTolls rt : threads.values()) { 735 | rt.join(); 736 | } 737 | 738 | 739 | // We should recombine files here so we don't have to do it in a shell 740 | PrintWriter writer = new PrintWriter("out"); 741 | for (String f : files) { 742 | reader = new BufferedReader(new FileReader(f + "-out")); 743 | while ((line = reader.readLine()) != null) { 744 | writer.println(line); 745 | } 746 | reader.close(); 747 | writer.flush(); 748 | } 749 | writer.close(); 750 | 751 | //Set threadKeys = threads.keySet(); 752 | for (ValidateMTBQEven3AeroTolls vt : threads.values()) { 753 | totalType0Seen += vt.type0Seen; 754 | totalType2Seen += vt.type2Seen; 755 | totalType3Seen += vt.type3Seen; 756 | totalType0Processed += vt.type0Processed; 757 | totalType1Processed += vt.type1Processed; 758 | totalType2Processed += vt.type2Processed; 759 | totalType3Processed += vt.type3Processed; 760 | } 761 | System.out.printf("Total type 0 seen:\t\t%d\n", totalType0Seen); 762 | System.out.printf("Total type 2 seen:\t\t%d\n", totalType2Seen); 763 | System.out.printf("Total type 3 seen:\t\t%d\n", totalType3Seen); 764 | System.out.printf("Total type 0 processed:\t\t%d\n", totalType0Processed); 765 | System.out.printf("Total type 1 processed:\t\t%d\n", totalType1Processed); 766 | System.out.printf("Total type 2 processed:\t\t%d\n", totalType2Processed); 767 | System.out.printf("Total type 3 processed:\t\t%d\n", totalType3Processed); 768 | } 769 | } 770 | 771 | -------------------------------------------------------------------------------- /PythonOriginal/GENERATEDATA.Original.md: -------------------------------------------------------------------------------- 1 | # How to generate data files 2 | 3 | ## Notes 4 | 2016-02-03: New scripts have been written (but not yet posted) to reduce the time required for many of the tasks below. For example, the creation of re-entrant cars has gone from days to hours--and the number of re-entrant cars is also much greater. Also, the process of cleaning raw files and creating historical tolls has also been parallelized to take advantage of multiple cores and multiple machines. 5 | 6 | For the creation of re-entrant cars, using the previous method--which was still faster than going to a database--took ~30+ hours to create ~200K replacements from a set of ~780K cars with times for a 50 expressway dataset. The new method will produce ~1M replacements from a set of ~2.1M cars with times in six hours, most of which time is spent in simply reading 150GB of data for a 150 expressway dataset. 7 | 8 | Also added are stripped versions of Duplicates.pl from the original mitsim generator that no longer needs a database but simply generates raw expressway files. 9 | 10 | Once posted, the scripts can still be refined over time. 11 | 12 | To create the datafiles download the data generator from http://www.cs.brandeis.edu/~linearroad/tools.html. 13 | 14 | ### Using the original generator 15 | To get the original generator working on CentOS6.5/6, or other modern 64-bit Linux distribution, the 32-bit compatibility pack must be installed. If an older, 32-bit version of Linux is available (i.e. 32-bit CentOS 4.8) that works too. Or, you could try recompiling the mitsim program into a 64-bit version. 16 | 17 | Both a 64-bit OS (CentOS in Azure) with the 32-bit compatibility pack installed and a 32-bit CentOS 4.8 install on a private machine were both successful. 18 | 19 | The general steps for Centos 6.5/6 follow: 20 | 21 | Download the original tools and unpack into arbitrary directory: 22 | 23 | ``` 24 | wget http://www.cs.brandeis.edu/~linearroad/files/mitsim.tar.gz 25 | mkdir MITSIMLab 26 | cd MITSIMLab 27 | tar xf ../mitsim.tar.gz 28 | ``` 29 | 30 | Install and set up the PostgreSQL database (these instructions may vary based on the version of PostgreSQL). For version 8.4.0 that the default CentOS 6.5/6 repo in Azure installs: 31 | 32 | ``` 33 | sudo yum -y install postgresql postgresql-server 34 | sudo service postgresql initdb 35 | sudo service postgresql start 36 | sudo su postgres 37 | psql 38 | psql> create user ; # this should be the same username from which scripts will be run 39 | psql> alter role with superuser login; 40 | psql> create database test; 41 | ``` 42 | 43 | Install gcc and make if not already installed. 44 | ``` 45 | sudo yum -y install gcc make 46 | ``` 47 | Install the appropriate Perl modules for the scripts to interact with postgresql. 48 | ``` 49 | sudo perl -MCPAN -e "install DBI" 50 | sudo perl -MCPAN -e "install DBD::PgPP" 51 | sudo perl -MCPAN -e "install Math::Random" 52 | ``` 53 | Install the 32-bit compatibility pack: 54 | ``` 55 | sudo yum -y install compat-libstdc++-296.i686 56 | ``` 57 | You should now have PostgreSQL setup with an appropriate user and database along with the proper Perl modules. To test database connectivity modify the included *test.pl* file to point to the new database connection: 58 | ``` 59 | DBI->connect("DBI:PgPP:dbname=test", "", "") 60 | ``` 61 | and insert a `print $dbh;` statement after the connection statement to test for connectivity. If it prints something like DBI::db=HASH(0x138f1a0) the connection should be good. 62 | 63 | ### Running the script 64 | To start the data creation process you primarily edit two files: 65 | `mitsim.config` and `linear-road.pl` 66 | 67 | Note that due to differences in PostgreSQL 8.4.0+ from 7.x.x, the latter being the version used by the original code, line 197 of `DuplicateCars.pl` should be changed from: 68 | ``` 69 | $dbquery="UPDATE input SET carid=carstoreplace.carid WHERE carid=carstoreplace.cartoreplace;"; 70 | ``` 71 | to: 72 | ``` 73 | $dbquery="UPDATE input SET carid=carstoreplace.carid FROM carstoreplace WHERE input.carid=carstoreplace.cartoreplace;"; 74 | ``` 75 | Note that this is not necessary if all we're generating are the raw files for later processing. 76 | 77 | In `mitsim.config`: change the `directoryforoutput` to a directory of your choosing, `databasename` to "test", set the `databasepassword` to `databasepassword=` if you don't have a password for the user, and select any number of expressways. 78 | 79 | NOTE: remove any trailing blank lines in `mitsim.config` to avoid `use of uninitialized value` errors. 80 | 81 | In `linear-road.pl` you have can control a variety of parameters but the only ones we've adjusted are `my $cars_per_hour`, increasing the value to 1000, and `my $endtime`, setting to however long we want the simulation to run. 82 | 83 | To kick off the script `./run mitsim.config` 84 | 85 | NOTE: if SELinux is present it may need to be disabled: `sudo setenforce 0` 86 | 87 | NOTE: the table `input` must be manually dropped or cleared between runs. This table is not automatically dropped because if file permissions are not right the final data can still be found in the `input` table even if it's not written out as `cardatapoints.out`. `cardatapoints.outN` are the raw files. `cardatapoints.out` is the final output after running duplications or re-entrants--as we've called them. 88 | 89 | To drop the database table `input`: 90 | ``` 91 | psql -d test # use the -d flag to choose a database, otherwise psql will default to trying to connect to a database with the same name as the user 92 | psql> drop table input; 93 | ``` 94 | And also, remove the output files from the chosen output directory, moving any of the raw `cardatapoints.outN` files first if desired. 95 | 96 | For convenience, add the following lines to `DuplicateCars.pl` before the statements that create the table `input`: 97 | ``` 98 | writeToLog ( $logfile, $logvar, "Dropping input table."); 99 | $dbquery="DROP TABLE IF EXISTS input;"; 100 | $sth=$dbh->prepare("$dbquery") or die $DBI::errstr; 101 | $sth->execute; 102 | unlink glob $dir."/*"; # remove previous files from output directory 103 | ``` 104 | Depending on the endtime and number of expressways chosen the program can run for hours, if not days or more. Each 3 hour 1 expressway set can take ~3-5 hours to generate. 105 | 106 | The raw data is found under the `directoryforoutput` as N files named `cardatapoints.out`N. N being 0 .. `numberofexpressways`-1. 107 | 108 | The script `DuplicateCars.pl` can perform the process of combining the multiple raw data files but cannot handle in reasonable time a very large number of expressways. The self-join query mentioned in the general introduction explains why (the progressive slowdown of self-join query that finds duplicates). The `directoryforoutput` must also be readable and writeable by the user `postgres`. 109 | 110 | In lieu of `DuplicateCars.pl` the directions below can be followed to create arbitrarily large datasets with duplicates. 111 | 112 | ### Creating a single combined data file 113 | As stated in the README, datasets of arbitrary sizes can be generated on a single machine or by parallelizing the expressway generation on multiple machines. But, after generation, these must be cleaned (if desired) and combined. 114 | 115 | **These are the scripts and commands used for cleaning raw files--run on the individual raw files. (Any number of additional steps can be added as desired.)** 116 | 117 | ``` 118 | dataval.py 119 | datarm2.py > # remove carids with only <=2 tuples 120 | datamakeexit.py > # make the last type 0 record an exit lane tuple 121 | mv 122 | ``` 123 | After cleaning, merge the _n_ "clean" files. 124 | ``` 125 | datacombine.py 126 | ``` 127 | Then, create the tolls and the random re-entrant cars. 128 | ``` 129 | combine.py 130 | # combine.py uses: p_duplicates.py, historical-tolls.pl 131 | # Also, pre-create the following files in the and change permissions accordingly: 132 | touch carsandtimes.csv; touch carstoreplace.csv; chmod 777 carsandtimes.csv; chmod 777 carstoreplace.csv 133 | #These steps are necessary as some databases write out files with owner read permissions only, but c 134 | ``` 135 | Clean the generated tolls to match the tuples present in the position reports. 136 | ``` 137 | datafixtype3.py /my.data.out /my.tolls.out /my.tolls.clean 138 | ``` 139 | 140 | **Recap of scripts and order of usage:** 141 | 142 | > On each raw file: 143 | ``` 144 | dataval.py 145 | datarm2.py > 146 | datamakeexit.py > 147 | ``` 148 | > Using the cleaned files create a single file: 149 | ``` 150 | datacombine.py / /clean.combined 151 | ``` 152 | > On the single combined file: 153 | ``` 154 | combine.py /clean.combined 155 | ``` 156 | > On the output toll file: 157 | ``` 158 | datafixtype3.py /my.data.out /my.tolls.out /my.tolls.clean 159 | ``` 160 | ### Final outputs 161 | The final outputs will be: 162 | ``` 163 | /my.data.out 164 | /my.tolls.clean 165 | ``` 166 | The scripts `preprawdata.sh` and `prepcleandata.sh` combine all the scripts and take a directory of raw or clean files, respectively, and output the final files. 167 | -------------------------------------------------------------------------------- /PythonOriginal/README.Original.md: -------------------------------------------------------------------------------- 1 | # linearroad 2 | Walmart version of the Linear Road streaming benchmark. 3 | 4 | ## Overview 5 | LinearRoad is a streaming data management system (SDMS) benchmark originally created in 2004. 6 | It was created at a time when SDMS systems were relatively new. 7 | The original Linear Road benchmark paper was a joint effort between collaborators from Stanford University, Brandeis University, Massachusetts Institute of Technology, and the Oregon Health and Science University/Oregon Graduate Institute. And it has since been endorsed by Stanford, Brandeis, MIT, and Brown Universities as an SDMS benchmark. 8 | 9 | All original files were downloaded from http://www.cs.brandeis.edu/~linearroad/tools.html. 10 | These original files were then modified or re-written for performance reasons, including the creation of arbitrarily large datasets in a reasonable amount of time. 11 | 12 | The spirit of the original files was followed. 13 | 14 | This is a 0.1 release. 15 | 16 | Changes will continually be made to bring the code closer to the intent of the original paper and new features will be added. 17 | 18 | ## Notes 19 | Type 4 queries are not implemented as per the original paper, nor are Type 4 queries implemented in subsequent implementations. We plan on implementing them in the near future. 20 | 21 | The validator and many portions of data generation have been completely rewritten in Python. The choice of Python was arbitrary. At the moment, the validator is limited by RAM. A version that leverages a NoSQL K/V store (currently Redis) to mitigate RAM issues is currently being developed and tested. This memory limitation puts a boundary on the number of expressways that can currently be validated. 22 | 23 | ### Data Generation 24 | Datasets of arbitrary sizes can be generated on a single machine or by parallelizing the expressway generation on multiple machines. The original mitsim (microscopic traffic simulator) program creates each expressway as a separate file. But, each file/expressway can take up to three hours or more to create. The file size for a one expressway, three hour simulation, with 1,000 cars per segment per hour is ~1GB and will contain ~20M tuples. Since each file is independent of all other files, you can parallelize the creation of these base files on as many machines or VM's as is desired. 25 | 26 | Each independent file created by mitsim is expressway 0, and each with its own independent car and query id numbering starting at ~0 till some _n_. In order to combine an arbitrary number of these files into a single simulation file the expressway number, as well as the car and query ids must be incremented according to the number of expressways being combined. 27 | 28 | Before combining the files we run some cleaning on the original, "raw" files to create a "clean"er set before running the combination. This cleaning helps remove some noise from the data. For example, some carids in the raw files will have exited but will magically reappear without going through an entry lane. 29 | 30 | After cleaning, the initial combination process merges _n_ "clean" files, incrementing the expressway number from _0 thru n-1_ for each cleaned file. It also increments the car and query ids by a current max car id and current max query id from the previous file to avoid overlap. 31 | 32 | Then, the subsequent combination process creates the tolls and creates the random re-entrant cars by replacing a percentage of random cars by other random cars that meet the criteria of having an entry time _1000 * random.random() + 61_ greater than the exit time of another car. 33 | 34 | The percentage of cars to check for possible re-entry is 10% by default. Note that this does not mean 10% of the cars will be re-entrant but only that 10% will be checked to see if they _can_ be re-entrant. And, this 10% is also not actually 10% of the actual number of cars, since the function used to create these possibly re-entrant cars uses _max carid_ and assumes the presence of carid's from 100 to _max carid_. The 100 is arbitrary since carids below 100 exist. But, more importantly, although carid's monotonically increase they do not so by only increments of 1. The actual carid's present in a given expressway may be 5, 20, ..., 123, 124, 130, etc... But, the 10% generated assumes the presence of carid's 101, 102, 103, etc.... Nevertheless, a random number of carid's that represents _at most_ 10% of the actual carid's is created as _duplicatecars_. Meaning, try to duplicate these cars--which is the same as try to make these cars re-entrant. 35 | 36 | From this larger number of potential re-entrant carid's, a table with the _enter-time_, the _leave-time_, and the _expressway_ of each carid that actually exists in the generated data is created. Then comes the phase where carid's in this new table is checked to see if a carid with an _enter-time > 1000 * random.random() + 61 + leave-time_ of another car exists. And, if more than expressway is simulated, the carid's must be from different expressways. We are simulating a car leaving one expressway and re-entering on a different expressway at a later point in time. If only one expressway is present then we are simply simulating a car re-entering at a later point in time. The _1000 * random.random() + 61_ appears to be arbitrary. Python's random.random() returns a floating point number between 0.0 and 1.0, not including 1.0, or [0.0,1.0). 37 | 38 | This process of making re-entrant carid's is the bottleneck of creating a single file from any arbitrary number of cleaned files. The original SQL version used the following query: 39 | 40 | _SELECT times.carid, times.entertime, times.leavetime, times_1.carid as carid1, times_1.entertime as entertime1, times_1.leavetime as leavetime1 41 | FROM carsandtimes as times, carsandtimes AS times_1 42 | WHERE times_1.entertime>times.leavetime+1000*random()+61 43 | LIMIT 1;_ 44 | 45 | If a match is found the two carids that match are removed from the carsandtimes table and entered into a new carstoreplace table, which simply holds two carid's per row. 46 | 47 | This query slows down tremendously as the "low-hanging fruit" is removed. For perspective: for the 50 expressway data set there are 7,851,650 unique carid's with a max carid of 13,958,137. From this max carid we get 1,395,100 potential duplicate, or re-entrant, cars per our description above. And, we get 783,265 actual carid's that exist. And, from these that actually exist we find, or create, 204,095 re-entrant cars. 48 | 49 | The issue with the original SQL statement is the size of the self-joined table. If attempted with this 50 expressway set the self-joined table would be up to 783,265 ^ 2, or 613,504,060,225 rows, from which to try and find an entry matching the WHERE clause. Again, low hanging fruit can be found relatively quickly--from negligible to under a second. But, as these easier finds are removed each additional match can run for many seconds, to minutes and tens of minutes, as up to the max rows above are potentially scanned. Even if we limit the number of re-entrant cars to 200K, and even if the performance were steady at one per second, it would take over 55 hours to create the re-entrant cars. But, as the query does slow down, and creation of even 200K would likely run into days, weeks, or more the above query was untenable. 50 | 51 | To mitigate the issue above a separate script was created that creates the re-entrant cars. 52 | 53 | This script creates the re-entrant cars in a single pass (or any number of passes). 54 | All the cars in the actual existing cars are loaded into a list, shuffled, and that list is operated on to create a separate list of lists, or tuples, of _(carid,cartoreplace)_. 55 | The current script tries 1,000 random times to find a suitable match. If a match is found the current carid and the matching carid are removed. Since a list is modified during iteration this means elements will be skipped if only one passed is used. This script is itself an improvement over an O(n^2) version which simply iterated through a copy of the list to find a suitable replacement. Ideally this script is O(n), but for almost 1M records it still had a run time of roughly 30 hours for a single pass. Some modifications to improve the run time include reducing the number of tries to 500 or maybe 100. 56 | 57 | Another option would be to stop looping the original query after an arbitrary number of replacements are found. Or, stop when queries start taking more than some arbitrary number of seconds. 58 | 59 | The tolls are simply a random table using the max carid after all the files have been combined. So, if the max carid were 100 with two expressways then the tolls table would be carid's 1 thru 100, with a row for each carid-day combination, where days run from 1 thru 69. Each historical toll row will have a random expressway from 0 or 1 and a random toll value from 0 thru 99. So, the table size will be max carid * 69. For our 50 expressway set the number of rows is 963,111,453. Note that the random expressway will not match the expressway created associated with the position report tuple. This is accounted for later. 60 | 61 | The original mitsim paper from 1996 can be found here https://its.mit.edu/sites/default/files/documents/MITSIM2.PDF. 62 | -------------------------------------------------------------------------------- /PythonOriginal/README.md: -------------------------------------------------------------------------------- 1 | This folder holds the working, originally modified Python scripts along with the original toll generation script. All of these are still valid. The newer versions will generally be much faster. 2 | -------------------------------------------------------------------------------- /PythonOriginal/combine.py: -------------------------------------------------------------------------------- 1 | import sys, os, time, random 2 | import MySQLdb 3 | import subprocess 4 | 5 | # combine.py: Now takes a single, combined, clean file and creates the tolls and re-entrant cars. 6 | # Requires MySQL and attendant drivers. Other databases can be used as well. 7 | # This script requires the original 'historical-tolls.pl' script and the 'p_duplicates.py' script. 8 | # It turns out mysql can run the original self-join query fairly quickly up to a large number of replacements, but 9 | # 50K took 355176 secs, so a non-db based recombination appears to still be better. 10 | # Usage: python combine.py 11 | # 12 | # NOTE: Modifying the Python to NOT use a database would be faster. The Java classes to replicate as Python 13 | # scripts would be: 14 | # 1) create_carsandtimes.java, 15 | # 2) create_carstoreplace.java, 16 | # 3) replacecars.java, 17 | # and 4) combine_after_replace.java. 18 | 19 | file = sys.argv[1] # not used, because 20 | datadir = sys.argv[2] # no trailing '/' 21 | numXWays = sys.argv[3] 22 | 23 | db = MySQLdb.connect(db="test",host="127.0.0.1",user="root") 24 | db2 = MySQLdb.connect(db="test",host="127.0.0.1",user="root") 25 | c = db.cursor() 26 | c2 = db2.cursor() 27 | overlap = 10 28 | maxCarId = None 29 | 30 | def generateRandomTable(maxCarId, overlap, db): 31 | c = db.cursor() 32 | for i in xrange(100, maxCarId): 33 | if random.random() * 100 < overlap: # the perl rand() function returns 0 < max; we use python vals from [0.0, 1.0) 34 | c.execute("INSERT INTO duplicatecars VALUES ("+str(i)+")") 35 | db.commit() 36 | c.close() 37 | 38 | # DROP ALL TABLES IF THEY EXIST 39 | print "Dropping tables..." 40 | c.execute("DROP TABLE IF EXISTS input") 41 | c.execute("DROP TABLE IF EXISTS carsandtimes") 42 | c.execute("DROP TABLE IF EXISTS carstoreplace") 43 | c.execute("DROP TABLE IF EXISTS duplicatecars") 44 | 45 | # CREATE input TABLE 46 | print "Creating tables..." 47 | c.execute("CREATE TABLE IF NOT EXISTS input ( type int, time int, carid int, speed int, xway int, lane int, dir int, seg int, pos int, qid int, m_init int, m_end int, dow int, tod int, day int, shard key(xway))") # just finding a field that won't change 48 | c.execute("CREATE INDEX inputcarid ON input (carid)") 49 | c.execute("CREATE INDEX inputcaridtime ON input (carid, time)") 50 | c.execute("CREATE INDEX inputtime ON input (time)") 51 | c.execute("CREATE INDEX inputlane ON input (lane)") 52 | c.execute("CREATE INDEX inputtype ON input (type)") 53 | 54 | # CREATE duplicatecars TABLE 55 | c.execute("CREATE TABLE duplicatecars (carid int, shard key(carid))") 56 | 57 | # CREATE carstoreplace TABLE 58 | c.execute("CREATE TABLE carstoreplace (carid int, cartoreplace int, shard key(carid))") 59 | 60 | # INSERT RECORDS 61 | print "Inserting records..." 62 | start_time = time.time() 63 | print "Start time: " + time.strftime("%H:%M:%S") 64 | # this may be too slow 65 | # the other, and better option, is to parallel load the data and skip this step (COMMENT OUT THE DELETE TABLE input ABOVE!) 66 | # memsql loads can be very, very quick and thus a combination of memsql and mysql _may_ yield the fastest results 67 | c.execute("LOAD DATA INFILE '"+file+"' INTO TABLE input FIELDS TERMINATED BY ','") 68 | db.commit() 69 | 70 | print "Total time to load file(s) ... " + str(time.time() - start_time) + " seconds." 71 | 72 | # GET MAX CAR ID 73 | print "Getting maxCarId..." 74 | c.execute("SELECT max(carid) FROM input") 75 | r = c.fetchone() 76 | maxCarId = r[0] 77 | print "maxCarId: " + str(maxCarId) 78 | 79 | # GENERATE HISTORICAL TOLLS 80 | print "Generating historical tolls..." 81 | subprocess.call(["perl", "historical-tolls.pl", str(numXWays), str(maxCarId), "."]) 82 | subprocess.call(["mv", "historical-tolls.out", datadir+"/my.tolls.out"]) 83 | 84 | # Generate random duplicate values for potential replacement 85 | print "Creating random table..." 86 | generateRandomTable (maxCarId, overlap, db) 87 | 88 | # CREATE carsandtimes TABLE 89 | print "Creating carsandtimes..." 90 | c.execute("CREATE TABLE carsandtimes (carid int, entertime int, leavetime int, xway int, shard key(carid))") # MemSQL does NOT support CREATE TABLE ... AS SELECT 91 | c.execute("SELECT duplicatecars.carid, min(input.time) as entertime, max(input.time) as leavetime, xway FROM duplicatecars, input WHERE duplicatecars.carid=input.carid GROUP by duplicatecars.carid") 92 | for i in xrange(0, c.rowcount): 93 | r = c.fetchone() 94 | c2.execute("INSERT INTO carsandtimes VALUES ("+str(r[0])+","+str(r[1])+","+str(r[2])+","+str(r[3])+")") 95 | db2.commit() 96 | c.execute("CREATE INDEX carsandtimescarid ON carsandtimes (carid)") 97 | c.execute("CREATE INDEX carsandtimescaridenter ON carsandtimes (carid, entertime)") 98 | c.execute("CREATE INDEX carsandtimescaridleave ON carsandtimes (carid, leavetime)") 99 | 100 | # CREATE MATCHES FOR MULTIPLE EXITS AND RE-ENTRANTS 101 | print "Processing and creating duplicates..." 102 | c.execute("SELECT * FROM carsandtimes INTO OUTFILE '"+datadir+"/carsandtimes.csv' FIELDS TERMINATED BY ','") 103 | 104 | # RUN THE DUPLICATION SCRIPT 105 | subprocess.call(["python", "p_duplicates.py", datadir, str(numXWays)]) 106 | 107 | # Take the results of the duplication script and insert into TABLE carstoreplace 108 | print "Loading carstoreplace file into carstoreplace table." 109 | c.execute("LOAD DATA INFILE '"+datadir+"/carstoreplace.csv' INTO TABLE carstoreplace FIELDS TERMINATED BY ','") 110 | db.commit() 111 | 112 | # Update input table with duplicates 113 | c.execute("SELECT * FROM carstoreplace") 114 | num_recs = c.rowcount 115 | print "Number of cars to replace: " + str(num_recs) 116 | for i in xrange(0, num_recs): 117 | r = c.fetchone() 118 | print "Replacing record " + str(i) + " of " + str(num_recs) + ", " + str(r[1]) + " with " + str(r[0]) 119 | c2.execute("UPDATE input SET carid="+str(r[0])+" WHERE carid="+str(r[1])); 120 | db2.commit() 121 | 122 | # Export final file 123 | print "Exporting final data file to " + datadir + "/my.data.out" 124 | c.execute("SELECT * FROM input ORDER BY time INTO OUTFILE '"+datadir+"/my.data.out' FIELDS TERMINATED BY ','") 125 | 126 | c.close() 127 | c2.close() 128 | db.close() 129 | db2.close() 130 | -------------------------------------------------------------------------------- /PythonOriginal/datacombine.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | # datacombine.py: Combine data files outside of a db. 4 | # Read the files in folder x and write out a new single file. 5 | # The time fields will not be accurate in the final file. This is why this combined file was then loaded into a 6 | # database. The database takes care of ordering the data by time. The Java version is different and requires time 7 | # ordering in the 'combine_after_replace' because the Java version does not use a database. 8 | # Aside: Python is wonderfully self-documenting. 9 | # Compare the Java versions to the Python versions and you'll see succinctness, clarity, and ease of reading in 10 | # the Python versions. 11 | # Why? There simply isn't as much code to decipher. Amazing. 12 | # The Java versions happen to be faster, much faster. 13 | # Usage: datacombine.py 14 | 15 | folder = sys.argv[1] 16 | outfile = open(sys.argv[2], 'w') 17 | 18 | if not os.path.isdir(folder): 19 | print "First argument must be a directory of cleaned data files" 20 | print "Usage: datacombine.py " 21 | sys.exit(1) 22 | 23 | # We need full paths to the files. 24 | dirpath = os.path.dirname(folder) 25 | files = list(os.listdir(folder)) 26 | 27 | # Iterate through the files and get the max car id of each file to accumulate and add to carid and queryid. 28 | maxcarid = 0 29 | maxqid = 0 30 | filecount = 0 # Used to assign xways. 31 | for file in files: 32 | f = open(dirpath+"/"+file) 33 | 34 | # Find current max carid and qid for this file 35 | curmaxcarid = 0 36 | curmaxqid = 0 37 | 38 | for line in f: 39 | t = line.strip().split(",") 40 | 41 | caridint = int(t[2]) 42 | qidint = int(t[9]) 43 | 44 | if caridint > curmaxcarid: 45 | curmaxcarid = caridint 46 | if qidint > curmaxqid: 47 | curmaxqid = qidint 48 | 49 | if filecount > 0: 50 | caridint += maxcarid 51 | t[2] = str(caridint) 52 | if t[0] != '0': # Update queryid's only for non-Type 0 notifications. 53 | qidint += maxqid 54 | t[9] = str(qidint) 55 | if t[0] == '0': # Update the xway number. 56 | t[4] = str(filecount) 57 | 58 | outfile.write(",".join(t)+"\n") 59 | 60 | maxcarid += curmaxcarid+1 61 | maxqid += curmaxqid+1 62 | 63 | f.close() 64 | filecount += 1 65 | -------------------------------------------------------------------------------- /PythonOriginal/datafixtype3.py: -------------------------------------------------------------------------------- 1 | import time, sys, MySQLdb 2 | 3 | # datafixtype3.py: 4 | # Run AFTER combine.py. 5 | # This fixes the toll file to have matching xways with the main data file. 6 | # Otherwise, the Type 3 requests from the main data file would not match the randomly generated toll file. 7 | # Prints cleaned results to stdout. 8 | # Usage: python datafixtype3.py

9 | # Note: the is no longer needed and can just be any file for now. 10 | # 11 | # NOTE: Again, this file may be better if re-written in the Java version, 'fixtolls.java.' 12 | 13 | 14 | db = MySQLdb.connect(db="test",user="root",host="127.0.0.1") 15 | c = db.cursor() 16 | db2 = MySQLdb.connect(db="test",user="root",host="127.0.0.1") 17 | c2 = db2.cursor() 18 | 19 | datfile = sys.argv[1] # arg 1 is no longer needed and can be removed. it was necessary when the output file, rather than the database was used 20 | histfile = sys.argv[2] 21 | outfile = sys.argv[3] 22 | 23 | print "Dropping historical table" 24 | c.execute("DROP TABLE IF EXISTS histtoll") 25 | 26 | print "Creating historical table" 27 | c.execute("CREATE TABLE histtoll (carid int, day int, xway int, amt int, shard key(carid))") 28 | 29 | print "Loading historical table" 30 | st = time.time() 31 | c.execute("LOAD DATA INFILE '" + histfile + "' INTO TABLE histtoll FIELDS TERMINATED BY ','") 32 | db.commit() 33 | 34 | print "Time to load '" + histfile + "': " + str(time.time() - st) 35 | 36 | print "Use the already existing input table..." 37 | c.execute("SELECT carid, day, xway FROM input WHERE type = 3") 38 | rc = c.rowcount 39 | count = 0 40 | for i in xrange(0, rc): 41 | r = c.fetchone() 42 | c2.execute("UPDATE histtoll SET xway = " + str(r[2]) + " WHERE carid = " + str(r[0]) + " AND day = " + str(r[1])) 43 | db2.commit() 44 | count += 1 45 | 46 | print "Number of type 3 corrections: " + str(count) 47 | 48 | print "Print NEW historical tolls file" 49 | c.execute("SELECT * FROM histtoll INTO OUTFILE '" + outfile + "' FIELDS TERMINATED BY ','") 50 | db.commit() 51 | 52 | c.close() 53 | db.close() 54 | c2.close() 55 | db2.close() 56 | -------------------------------------------------------------------------------- /PythonOriginal/datamakeexit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # ==== 4 | # Note: This step isn't really necessary and can be removed. 5 | # ==== 6 | 7 | # datamakeexit.py: Ensure that all vehicles get off the xway. 8 | # Run after dataval.py and datarm2.py. 9 | # Usage: datamakeexit.py 10 | 11 | f = open(sys.argv[1]) 12 | 13 | lasttimes = {} 14 | 15 | # Read the file and find the last time for each vehicle. 16 | for line in f: 17 | t = line.strip().split(",") 18 | lasttimes[t[2]] = t[1] 19 | 20 | # Go back to the beginning of the file and re-read, 21 | # when the last notification for a car is seen modify the line to make it an exiting notification. 22 | f.seek(0) 23 | for line in f: 24 | t = line.strip().split(",") 25 | if t[1] == lasttimes[t[2]] and t[0] == '0': # Only last appearing type 0 queries need adjustment. 26 | t[3] = '10' 27 | t[5] = '4' 28 | print ",".join(t) 29 | else: 30 | print line.strip() 31 | -------------------------------------------------------------------------------- /PythonOriginal/datarm2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # datarm2.py: Remove carid's with only one or two records. 4 | # Writes to stdout. 5 | # Usage: python datarm2.py > 6 | 7 | f = open(sys.argv[1]) 8 | 9 | # Hold carid's and the number of times the carid appears in this file. 10 | counts = {} 11 | for line in f: 12 | t = line.strip().split(",") 13 | if t[2] not in counts: 14 | counts[t[2]] = 1 15 | else: 16 | counts[t[2]] += 1 17 | 18 | # Read the file again and ignore those carid's that don't have more than two records. 19 | f.seek(0) 20 | for line in f: 21 | t = line.strip().split(",") 22 | if t[2] in counts: 23 | if counts[t[2]] > 2: # Ensure this carid has > 2 records. 24 | if t[0] != '4': # Ignore type 4's. 25 | if t[0] == '3': # Redundant if run through dataval.py, but check for day 0 type 3's. 26 | if t[len(t)-1] == '0': 27 | continue 28 | print line.strip() 29 | -------------------------------------------------------------------------------- /PythonOriginal/dataval.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | 3 | # dataval.py: With a raw mitsim data file perform the following: 4 | # 1) Check for position reports that are not 30 secs apart, and simply report. 5 | # 2) Ensure car does not reappear after exiting. 6 | # 3) Remove negative positions and segments. 7 | # 4) Remove type 3 queries with a day of '0' if any. 8 | # Usage: dataval.py 9 | 10 | f = open(sys.argv[1]) 11 | w = open(sys.argv[2], 'w') 12 | print "Validating data file: " + sys.argv[1] 13 | 14 | # This is the map that will hold all carid's and the last line show. 15 | cars = {} # K: carid V: time 16 | exited = {} # K: carid V: time 17 | # Time how long it to perform parts of this cleanup. 18 | st = time.time() 19 | 20 | # Read through the file and fix issues. 21 | for line in f: 22 | # 'discard isn't used." 23 | #discard = False 24 | t = line.strip().split(",") 25 | type = t[0] 26 | ctime = t[1] 27 | carid = t[2] 28 | speed = t[3] 29 | xway = t[4] 30 | lane = t[5] 31 | dir = t[6] 32 | seg = t[7] 33 | pos = t[8] 34 | day = t[14] 35 | 36 | if type == '0': 37 | if carid in exited: 38 | continue # Skip this row as it already exited. 39 | if carid not in cars: 40 | cars[carid] = ctime # Add the car and set its time. 41 | else: 42 | # 30 sec incr? 43 | if int(cars[carid]) != int(ctime)-30: 44 | print cars[carid] + " " + ctime 45 | print "Time error for car " + carid + " at time " + ctime 46 | cars[carid] = ctime 47 | if lane == '4': # Put this car in the exited dict. 48 | exited[carid] = ctime 49 | if int(seg) < 0: # Fix negative segments and positions. 50 | print t 51 | t[7] = '0' 52 | t[8] = '0' 53 | elif type == '2': # Ignore Type 2's. 54 | pass 55 | elif type == '3': # Ignore Type 3's with day 0. 56 | if day == '0': 57 | continue 58 | 59 | # 'discard' isn't used. 60 | #if not discard: 61 | w.write(",".join(t)+"\n") 62 | 63 | print "Time to run dataval.py: " + str(time.time() - st) 64 | 65 | -------------------------------------------------------------------------------- /PythonOriginal/dups.test.mysql.py: -------------------------------------------------------------------------------- 1 | import sys, os, time, random 2 | import MySQLdb 3 | 4 | # Just a test file. 5 | # Test the self-join performance of mysql. 6 | # This simply tests the ability to find re-entrant duplicates. 7 | 8 | file = sys.argv[1] 9 | 10 | db = MySQLdb.connect(db="test",host="127.0.0.1",user="root") 11 | db2 = MySQLdb.connect(db="test",host="127.0.0.1",user="root") 12 | c = db.cursor() 13 | c2 = db2.cursor() 14 | 15 | # DROP ALL TABLES IF THEY EXIST 16 | print "Dropping tables..." 17 | c.execute("DROP TABLE IF EXISTS carsandtimes") 18 | c.execute("DROP TABLE IF EXISTS carstoreplace") 19 | 20 | # CREATE carstoreplace TABLE 21 | print "Creating tables..." 22 | c.execute("CREATE TABLE carstoreplace (carid int, cartoreplace int, primary key(carid))") 23 | c.execute("CREATE TABLE carsandtimes (carid int, entertime int, leavetime int, xway int, primary key(carid))") # MemSQL does NOT support CREATE TABLE ... AS SELECT 24 | 25 | print "Loading data..." 26 | c.execute("LOAD DATA LOCAL INFILE '"+file+"' INTO TABLE carsandtimes FIELDS TERMINATED BY ','") 27 | db.commit() 28 | print "Creating indexes..." 29 | c.execute("CREATE INDEX carsandtimescarid ON carsandtimes (carid)") 30 | c.execute("CREATE INDEX carsandtimescaridenter ON carsandtimes (carid, entertime)") 31 | c.execute("CREATE INDEX carsandtimescaridleave ON carsandtimes (carid, leavetime)") 32 | 33 | # CREATE MATCHES FOR MULTIPLE EXITS AND RE-ENTRANTS 34 | print "Processing and creating duplicates..." 35 | sql = "SELECT times.carid, times.entertime, times.leavetime, times_1.carid as carid1, times_1.entertime as entertime1, times_1.leavetime as leavetime1" 36 | sql += " FROM carsandtimes as times, carsandtimes AS times_1" 37 | sql += " WHERE times_1.entertime>times.leavetime+1000*rand()+61" 38 | sql += " LIMIT 1" 39 | total_st = time.time() 40 | st = time.time() 41 | c.execute(sql) 42 | et = time.time() 43 | print et-st 44 | replacements = 0 45 | 46 | while c.rowcount > 0: 47 | r = c.fetchone() 48 | print r 49 | c2.execute("INSERT INTO carstoreplace VALUES("+str(r[0])+","+str(r[3])+")") 50 | db2.commit() 51 | c2.execute("DELETE FROM carsandtimes WHERE carid="+str(r[0])+" OR carid="+str(r[3])) 52 | db2.commit() 53 | replacements += 1 54 | print replacements 55 | st = time.time() 56 | c.execute(sql) 57 | et = time.time() 58 | print et-st 59 | print et-total_st 60 | 61 | c.close() 62 | c2.close() 63 | db.close() 64 | db2.close() 65 | 66 | print "Finished!" 67 | -------------------------------------------------------------------------------- /PythonOriginal/historical-tolls.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # This was the original file used to generate historical tolls from the maxcarid is a given data set. 4 | # Note that when they say $max_xway0 they really mean the max car, or vehicle id (vid). 5 | 6 | @ARGV == 3 or die("to generate toll-history, give me # of xways, maxcarid"); 7 | 8 | my $xway = $ARGV[0]; 9 | my $max_xway0 =$ARGV[1]; 10 | my $dir=$ARGV[2]; 11 | 12 | #open(OUT, ">$dir/xway$xway.historical-tolls"); 13 | # Put all historical tolls of different expressway in 1 files 14 | open(OUT, ">$dir/historical-tolls.out"); 15 | 16 | 17 | for (my $vid = 1; $vid <= $max_xway0; ++$vid) { 18 | for (my $day = 1; $day <= 69; ++$day) { 19 | my $toll = int(rand(99)); 20 | $xway1 = int(rand ($xway)); 21 | # (vid, day, xway, tolls) 22 | print OUT "$vid,$day,$xway1,$toll\n"; 23 | } 24 | } 25 | close (OUT); 26 | -------------------------------------------------------------------------------- /PythonOriginal/p_duplicates.py: -------------------------------------------------------------------------------- 1 | import random, sys 2 | 3 | # p_duplicates.py: Since the self-join on carsandtimes is just TOO slow to create the random replacements. 4 | # This script should be called by combine.py 5 | # 6 | # NOTE: This is really slow too. For best results create Python versions of: 7 | # create_carsandtimes.java, create_carstoreplace.java, replacecars.java and combine_after_replace.java. 8 | 9 | dir = sys.argv[1] # Ensure there is no trailing '/' when calling from combine.py. 10 | f1 = open(dir + '/carsandtimes.csv') 11 | f2 = open(dir + '/carstoreplace.csv','w') 12 | 13 | # Do we need to account for more than one expressway (affects whether duplicates are assigned to different xways). 14 | numXWays = 1 15 | if len(sys.argv) > 2: 16 | numXWays = sys.argv[2] 17 | print "numXWays: " + str(numXWays) 18 | 19 | # Place token lists into a python list (carid, entertime, leavetime, seg). 20 | c1 = [] 21 | for l in f1: 22 | c1.append(l.strip().split(',')) 23 | f1.close 24 | 25 | # Create a copy of the list. 26 | c2 = list(c1) 27 | 28 | # Shuffle both lists. 29 | random.shuffle(c1) 30 | random.shuffle(c2) 31 | 32 | replacements = [] 33 | 34 | # 4.10) Print the size of the original carsandtimes list of tuples. 35 | print "Original number of carsandtimes: " + str(len(c1)) 36 | 37 | ##################################################### 38 | # find an appropriate car to use as a re-entrant car. 39 | # car1: the current car of the first list. 40 | # cars0: the list of the current car (to remove car1 if a match is found). 41 | # cars: the list from which to find a match. 42 | # replacements: the list to hold the replacements tuples. 43 | ##################################################### 44 | def findCar(car1, cars0, cars, replacements): 45 | random_inc = 1000 * random.random() + 61 # 46 | 47 | # Try 1000 times to find a replacement for each car1. 48 | for i in xrange(0,1000): 49 | car2 = cars[random.randint(0,len(cars)-1)] 50 | if float(car2[1]) > float(car1[2])+random_inc: 51 | if numXWays > 1: 52 | if car2[3] == car1[3]: # Try again the xways are the same. 53 | continue 54 | replacements.append([car1[0], car2[0]]) 55 | print len(replacements), # Print find out how many replacements we've found so far. 56 | # NOTE: we are modifying the list as we're iterating over it. 57 | cars0.remove(car1) 58 | cars0.remove(car2) 59 | cars.remove(car1) 60 | cars.remove(car2) 61 | break 62 | 63 | for i in xrange(0,1): # Can choose arbitrary number of times to run loop. 64 | for c in c1: 65 | findCar(c, c1, c2, replacements) 66 | print "Length of c1: " + str(len(c1)) 67 | print "Length of c2: " + str(len(c2)) 68 | 69 | print "Number of replacements to make: " + str(len(replacements)) 70 | 71 | for t in replacements: 72 | f2.write(str(t[0])+","+str(t[1])+"\n") 73 | 74 | 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *** 2 | # NOTICE: 3 | 4 | ## This repository has been archived and is not supported. 5 | 6 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/) 7 | *** 8 | NOTICE: SUPPORT FOR THIS PROJECT HAS ENDED 9 | 10 | This projected was owned and maintained by Walmart. This project has reached its end of life and Walmart no longer supports this project. 11 | 12 | We will no longer be monitoring the issues for this project or reviewing pull requests. You are free to continue using this project under the license terms or forks of this project at your own risk. This project is no longer subject to Walmart's bug bounty program or other security monitoring. 13 | 14 | 15 | ## Actions you can take 16 | 17 | We recommend you take the following action: 18 | 19 | * Review any configuration files used for build automation and make appropriate updates to remove or replace this project 20 | * Notify other members of your team and/or organization of this change 21 | * Notify your security team to help you evaluate alternative options 22 | 23 | ## Forking and transition of ownership 24 | 25 | For [security reasons](https://www.theregister.co.uk/2018/11/26/npm_repo_bitcoin_stealer/), Walmart does not transfer the ownership of our primary repos on Github or other platforms to other individuals/organizations. Further, we do not transfer ownership of packages for public package management systems. 26 | 27 | If you would like to fork this package and continue development, you should choose a new name for the project and create your own packages, build automation, etc. 28 | 29 | Please review the licensing terms of this project, which continue to be in effect even after decommission. 30 | 31 | # linearroad 32 | Walmart version of the Linear Road streaming benchmark. 33 | 34 | ## Overview 35 | LinearRoad is a streaming data management system (SDMS) benchmark originally created in 2004. 36 | It was created at a time when SDMS systems were relatively new. 37 | The original Linear Road benchmark paper was a joint effort between collaborators from Stanford University, Brandeis University, Massachusetts Institute of Technology, and the Oregon Health and Science University/Oregon Graduate Institute. It has since been endorsed by Stanford, Brandeis, MIT, and Brown Universities as an SDMS benchmark. 38 | 39 | All original files were downloaded from http://www.cs.brandeis.edu/~linearroad/tools.html. 40 | These original files were then modified or re-written for performance reasons, including for quick creation of arbitrarily large datasets. 41 | 42 | 0.2 release. 43 | 44 | 45 | ## Notes 46 | Type 4 queries are not implemented as per the original paper, nor are Type 4 queries implemented in subsequent implementations. Type 4 may be implemented in the future. 47 | 48 | The validator and many portions of data generation were originally re-written in Python but have since been re-written again in Java, mainly for performance reasons. The choice of Python was initially arbitrary. The original rewrite of the validator was limited by available RAM. Versions that leverage multiple nodes and/or persisent datastores have been experimented with. The current version of the Validator uses a single node, multi-threaded, with Aerospike as the datastore. Using this setup up to a 50 xway dataset has been validated in only 234 minutes on a single Azure DS12 node with four cores and 28 GB of RAM. 49 | 50 | ### Data Generation 51 | Datasets of arbitrary sizes can be generated on a single machine or by parallelizing the expressway generation on multiple machines. The original mitsim (microscopic traffic simulator) program creates each expressway as a separate file and each expressway file can take three hours or more to generate. The file size for a one xway three hour simulation with 1,000 cars per segment per hour is ~1GB and contains ~20M tuples. Since each file is independent of all other files, the creation of these raw files can be parallelized on many machines, or VM's. 52 | 53 | Each independent file created by mitsim is expressway 0, and each has its own independent car and query id numbering starting at ~100 and ~1, respectively. In order to combine an arbitrary number of these files into a single simulation file the expressway number, as well as the car and query ids must be incremented according to the number of expressways being combined. 54 | 55 | Before combining the files some cleaning on the original, "raw" files is necessary to create a "clean"er set before running the combiner. This cleaning helps remove noise from the data. An example of noise: carids exist that exit but magically reappear without going through an entry lane. 56 | 57 | After cleaning, the initial combination process merges _n_ "clean" files, incrementing the expressway number from _0 thru n-1_ for each respective, cleaned file. It also increments the car and query ids by a current _max_car_id_ and current _max_query_id_ from the previous file to avoid overlap. 58 | 59 | Then, the subsequent combination process creates random historical tolls and creates the random re-entrant cars by replacing a percentage of random cars by other random cars that meet the criteria of having an entry time _1000 * random.random() + 61_ greater than the exit time of another car. 60 | 61 | The percentage of cars to check for possible re-entry is 10% by default. Note that this does not mean 10% of the cars will be re-entrant but only that 10% will be checked to see if they _can_ be re-entrant. 62 | 63 | Carid's monotonically increase but they do not so by only increments of 1. The actual carid's present in a given expressway may be 102, 120, ..., 123, 124, 130, etc... 64 | 65 | From this pool of potential re-entrant carid's, a table with the _enter-time_, _leave-time_, and _expressway_ of each carid is created. The carid's in this new table are checked to see if a carid with an _enter-time > 1000 * random.random() + 61 + leave-time_ of another car exists. If the number of simulated expressways is > 1, the carid's must be from different expressways. We are simulating a car leaving one expressway and re-entering on a different expressway at a later point in time. If only one expressway is present we are simply simulating a car re-entering at a later point in time. The _1000 * random.random() + 61_ from the original paper appears to be arbitrary. NOTE: Python's random.randint() is very slow and was the root of some Python performance issues. Even taking randint() into account and switching to random.random() the current Java code is at least 2x faster than the Python code. 66 | 67 | This process of making re-entrant carid's was a bottleneck to creating a single file from any arbitrary number of cleaned files. The original SQL version used the following query: 68 | 69 | _SELECT times.carid, times.entertime, times.leavetime, times_1.carid as carid1, times_1.entertime as entertime1, times_1.leavetime as leavetime1 70 | FROM carsandtimes as times, carsandtimes AS times_1 71 | WHERE times_1.entertime>times.leavetime+1000*random()+61 72 | LIMIT 1;_ 73 | 74 | If a match is found the two carids that match are removed from the _carsandtimes_ table and entered into a new _carstoreplace_ table, which simply holds two carid's per row. 75 | 76 | This query slows down tremendously as the "low-hanging fruit" is removed. For perspective: for the 50 expressway data set there are 7,851,650 unique carid's with a max carid of 13,958,137. From this max carid we get 1,395,100 potential duplicate, or re-entrant, cars per our description above. And, we get 783,265 actual carid's that exist. And, from these that actually exist we find, or create, 204,095 re-entrant cars. 77 | 78 | The issue with the original SQL statement is the size of the self-joined table. If attempted with this 50 expressway set the self-joined table would be up to 783,265 ^ 2, or 613,504,060,225 rows, from which to try and find an entry matching the WHERE clause. Again, low hanging fruit can be found relatively quickly--from negligible to under a second. But, as these easier finds are removed each additional match can run for many seconds, to minutes and tens of minutes, as up to the max rows above are potentially scanned. Even if we limit the number of re-entrant cars to 200K, and even if the performance were steady at one per second, it would take over 55 hours to create the re-entrant cars. But, as the query does slow down, and creation of even 200K would likely run into days, weeks, or more, the above query was untenable. 79 | 80 | To mitigate the issue above a separate script was created that creates the re-entrant cars. The original revision was faster than the database option but still not fast, taking 10's of hours. The newly written Java version performs this portion in seconds to minutes. 81 | 82 | Data preparation and generation has all been completely re-written in Java and all the previous issues have been mitigated. A 250 expressway set can now be generated from clean files in under 24 hours without a database. File cleansing time has been halved to under three minutes per 1GB file from over six minutes per file. And, raw file generation no longer requires a database. 83 | 84 | The tolls are simply a random table using the max carid after all the files have been combined. So, if the max carid were 100 with two expressways then the tolls table would be carid's 1 thru 100, with a row for each carid-day combination, where days run from 1 thru 69. Each historical toll row will have a random expressway from 0 or 1 (for the two expressway set) and a random toll value from 0 thru 99. So, the table size will be max carid * 69. For our 50 expressway set the number of rows is 963,111,453. Note that the random expressway will not match the expressway associated with the position report tuple and must be corrected. This is accounted for in the data preparation process. 85 | 86 | The original mitsim paper from 1996 can be found here https://its.mit.edu/sites/default/files/documents/MITSIM2.PDF. 87 | -------------------------------------------------------------------------------- /VALIDATE.md: -------------------------------------------------------------------------------- 1 | # Validating Results 2 | 3 | ## Notes 4 | 5 | ## Generating the Validation File (or the expected output) 6 | The original validator was written in Python and was a collaborative effort based on some of the idiosyncracies in the data as found by the various vendor-participants. The original Python validator was all in-memory, using Python dictionaries, so the number of expressways that can be validated was limited by available memory. The original was also single-threaded. 7 | 8 | One technique to reduce the memory footprint for validation is to reduce the historical tolls files to only those records that actually match a query within the main data file. The newer Java version does this automatically, while also being multi-threaded. 9 | 10 | The Java version is found here: https://github.com/walmart/linearroad/tree/master/JavaValidator 11 | 12 | The current Java version uses Aerospike to hold the toll state for all cars. Initial testing showed using Aerospike for this purpose allowed for validation file creation in less time, for large sets, in less time than even using Java's HashMap. 13 | 14 | The creation of expected output and the comparison to output created by any potential solution are two separate steps. 15 | 16 | To create the file of expected output: 17 | 18 | ```time java ValidateMTBQEven3AeroTolls ``` 19 | 20 | MT (Multi-Threaded) BQ (Blocking Queue) Even (wait till all threads have processed each second before proceeding to next second) Aero (uses AeroSpike) Tolls (cleans the toll file) 21 | 22 | Output will be a file named `out` in the current directory. 23 | 24 | To run a comparison of the expected output with the output of a streaming product run: 25 | 26 | ```java CompareFiles ``` 27 | 28 | The expected output is loaded into a Java Map and the product output is read line-by-line and checked against what is present in the Map. Product output that is not found in the expected output, product output values outside the expected ranges, or product output not matching the expected output are flagged. This stage of validation is also limited by available memory. 29 | 30 | Various solutions using various database backends to store state while generating the expected output were used but all were slower (some by orders of magnitude) than the Java + Aerospike combination. When time permits further work may be done to increase the xway sizes that can be validated in a timely manner, "timely" being the key word. 31 | --------------------------------------------------------------------------------