├── .lein-failures ├── test ├── fixtures │ └── files │ │ ├── sample.utf-16be.txt │ │ ├── sample.utf-16le.txt │ │ ├── sample.utf-32be.txt │ │ ├── sample.utf-32le.txt │ │ ├── sample.in.txt │ │ └── sample.utf-8.txt ├── clj_etl_utils │ ├── landmark_parser_test.clj │ ├── text_test.clj │ ├── json_test.clj │ ├── sequences_test.clj │ ├── test_time.clj │ ├── test_text.clj │ ├── test_helper.clj │ ├── lang_test.clj │ ├── regex_test.clj │ └── io_test.clj └── heplers │ └── gen-test-files.rb ├── .gitignore ├── Bakefile ├── Changes ├── src └── clj_etl_utils │ ├── json │ └── schema.clj │ ├── repl.clj │ ├── collections.clj │ ├── crypto │ ├── aes.clj │ └── file.clj │ ├── math.clj │ ├── analysis.clj │ ├── scratch.clj │ ├── json.clj │ ├── linguistics.clj │ ├── regex.clj │ ├── cache_utils.clj │ ├── time.clj │ ├── sequences.clj │ ├── landmark_parser.clj │ ├── lang_utils.clj │ ├── text.clj │ ├── indexer.clj │ └── io.clj ├── .dir-locals.el ├── config └── log4j.properties ├── project.clj ├── java └── com │ └── rn │ └── codec │ └── Nysiis.java ├── README.md └── resources └── clj_etl_utils └── ref_data └── usps-abbreviations.tab /.lein-failures: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /test/fixtures/files/sample.utf-16be.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyleburton/clj-etl-utils/HEAD/test/fixtures/files/sample.utf-16be.txt -------------------------------------------------------------------------------- /test/fixtures/files/sample.utf-16le.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyleburton/clj-etl-utils/HEAD/test/fixtures/files/sample.utf-16le.txt -------------------------------------------------------------------------------- /test/fixtures/files/sample.utf-32be.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyleburton/clj-etl-utils/HEAD/test/fixtures/files/sample.utf-32be.txt -------------------------------------------------------------------------------- /test/fixtures/files/sample.utf-32le.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyleburton/clj-etl-utils/HEAD/test/fixtures/files/sample.utf-32le.txt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml 2 | *jar 3 | lib 4 | classes 5 | .DS_Store 6 | out.txt 7 | *~ 8 | README.html 9 | autodoc/ 10 | .lein-deps-sum 11 | .lein-plugins 12 | target/ 13 | -------------------------------------------------------------------------------- /Bakefile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bake_task run-dev "Run a development Cider-NREPL" 4 | run-dev () { 5 | lein run -m clj-etl-utils.repl "$@" 6 | } 7 | 8 | bake_task lint "lint the source code" 9 | lint () { 10 | ag ' \(def ' | cat 11 | } 12 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | 1.0.91 ::! Thu Aug 21 08:16:17 EDT 2014 2 | * deprecate http lib, please consider clj-http or http-kit instead. 3 | 1.0.43 ::! Thu Apr 5 09:58:51 EDT 2012 4 | * text: added base64 encode / decode helpers 5 | * added crypto/aes.clj 6 | 7 | 1.0.17 :: Wed Nov 10 22:39:44 EST 2010 8 | * upgrade to commons-io 2.0 to use BoundedInputStream 9 | 10 | * deprecated all the BOM/Unicode functions, these are handled by the current 11 | version of commons-io 12 | -------------------------------------------------------------------------------- /src/clj_etl_utils/json/schema.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.json.schema 2 | (:use 3 | [clj-etl-utils.lang-utils :only [raise]])) 4 | 5 | (defn validate [schema json-object] 6 | {:ok true}) 7 | 8 | (defn validate! [schema json-object] 9 | (let [res (validate schema json-object)] 10 | (if-not (:ok res) 11 | (raise "Validation Errors: %s" res)) 12 | res)) 13 | 14 | (defn make-validator! [schema] 15 | (fn curryd-validator [json-object] 16 | (validate! schema json-object))) 17 | 18 | -------------------------------------------------------------------------------- /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ;;; Directory Local Variables 2 | ;;; For more information see (info "(emacs) Directory Variables") 3 | 4 | ((clojure-mode 5 | (krb-clj-cider-connect-args :host "localhost" :port "4021") 6 | (krb-clj-cider-connect-fn . cider-connect) 7 | (ffip-project-root . "~/code/github.com/kyleburton/clj-etl-utils") 8 | 9 | (krb-ag-search-directory . "~/code/github.com/kyleburton/clj-etl-utils") 10 | (eval . (progn 11 | (require 'find-file-in-project) 12 | (add-to-list 'ffip-prune-patterns "*.clj-kondo"))))) 13 | -------------------------------------------------------------------------------- /src/clj_etl_utils/repl.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.repl 2 | (:require 3 | [nrepl.server :refer [start-server start-server]] 4 | [cider.nrepl :refer [cider-nrepl-handler]] 5 | [clojure.tools.logging :as log] 6 | [schema.core :as s])) 7 | 8 | (defonce nrepl-server (atom nil)) 9 | (defonce config (atom {::nrepl {::port 4021}})) 10 | 11 | (defn -main [& args] 12 | (reset! nrepl-server (start-server 13 | :port (-> @config ::nrepl ::port) 14 | :handler cider-nrepl-handler)) 15 | (log/infof "nrepl is running %s" @config) 16 | (s/set-fn-validation! true)) 17 | -------------------------------------------------------------------------------- /config/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=DEBUG, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.ConversionPattern=[%-5p] %c - %m%n 10 | 11 | log4j.appender.mail=org.apache.log4j.net.SMTPAppender 12 | log4j.appender.mail.To=kyle.burton@gmail.com,cashion@gmail.com 13 | log4j.appender.mail.From=router@algoconnectone.com 14 | log4j.appender.mail.SMTPHost=localhost 15 | log4j.appender.mail.Threshold=ERROR 16 | log4j.appender.mail.BufferSize=1 17 | log4j.appender.mail.Subject=Application Error Occurred 18 | # HTML Formatting seems to be preferred to the pattern layout for the emails... 19 | log4j.appender.mail.layout=org.apache.log4j.HTMLLayout 20 | 21 | 22 | -------------------------------------------------------------------------------- /test/clj_etl_utils/landmark_parser_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.landmark-parser-test 2 | (:require [clj-etl-utils.landmark-parser :as lp]) 3 | (:use [clojure.test] 4 | [clj-etl-utils.test-helper])) 5 | 6 | 7 | (register-fixture :text-doc :simple-text "This is some text. There is some in the middle.\nThere is some towards the end, but it is not this.\nA few sentences in all.") 8 | 9 | (deftest test-extract 10 | (is (= "This is some text." 11 | (lp/extract 12 | (lp/make-parser (fixture :text-doc :simple-text)) 13 | [[:start nil]] 14 | [[:forward-past "."]]))) 15 | (is (= "There is some in the middle." 16 | (lp/extract 17 | (lp/make-parser (fixture :text-doc :simple-text)) 18 | [[:forward-past "."] 19 | [:forward-past " "]] 20 | [:forward-past "."]))) 21 | (is (= "A few sentences in all." 22 | (lp/extract 23 | (lp/make-parser (fixture :text-doc :simple-text)) 24 | [[:end nil] 25 | [:rewind-to "\n"]] 26 | [:end nil])))) 27 | 28 | ;; (test-extract) 29 | 30 | -------------------------------------------------------------------------------- /test/clj_etl_utils/text_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.text-test 2 | (:use 3 | [clojure.test] 4 | [clj-etl-utils.lang-utils :only [raise]]) 5 | (:require 6 | [clj-etl-utils.text :as text])) 7 | 8 | 9 | (deftest test-split-message 10 | (let [msg "This is a message that will get split."] 11 | 12 | ;; Sanity check 13 | (is (= 38 (count msg))) 14 | 15 | ;; 100-char blocks. No splitting should be done. 16 | (is (= '("This is a message that will get split.") 17 | (text/word-split msg 100))) 18 | 19 | ;; Msg is exactly 38 chars long. Test the boundary. 20 | (is (= '("This is a message that will get split.") 21 | (text/word-split msg 38))) 22 | 23 | 24 | ;; Split into two approximately 20-char blocks. 25 | ;; Should result in: 26 | ;; "This is a message" and "that will get split" 27 | (is (= '("This is a message" "that will get split.") 28 | (text/word-split msg 20))) 29 | 30 | ;; Split into approximately 10-char blocks. 31 | ;; Should result in: 32 | ;; "This is a", "message", "that will", "get split." 33 | (is (= '("This is a" "message" "that will" "get split.") 34 | (text/word-split msg 10))) 35 | 36 | )) 37 | 38 | 39 | 40 | 41 | (comment 42 | 43 | (run-tests) 44 | ) 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/clj_etl_utils/collections.clj: -------------------------------------------------------------------------------- 1 | (ns 2 | ^{:doc "Collections extensions." 3 | :author "Kyle Burton"} 4 | clj-etl-utils.collections) 5 | 6 | 7 | (defn 8 | ^{:doc "Given a map and a vector of key/default value pairs, will apply the default if the key is present but the value is nil in the original map. 9 | 10 | (fill-map-defaults {} []) 11 | => {} 12 | (fill-map-defaults {:foo 1} [[:created-at :foo]]) 13 | => {:foo 1} 14 | (fill-map-defaults {:foo 1 :created-at 3} [[:created-at :foo]]) 15 | => {:foo 1, :created-at 3} 16 | (fill-map-defaults {:foo 1 :created-at nil} [[:created-at :foo]]) 17 | => {:foo 1, :created-at :foo} 18 | 19 | " 20 | :added "1.0.0"} 21 | fill-map-defaults [m defaults] 22 | (loop [[[k v] & kvs] defaults 23 | m m] 24 | (if-not k 25 | m 26 | (recur kvs 27 | (if (contains? m k) 28 | (assoc m k (if (nil? (get m k)) 29 | v 30 | (get m k))) 31 | m))))) 32 | 33 | (comment 34 | 35 | ) 36 | 37 | 38 | (defn slice-map* [in-map & keys] 39 | (reduce 40 | (fn [m k] 41 | (if (contains? in-map k) 42 | (assoc m k (get in-map k)) 43 | m)) 44 | {} 45 | keys)) 46 | 47 | 48 | (defn slice-map [m keys] 49 | (apply slice-map* m keys)) 50 | 51 | -------------------------------------------------------------------------------- /src/clj_etl_utils/crypto/aes.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.crypto.aes 2 | (:import [javax.crypto KeyGenerator SecretKey Cipher] 3 | [javax.crypto.spec SecretKeySpec]) 4 | (:use 5 | [clj-etl-utils.text :only [decode-base64 encode-base64]] 6 | [clj-etl-utils.lang-utils :only [raise]])) 7 | 8 | 9 | 10 | (defn genkey [keygen] 11 | (do (.init keygen 128) 12 | (.getEncoded (.generateKey keygen) ))) 13 | 14 | (defn do-encrypt [rawkey plaintext] 15 | (let [cipher (Cipher/getInstance "AES")] 16 | (do (.init cipher Cipher/ENCRYPT_MODE (SecretKeySpec. rawkey "AES")) 17 | (.doFinal cipher (.getBytes plaintext))))) 18 | 19 | (defn do-decrypt [rawkey ciphertext] 20 | (let [cipher (Cipher/getInstance "AES")] 21 | (do (.init cipher Cipher/DECRYPT_MODE (SecretKeySpec. rawkey "AES")) 22 | (String. (.doFinal cipher ciphertext))))) 23 | 24 | (defn string->key [s] 25 | (if (< (count s) 16) 26 | (raise "Error: key must be >= 16 characters to be a valid AES key, you supplied s[%d]='%s'" (count s) s)) 27 | (.getBytes (.substring s 0 16) "UTF-8")) 28 | 29 | 30 | (comment 31 | "Example usage" 32 | 33 | ( def *key* (string->key "some key with some more padding")) 34 | 35 | (String. (encode-base64 (do-encrypt *key* "this is some plaintext"))) 36 | 37 | (do-decrypt *key* (do-encrypt *key* "this is some plaintext")) 38 | 39 | ) 40 | -------------------------------------------------------------------------------- /test/clj_etl_utils/json_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.json-test 2 | (:require 3 | clj-etl-utils.json 4 | [clojure.data.json :as json] 5 | [clj-time.core :as time] 6 | [clj-time.format :as tformat]) 7 | (:use clojure.test)) 8 | 9 | 10 | (deftest test-json-date-produces-iso8601-format 11 | (is 12 | (= (-> 13 | (java.util.Date. 1379264104884) 14 | json/json-str 15 | json/read-str) 16 | (-> 17 | "2013-09-15T16:55:04.884Z"))) 18 | (is 19 | (= (-> 20 | (java.sql.Timestamp. 1379264104884) 21 | json/json-str 22 | json/read-str) 23 | (-> 24 | "2013-09-15T16:55:04.884Z"))) 25 | (is 26 | (= (-> 27 | (org.joda.time.DateTime. 1379264104884) 28 | json/json-str 29 | json/read-str 30 | tformat/parse 31 | str) 32 | "2013-09-15T16:55:04.884Z")) 33 | (is 34 | (= (-> 35 | (java.time.LocalDateTime/ofInstant 36 | (java.time.Instant/ofEpochMilli 1379264104884) 37 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC"))) 38 | json/json-str 39 | json/read-str 40 | tformat/parse 41 | str) 42 | "2013-09-15T16:55:04.884Z")) 43 | (is 44 | (= (-> 45 | (java.time.ZonedDateTime/ofInstant 46 | (java.time.Instant/ofEpochMilli 1379264104884) 47 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC"))) 48 | json/json-str 49 | json/read-str 50 | tformat/parse 51 | str) 52 | "2013-09-15T16:55:04.884Z"))) 53 | -------------------------------------------------------------------------------- /test/clj_etl_utils/sequences_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.sequences-test 2 | (:require 3 | [clj-etl-utils.sequences :as sequences]) 4 | (:use 5 | [clojure.test])) 6 | 7 | 8 | (deftest test-make-stream-sampler 9 | (let [sampler (sequences/make-stream-sampler (fn [_] 0))] 10 | (is (= [:a] (sampler [:a :b :c :d] 1 1)))) 11 | (let [rand-int-fn (fn [_] 0) 12 | sampler (sequences/make-stream-sampler rand-int-fn) 13 | population [:a :b :c :d :e :f :g :h :i :j :k :l :m :n :o :p :q :r :s :t :u :v :w :x :y :z]] 14 | (is (= [:a :b :c :d :e]) (sampler population (count population) 5))) 15 | (let [data (atom 0) 16 | rand-int-fn (fn [_] (if (zero? @data) 17 | (do 18 | (reset! data 1) 19 | 0) 20 | (do 21 | (reset! data 0) 22 | 9999))) 23 | sampler (sequences/make-stream-sampler rand-int-fn) 24 | population [:a :b :c :d :e :f :g :h :i :j :k :l :m :n :o :p :q :r :s :t :u :v :w :x :y :z]] 25 | (is (= [:a :c :e :g :i]) (sampler population (count population) 5)))) 26 | 27 | 28 | (deftest test-make-reservoir-sampler 29 | (is (= [] ((sequences/make-reservoir-sampler 6) []))) 30 | (is (= [1] ((sequences/make-reservoir-sampler 6) [1]))) 31 | (is (= [0 1 2 3 4 5] ((sequences/make-reservoir-sampler 6) [0 1 2 3 4 5]))) 32 | (let [sampler (sequences/make-reservoir-sampler 2 (fn [_] 0))] 33 | (is (= [:d :b] (sampler [:a :b :c :d]))))) 34 | -------------------------------------------------------------------------------- /test/heplers/gen-test-files.rb: -------------------------------------------------------------------------------- 1 | proj_dir = File.join(File.dirname(__FILE__), '..', '..') 2 | files_dir = File.join(proj_dir,'test','fixtures','files') 3 | debug = false 4 | 5 | # 00 00 FE FF UTF-32, big-endian 6 | # FF FE 00 00 UTF-32, little-endian 7 | # FE FF UTF-16, big-endian 8 | # FF FE UTF-16, little-endian 9 | # EF BB BF UTF-8 10 | # 11 | 12 | BOMs = [ 13 | ['UTF-32BE', "\x00\x00\xFE\xFF"], 14 | ['UTF-32LE', "\xFF\xFE\x00\x00"], 15 | ['UTF-16BE', "\xFE\xFF"], 16 | ['UTF-16LE', "\xFF\xFE"], 17 | ['UTF-8', "\xEF\xBB\xBF"] 18 | ] 19 | 20 | sample_input = File.join(files_dir,'sample.in.txt') 21 | from_encoding = 'ASCII' 22 | BOMs.each do |pair| 23 | to_encoding, bom_bytes = pair 24 | puts "to_encoding=#{to_encoding} bom_bytes='#{bom_bytes}'" if debug 25 | output_file = sample_input.sub(/\.in\./, ".#{to_encoding.downcase}.") 26 | File.open(output_file,"w") do |f| 27 | f.write bom_bytes 28 | end 29 | 30 | cmd = "iconv -f #{from_encoding} -t #{to_encoding} #{sample_input} >> #{output_file}" 31 | puts cmd 32 | system cmd 33 | end 34 | 35 | system "file #{files_dir}/*.txt" 36 | 37 | # set -e 38 | # set -u 39 | # PROJ_DIR=$(dirname $0)/../.. 40 | # FILES_DIR=$PROJ_DIR/test/fixtures/files 41 | # 42 | # if ! which recode; then 43 | # echo "Error: you must install the recode utiltiy to generate the sample data files (aptitude, brew, yum, etc.)" 44 | # exit 1 45 | # fi 46 | # 47 | # cp $FILES_DIR/sample.in.txt $FILES_DIR/sample-ascii.txt 48 | # recode ascii...utf16 < $FILES_DIR/sample.in.txt > $FILES_DIR/sample-utf-16.txt 49 | # recode utf16...utf8 < $FILES_DIR/sample.in.txt > $FILES_DIR/sample-utf-8.txt 50 | -------------------------------------------------------------------------------- /test/clj_etl_utils/test_time.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.test-time 2 | (:use 3 | clojure.test) 4 | (:require 5 | [clj-etl-utils.time :as time])) 6 | 7 | 8 | (deftest date-seq 9 | (is (= 1 (count (time/date-seq 10 | (org.joda.time.DateTime. "2014-05-06T12:59:59Z") 11 | (org.joda.time.DateTime. "2014-05-06T12:59:59Z"))))) 12 | (is (= 3 (count (time/date-seq 13 | (org.joda.time.DateTime. "2014-05-06T12:59:59Z") 14 | (org.joda.time.DateTime. "2014-05-08T12:59:59Z")))))) 15 | 16 | (deftest test-mins-between-end-and-start 17 | (is (zero? (time/mins-between "13:00" "13:00"))) 18 | (is (= 1 (time/mins-between "13:00" "13:01"))) 19 | (is (= 361 (time/mins-between "13:00" "19:01"))) 20 | (is (= 360 (time/mins-between "23:00" "05:00"))) 21 | ) 22 | 23 | 24 | (deftest test-business-hours 25 | ;; 1 min before 9am 26 | (is (not (time/during-business-hours? 27 | (org.joda.time.DateTime. "2014-05-06T12:59:59Z") 28 | "09:00" "22:00" "EDT"))) 29 | ;; 9am 30 | (is (time/during-business-hours? 31 | (org.joda.time.DateTime. "2014-05-06T13:00:00Z") 32 | "09:00" "22:00" "EDT")) 33 | ;; 1 min before 10pm 34 | (is (time/during-business-hours? 35 | (org.joda.time.DateTime. "2014-05-06T01:59:59Z") 36 | "09:00" "22:00" "EDT")) 37 | ;; at 10pm 38 | (is (not 39 | (time/during-business-hours? 40 | (org.joda.time.DateTime. "2014-05-06T02:00:00Z") 41 | "09:00" "22:00" "EDT"))) 42 | (is (not (time/during-business-hours? 43 | (org.joda.time.DateTime. "2014-05-06T02:01:00Z") 44 | "09:00" "22:00" "EDT"))) 45 | (is (not (time/during-business-hours? 46 | (org.joda.time.DateTime. "2014-05-06T02:00:30Z") 47 | "09:00" "22:00" "EDT")))) 48 | -------------------------------------------------------------------------------- /test/clj_etl_utils/test_text.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.test-text 2 | (:use 3 | clojure.test) 4 | (:require 5 | [clj-etl-utils.text :as text])) 6 | 7 | 8 | (comment 9 | 10 | 11 | 12 | ) 13 | 14 | 15 | (deftest nil-and-emtpy-default 16 | (is (= nil (text/trim-and-truncate nil 100))) 17 | (is (= nil (text/trim-and-truncate "" 100))) 18 | (is (= "banana" (text/trim-and-truncate "" 100 "banana")))) 19 | 20 | (deftest blanks-are-trimmed 21 | (is (= nil (text/trim-and-truncate " " 100))) 22 | (is (= "First Name" (text/trim-and-truncate " First Name " 100)))) 23 | 24 | (deftest truncates-long-strings 25 | (is (= 26 | "First Name First Name First Name First Name First Name First Name First Name First Name Firs" 27 | (text/trim-and-truncate " First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name First Name " 100))) 28 | (is (= 29 | "First Name" 30 | (text/trim-and-truncate "First Name xxx I should be removed." 100)))) -------------------------------------------------------------------------------- /src/clj_etl_utils/math.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.math 2 | (:use 3 | [clj-etl-utils.lang-utils :only [raise]]) 4 | (:import [java.lang Math])) 5 | 6 | (defn log-b [b x] 7 | (/ (Math/log x) 8 | (Math/log b))) 9 | 10 | (defn log2 [x] 11 | (log-b 2 x)) 12 | 13 | (comment 14 | 15 | (log2 1) 16 | (log2 2) 17 | (log2 4) 18 | (log2 8) 19 | 20 | (log2 (Math/pow 10 10)) 21 | 22 | (log2 (Math/pow 2 32)) 23 | (log2 4200000000) 24 | 25 | ) 26 | 27 | (def base62-digits "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 28 | 29 | 30 | (defn base62-encode [n & [base]] 31 | (let [#^BigInteger base (BigInteger/valueOf (or base 62)) 32 | #^BigInteger n (if (isa? (class n) BigInteger) 33 | n 34 | (BigInteger/valueOf n))] 35 | (if (= -1 (.compareTo n BigInteger/ZERO)) 36 | (raise "Error: negative numbers are not supported, sorry: %s" n)) 37 | (loop [res (StringBuilder.) 38 | [number remainder] (.divideAndRemainder n base)] 39 | (if (= 1 (.compareTo number BigInteger/ZERO)) 40 | (recur 41 | (.insert res 0 (.charAt base62-digits (.intValue remainder))) 42 | (.divideAndRemainder number base)) 43 | (let [res (if (= 1 (.compareTo remainder BigInteger/ZERO)) 44 | (str (.insert res 0 (.charAt base62-digits (.intValue remainder)))) 45 | (str res))] 46 | (if (empty? res) 47 | "0" 48 | res)))))) 49 | 50 | 51 | (defn base62-decode [#^String s & [base]] 52 | (let [#^BigInteger base (BigInteger/valueOf (or base 62)) 53 | digits (count s)] 54 | (if (empty? s) 55 | BigInteger/ZERO 56 | (loop [res #^BigInteger BigInteger/ZERO 57 | idx 0] 58 | (if (>= idx digits) 59 | res 60 | (let [c (.charAt s (- digits idx 1)) 61 | digit (BigInteger/valueOf (.indexOf base62-digits (str c)))] 62 | (recur (.add res (.multiply digit (.pow base idx))) 63 | (inc idx)))))))) 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject com.github.kyleburton/clj-etl-utils "1.0.100" 2 | :description "ETL Utilities" 3 | :url "http://github.com/kyleburton/clj-etl-utils" 4 | :license {:name "Eclipse Public License - v 1.0" 5 | :url "http://www.eclipse.org/legal/epl-v10.html" 6 | :distribution :repo 7 | :comments "Same as Clojure"} 8 | :deploy-repositories [ 9 | ["releases" {:url "https://clojars.org/repo" :creds :gpg}] 10 | ["snapshots" {:url "https://clojars.org/repo" :creds :gpg}]] 11 | :java-source-path "java" 12 | :local-repo-classpath true 13 | :autodoc { 14 | :name "clj-etl-utils" 15 | :page-title "clj-etl-utils: API Documentation" 16 | :description "ETL Utilites for Clojure" 17 | :web-home "http://kyleburton.github.com/projects/clj-etl-utils/" 18 | } 19 | 20 | :global-vars {*warn-on-reflection* true} 21 | :profiles {:dev {:resource-paths ["dev-resources"] 22 | :dependencies [ 23 | [org.clojure/clojure "1.12.0"] 24 | [prismatic/schema "1.1.3"] 25 | [nrepl/nrepl "1.0.0"] 26 | [cider/cider-nrepl "0.52.1"] 27 | ]}} 28 | :dependencies [[commons-io/commons-io "2.5"] 29 | [org.clojure/tools.logging "1.3.0"] 30 | [ch.qos.logback/logback-classic "1.2.11"] 31 | [org.mindrot/jbcrypt "0.4"] 32 | [commons-codec/commons-codec "1.10"] 33 | [commons-lang/commons-lang "2.6"] 34 | [org.clojure/data.csv "0.1.4"] 35 | [org.clojure/data.json "2.5.1"] 36 | [clj-time "0.14.0"]]) 37 | -------------------------------------------------------------------------------- /test/clj_etl_utils/test_helper.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.test-helper 2 | (:use [clojure.tools.logging :as log])) 3 | 4 | (defonce fixture-registry (atom {})) 5 | 6 | (defn mm-get [m k1 k2] 7 | (and (contains? m k1) 8 | ((m k1) k2))) 9 | 10 | (defn mm-put [m k1 k2 v] 11 | (assoc m k1 12 | (assoc (m k1 {}) 13 | k2 v))) 14 | 15 | (defn mm-contains? [m k1 k2] 16 | (and (contains? m k1) 17 | (contains? (m k1) k2))) 18 | 19 | (defn clear-fixture-registry [] 20 | (reset! fixture-registry (atom {}))) 21 | 22 | (defn register-fixture [type k v] 23 | (reset! fixture-registry 24 | (mm-put @fixture-registry type k v))) 25 | 26 | ;; is this strategy good enough? 27 | (defn project-root [] 28 | (if-let [location (.getLocation ^java.security.CodeSource (.getCodeSource ^java.security.ProtectionDomain (.getProtectionDomain (.getClass ^Object project-root))))] 29 | (do 30 | (log/debug (format "project-root: from class location: %s" location)) 31 | location) 32 | (do 33 | (log/debug (format "project-root: falling back to user.dir")) 34 | (System/getProperty "user.dir")))) 35 | 36 | (defn fixture-file ^String [^String file] 37 | (format "%s/test/fixtures/files/%s" (project-root) file)) 38 | 39 | (defn fixture-file-contents [^String file] 40 | (let [fixture-path (fixture-file file)] 41 | (if (not (.exists (java.io.File. ^String fixture-path))) 42 | (throw (format "Error: no such fixture file '%s' => '%s'" file fixture-path))) 43 | (slurp fixture-path))) 44 | 45 | ;; load/access a fixture 46 | (defn fixture [type key] 47 | (cond (= :file type) (fixture-file-contents key) 48 | (mm-contains? @fixture-registry type key) (mm-get @fixture-registry type key) 49 | :else (throw (RuntimeException. (format "Error: unknown fixture type: %s / %s registry(%s)" type key (keys @fixture-registry)))))) 50 | 51 | 52 | 53 | (comment 54 | 55 | (fixture-file-contents "sample.in.txt") 56 | 57 | (let [props (System/getProperties) 58 | keys (.stringPropertyNames props)] 59 | (doseq [key keys] 60 | (printf "%s: %s\n" key (.getProperty props key)))) 61 | 62 | ;; getClass().getProtectionDomain().getCodeSource().getLocation() 63 | 64 | 65 | (foo) 66 | 67 | 68 | ) 69 | -------------------------------------------------------------------------------- /test/clj_etl_utils/lang_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.lang-test 2 | (:require [clj-etl-utils.lang-utils :as lang-utils]) 3 | (:use [clojure.test] 4 | [clj-etl-utils.lang-utils :only [nth-let]])) 5 | 6 | (deftest test-make-periodic-invoker 7 | (let [stat (atom []) 8 | trigger (lang-utils/make-periodic-invoker 10 9 | (fn [action count val] 10 | (swap! stat conj val)))] 11 | (dotimes [ii 100] 12 | (trigger ii)) 13 | (is (= 10 (count @stat))))) 14 | 15 | ;; (test-make-periodic-invoker) 16 | 17 | (comment 18 | (let [stat (atom []) 19 | trigger (lang-utils/make-periodic-invoker 10 20 | (fn [count val] 21 | (swap! stat conj [count val])))] 22 | (dotimes [ii 100] 23 | (trigger :hit ii)) 24 | @stat) 25 | 26 | 27 | ( def *timer* (lang-utils/make-periodic-invoker 28 | 10 29 | (fn [val & args] 30 | (printf "triggered: val=%s args=%s\n" val args)))) 31 | 32 | (dotimes [ii 10] (*timer*)) 33 | (*timer* :final) 34 | (*timer* :invoke 1 2 3) 35 | (*timer* :state) 36 | (*timer* :set 19) 37 | (*timer* :reset) 38 | 39 | (macroexpand-1 40 | '(lang-utils/with-hit-timer [timer 10] 41 | (dotimes [ii 100] 42 | (timer)))) 43 | 44 | 45 | (lang-utils/with-hit-timer [timer 10] 46 | (dotimes [ii 109] 47 | (timer))) 48 | 49 | 50 | (let [total 1000 51 | period 100 52 | progress (lang-utils/make-periodic-invoker 53 | period 54 | (fn [val & [is-done]] 55 | (if (= is-done :done) 56 | (printf "All Done! %d\n" val) 57 | (printf "So far we did %d, we are %3.2f%% complete.\n" val (* 100.0 (/ val 1.0 total))))))] 58 | (dotimes [ii total] 59 | (progress)) 60 | (progress :final :done)) 61 | 62 | ) 63 | 64 | (deftest test-nth-let 65 | (let [rec (vec (.split "abcdefghijklmnopqrstuvwxyz" ""))] 66 | (nth-let [rec 67 | lstart 0 68 | lmiddle 12 69 | llast 25] 70 | (is (= "a" lstart)) 71 | (is (= "m" lmiddle)) 72 | (is (= "z" llast))))) 73 | -------------------------------------------------------------------------------- /src/clj_etl_utils/analysis.clj: -------------------------------------------------------------------------------- 1 | (ns ^{:doc "Data and Text file analysis functions. These functions 2 | work with delimited and fixed width files, such as database dumps 3 | log data and other exports." 4 | :author "Kyle Burton"} 5 | clj-etl-utils.analysis) 6 | 7 | (defn ^{:doc "Given a sequence of records (a sequence of vectors) 8 | this function will track the maximum length string seen in each column 9 | of the records of the sequence." 10 | :added "1.0.0"} 11 | max-col-lens [rec-seq] 12 | (letfn [(track-counts 13 | [m rec] 14 | (reduce (fn [m idx] 15 | (assoc m idx 16 | (max (count (rec idx)) 17 | (get m idx -1)))) 18 | m 19 | (range 0 (count rec))))] 20 | (reduce track-counts {} rec-seq))) 21 | 22 | (comment 23 | 24 | ( def *example-data* "field1\tfield2\tfield3 25 | This\ttaht\tother 26 | \t\t 27 | some more stuff\tand yet more\tfinal field 28 | the quick brown\t fox jumped over\t the lazy\t toad 29 | \tguns\t germs\t steel") 30 | 31 | ( def *example-recs* (map (fn [l] (vec (.split l "\t"))) (.split *example-data* "\\n"))) 32 | 33 | (max-col-lens *example-recs*) 34 | 35 | ) 36 | 37 | (defn 38 | ^{:doc "Given a counts map (see: max-col-lens), and a vector of the header 39 | names, this function replaces the numeric column indicies in the map returned by max-col-lens into their names based on the fields vector." 40 | :added "1.0.0"} 41 | translate-to-header-names-vec [counts-map fields] 42 | (map (fn [idx] 43 | [(fields idx) 44 | (counts-map idx)]) 45 | (range 0 (count fields)))) 46 | 47 | (comment 48 | 49 | (translate-to-header-names-vec (max-col-lens (drop 1 *example-recs*)) 50 | (first *example-recs*)) 51 | 52 | ) 53 | 54 | (defn 55 | ^{:doc "Takes a sequences, returns a seq of column info, one per column. 56 | The column info will contain the following: 57 | 58 | {:name field-name 59 | :type field-type 60 | :length max-size} 61 | 62 | Where the field-name is defined by the first record in the stream - by 63 | assuming this first record is the column names (a header row). 64 | 65 | field-type is the type detectec by the analyzer (currently hard-coded 66 | to 'character varying'). 67 | 68 | max-size is the maximum detected width of data values from the 69 | remainder of the stream (not the header)." 70 | :added "1.0.0"} 71 | analyze-column-data [[hdr & recs]] 72 | (for [[field-name max-size] 73 | (translate-to-header-names-vec 74 | (max-col-lens recs) 75 | hdr)] 76 | {:name field-name 77 | :type "character varying" 78 | :length max-size})) 79 | 80 | (comment 81 | 82 | 83 | (analyze-column-data *example-recs*) 84 | 85 | ) 86 | -------------------------------------------------------------------------------- /src/clj_etl_utils/scratch.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.scratch 2 | (:require 3 | [clj-etl-utils.indexer :as indexer] 4 | [clojure.data.csv :as csv] 5 | [clojure.java.sh :as sh] 6 | [clojure.string :as string] 7 | [clj-etl-utils.indexer :as indexer] 8 | [clj-etl-utils.lang-utils :refer [raise]]) 9 | (:import 10 | [java.io RandomAccessFile FileInputStream InputStreamReader BufferedReader] 11 | [org.apache.commons.io.input BoundedInputStream])) 12 | 13 | (comment 14 | 15 | (process-candidte-clusters 16 | (vals sources) 17 | :city 18 | (fn [term cluster] 19 | (doseq [src-results cluster] 20 | (println (format "%s: %s : %d %s" term (:source src-results) (count (:recs src-results)) (:recs src-results)))))) 21 | (make-candidate-keyfile 22 | (vals sources) 23 | :city 24 | 25 | "candidates.tab") 26 | 27 | ( def sources 28 | {:boutell 29 | {:name :boutell 30 | :config {:file "tmp/zipcode.csv" 31 | :has-header true 32 | :indexes {:city {:name :city 33 | :fn (fn [l] 34 | (let [rec (csv-parse l)] 35 | ;; 4th col is City 36 | (when-not (empty? rec) 37 | [(.toLowerCase (nth rec 1))])))} 38 | :state {:name :state 39 | :fn (fn [l] 40 | (let [rec (csv-parse l)] 41 | ;; 4th col is City 42 | (when-not (empty? rec) 43 | [(.toLowerCase (nth rec 2))])))}}}} 44 | :free-zipcode-database 45 | {:name :free-zipcode-database 46 | :config {:file "tmp/free-zipcode-database.csv" 47 | :has-header true 48 | :indexes {:city {:name :city 49 | :fn (fn [l] 50 | (let [rec (csv-parse l)] 51 | [(.toLowerCase (nth rec 3))]))} 52 | :state 53 | {:name :state 54 | :fn (fn [l] 55 | (let [rec (csv-parse l)] 56 | [(.toLowerCase (nth rec 4))]))}}}}}) 57 | 58 | (time 59 | (do 60 | (ensure-indexes (sources :boutell)) 61 | (ensure-indexes (sources :free-zipcode-database)))) 62 | 63 | 64 | (vec 65 | (index-search-file 66 | "tmp/free-zipcode-database.csv" 67 | "tmp/.free-zipcode-database.csv.city-idx" 68 | "philade" 69 | (fn [a b] 70 | (.startsWith 71 | (.toLowerCase a) 72 | (.toLowerCase b))))) 73 | 74 | 75 | (vec 76 | (index-search-file 77 | "tmp/zipcode.csv" 78 | "tmp/.zipcode.csv.city-idx" 79 | "philade" 80 | (fn [a b] 81 | (.startsWith 82 | (.toLowerCase a) 83 | (.toLowerCase b))))) 84 | 85 | ) 86 | -------------------------------------------------------------------------------- /src/clj_etl_utils/crypto/file.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.crypto.file 2 | (:import [javax.crypto KeyGenerator SecretKey SecretKeyFactory Cipher CipherOutputStream CipherInputStream] 3 | [javax.crypto.spec SecretKeySpec PBEKeySpec IvParameterSpec] 4 | [java.io File FileOutputStream DataInputStream FileInputStream] 5 | [org.apache.commons.codec.binary Base64] 6 | [org.apache.commons.io IOUtils]) 7 | (:use 8 | [clj-etl-utils.lang-utils :only [raise aprog1]])) 9 | 10 | (defn rand-salt [size] 11 | (aprog1 12 | (byte-array size) 13 | (.nextBytes (java.security.SecureRandom.) it))) 14 | 15 | ;; Defaults 16 | (def pbe-iteration-count 65536) 17 | (def pbe-key-length 256) 18 | (def cipher-algorithm "AES/CBC/PKCS5Padding") 19 | (def key-factory-algorithm "PBKDF2WithHmacSHA1") 20 | (def key-encoding "AES") 21 | 22 | (defn make-secret-key [password] 23 | (let [key-factory (SecretKeyFactory/getInstance key-factory-algorithm) 24 | key-spec (PBEKeySpec. (.toCharArray password) 25 | (rand-salt 20) 26 | pbe-iteration-count 27 | pbe-key-length)] 28 | (SecretKeySpec. (.getEncoded (.generateSecret key-factory key-spec)) key-encoding))) 29 | 30 | (defn get-init-vec-from-cipher [cipher] 31 | (-> cipher 32 | (.getParameters) 33 | (.getParameterSpec IvParameterSpec) 34 | (.getIV))) 35 | 36 | (defn make-cipher 37 | ([mode secret-key] 38 | (doto (Cipher/getInstance cipher-algorithm) 39 | (.init mode secret-key))) 40 | ([mode secret-key init-vec] 41 | (doto (Cipher/getInstance cipher-algorithm) 42 | (.init mode secret-key init-vec)))) 43 | 44 | ;; Adapted from: http://stackoverflow.com/questions/992019/java-256-bit-aes-password-based-encryption 45 | (defn file-encrypt [infile outfile password] 46 | (let [secret-key (make-secret-key password) 47 | cipher (make-cipher Cipher/ENCRYPT_MODE secret-key) 48 | init-vec (get-init-vec-from-cipher cipher)] 49 | (with-open [istream (java.io.FileInputStream. infile) 50 | ostream (CipherOutputStream. (java.io.FileOutputStream. outfile) cipher)] 51 | (IOUtils/copy istream ostream)) 52 | {:skey (Base64/encodeBase64String (.getEncoded secret-key)) 53 | :ivec (Base64/encodeBase64String init-vec)})) 54 | 55 | 56 | ;; Adapted from: http://stackoverflow.com/questions/992019/java-256-bit-aes-password-based-encryption 57 | (defn file-decrypt [infile outfile secret-key init-vec] 58 | (let [skey (if (Base64/isBase64 secret-key) 59 | (Base64/decodeBase64 secret-key) 60 | secret-key) 61 | ivec (if (Base64/isBase64 init-vec) 62 | (Base64/decodeBase64 init-vec) 63 | init-vec) 64 | cipher (make-cipher Cipher/DECRYPT_MODE (SecretKeySpec. skey key-encoding) (IvParameterSpec. ivec))] 65 | (with-open [istream (CipherInputStream. (FileInputStream. infile) cipher) 66 | ostream (java.io.FileOutputStream. outfile)] 67 | (IOUtils/copy istream ostream)))) 68 | 69 | 70 | (comment 71 | ( def crypt-info (file-encrypt "/home/superg/foo.txt" "/home/superg/foo.enc")) 72 | 73 | (file-decrypt "/home/superg/foo.enc" "/home/superg/foo.dec" (:skey crypt-info) (:ivec crypt-info)) 74 | ) 75 | -------------------------------------------------------------------------------- /src/clj_etl_utils/json.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.json 2 | (:import 3 | [java.sql Timestamp] 4 | [java.io Writer] 5 | [org.joda.time DateTime] 6 | [org.joda.time.format ISODateTimeFormat]) 7 | (:require 8 | [clojure.data.json :as json])) 9 | 10 | (defn -write-quoted-string [^String s #^Writer out options] 11 | (.write out "\"") 12 | (.write out s) 13 | (.write out "\"")) 14 | 15 | (defn -write-as-string [obj #^Writer out options] 16 | (-write-quoted-string (str obj) out options)) 17 | 18 | (def ^java.text.SimpleDateFormat java-util-time-iso8601-formatter 19 | (let [tz (java.util.TimeZone/getTimeZone "UTC") 20 | df (java.text.SimpleDateFormat. "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")] 21 | (.setTimeZone df tz) 22 | df)) 23 | 24 | (defn -write-java-sql-timestamp [^java.sql.Timestamp x #^Writer out options] 25 | (-write-as-string (.format java-util-time-iso8601-formatter x) out options)) 26 | (extend java.sql.Timestamp json/JSONWriter 27 | {:-write -write-java-sql-timestamp}) 28 | 29 | (extend org.joda.time.DateTime json/JSONWriter 30 | {:-write -write-as-string}) 31 | 32 | (defn -write-java-util-date [^java.util.Date x #^Writer out options] 33 | (-write-as-string (.format java-util-time-iso8601-formatter x) out options)) 34 | (extend java.util.Date json/JSONWriter 35 | {:-write -write-java-util-date}) 36 | 37 | (extend clojure.lang.Fn json/JSONWriter 38 | {:-write -write-as-string}) 39 | 40 | (defn java-time-local-date-time-to-iso8601-str [^java.time.LocalDateTime dtime] 41 | (str (.format dtime java.time.format.DateTimeFormatter/ISO_DATE_TIME) "Z")) 42 | 43 | (comment 44 | 45 | (java-time-local-date-time-to-iso8601-str 46 | (java.time.LocalDateTime/ofInstant 47 | (java.time.Instant/ofEpochMilli 1379264104884) 48 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC")))) 49 | 50 | ) 51 | 52 | (defn -write-java-time-local-date-time [^java.time.LocalDateTime x #^Writer out options] 53 | (-write-quoted-string 54 | (java-time-local-date-time-to-iso8601-str x) 55 | out 56 | options)) 57 | (extend 58 | java.time.LocalDateTime json/JSONWriter 59 | {:-write -write-java-time-local-date-time}) 60 | 61 | (defn java-time-zoned-date-time-to-iso8601-str [^java.time.ZonedDateTime dtime] 62 | (.format (.toOffsetDateTime dtime) java.time.format.DateTimeFormatter/ISO_ZONED_DATE_TIME)) 63 | 64 | (comment 65 | (java-time-zoned-date-time-to-iso8601-str 66 | (java.time.ZonedDateTime/ofInstant 67 | (java.time.Instant/ofEpochMilli 1379264104884) 68 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC")))) 69 | ) 70 | 71 | (defn -write-java-time-zoned-date-time [^java.time.ZonedDateTime x #^Writer out options] 72 | (-write-quoted-string 73 | (java-time-zoned-date-time-to-iso8601-str x) 74 | out 75 | options)) 76 | (extend java.time.ZonedDateTime json/JSONWriter 77 | {:-write -write-java-time-zoned-date-time}) 78 | 79 | (comment 80 | (org.joda.time.DateTime. (java.util.Date.)) 81 | (clojure.data.json/json-str (java.util.Date.)) 82 | (clojure.data.json/json-str (java.sql.Timestamp. (.getTime (java.util.Date.)))) 83 | java.time.DateTime 84 | 85 | (clojure.data.json/json-str 86 | (java.time.LocalDateTime/ofInstant 87 | (java.time.Instant/ofEpochMilli 1379264104884) 88 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC")))) 89 | 90 | (clojure.data.json/json-str 91 | (java.time.ZonedDateTime/ofInstant 92 | (java.time.Instant/ofEpochMilli 1379264104884) 93 | (.toZoneId (java.util.TimeZone/getTimeZone "UTC")))) 94 | 95 | (.print 96 | (ISODateTimeFormat/dateTime) 97 | (org.joda.time.DateTime. "2013-09-12")) 98 | ) 99 | -------------------------------------------------------------------------------- /test/fixtures/files/sample.in.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis quis nibh in urna 2 | viverra semper in id elit. Etiam massa orci, vestibulum id porttitor in, 3 | convallis quis ligula. Praesent egestas consectetur tincidunt. Nam vehicula 4 | consequat congue. Ut sed nibh risus. Integer pellentesque, libero eget dictum 5 | vestibulum, ligula sapien fermentum neque, non dictum ipsum felis sit amet 6 | lorem. Integer gravida varius lorem, et euismod justo egestas a. Mauris 7 | vehicula magna sit amet risus consectetur nec interdum libero interdum. 8 | Pellentesque nec elit a tortor lobortis venenatis. Aliquam semper sapien nec 9 | neque vehicula viverra. Aenean at nunc a quam venenatis euismod quis nec ipsum. 10 | Duis ornare magna vitae ipsum eleifend ullamcorper. Cras vel ipsum in erat 11 | scelerisque cursus. In a ante ac arcu ornare tincidunt mattis a metus. Duis 12 | scelerisque lorem id enim dapibus bibendum. 13 | 14 | Morbi cursus, nisi quis tincidunt lobortis, massa sem sagittis eros, ac pretium 15 | arcu neque at nisl. Sed ligula erat, pulvinar in commodo in, lobortis ut est. 16 | Sed molestie fermentum mauris sed tristique. Cum sociis natoque penatibus et 17 | magnis dis parturient montes, nascetur ridiculus mus. Duis porttitor, sem a 18 | hendrerit ultricies, sem odio ultrices elit, vel luctus felis ante quis felis. 19 | In felis felis, viverra et ullamcorper non, imperdiet vel purus. Sed in dapibus 20 | urna. Vivamus molestie pretium mauris non semper. Donec convallis tortor vitae 21 | diam condimentum quis condimentum odio sagittis. Nulla dapibus massa eros, 22 | vitae aliquam odio. Etiam eu tellus sit amet ipsum posuere suscipit in et dui. 23 | 24 | Phasellus aliquet, mi sollicitudin egestas vestibulum, nisl dolor condimentum 25 | leo, quis ornare ante nunc a enim. Vestibulum at neque nibh, in placerat est. 26 | Duis gravida quam id magna accumsan viverra. Maecenas sed orci nunc, non 27 | iaculis massa. Nulla id augue mi, mattis tristique nisi. Nullam iaculis tortor 28 | sed leo mollis vel ullamcorper metus ultricies. Suspendisse potenti. In feugiat 29 | nibh magna. Suspendisse porta ante eu justo pulvinar adipiscing in ut arcu. 30 | Class aptent taciti sociosqu ad litora torquent per conubia nostra, per 31 | inceptos himenaeos. Curabitur vel nulla dolor. Praesent viverra, nisi vel 32 | fermentum sagittis, velit nisl bibendum dui, vitae gravida quam neque id nisi. 33 | Aliquam non nisl eget dolor auctor porttitor eget volutpat quam. Sed molestie 34 | eleifend nunc ac faucibus. 35 | 36 | Morbi lorem orci, pharetra ac ultrices sit amet, tempor a metus. Nullam nec mi 37 | ac ligula suscipit rhoncus non a ante. Pellentesque nec lacinia libero. Sed 38 | dictum dui a est condimentum et aliquet lectus commodo. Fusce id erat lacus. 39 | Aliquam nec lacinia est. Vestibulum cursus urna eget lacus vehicula porta. 40 | Class aptent taciti sociosqu ad litora torquent per conubia nostra, per 41 | inceptos himenaeos. Sed pellentesque, est eu ornare dapibus, est orci aliquet 42 | justo, nec blandit nibh mi nec lectus. Aenean augue nunc, laoreet at malesuada 43 | et, hendrerit eget ligula. 44 | 45 | Morbi congue placerat tortor, id dapibus tellus ultrices ac. Ut adipiscing 46 | metus eget arcu faucibus iaculis. Vestibulum sagittis dui a ante convallis 47 | posuere. Nam libero justo, dictum nec blandit sed, consectetur id mauris. 48 | Phasellus dapibus consectetur elementum. Ut bibendum lacus eros. Morbi quis 49 | tortor odio. Sed non lorem at sem mattis mollis. Cum sociis natoque penatibus 50 | et magnis dis parturient montes, nascetur ridiculus mus. Nullam posuere ipsum 51 | vel elit tincidunt ullamcorper. Duis nulla libero, porttitor a ultrices ac, 52 | semper at lectus. Cum sociis natoque penatibus et magnis dis parturient montes, 53 | nascetur ridiculus mus. Morbi vitae sapien mauris. Integer porta interdum 54 | iaculis. Praesent fringilla leo et turpis accumsan sodales. Nam facilisis 55 | bibendum ipsum, sed blandit est consectetur feugiat. 56 | -------------------------------------------------------------------------------- /test/fixtures/files/sample.utf-8.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis quis nibh in urna 2 | viverra semper in id elit. Etiam massa orci, vestibulum id porttitor in, 3 | convallis quis ligula. Praesent egestas consectetur tincidunt. Nam vehicula 4 | consequat congue. Ut sed nibh risus. Integer pellentesque, libero eget dictum 5 | vestibulum, ligula sapien fermentum neque, non dictum ipsum felis sit amet 6 | lorem. Integer gravida varius lorem, et euismod justo egestas a. Mauris 7 | vehicula magna sit amet risus consectetur nec interdum libero interdum. 8 | Pellentesque nec elit a tortor lobortis venenatis. Aliquam semper sapien nec 9 | neque vehicula viverra. Aenean at nunc a quam venenatis euismod quis nec ipsum. 10 | Duis ornare magna vitae ipsum eleifend ullamcorper. Cras vel ipsum in erat 11 | scelerisque cursus. In a ante ac arcu ornare tincidunt mattis a metus. Duis 12 | scelerisque lorem id enim dapibus bibendum. 13 | 14 | Morbi cursus, nisi quis tincidunt lobortis, massa sem sagittis eros, ac pretium 15 | arcu neque at nisl. Sed ligula erat, pulvinar in commodo in, lobortis ut est. 16 | Sed molestie fermentum mauris sed tristique. Cum sociis natoque penatibus et 17 | magnis dis parturient montes, nascetur ridiculus mus. Duis porttitor, sem a 18 | hendrerit ultricies, sem odio ultrices elit, vel luctus felis ante quis felis. 19 | In felis felis, viverra et ullamcorper non, imperdiet vel purus. Sed in dapibus 20 | urna. Vivamus molestie pretium mauris non semper. Donec convallis tortor vitae 21 | diam condimentum quis condimentum odio sagittis. Nulla dapibus massa eros, 22 | vitae aliquam odio. Etiam eu tellus sit amet ipsum posuere suscipit in et dui. 23 | 24 | Phasellus aliquet, mi sollicitudin egestas vestibulum, nisl dolor condimentum 25 | leo, quis ornare ante nunc a enim. Vestibulum at neque nibh, in placerat est. 26 | Duis gravida quam id magna accumsan viverra. Maecenas sed orci nunc, non 27 | iaculis massa. Nulla id augue mi, mattis tristique nisi. Nullam iaculis tortor 28 | sed leo mollis vel ullamcorper metus ultricies. Suspendisse potenti. In feugiat 29 | nibh magna. Suspendisse porta ante eu justo pulvinar adipiscing in ut arcu. 30 | Class aptent taciti sociosqu ad litora torquent per conubia nostra, per 31 | inceptos himenaeos. Curabitur vel nulla dolor. Praesent viverra, nisi vel 32 | fermentum sagittis, velit nisl bibendum dui, vitae gravida quam neque id nisi. 33 | Aliquam non nisl eget dolor auctor porttitor eget volutpat quam. Sed molestie 34 | eleifend nunc ac faucibus. 35 | 36 | Morbi lorem orci, pharetra ac ultrices sit amet, tempor a metus. Nullam nec mi 37 | ac ligula suscipit rhoncus non a ante. Pellentesque nec lacinia libero. Sed 38 | dictum dui a est condimentum et aliquet lectus commodo. Fusce id erat lacus. 39 | Aliquam nec lacinia est. Vestibulum cursus urna eget lacus vehicula porta. 40 | Class aptent taciti sociosqu ad litora torquent per conubia nostra, per 41 | inceptos himenaeos. Sed pellentesque, est eu ornare dapibus, est orci aliquet 42 | justo, nec blandit nibh mi nec lectus. Aenean augue nunc, laoreet at malesuada 43 | et, hendrerit eget ligula. 44 | 45 | Morbi congue placerat tortor, id dapibus tellus ultrices ac. Ut adipiscing 46 | metus eget arcu faucibus iaculis. Vestibulum sagittis dui a ante convallis 47 | posuere. Nam libero justo, dictum nec blandit sed, consectetur id mauris. 48 | Phasellus dapibus consectetur elementum. Ut bibendum lacus eros. Morbi quis 49 | tortor odio. Sed non lorem at sem mattis mollis. Cum sociis natoque penatibus 50 | et magnis dis parturient montes, nascetur ridiculus mus. Nullam posuere ipsum 51 | vel elit tincidunt ullamcorper. Duis nulla libero, porttitor a ultrices ac, 52 | semper at lectus. Cum sociis natoque penatibus et magnis dis parturient montes, 53 | nascetur ridiculus mus. Morbi vitae sapien mauris. Integer porta interdum 54 | iaculis. Praesent fringilla leo et turpis accumsan sodales. Nam facilisis 55 | bibendum ipsum, sed blandit est consectetur feugiat. 56 | -------------------------------------------------------------------------------- /test/clj_etl_utils/regex_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.regex-test 2 | (:require [clj-etl-utils.regex :as rx]) 3 | (:use [clojure.test] 4 | [clj-etl-utils.test-helper])) 5 | 6 | 7 | (defn us-state-match-found [data] 8 | (let [regex ^java.util.regex.Pattern (rx/std-regex :geographic :usa :state)] 9 | (.matches (.matcher regex data)))) 10 | 11 | 12 | (deftest test-us-state-regex 13 | (is (not (us-state-match-found "xx"))) 14 | (is (us-state-match-found "NJ")) 15 | (is (us-state-match-found "nJ")) 16 | (is (not (us-state-match-found ""))) 17 | (is (us-state-match-found "pa")) 18 | (is (us-state-match-found "NJ")) 19 | (is (us-state-match-found "AE")) 20 | (is (not (us-state-match-found "not")))) 21 | 22 | ;; (test-us-state-regex) 23 | 24 | (def sample-postal-codes 25 | {:can 26 | ["R2K 3X4" 27 | "R6W 4B7" 28 | "L1T 2W2" 29 | "M2R 1L7" 30 | "R5G 0M3" 31 | "R2V4J1" 32 | "H9S2R9" 33 | "H1T 2R6" 34 | "H1T 2R6" 35 | "N0G 2H0" 36 | "H1P 1K5" 37 | "L1V 6L4" 38 | "T2H 1K2" 39 | "L8H 3B2" 40 | "H1T 2R6" 41 | "L8E0C7" 42 | "J7R 6W2" 43 | "J3H 0A9" 44 | "R0G 2B0" 45 | "L4Z 2X2" 46 | "N7T 8B2" 47 | "V2X 2R4" 48 | "T6K 3P8" 49 | "V0G 1M0" 50 | "N8H 3P2" 51 | "G6E 1A2" 52 | "J3X 2B8"] 53 | :usa 54 | ["93306" 55 | "94568" 56 | "26547" 57 | "12440" 58 | "43333" 59 | "16692" 60 | "98392" 61 | "26038" 62 | "90815" 63 | "67484" 64 | "63366" 65 | "60946" 66 | "61048" 67 | "24950" 68 | "07036" 69 | "65764" 70 | "12538" 71 | "30296" 72 | "41601" 73 | "75228" 74 | "26267" 75 | "84320" 76 | "61743" 77 | "37402" 78 | "94523" 79 | "20164 1131" 80 | "20164-1131" 81 | "48044 2335" 82 | "48044-2335" 83 | "37214 2928" 84 | "37214-2928" 85 | "48340 1205" 86 | "48340-1205" 87 | "18054 2531" 88 | "18054-2531" 89 | "30224 5330" 90 | "30224-5330" 91 | "49426 7317" 92 | "49426-7317" 93 | "30340 4294" 94 | "30340-4294" 95 | "08201 2813" 96 | "08201-2813"]}) 97 | 98 | (deftest test-usa-zip-matcher 99 | (doseq [zip5 (:usa sample-postal-codes)] 100 | (is (re-matches (rx/std-regex :geographic :usa :zip) 101 | zip5)))) 102 | 103 | (deftest test-can-postal-code-matcher 104 | (doseq [postal-code (:can sample-postal-codes)] 105 | (is (re-matches (rx/std-regex :geographic :can :postal-code) 106 | postal-code)))) 107 | 108 | (deftest test-north-america-postal-code-matcher 109 | (let [regex (rx/std-regex-compose [:geographic :usa :postal-code] 110 | [:geographic :can :postal-code])] 111 | (doseq [postal-code (concat 112 | (:can sample-postal-codes) 113 | (:usa sample-postal-codes))] 114 | (is (re-matches regex postal-code))))) 115 | 116 | (deftest test-north-america-postal-code-matcher? 117 | (let [regex (rx/std-regex-compose [:geographic :usa :postal-code?] 118 | [:geographic :can :postal-code?])] 119 | (doseq [postal-code (concat 120 | (:can sample-postal-codes) 121 | (:usa sample-postal-codes))] 122 | (is (re-matches regex postal-code))))) 123 | 124 | (deftest test-north-america-postal-code-matcher? 125 | (let [regex (rx/std-regex-compose [:geographic :usa :postal-code?] 126 | [:geographic :can :postal-code?])] 127 | (doseq [postal-code (concat 128 | (:can sample-postal-codes) 129 | (:usa sample-postal-codes))] 130 | (is (re-matches regex postal-code))))) 131 | 132 | (comment 133 | 134 | (apply rx/std-regex [:geographic :can :postal-code]) 135 | 136 | (test-usa-zip-matcher) 137 | (test-can-postal-code-matcher) 138 | (test-north-america-postal-code-matcher) 139 | 140 | ) 141 | -------------------------------------------------------------------------------- /src/clj_etl_utils/linguistics.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.linguistics 2 | (:require 3 | [clj-etl-utils.io :as io] 4 | [clj-etl-utils.lang-utils :refer [raise]]) 5 | (:import [com.rn.codec Nysiis] 6 | [org.apache.commons.codec.language DoubleMetaphone] 7 | [org.apache.commons.codec.language Soundex])) 8 | 9 | 10 | 11 | ;; see: http://norvig.com/spell-correct.html 12 | (def *dict-file* "/usr/share/dict/words") 13 | 14 | (defn load-dictionary [file] 15 | (reduce 16 | (fn [s l] 17 | (conj s (.toLowerCase l))) 18 | #{} 19 | (filter 20 | #(not (empty? %1)) (io/line-read-lines file)))) 21 | 22 | 23 | (def *dict* (atom nil)) 24 | 25 | (defn in-dictionary? [word] 26 | (if-not @*dict* 27 | (reset! *dict* (load-dictionary *dict-file*))) 28 | (not (nil? (get @*dict* (.toLowerCase word))))) 29 | 30 | (def *alphabet* (vec (drop 1 (.split "abcdefghijklmnopqrstuvwxyz" "")))) 31 | 32 | (defn edist1 [word] 33 | (let [splits (for [idx (range 0 (inc (count word)))] 34 | [(.substring word 0 idx) 35 | (.substring word idx)]) 36 | deletes (for [[a b] splits :when (not (empty? b))] 37 | (str a (.substring b 1))) 38 | transposes (for [[a b] splits :when (> (count b) 1)] 39 | (str a 40 | (.substring b 1 2) 41 | (.substring b 0 1) 42 | (.substring b 2))) 43 | replaces (for [[a b] splits :when (not (empty? b)) 44 | c *alphabet*] 45 | (str a c (.substring b 1))) 46 | inserts (for [[a b] splits 47 | c *alphabet*] 48 | (str a c b))] 49 | (set (concat deletes transposes replaces inserts)))) 50 | 51 | 52 | 53 | (defn edist2 [word] 54 | (for [d1 (edist1 word) 55 | d2 (edist1 d1)] 56 | d2)) 57 | 58 | (def double-metaphone 59 | (let [encoder (DoubleMetaphone.)] 60 | (fn [a] 61 | (.encode encoder a)))) 62 | 63 | (def double-metaphone-match? 64 | (let [encoder (DoubleMetaphone.)] 65 | (fn [a b] 66 | (.isDoubleMetaphoneEqual encoder a b)))) 67 | 68 | (def nysiis 69 | (let [encoder (Nysiis.)] 70 | (fn [a] 71 | (.encode encoder a)))) 72 | 73 | (def nysiis-match? 74 | (let [encoder (Nysiis.)] 75 | (fn [a b] 76 | (= 77 | (.encode encoder a) 78 | (.encode encoder b))))) 79 | 80 | (def soundex 81 | (let [encoder (Soundex.)] 82 | (fn [a] 83 | (.encode encoder a)))) 84 | 85 | (def soundex-match? 86 | (let [encoder (Soundex.)] 87 | (fn [a b] 88 | (= (.encode encoder a) 89 | (.encode encoder b))))) 90 | 91 | (defn filter-for-phonetic-equivalence [word permutations] 92 | (let [dmeta (DoubleMetaphone.) 93 | sdex (Soundex.) 94 | target-sdex (.encode sdex word)] 95 | (filter (fn [permutation] 96 | (and (.isDoubleMetaphoneEqual dmeta word permutation) 97 | (Nysiis/isEncodeEqual word permutation) 98 | (= (.encode sdex permutation) target-sdex ))) 99 | permutations))) 100 | 101 | 102 | (defn permutations [word] 103 | (set (concat [word] 104 | (filter-for-phonetic-equivalence word (edist1 word)) 105 | (filter-for-phonetic-equivalence word (edist2 word))))) 106 | 107 | (defn permutations-with-encodings [word] 108 | (for [permutation (vec (permutations word))] 109 | [word 110 | (soundex word) 111 | (double-metaphone word) 112 | (nysiis word) 113 | permutation 114 | (soundex permutation) 115 | (double-metaphone permutation) 116 | (nysiis permutation)])) 117 | 118 | (defn permutations-in-dict-with-encodings [word] 119 | (for [permutation (filter in-dictionary? (concat (edist1 word) 120 | (edist2 word)))] 121 | [word 122 | (soundex word) 123 | (double-metaphone word) 124 | (nysiis word) 125 | permutation 126 | (soundex permutation) 127 | (double-metaphone permutation) 128 | (nysiis permutation)])) 129 | 130 | -------------------------------------------------------------------------------- /test/clj_etl_utils/io_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.io-test 2 | (:require [clj-etl-utils.io :as io]) 3 | (:use [clojure.test] 4 | [clj-etl-utils.test-helper])) 5 | 6 | (deftest test-first-n-bytes-available 7 | (let [rdr (java.io.StringReader. "")] 8 | (is (= [] (io/first-n-bytes-available rdr 0))) 9 | (is (= [] (io/first-n-bytes-available rdr 1))) 10 | (is (= [] (io/first-n-bytes-available rdr 2))) 11 | (is (= [] (io/first-n-bytes-available rdr 3)))) 12 | (let [rdr (java.io.StringReader. "a")] 13 | (is (= [] (io/first-n-bytes-available rdr 0))) 14 | (is (= [97] (io/first-n-bytes-available rdr 1))) 15 | (is (= [97] (io/first-n-bytes-available rdr 2))) 16 | (is (= [97] (io/first-n-bytes-available rdr 3)))) 17 | (let [rdr (java.io.StringReader. "aaaaaaaaaaa")] 18 | (is (= [] (io/first-n-bytes-available rdr 0))) 19 | (is (= [97] (io/first-n-bytes-available rdr 1))) 20 | (is (= [97 97] (io/first-n-bytes-available rdr 2))) 21 | (is (= [97 97 97] (io/first-n-bytes-available rdr 3))) 22 | (is (= [97 97 97 97] (io/first-n-bytes-available rdr 4)))) 23 | (with-open [rdr (java.io.FileReader. (fixture-file "sample.in.txt"))] 24 | (is (= [76 111 114 101] (io/first-n-bytes-available rdr 4))))) 25 | 26 | ;; (test-first-n-bytes-available) 27 | 28 | (deftest test-byte-marker-matches? 29 | (is (not (io/byte-marker-matches? [] []))) 30 | (is (not (io/byte-marker-matches? [1] [0]))) 31 | (is (io/byte-marker-matches? [1] [1])) 32 | (is (io/byte-marker-matches? [1 2] [1 2 3]))) 33 | 34 | ;; (test-byte-marker-matches?) 35 | 36 | ;; (deftest test-detect-file-encoding-via-bom 37 | ;; (is 38 | ;; (= "ISO-8859-1" 39 | ;; (:encoding (io/detect-file-encoding-via-bom 40 | ;; (fixture-file "sample.in.txt"))))) 41 | ;; (is 42 | ;; (= "US-ASCII" 43 | ;; (:encoding (io/detect-file-encoding-via-bom 44 | ;; (fixture-file "sample.in.txt") io/*us-ascii*)))) 45 | ;; (is 46 | ;; (= "UTF-8" 47 | ;; (:encoding (io/detect-file-encoding-via-bom 48 | ;; (fixture-file "sample.utf-8.txt"))))) 49 | ;; (is 50 | ;; (= "UTF-8" 51 | ;; (:encoding (io/detect-file-encoding-via-bom 52 | ;; (fixture-file "sample.utf-8.txt") io/*us-ascii*)))) 53 | ;; (is 54 | ;; (= "UTF-16LE" 55 | ;; (:encoding (io/detect-file-encoding-via-bom 56 | ;; (fixture-file "sample.utf-16le.txt"))))) 57 | ;; (is 58 | ;; (= "UTF-16BE" 59 | ;; (:encoding (io/detect-file-encoding-via-bom 60 | ;; (fixture-file "sample.utf-16be.txt"))))) 61 | ;; (is 62 | ;; (= "UTF-32LE" 63 | ;; (:encoding (io/detect-file-encoding-via-bom 64 | ;; (fixture-file "sample.utf-32le.txt"))))) 65 | ;; (is 66 | ;; (= "UTF-32BE" 67 | ;; (:encoding (io/detect-file-encoding-via-bom 68 | ;; (fixture-file "sample.utf-32be.txt")))))) 69 | 70 | ;; ;; (io/detect-file-encoding-via-bom (fixture-file "sample.utf-32be.txt")) 71 | 72 | ;; ;; (test-detect-file-encoding-via-bom) 73 | 74 | 75 | ;; ;; (prn (java.nio.charset.Charset/availableCharsets)) 76 | 77 | ;; ;; ensure it recognizes the BOM correctly and that it reads off the 78 | ;; ;; BOM if present in all cases, the first few bytes should be: "Lorem" 79 | ;; (deftest test-unicode-input-stream 80 | ;; (let [inp (io/unicode-input-stream (fixture-file "sample.utf-32be.txt"))] 81 | ;; (is (= "UTF-32BE" (.getEncoding inp))) 82 | ;; ;; TODO: assert we've read the first word w/no BOM 83 | ;; ; (is (= "Lorem" (.something inp)) 84 | ;; ) 85 | 86 | 87 | ;; ;; Is UnicodeLittleUnmarked an alternate name for 16le? 88 | ;; (let [inp (io/unicode-input-stream (fixture-file "sample.utf-16le.txt"))] 89 | ;; (is (= "UnicodeLittleUnmarked" (.getEncoding inp)))) 90 | 91 | ;; ;; The stream drops the hyphen, this is otherwise correct 92 | ;; (let [inp (io/unicode-input-stream (fixture-file "sample.utf-8.txt"))] 93 | ;; (is (= "UTF8" (.getEncoding inp))))) 94 | 95 | ;; (test-unicode-input-stream) 96 | 97 | (deftest test-read-fixed-length-string 98 | (is (= "" (io/read-fixed-length-string (io/string-input-stream "foof") 0))) 99 | (is (= "f" (io/read-fixed-length-string (io/string-input-stream "foof") 1))) 100 | (is (= "foof" (io/read-fixed-length-string (io/string-input-stream "foof") 99)))) 101 | 102 | ;; (test-read-fixed-length-string) 103 | 104 | ;; TODO: how do you declare a pending with clojure.test? 105 | ;; (deftest test-drain-line-reader 106 | ;; (is (= "no implemented" ""))) -------------------------------------------------------------------------------- /src/clj_etl_utils/regex.clj: -------------------------------------------------------------------------------- 1 | (ns ^{:doc "Collection of commonly used regular expressions." 2 | :author "Kyle Burton"} clj-etl-utils.regex 3 | (:import 4 | [java.util.regex Pattern Matcher]) 5 | (:require 6 | [clojure.string :as str] 7 | [clj-etl-utils.ref-data :as ref-data])) 8 | 9 | ;; regexes, initial set pulled from Regex::Common CPAN module 10 | (def common-regexes 11 | {:numeric 12 | {:real #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|)))" 13 | :int #"(?xism:(?:(?:[+-]?)(?:[0123456789]+)))" 14 | :dec #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|)))" 15 | :decimal #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)))" 16 | :hex #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789ABCDEF]|[.])(?:[0123456789ABCDEF]*)(?:(?:[.])(?:[0123456789ABCDEF]{0,}))?)(?:(?:[G])(?:(?:[+-]?)(?:[0123456789ABCDEF]+))|)))" 17 | :oct #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[01234567]|[.])(?:[01234567]*)(?:(?:[.])(?:[01234567]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[01234567]+))|)))" 18 | :bin #"(?xism:(?:(?i)(?:[+-]?)(?:(?=[01]|[.])(?:[01]*)(?:(?:[.])(?:[01]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[01]+))|)))" 19 | :roman #"(?xism:(?xi)(?=[MDCLXVI]) 20 | (?:M{0,3} 21 | (D?C{0,3}|CD|CM)? 22 | (L?X{0,3}|XL|XC)? 23 | (V?I{0,3}|IV|IX)?))"} 24 | :geographic 25 | {:iso 26 | {:country-3 (Pattern/compile 27 | (format 28 | "(?xism:%s)" 29 | (str/join 30 | "|" 31 | (map first ref-data/iso-3-country-codes)))) 32 | 33 | :country-2 (Pattern/compile 34 | (format 35 | "(?xism:%s)" 36 | (str/join 37 | "|" 38 | (map first ref-data/iso-2-country-codes))))} 39 | :usa 40 | {:zip #"^\d{5}[-\s]?(?:\d{4})?$" 41 | :zip? #"\d{5}[-\s]?(?:\d{4})?" 42 | ;; NB: same as zip, just using a consistent name 43 | :postal-code #"^\d{5}[-\s]?(?:\d{4})?$" 44 | :postal-code? #"\d{5}[-\s]?(?:\d{4})?" 45 | :state (Pattern/compile (format "(?xism:%s)" (str/join "|" (map first ref-data/us-states)))) 46 | :state-name (Pattern/compile (format "(?xism:%s)" (str/join "|" (map second ref-data/us-states)))) 47 | :airport-code (Pattern/compile (format "(?xism:%s)" (str/join "|" (map #(nth % 2) ref-data/us-airport-codes)))) 48 | :area-code (Pattern/compile (format "(?xism:%s)" (str/join "|" ref-data/us-area-codes))) 49 | :phone #"(?:1[- ]?)?\(?[2-9]\d{2}\)?[-\. ]?\d{3}[-\. ]?\d{4}(?:\s*(?:e|ex|ext|x|xtn|extension)?\s*\d*)"} 50 | 51 | :can 52 | {:postal-code #"^(?i:[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ]\s?\d[ABCEGHJKLMNPRSTVWXYZ]\d)$" 53 | :postal-code? #"(?i:[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ]\s?\d[ABCEGHJKLMNPRSTVWXYZ]\d)"}} 54 | 55 | :internet 56 | {:ipv4 #"(?xism:(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})))" 57 | :mac #"(?xism:(?:(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2})))" 58 | :net-domain #"(?xism:(?: |(?:[A-Za-z](?:(?:[-A-Za-z0-9]){0,61}[A-Za-z0-9])?(?:\.[A-Za-z](?:(?:[-A-Za-z0-9]){0,61}[A-Za-z0-9])?)*)))"} 59 | :general 60 | {:word #"(?:[\w-]+)" 61 | :punctuation #"(?:[\.,\?/'\";:\\`~!\(\)]+)"}}) 62 | 63 | 64 | (comment 65 | (:zip common-regexes) 66 | 67 | ) 68 | 69 | (defn std-regex [& path] 70 | (get-in common-regexes path)) 71 | 72 | (defn std-regex-compose [& paths] 73 | (Pattern/compile (format "(?:%s)" 74 | (str/join 75 | "|" 76 | (map (fn [path] 77 | (format "(?:%s)" (apply std-regex path))) 78 | paths))))) 79 | 80 | (comment 81 | 82 | (std-regex :geographic :usa :postal-code?) 83 | (std-regex-compose [:geographic :usa :postal-code?] 84 | [:geographic :can :postal-code?]) 85 | 86 | (re-matches (std-regex-compose [:geographic :usa :postal-code?] 87 | [:geographic :can :postal-code?]) "m4w1j5") 88 | ) 89 | 90 | (defn all-groups 91 | "Extracts all the groups from a java.util.regex.Matcher into a seq." 92 | [^java.util.regex.Matcher m] 93 | (for [grp (range 1 (+ 1 (.groupCount m)))] 94 | (.group m (int grp)))) 95 | 96 | 97 | (defn re-find-all 98 | "Retreive all of the matches for a regex in a given string." 99 | [re str] 100 | (doall 101 | (loop [m (re-matcher (if (isa? (class re) String) (re-pattern re) re) str) 102 | res []] 103 | (if (.find m) 104 | (recur m (conj res (vec (all-groups m)))) 105 | res)))) 106 | 107 | (defn re-find-first 108 | "Retreive the first set of match groups for a regex in a given string." 109 | [re str] 110 | (first 111 | (doall 112 | (loop [m (re-matcher (if (isa? (class re) String) (re-pattern re) re) str) 113 | res []] 114 | (if (.find m) 115 | (recur m (conj res (vec (all-groups m)))) 116 | res))))) 117 | -------------------------------------------------------------------------------- /src/clj_etl_utils/cache_utils.clj: -------------------------------------------------------------------------------- 1 | ;; # Caching Utilities 2 | ;; 3 | ;; Basic memoization produces a function with a cache wrapped around it. 4 | ;; This type of cache will have unbounded growth. The functions 5 | ;; in this module are for maintaining similar behavior while allowing 6 | ;; caches to have various flushing policies associated with them. 7 | ;; 8 | (ns clj-etl-utils.cache-utils 9 | (:require 10 | [clojure.tools.logging :as log]) 11 | (:use 12 | [clj-etl-utils.lang-utils :only [raise aprog1]]) 13 | (:import 14 | [org.joda.time DateTime])) 15 | 16 | 17 | ;; ## Cache Registry 18 | ;; 19 | ;; The intent is for static caches to be registered so they can be 20 | ;; managed by code outside of the memoize wrapper. 21 | ;; 22 | ;; Best practice for naming caches is the namespace and the wrapped 23 | ;; function name. Eg: 24 | ;; 25 | ;; `(register-cache :clj-etl-utils.cache-utils.my-function #{:standard} (atom {}))` 26 | ;; 27 | (defonce cache-registry (atom {})) 28 | 29 | ;; ### Cache Tags 30 | ;; 31 | ;; Allows caches to be operated on based on a 'type'. 32 | ;; The first supported type for this module is :standard 33 | ;; which represents a standard memoize based cache. 34 | 35 | (defn register-cache [name #^java.util.Set tags cache-ref & [cache-reset-fn]] 36 | (swap! cache-registry assoc name {:name name :tags tags :cache cache-ref :reset-fn (or cache-reset-fn 37 | (fn [entry] (reset! (:cache entry) {})))})) 38 | 39 | (defn lookup-cache-by-name [name] 40 | (get @cache-registry name)) 41 | 42 | (defn purge-cache-named [n] 43 | (reset! (:cache (lookup-cache-by-name n)) 44 | {})) 45 | 46 | (defn lookup-caches-by-tag [tag] 47 | (filter (fn [entry] 48 | (contains? (:tags entry) tag)) 49 | (vals @cache-registry))) 50 | 51 | (defn purge-caches-with-tag [tag] 52 | (doseq [entry (lookup-caches-by-tag tag)] 53 | ((:reset-fn entry) entry))) 54 | 55 | (defn purge-standard-caches [] 56 | (purge-caches-with-tag :standard)) 57 | 58 | (comment 59 | 60 | (map :cache (lookup-caches-by-tag :standard)) 61 | ) 62 | 63 | (defn wrap-standard-cache [name tags the-fn args-ser-fn] 64 | (let [cache (atom {})] 65 | (register-cache name tags cache) 66 | (fn [& args] 67 | (let [k (args-ser-fn args) 68 | cmap @cache] 69 | (if (contains? cmap k) 70 | (get cmap k) 71 | (aprog1 72 | (apply the-fn args) 73 | (swap! cache assoc k it))))))) 74 | 75 | (defn simple-cache [name the-fn] 76 | (wrap-standard-cache name #{:standard} the-fn identity)) 77 | 78 | (defmacro def-simple-cached [name arg-spec & body] 79 | `(def ~name 80 | (simple-cache ~(keyword (str *ns* "." name)) 81 | (fn ~arg-spec 82 | ~@body)))) 83 | 84 | (defn wrap-countdown-cache [name tags the-fn config] 85 | (let [cache (atom {}) 86 | args-ser-fn (:args-ser-fn config) 87 | max-hits (:max-hits config 100) 88 | nhits (java.util.concurrent.atomic.AtomicLong. 0)] 89 | (register-cache name tags cache) 90 | (fn [& args] 91 | (let [k (args-ser-fn args) 92 | cmap @cache] 93 | (when (>= (.incrementAndGet nhits) max-hits) 94 | (.set nhits 0) 95 | (reset! cache {})) 96 | (if (contains? cmap k) 97 | (get cmap k) 98 | (aprog1 99 | (apply the-fn args) 100 | (swap! cache assoc k it))))))) 101 | 102 | (defmacro def-countdown-cached [name max-hits arg-spec & body] 103 | `(def ~name 104 | (wrap-countdown-cache 105 | ~(keyword (str *ns* "." name)) 106 | #{:countdown} 107 | (fn ~arg-spec 108 | ~@body) 109 | {:max-hits ~max-hits 110 | :args-ser-fn identity}))) 111 | 112 | (defn wrap-timeout-cache [name tags the-fn config] 113 | (let [cache (atom {}) 114 | args-ser-fn (:args-ser-fn config) 115 | duration (long (:duration config (* 1000 60 60))) 116 | exp-time (atom (.plusMillis (DateTime.) duration))] 117 | (register-cache name tags cache) 118 | (fn [& args] 119 | (let [k (args-ser-fn args) 120 | cmap @cache] 121 | (when (.isBeforeNow ^DateTime @exp-time) 122 | (reset! exp-time (.plusMillis (DateTime.) duration)) 123 | (reset! cache {})) 124 | (if (contains? cmap k) 125 | (get cmap k) 126 | (aprog1 127 | (apply the-fn args) 128 | (swap! cache assoc k it))))))) 129 | 130 | 131 | (defmacro def-timeout-cached [name duration arg-spec & body] 132 | `(def ~name 133 | (wrap-timeout-cache 134 | ~(keyword (str *ns* "." name)) 135 | #{:timeout} 136 | (fn ~arg-spec 137 | ~@body) 138 | {:duration ~duration 139 | :args-ser-fn identity}))) 140 | 141 | (defn timeout-with-fallback-cache [name tags timeout-ms the-fn] 142 | (let [cache (atom {}) 143 | now-ms (fn [] (.getTime (java.util.Date.))) 144 | store-in-cache! (fn store-in-cache [cache-key res] 145 | (swap! cache assoc cache-key {:res res :time (now-ms)}) 146 | res) 147 | in-cache-and-not-expired? (fn in-cache-and-not-expired [cache-key] 148 | (if-let [entry (get @cache cache-key)] 149 | (< (- (now-ms) (:time entry)) timeout-ms) 150 | false))] 151 | (register-cache name tags cache) 152 | (fn timeout-with-fallback-cache-inner [& args] 153 | (cond 154 | (not (contains? @cache args)) 155 | (store-in-cache! args (apply the-fn args)) 156 | 157 | (in-cache-and-not-expired? args) 158 | (:res (get @cache args)) 159 | 160 | :in-cache-but-expired 161 | (try 162 | (log/infof "in-cache-and-not-expired: refetching: %s" args) 163 | (store-in-cache! args (apply the-fn args)) 164 | (catch Exception ex 165 | (log/errorf ex "Error executing wrapped function! (will return old cached value) %s" ex) 166 | (:res (get @cache args)))))))) 167 | 168 | (defmacro def-timeout-with-fallback-cache [fn-name timeout-ms args-spec & body] 169 | `(def ~fn-name 170 | (timeout-with-fallback-cache ~(keyword (str *ns* "." fn-name)) #{:timeout :fallback} ~timeout-ms 171 | (fn ~args-spec 172 | ~@body)))) 173 | 174 | 175 | 176 | (defn invalidate-standard-cache [cache-name args] 177 | (let [cache 178 | (-> 179 | @cache-registry 180 | (get cache-name) 181 | :cache)] 182 | (swap! cache 183 | dissoc args))) 184 | -------------------------------------------------------------------------------- /src/clj_etl_utils/time.clj: -------------------------------------------------------------------------------- 1 | (ns clj-etl-utils.time 2 | (:require 3 | [clj-etl-utils.lang-utils :refer [raise]] 4 | [clj-time.core :as time] 5 | [clj-time.format :as tformat]) 6 | (:import 7 | [org.joda.time LocalDate DateTime Days DateTimeConstants DateTimeZone Minutes])) 8 | 9 | (defn date-seq 10 | "Sequence of timestamps, each one day ahead of the previous, inclusive of the end-time. 11 | 12 | (time/date-seq 13 | (org.joda.time.DateTime. \"2014-05-06T12:59:59Z\") 14 | (org.joda.time.DateTime. \"2014-05-08T12:59:59Z\")) 15 | => 16 | (# 17 | # 18 | #) 19 | 20 | " 21 | [^DateTime start-date ^DateTime end-date] 22 | (take-while 23 | #(or (.isBefore ^DateTime %1 end-date) 24 | (.isEqual ^DateTime %1 end-date)) 25 | (iterate (fn [^DateTime d] 26 | (.plusDays d 1)) 27 | start-date))) 28 | 29 | (defn same-day? 30 | "Test if the two DateTime's represent the same day. 31 | 32 | (same-day? 33 | (org.joda.time.DateTime. \"2014-05-06T12:59:59Z\") 34 | (org.joda.time.DateTime. \"2014-05-06T12:59:59Z\")) 35 | => true 36 | 37 | " 38 | [^DateTime t1 ^DateTime t2] 39 | (let [d1 (.toLocalDate t1) 40 | d2 (.toLocalDate t2)] 41 | (.isEqual d1 d2))) 42 | 43 | (def days-of-week-long 44 | {DateTimeConstants/SUNDAY "Sunday" 45 | DateTimeConstants/MONDAY "Monday" 46 | DateTimeConstants/TUESDAY "Tuesday" 47 | DateTimeConstants/WEDNESDAY "Wednesday" 48 | DateTimeConstants/THURSDAY "Thursday" 49 | DateTimeConstants/FRIDAY "Friday" 50 | DateTimeConstants/SATURDAY "Saturday"}) 51 | 52 | (defn day-of-week-long [^DateTime dt] 53 | (get days-of-week-long (.getDayOfWeek dt))) 54 | 55 | (def days-of-week-abbr 56 | {DateTimeConstants/SUNDAY "Sun" 57 | DateTimeConstants/MONDAY "Mon" 58 | DateTimeConstants/TUESDAY "Tue" 59 | DateTimeConstants/WEDNESDAY "Wed" 60 | DateTimeConstants/THURSDAY "Thu" 61 | DateTimeConstants/FRIDAY "Fri" 62 | DateTimeConstants/SATURDAY "Sat"}) 63 | 64 | (defn day-of-week-abbr [^DateTime dt] 65 | (get days-of-week-abbr (.getDayOfWeek dt))) 66 | 67 | (def days-of-week-short 68 | {DateTimeConstants/SUNDAY "S" 69 | DateTimeConstants/MONDAY "M" 70 | DateTimeConstants/TUESDAY "T" 71 | DateTimeConstants/WEDNESDAY "W" 72 | DateTimeConstants/THURSDAY "Th" 73 | DateTimeConstants/FRIDAY "F" 74 | DateTimeConstants/SATURDAY "S"}) 75 | 76 | 77 | (defonce hour-minute-time-formatter (tformat/formatter "HH:mm")) 78 | 79 | (defn make-time-zone-for-id 80 | "Helper that translates coloquial time zones (eg: EDT and PDT) to the official zone. Returns a Joda DateTimeZone." 81 | [^String id] 82 | (let [id (and id (.toUpperCase id))] 83 | (cond 84 | (nil? id) 85 | (DateTimeZone/forID "EST5EDT") 86 | 87 | (= "EDT" id) 88 | (DateTimeZone/forID "EST5EDT") 89 | 90 | (= "PDT" id) 91 | (DateTimeZone/forID "PST8PDT") 92 | 93 | (= "PST" id) 94 | (DateTimeZone/forID "America/Los_Angeles") 95 | 96 | :otherwise 97 | (DateTimeZone/forID id)))) 98 | 99 | (defn mins-between [^String start-hour-min ^String end-hour-min] 100 | (let [^DateTime stime (tformat/parse hour-minute-time-formatter start-hour-min) 101 | ^DateTime etime (tformat/parse hour-minute-time-formatter end-hour-min) 102 | end-is-before-start? (.isBefore etime stime) 103 | ^DateTime etime (if end-is-before-start? 104 | (time/plus etime (time/minutes (* 24 60))) 105 | etime) 106 | shour (.getHourOfDay stime) 107 | smin (.getMinuteOfHour stime) 108 | ehour (.getHourOfDay etime) 109 | emin (.getMinuteOfHour etime) 110 | diff (Minutes/minutesBetween stime etime)] 111 | (.getMinutes diff))) 112 | 113 | (defn translate-time-of-day-to-utc-time-stamp 114 | " 115 | (translate-time-of-day-to-utc-time-of-day \"09:00\" \"EDT\") 116 | => ^DateTime timestamp 117 | " 118 | [^String hour-of-day ^org.joda.time.DateTime tstamp ^String tz] 119 | (let [^DateTime htime (tformat/parse hour-minute-time-formatter hour-of-day) 120 | tstamp (time/to-time-zone tstamp (make-time-zone-for-id tz)) 121 | tstamp (.withTime 122 | tstamp 123 | (.getHourOfDay htime) 124 | (.getMinuteOfHour htime) 125 | 0 0) 126 | tstamp (time/to-time-zone tstamp (DateTimeZone/forID "UTC"))] 127 | tstamp)) 128 | 129 | (comment 130 | 131 | (hour-span-to-time-stamps current-time start-hour end-hour tzone) 132 | 133 | (translate-time-of-day-to-utc-time-stamp hour-of-day tstamp tz) 134 | ) 135 | 136 | (defn hour-span-to-time-stamps 137 | " (hour-span-to-time-stamps tstamp \"09:00\" \"17:00\" \"EDT\") " 138 | [^DateTime current-time ^String start-hour ^String end-hour ^String tzone] 139 | (let [start-tstamp (translate-time-of-day-to-utc-time-stamp start-hour current-time tzone) 140 | mins (mins-between start-hour end-hour) 141 | end-tstamp (time/plus start-tstamp (time/minutes mins))] 142 | [start-tstamp end-tstamp])) 143 | 144 | (defn minutes-into-day [^org.joda.time.DateTime dt] 145 | (+ 146 | (* 60 (.getHourOfDay dt)) 147 | (.getMinuteOfHour dt))) 148 | 149 | (def int->day-of-week-keyword 150 | {1 :monday 151 | 2 :tuesday 152 | 3 :wednesday 153 | 4 :thursday 154 | 5 :friday 155 | 6 :saturday 156 | 7 :sunday}) 157 | 158 | (defn joda-time->day-of-week 159 | [^org.joda.time.DateTime joda-time] 160 | (let [day-of-week-int (.getDayOfWeek joda-time)] 161 | (get int->day-of-week-keyword day-of-week-int))) 162 | 163 | (defn during-business-hours? 164 | " 165 | (during-business-hours? 166 | (time/now) 167 | \"09:00\" 168 | \"17:00\" 169 | \"EDT\") 170 | 171 | " 172 | [^org.joda.time.DateTime current-time ^String start-hour-min ^String end-hour-min ^String tz] 173 | (let [current-time (time/to-time-zone current-time (DateTimeZone/forID "UTC")) 174 | [bus-start bus-end] (hour-span-to-time-stamps 175 | current-time 176 | start-hour-min 177 | end-hour-min 178 | tz) 179 | current-mins (minutes-into-day current-time) 180 | bus-start-mins (minutes-into-day bus-start) 181 | bus-end-mins (minutes-into-day bus-end)] 182 | (if (< bus-end-mins bus-start-mins) 183 | (or (>= current-mins bus-start-mins) 184 | (< current-mins bus-end-mins)) 185 | (and 186 | (<= bus-start-mins current-mins) 187 | (< current-mins bus-end-mins))))) 188 | -------------------------------------------------------------------------------- /java/com/rn/codec/Nysiis.java: -------------------------------------------------------------------------------- 1 | package com.rn.codec; 2 | 3 | import org.apache.commons.codec.Encoder; 4 | 5 | /** 6 | * A class to generate phonetic codings based on the New York State 7 | * Identification and Intelligence System algorithm. This module is based on 8 | * the code from the Perl module available from CPAN, which derives from an 9 | * implementation by Ben Kennedy. 10 | * 11 | * @see http://www.nist.gov/dads/HTML/nysiis.html 12 | * @see http://search.cpan.org/search?query=nysiis&mode=all 13 | * 14 | * @see Atack, J., and F. Bateman. 1992 . 15 | * "Matchmaker, matchmaker, make me a match" : a general 16 | * computer-based matching program for historical researc. 17 | * Historical Methods 25: 53-65. 18 | * 19 | * @author Kyle R. Burton 20 | */ 21 | public final class Nysiis implements Encoder { 22 | 23 | /** Enable/disable internal debugging. */ 24 | private boolean debug = false; 25 | 26 | /** The name to be encoded. */ 27 | private StringBuffer word = null; 28 | 29 | /** 30 | * Static version of encode. This method was originaly created to allow this 31 | * encoder to be used as a Java Stored Procedure in Oracle. 32 | * @param word the data to encode. 33 | * @return the encoded string. 34 | */ 35 | public static String sencode( String word ) { 36 | Nysiis ny = new Nysiis(); 37 | return ny.encode(word); 38 | } 39 | 40 | /** 41 | * Encode the given string using the Nysiis phonetic encoding algorithm. 42 | * @param String originalWord 43 | * @return String - the encoded word 44 | */ 45 | public String encode( String originalWord ) { 46 | 47 | if( originalWord != null && 48 | originalWord.length() > 0 ) { 49 | word = new StringBuffer( originalWord.toUpperCase() ); 50 | } else { 51 | return ""; 52 | } 53 | char first; 54 | 55 | // strip any trailing S or Zs 56 | while(word.toString().endsWith("S") || word.toString().endsWith("Z")) { 57 | word.deleteCharAt( word.length() - 1 ); 58 | } 59 | 60 | replaceFront( "MAC", "MC" ); 61 | replaceFront( "PF", "F" ); 62 | replaceEnd( "IX", "IC" ); 63 | replaceEnd( "EX", "EC" ); 64 | 65 | replaceEnd( "YE", "Y" ); 66 | replaceEnd( "EE", "Y" ); 67 | replaceEnd( "IE", "Y" ); 68 | 69 | replaceEnd( "DT", "D" ); 70 | replaceEnd( "RT", "D" ); 71 | replaceEnd( "RD", "D" ); 72 | 73 | 74 | replaceEnd( "NT", "N" ); 75 | replaceEnd( "ND", "N" ); 76 | 77 | // .EV => .EF 78 | replaceAll( "EV", "EF", 1 ); 79 | 80 | first = word.charAt(0); 81 | 82 | 83 | // replace all vowels with 'A' 84 | // word = replaceAll( word, "A", "A" ); 85 | replaceAll( "E", "A" ); 86 | replaceAll( "I", "A" ); 87 | replaceAll( "O", "A" ); 88 | replaceAll( "U", "A" ); 89 | 90 | // remove any 'W' that follows a vowel 91 | replaceAll( "AW", "A" ); 92 | 93 | replaceAll( "GHT", "GT" ); 94 | replaceAll( "DG", "G" ); 95 | replaceAll( "PH", "F" ); 96 | 97 | replaceAll( "AH", "A", 1 ); 98 | replaceAll( "HA", "A", 1 ); 99 | 100 | replaceAll( "KN", "N" ); 101 | replaceAll( "K", "C" ); 102 | 103 | replaceAll( "M", "N", 1 ); 104 | replaceAll( "Q", "G", 1 ); 105 | 106 | replaceAll( "SH", "S" ); 107 | replaceAll( "SCH", "S" ); 108 | 109 | replaceAll( "YW", "Y" ); 110 | 111 | replaceAll( "Y", "A", 1, word.length() - 2 ); 112 | 113 | replaceAll( "WR", "R" ); 114 | 115 | replaceAll( "Z", "S", 1 ); 116 | 117 | replaceEnd( "AY", "Y" ); 118 | 119 | while(word.toString().endsWith("A")) { 120 | word.deleteCharAt( word.length() - 1 ); 121 | } 122 | 123 | reduceDuplicates(); 124 | 125 | if( ( 'A' == first 126 | || 'E' == first 127 | || 'I' == first 128 | || 'O' == first 129 | || 'U' == first ) && word.length() > 0 ){ 130 | word.deleteCharAt(0); 131 | word.insert(0,first); 132 | } 133 | 134 | return word.toString(); 135 | } 136 | 137 | /** 138 | * Traverse the string reducing duplicated characters. 139 | */ 140 | private void reduceDuplicates() { 141 | char lastChar; 142 | StringBuffer newWord = new StringBuffer(); 143 | 144 | if(0 == word.length()) { 145 | return; 146 | } 147 | 148 | lastChar = word.charAt(0); 149 | newWord.append(lastChar); 150 | for(int i = 1; i < word.length(); ++i) { 151 | if(lastChar != word.charAt(i)) { 152 | newWord.append(word.charAt(i)); 153 | } 154 | lastChar = word.charAt(i); 155 | } 156 | 157 | log("reduceDuplicates: " + word); 158 | 159 | word = newWord; 160 | } 161 | 162 | /** 163 | * Replace all occurances of the given pattern in the string to be encoded 164 | * with the given replacement. 165 | * @param find the sequence to locate 166 | * @param repl the string to replace it with 167 | */ 168 | private void replaceAll( String find, 169 | String repl ) { 170 | replaceAll(find,repl,0,-1); 171 | } 172 | 173 | /** 174 | * Replace all occurances of the given pattern in the string to be encoded 175 | * with the given replacement, beginning at the given staring position. 176 | * @param find the sequence to locate 177 | * @param repl the string to replace it with 178 | * @param startPos the position to begin at 179 | */ 180 | private void replaceAll( String find, 181 | String repl, 182 | int startPos ) { 183 | replaceAll(find,repl,startPos,-1); 184 | } 185 | 186 | /** 187 | * Replace all occurances of the given pattern in the string to be encoded 188 | * with the given replacement, beginning at the given staring position up to 189 | * the given end position. 190 | * @param find the sequence to locate 191 | * @param repl the string to replace it with 192 | * @param startPos the position to begin at 193 | * @param endPos the position to stop at 194 | */ 195 | private void replaceAll( String find, 196 | String repl, 197 | int startPos, 198 | int endPos ) { 199 | int pos = word.toString().indexOf(find,startPos); 200 | 201 | /* 202 | log("Nysiis.replaceAll(): " 203 | + "pos: " + pos + " " 204 | + "word: " + word + " " 205 | + "find: " + find + " " 206 | + "repl: " + repl + " " 207 | + "startPos: " + startPos + " " 208 | + "endPos: " + endPos + " " 209 | ); 210 | */ 211 | 212 | if(-1 == endPos) { 213 | endPos = word.length() - 1; 214 | } 215 | 216 | while(-1 != pos) { 217 | if(-1 != endPos && pos > endPos) { 218 | log("stopping pos > endPos: " + pos + ":" + endPos); 219 | break; 220 | } 221 | // log("word[" + word.length() + "]: " + word); 222 | // log("deleting at: " + pos + ", " + (find.length() - 1)); 223 | 224 | word.delete( pos, pos + find.length() ); 225 | // log("del[" + word.length() + "]: " + word); 226 | 227 | word.insert( pos, repl ); 228 | // log("ins[" + word.length() + "]: " + word); 229 | 230 | pos = word.toString().indexOf(find); 231 | // log("new pos[" + word.length() + "]: " + pos); 232 | log("replaceAll[" + find + "," + repl + "]: " + word); 233 | } 234 | 235 | } 236 | 237 | /** 238 | * If the encoded string begins with the given find string, replace it. 239 | * @param find the prefix to test for 240 | * @param repl the replacement to substitue 241 | */ 242 | private void replaceFront( String find, 243 | String repl ) { 244 | if(word.toString().startsWith(find)) { 245 | word.delete( 0, find.length() ); 246 | word.insert( 0, repl ); 247 | log("replaceFront[" + find + "]: " + word); 248 | } 249 | } 250 | 251 | /** 252 | * If the encoded string ends with the given find string, replace it. 253 | * @param find the suffix to test for 254 | * @param repl the replacement to substitue 255 | */ 256 | private void replaceEnd( String find, 257 | String repl ) { 258 | if(word.toString().endsWith(find)) { 259 | word.delete( word.length() - find.length(), word.length() ); 260 | word.append(repl); 261 | log("replaceEnd[" + find + "]: " + word); 262 | } 263 | } 264 | 265 | /** 266 | * Logging statement controlled by the debug member. 267 | * @param msg the message to optionaly log. 268 | */ 269 | private void log( String msg ) { 270 | if(!debug) { return; } 271 | System.out.println(msg); 272 | System.out.flush(); 273 | } 274 | 275 | /** 276 | * Check if the two strings encode to the same primary or alternate encodings 277 | * using the Nysiis algorithm. 278 | * @param s1 279 | * @param s2 280 | * @return true/false 281 | */ 282 | public static boolean isEncodeEqual( String s1, String s2 ) { 283 | return sencode( s1 ).equals( sencode( s2 ) ); 284 | } 285 | 286 | public Object encode (Object thing ) { 287 | return encode((String) thing); 288 | } 289 | } 290 | 291 | 292 | 293 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clj-etl-utils 2 | 3 | ETL Utilities for Clojure. This library began with functions that worked with data on disk, such as database dumps and log files, at least that was the original purpose of the library, it has since grown to include other utilities. 4 | 5 | NB: this library was created in 2010, since then many of the functions and utilities here are availabile in other libraries (eg: camel-snake-kebab). 6 | 7 | ## Modules 8 | 9 | ### `clj-etl-utils.io` 10 | 11 | IO and File utilities. 12 | 13 | #### string-reader, string-input-stream 14 | 15 | Returns a Reader or an InputStream, respectively, that will read from the given string. 16 | 17 | #### read-fixed-length-string 18 | 19 | Reads a fixed-length string. 20 | 21 | #### chmod 22 | 23 | Changes the permissions on a file by shelling out to the `chmod` command. 24 | 25 | #### mkdir 26 | 27 | Creates the given directory, just returning true if the given directory already exists (as opposed to throwing an exception). 28 | 29 | #### exists? 30 | 31 | Tests if a file exists. 32 | 33 | #### symlink 34 | 35 | Establishes a symlink for a file. 36 | 37 | #### freeze, thaw 38 | 39 | freeze invokes the java serialization and returns a byte array. Thaw does the opposite: takes a byte array and deserializes it. 40 | 41 | #### object->file 42 | 43 | Uses Java serialization to write an object to the given file, truncating if it exists. 44 | 45 | #### file->object 46 | 47 | Deserializes a serialized object from a file. 48 | 49 | #### ensure-directory 50 | 51 | Ensures a directory path exists (recursively), doing nothing if it already exists. 52 | 53 | #### string-gzip 54 | 55 | Compress a string, returning the bytes. 56 | 57 | #### byte-partitions-at-line-boundaries 58 | 59 | This can be used in divide and conquer scenarios where you want to process different segments of a single file in parallel. It takes an input file name and a desired block size. Block boundaries will be close to the desired size - the size is used as a seek position, any line remnant present at that position is read, such that a given block will end cleanly at a line boundary. 60 | 61 | #### random-access-file-line-seq-with-limit 62 | 63 | Returns a lazy sequence of lines from a RandomAccessFile up to a given limit. If a line spans the limit, the entire line will be returned, so that a valid line is always returned. 64 | 65 | #### read-lines-from-file-segment 66 | 67 | Returns a sequence of lines from the file across the given starting and ending positions. 68 | 69 | ### `clj-etl-utils.landmark_parser` 70 | 71 | Semi-structured Text parsing library. The library uses a command set to extract portions of a text based document. Commands are instructions such as: start, end forward-char, backward-char, forward-past, rewind-to and so on. The landmarks can be either literal or logical (regular expressions, 'types', etc). This extractor can often succeed in situations where there is mixed media (javascript embedded in html), or inconsisten structure (the outputs of pdf to text). It should be able to operate on a document as long as there are identifiable sets of landmarks. 72 | 73 | ```clojure 74 | ;; this example extracts the last line from the document (string) 75 | (lp/extract 76 | (lp/make-parser "This is some text. There is some in the middle.\nThere is some towards the end, but it is not this.\nA few sentences in all.") 77 | [[:end nil] 78 | [:rewind-to "\n"]] 79 | [:end nil]) 80 | ``` 81 | 82 | ### `clj-etl-utils.lang-utils` 83 | 84 | `lang/make-periodic-invoker` can be used to easily create 'progress' indicators or bars 85 | 86 | #### Example 87 | 88 | ```clojure 89 | (let [total 1000 90 | period 100 91 | started-at-ms (.getTime (java.util.Date.)) 92 | progress (make-periodic-invoker 93 | period 94 | (fn [status counter] 95 | (let [elapsed-ms (- (.getTime (java.util.Date.)) started-at-ms) 96 | elapsed-secs (/ elapsed-ms 1000) 97 | num-remaining (- total counter) 98 | rate-per-sec (/ counter elapsed-secs) 99 | eta-secs (/ num-remaining rate-per-sec)] 100 | (cond 101 | (= status :final) 102 | (printf "All Done! %d processed in %d seconds at %3.2f/s\n" counter (long elapsed-secs) (double rate-per-sec)) 103 | 104 | :else 105 | (printf "So far we've completed %d of %d, we are %3.2f%% complete at %3.2f per second, we should be done in %d seconds.\n" 106 | counter 107 | total 108 | (double (* 100.0 (/ counter total 1.0))) 109 | (double rate-per-sec) 110 | (long eta-secs))))))] 111 | (dotimes [ii total] 112 | ;; do some work / processing here 113 | (Thread/sleep ^long (rand-nth [0 1 2 3 4 5])) 114 | (progress)) 115 | (progress :final) 116 | :done) 117 | ``` 118 | 119 | Produces the following output: 120 | 121 | ```text 122 | So far we've completed 100 of 1000, we are 10.00% complete at 273.22 per second, we should be done in 3 seconds. 123 | So far we've completed 200 of 1000, we are 20.00% complete at 294.55 per second, we should be done in 2 seconds. 124 | So far we've completed 300 of 1000, we are 30.00% complete at 300.30 per second, we should be done in 2 seconds. 125 | So far we've completed 400 of 1000, we are 40.00% complete at 298.73 per second, we should be done in 2 seconds. 126 | So far we've completed 500 of 1000, we are 50.00% complete at 305.06 per second, we should be done in 1 seconds. 127 | So far we've completed 600 of 1000, we are 60.00% complete at 308.01 per second, we should be done in 1 seconds. 128 | So far we've completed 700 of 1000, we are 70.00% complete at 306.35 per second, we should be done in 0 seconds. 129 | So far we've completed 800 of 1000, we are 80.00% complete at 308.76 per second, we should be done in 0 seconds. 130 | So far we've completed 900 of 1000, we are 90.00% complete at 303.85 per second, we should be done in 0 seconds. 131 | So far we've completed 1000 of 1000, we are 100.00% complete at 303.67 per second, we should be done in 0 seconds. 132 | All Done! 1000 processed in 3 seconds at 303.58/s 133 | ``` 134 | 135 | ### `clj-etl-utils.ref_data` 136 | 137 | Static reference data and lookup tables 138 | 139 | * USA State Names and Abbreviations 140 | * USA Airport Codes 141 | * USA Phone Number Area Codes 142 | * ISO Country Codes 143 | 144 | ### `clj-etl-utils.regex` 145 | 146 | Ready made Regular expressions to match items like numbers, iso country codes, USA postal codes, USA state names, USA phone numbers, ipv4 adresses, etc. 147 | 148 | ### `clj-etl-utils.sequences` 149 | 150 | Sequence utilities, eg: `make-stream-sampler`, `make-reservoir-sampler`, `reservoir-sample-seq`. 151 | 152 | ### `clj-etl-utils.text` 153 | 154 | Text manipulation helper functions, eg: `md5->string`, `string->md5`, `sha1->string`, `string->sha1`, `substr` that supports negative indexes to read from the right hand side, `human-readable-byte-count`, `canonical-phone-number`, `encode-base64`, `decode-base64` 155 | 156 | ### `clj-etl-utils.indexer` 157 | 158 | Module for working with line-oriented data files in-situ on disk. These tools allow you to create (somewhat) arbitrary indexes into a file and efficiently find lines based on the indexed values. 159 | 160 | #### Example 161 | 162 | Given the tab delimited file `file.txt`: 163 | 164 | ``` 165 | 99 line with larger key 166 | 1 is is the second line 167 | 2 this is a line 168 | 3 this is another line 169 | 99 duplicated line for key 170 | ``` 171 | 172 | We can create an index on the `id` column id: 173 | 174 | ```clojure 175 | (index-file! "file.txt" ".file.txt.id-idx" #(first (.split % "\t" 2))) 176 | ``` 177 | 178 | That index can then be used to read groups of records from the file with the same key values: 179 | 180 | ```clojure 181 | (record-blocks-via-index "file.txt" ".file.txt.id-idx") 182 | ``` 183 | 184 | 185 | ```clojure 186 | ( [ "1\tis is the second line" ] 187 | [ "2\tthis is a line" ] 188 | [ "3\tthis is another line" ] 189 | [ "99\tline with larger key" 190 | "99\tduplicated line for key" ] ) 191 | ``` 192 | 193 | ## Installation 194 | 195 | `clj-etl-utils` is available via Clojars: 196 | 197 | https://clojars.org/com.github.kyleburton/clj-etl-utils 198 | 199 | ## References 200 | 201 | UTF and BOM 202 | 203 | http://unicode.org/faq/utf\_bom.html 204 | 205 | ## Random Sampling 206 | 207 | [How to pick a random sample from a list](http://www.javamex.com/tutorials/random\_numbers/random\_sample.shtml) 208 | 209 | ## Reference Data 210 | 211 | ### US Zip5 Codes 212 | 213 | [Fun with Zip Codes](http://www.mattcutts.com/blog/fun-with-zip-codes/) 214 | 215 | [US Census Tigerline Data: Zip Codes](http://www.census.gov/tiger/tms/gazetteer/zips.txt) 216 | 217 | 218 | ## License 219 | 220 | This code is covered under the same as Clojure. 221 | 222 | # Authors 223 | 224 | Kyle Burton 225 | 226 | Paul Santa Clara 227 | 228 | Tim Visher 229 | -------------------------------------------------------------------------------- /src/clj_etl_utils/sequences.clj: -------------------------------------------------------------------------------- 1 | (ns 2 | ^{:doc "Sequences helpers and extension functions." 3 | :author "Kyle Burton"} 4 | clj-etl-utils.sequences) 5 | 6 | (defn make-stream-sampler 7 | "(make-stream-sampler rand-int) 8 | 9 | Creates a stream sampling function from a given random integer source (defaults to rand-int). 10 | The returned sampling function takes the following arguments: 11 | 12 | - population sequnce 13 | - total-population-size 14 | - remaining-samples-needed 15 | - update-notification-callback [optional] 16 | " 17 | 18 | ([] 19 | (make-stream-sampler rand-int)) 20 | ([rand-int-fn] 21 | (fn sampler-fn [[item & population :as population-seq] 22 | population-size 23 | remaining-samples-needed 24 | & [update-fn]] 25 | (if (or (zero? remaining-samples-needed) (empty? population-seq)) 26 | nil 27 | (if (< (rand-int-fn population-size) remaining-samples-needed) 28 | (do 29 | (when update-fn 30 | (update-fn)) 31 | (lazy-cat 32 | [item] 33 | (sampler-fn population 34 | (dec population-size) 35 | (dec remaining-samples-needed) 36 | update-fn))) 37 | (recur population 38 | (dec population-size) 39 | remaining-samples-needed 40 | update-fn)))))) 41 | 42 | ;; TODO: factor out the random, allow it to be passed in 43 | (def 44 | #^{:doc "random-sample-seq 45 | (population-seq population-size num-samples-needed & [update-fn]) 46 | 47 | Filters a sequence taking a random sample of the elements from the 48 | population sequence. The random sample will be evenly distributed 49 | over the given population-size. The sample will terminate when the 50 | sequence runs out or the requested sample size has been reached. NB: 51 | Given the probabalistic nature of the random sampling process the 52 | sample size may not been precisely met. If an update-fn is supplied, 53 | it will be invoked every time an element is selected by the random 54 | sampling process."} 55 | random-sample-seq (make-stream-sampler)) 56 | 57 | 58 | (comment 59 | 60 | (= 1 (count (random-sample-seq 61 | (take 10 (iterate inc 1)) 62 | 10 63 | 1 64 | (fn [] (printf "foof\n"))))) 65 | 66 | (let [x (fn thing [a b & [c]] 67 | [a b c])] 68 | (x 1 2 (fn [] 2))) 69 | 70 | (sort (apply concat (for [ii (range 0 10)] 71 | (random-sample-seq 72 | (take 100 (iterate inc 1)) 73 | 100 74 | 10)))) 75 | 76 | ) 77 | 78 | (defn make-reservoir-sampler 79 | "(make-reservoir-sampler reservoir-size 80 | (make-reservoir-sampler reservoir-size rand-int-fn) 81 | 82 | Returns a function that will take a reservoir sample from a sequence (see: https://en.wikipedia.org/wiki/Reservoir_sampling)." 83 | ([reservoir-size] 84 | (make-reservoir-sampler reservoir-size rand-int)) 85 | ([reservoir-size rand-int-fn] 86 | (fn reservoir-sampler [elements] 87 | ;; fill the reservoir w/the first reservoir-size elements 88 | (loop [elements elements 89 | reservoir [] 90 | ii 0] 91 | (cond 92 | ;; completed? 93 | (empty? elements) 94 | reservoir 95 | 96 | ;; need to fill the reservoir? 97 | (< (count reservoir) reservoir-size) 98 | (recur 99 | (rest elements) 100 | (conj reservoir (first elements)) 101 | (inc ii)) 102 | 103 | :attempt-sampling 104 | (let [jj (rand-int-fn (inc ii))] 105 | (if (< jj reservoir-size) 106 | (recur 107 | (rest elements) 108 | (assoc reservoir jj (first elements)) 109 | (inc ii)) 110 | (recur 111 | (rest elements) 112 | reservoir 113 | (inc ii))))))))) 114 | 115 | (defn reservoir-sample-seq [reservoir-size elements] 116 | ((make-reservoir-sampler reservoir-size) elements)) 117 | 118 | 119 | (comment 120 | 121 | ((make-reservoir-sampler 10) (range 10)) 122 | [0 1 2 3 4 5 6 7 8 9] 123 | 124 | ((make-reservoir-sampler 10) (range 100)) 125 | [53 83 73 70 91 78 49 7 52 9] 126 | 127 | ((make-reservoir-sampler 10) (range 10000)) 128 | [5388 3861 8622 9700 4658 5334 1517 8222 8591 6114] 129 | 130 | (let [sampler (make-reservoir-sampler 1)] 131 | (->> 132 | (range 99) 133 | (mapv (fn [trial] (sampler [1 2 3]))) 134 | (reduce (fn [acc choice] 135 | (assoc acc choice 136 | (inc (acc choice 0)))) 137 | {}))) 138 | 139 | 140 | 141 | 142 | ) 143 | 144 | 145 | 146 | ;; TODO: remove this, it is a re-implementation of partition-by which 147 | ;; is in the core in clojure 1.2 148 | (defn 149 | ^{:doc " 150 | (group-seq identity [1 1 2 3 4 5 5 5 6 1 1]) 151 | ;; => [[1 1] [2] [3] [4] [5 5 5] [6] [1 1]] 152 | " 153 | :added "1.0.0"} 154 | group-with [f s] 155 | 156 | (if (empty? s) 157 | nil 158 | (let [k (f (first s)) 159 | pred #(= k (f %)) 160 | [grp rst] (split-with pred s)] 161 | (lazy-cat 162 | [grp] 163 | (group-with f rst))))) 164 | 165 | (comment 166 | 167 | (group-with identity [1 1 2 3 3 4 4 4 5 6 7 8 9 9 9 9 9 9 9]) 168 | ([1 1] [2] [3 3] [4 4 4] [5] [6] [7] [8] [9 9 9 9 9 9 9]) 169 | 170 | (group-with 171 | (fn [#^String s] 172 | (.charAt s 0)) 173 | ["this" "that" "other" "othello" "flub" "flubber" "flugelhorn" "potatoe"]) 174 | 175 | ) 176 | 177 | (defn ^{:doc "Given a comparator function (-1, 0, 1) and a set of 178 | sequences, this function will return the minimal head value across all 179 | of the given sequences, and the set of sequences with the minimal 180 | value dropped from the sequence it was identified within." 181 | :added "1.0.0"} 182 | minval-from-seqs [cmpfn sequences] 183 | (let [sqs (sort #(cmpfn (first %1) (first %2)) (filter (complement empty?) sequences))] 184 | [(first (first sqs)) 185 | (filter (complement empty?) (conj (drop 1 sqs) (drop 1 (first sqs))))])) 186 | 187 | (comment 188 | 189 | (minval-from-seqs 190 | (fn [a b] 191 | (cond (< a b) -1 192 | (= a b) 0 193 | :else 1)) 194 | [[2 2 4 6 8 10 12 14 16 18]]) 195 | 196 | 197 | (minval-from-seqs 198 | (fn [a b] 199 | (cond (< a b) -1 200 | (= a b) 0 201 | :else 1)) 202 | [[2 2 4 6 8 10 12 14 16 18] 203 | [1 2 3 3 3 9 9 9 14 15 16 20 20 20] 204 | [-5 0 0 0 99 999]]) 205 | 206 | ) 207 | 208 | 209 | (defn ^{:doc "Given a comparator function and one or more sequences 210 | this function will merge them taking the next most minimal value 211 | from each of the given sequences. A good way to think about this 212 | is: if you have a set of already sorted sequences, this function 213 | will produce a merged, sorted sequence that combines the given 214 | sequences. 215 | 216 | Example: 217 | 218 | (merge-seqs 219 | (fn [a b] 220 | (cond (< a b) -1 221 | (= a b) 0 222 | :else 1)) 223 | [2 2 4 6 8 10 12 14 16 18] 224 | [1 2 3 3 3 9 9 9 14 15 16 20 20 20] 225 | [-5 0 0 0 99 999]) 226 | 227 | (-5 0 0 0 1 2 2 2 3 3 3 4 6 8 9 9 9 10 12 14 14 15 16 16 18 20 20 20 99 999) 228 | 229 | " 230 | :added "1.0.0"} 231 | merge-seqs [cmpfn & sequences] 232 | (if (or (empty? sequences) 233 | (every? empty? sequences)) 234 | nil 235 | (let [[minval rest-seqs] (minval-from-seqs cmpfn sequences)] 236 | (lazy-cat 237 | [minval] 238 | (apply merge-seqs cmpfn rest-seqs))))) 239 | 240 | (comment 241 | 242 | (merge-seqs 243 | (fn [a b] 244 | (cond (< a b) -1 245 | (= a b) 0 246 | :else 1)) 247 | [2 2 4 6 8 10 12 14 16 18] 248 | [1 2 3 3 3 9 9 9 14 15 16 20 20 20] 249 | [-5 0 0 0 99 999]) 250 | 251 | (-5 0 0 0 1 2 2 2 3 3 3 4 6 8 9 9 9 10 12 14 14 15 16 16 18 20 20 20 99 999) 252 | 253 | (merge-seqs 254 | (fn [a b] 255 | (cond (< a b) -1 256 | (= a b) 0 257 | :else 1)) 258 | [2 2 4 6 8 10 12 14 16 18] 259 | [1 2 3 3 3 9 9 9 14 15 16 20 20 20]) 260 | 261 | (merge-seqs 262 | (fn [a b] 263 | (cond (< a b) -1 264 | (= a b) 0 265 | :else 1)) 266 | [] 267 | [] 268 | [1]) 269 | 270 | 271 | ) 272 | 273 | (defn 274 | ^{:doc "Enumerates all pairs of items. 275 | 276 | " 277 | :added "1.0.0"} 278 | all-pairs [things] 279 | (for [this things] 280 | (for [that (remove #(= this %1) things)] 281 | [this that]))) 282 | 283 | (defn n-choose-2 [n] 284 | (apply + (range 1 n))) 285 | 286 | (comment 287 | 288 | (all-pairs [1 2 3]) 289 | 290 | ) 291 | 292 | 293 | (defn 294 | ^{:doc "Given a sequence of numeric values, results in a lazy 295 | sequence of the running averge of those values. 296 | (running-avg-seq [1 2 3 4 5 6 7 8 9 9 8 7 6 5 4 3 2 1]) 297 | => (1 3/2 2 5/2 3 7/2 4 9/2 5 27/5 62/11 23/4 298 | 75/13 40/7 28/5 87/16 89/17 5) 299 | 300 | (running-avg-seq [1.0 2 3 4 5 6 7 8 9 9 8 7 6 5 4 3 2 1]) 301 | => (1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.4 302 | 5.636363636363637 5.75 5.769230769230769 303 | 5.714285714285714 5.6 5.4375 5.235294117647059 304 | 5.0) 305 | " 306 | :added "1.0.20"} 307 | running-avg-seq [s] 308 | (letfn [(averager 309 | [s items-seen total] 310 | (if (empty? s) 311 | nil 312 | (lazy-cat 313 | [(/ (+ total (first s)) (inc items-seen))] 314 | (averager 315 | (drop 1 s) 316 | (inc items-seen) 317 | (+ total (first s))))))] 318 | (averager s 0 0))) 319 | 320 | (comment 321 | 322 | (running-avg-seq []) 323 | (running-avg-seq [1]) 324 | (running-avg-seq [1 2 3]) 325 | (running-avg-seq [1 2 3 4 5 6 7 8 9 9 8 7 6 5 4 3 2 1]) 326 | 327 | ) 328 | 329 | (defn 330 | ^{:doc "Given a numeric sequence, results in a lazy sequence 331 | of the average of the `n' elements from the sequence - averaging over 332 | a window of size `n'. 333 | 334 | (windowed-avg-seq 3 [1 1 1 1 2 2 2 2 2]) 335 | => (1 1 4/3 5/3 2 2) 336 | 337 | (map #(* 1.0 %) (windowed-avg-seq 3 [1 1 1 1 2 2 2 2 2])) 338 | => (1.0 1.0 1.333333333333333 1.666666666666667 2.0 2.0) 339 | 340 | " 341 | :added "1.0.20"} 342 | windowed-avg-seq [n s] 343 | (letfn [(averager 344 | [s buffer] 345 | (if (empty? s) 346 | nil 347 | (lazy-cat 348 | [(/ (apply + buffer) 349 | (count buffer))] 350 | (averager (drop 1 s) 351 | (concat (drop 1 buffer) 352 | [(first s)])))))] 353 | (averager (drop n s) 354 | (take n s)))) 355 | 356 | (comment 357 | (windowed-avg-seq 3 [1 1 1 1 2 2 2 2 2]) 358 | 359 | ) 360 | 361 | -------------------------------------------------------------------------------- /src/clj_etl_utils/landmark_parser.clj: -------------------------------------------------------------------------------- 1 | (ns ^{:doc "Semi-structured Text parsing library. The library uses an 2 | automation and a command set to extract portions of a document. 3 | Atomic commands are instructions such as: start, end 4 | forward-char, backward-char, forward-past, rewind-to and so on. 5 | The landmarks can be either literal or logical (regular 6 | expressions, 'types', etc). This extractor can often succeed in 7 | situations where there is mixed media, or inconsisten structure. 8 | It should be able to operate on a document as long as there are 9 | identifiable sets of landmarks." 10 | :author "Kyle Burton"} 11 | clj-etl-utils.landmark-parser 12 | (:import [java.util.regex Pattern Matcher]) 13 | (:use [clj-etl-utils.lang-utils :only (raise seq-like?)]) 14 | (:require 15 | [clj-etl-utils.regex :as regex] 16 | [clojure.tools.logging :as log])) 17 | 18 | 19 | (declare lp-commands) 20 | 21 | (defstruct parser :pos :doc :ldoc :doclen) 22 | 23 | (defn make-parser [#^String doc] 24 | (struct-map parser 25 | :pos (atom 0) 26 | :ldoc (.toLowerCase doc) 27 | :doclen (.length doc) 28 | :doc doc)) 29 | 30 | (defn forward-past [parser ^String landmark] 31 | (let [pos (.indexOf ^String (:ldoc parser) (.toLowerCase landmark) (int @(:pos parser)))] 32 | (if (= -1 pos) 33 | false 34 | (do 35 | (reset! (:pos parser) (+ pos (count landmark))) 36 | @(:pos parser))))) 37 | 38 | (defn forward-to [parser ^String landmark] 39 | (let [start (:post parser) 40 | pos (.indexOf ^String (:ldoc parser) ^String (.toLowerCase landmark) (int @(:pos parser)))] 41 | (if (= -1 pos) 42 | false 43 | (do 44 | (reset! (:pos parser) pos) 45 | @(:pos parser))))) 46 | 47 | (defn set-pos! [parser pos] 48 | (if (or (> pos (:doclen parser)) 49 | (< pos 0)) 50 | false 51 | (do 52 | (reset! (:pos parser) pos) 53 | true))) 54 | 55 | ;; TODO: remove the need to have 1 and only 1 parameter to each 56 | ;; command (see do-commands) 57 | (defn move-to-start [p & [_]] 58 | (reset! (:pos p) 0)) 59 | 60 | (defn move-to-end [p & [_]] 61 | (reset! (:pos p) 62 | (:doclen p))) 63 | 64 | (defn forward [parser cnt] 65 | (let [pos (+ cnt @(:pos parser))] 66 | (if (> pos (:doclen parser)) 67 | false 68 | (do 69 | (reset! (:pos parser) pos) 70 | true)))) 71 | 72 | (defn rewind [parser cnt] 73 | (let [pos (- @(:pos parser) cnt)] 74 | (if (< pos 0) 75 | false 76 | (do 77 | (reset! (:pos parser) pos) 78 | true)))) 79 | 80 | 81 | (defn rewind-to [p ^String landmark] 82 | (let [pos (.lastIndexOf ^String (:ldoc p) 83 | (.toLowerCase landmark) 84 | (int @(:pos p)))] 85 | (if (= -1 pos) 86 | false 87 | (do 88 | (reset! (:pos p) (+ pos (count landmark))) 89 | @(:pos p))))) 90 | 91 | (defn rewind-past [p ^String landmark] 92 | (let [pos (.lastIndexOf ^String (:ldoc p) 93 | (.toLowerCase landmark) 94 | (int @(:pos p)))] 95 | (if (= -1 pos) 96 | false 97 | (do 98 | (reset! (:pos p) pos) 99 | @(:pos p))))) 100 | 101 | ;; support either '((:fp "foo") (:fp "bar")) 102 | ;; or '(:fp "foo" :fp "bar") 103 | (defn parse-cmds [cmds] 104 | (cond (and (seq-like? cmds) 105 | (seq-like? (first cmds)) 106 | (= 2 (count (first cmds)))) 107 | cmds 108 | (= 1 (mod (count cmds) 2)) 109 | (raise (format "parse-cmds: error, odd number of commands (expected even, symbol/landmark): cmds=%s" cmds)) 110 | true 111 | (partition 2 cmds))) 112 | 113 | (defn apply-commands [parser & cmds] 114 | (loop [[[cmd & args] & cmds] (parse-cmds cmds)] 115 | (if cmd 116 | (do 117 | (if (apply (lp-commands cmd) (cons parser args)) 118 | (do 119 | (recur cmds)) 120 | false)) 121 | true))) 122 | 123 | (defn do-commands [parser cmds] 124 | (loop [[[cmd & args] & cmds] (parse-cmds cmds)] 125 | (if cmd 126 | (do 127 | (if (not (lp-commands cmd)) 128 | (raise "Error: invalid command: %s" cmd)) 129 | (if (apply (lp-commands cmd) (cons parser args)) 130 | (do 131 | (recur cmds)) 132 | false)) 133 | true))) 134 | 135 | (defn forward-past-regex 136 | "See also regex/common-regexes" 137 | [p regex] 138 | (log/infof "forward-past-regex regex=%s" regex) 139 | (let [^java.util.regex.Pattern pat (if (and (keyword? regex) (regex regex/common-regexes)) 140 | (regex regex/common-regexes) 141 | (Pattern/compile (str regex) (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) 142 | ^java.util.regex.Matcher m (.matcher pat (:doc p))] 143 | (log/infof "forward-past-regex: pat=%s m=%s" pat m) 144 | (if (.find m @(:pos p)) 145 | (do 146 | (log/infof "forward-past-regex: found reg:%s at:(%d,%d,)" regex (.start m) (.end m)) 147 | (reset! (:pos p) (.end m)) 148 | @(:pos p)) 149 | false))) 150 | 151 | (defn forward-to-regex [p regex] 152 | "See also regex/common-regexes" 153 | (let [pat (if (and (keyword? regex) (regex regex/common-regexes)) 154 | (regex regex/common-regexes) 155 | (Pattern/compile (str regex) (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) 156 | m ^java.util.regex.Matcher (.matcher ^java.util.regex.Pattern pat ^String (:doc p))] 157 | (log/infof "forward-to-regex: using pat=%s" pat) 158 | (if (.find m @(:pos p)) 159 | (do 160 | (reset! (:pos p) (.start m)) 161 | @(:pos p)) 162 | false))) 163 | 164 | 165 | (def lp-commands 166 | {:apply-commands apply-commands 167 | :a apply-commands 168 | :do-commands do-commands 169 | :d do-commands 170 | :forward forward 171 | :f forward 172 | :forward-past forward-past 173 | :fp forward-past 174 | :forward-past-regex forward-past-regex 175 | :fpr forward-past-regex 176 | :forward-to forward-to 177 | :ft forward-to 178 | :forward-to-regex forward-to-regex 179 | :ftr forward-to-regex 180 | :rewind rewind 181 | :r rewind 182 | :rewind-to rewind-to 183 | :rt rewind-to 184 | :rewind-past rewind-past 185 | :rp rewind-past 186 | :beginning move-to-start 187 | :b move-to-start 188 | :start move-to-start 189 | :s move-to-start 190 | :end move-to-end 191 | :e move-to-end}) 192 | 193 | 194 | (defn doc-substr [parser cnt] 195 | (.substring ^String (:doc parser) 196 | @(:pos parser) 197 | (+ @(:pos parser) 198 | cnt))) 199 | 200 | (defn extract [p start-cmds end-cmds] 201 | (let [orig-pos @(:pos p)] 202 | (if (do-commands p start-cmds) 203 | (let [spos @(:pos p)] 204 | (if (do-commands p end-cmds) 205 | (.substring ^String (:doc p) 206 | spos 207 | @(:pos p)) 208 | (do (set-pos! p orig-pos) 209 | false))) 210 | (do (set-pos! p orig-pos) 211 | false)))) 212 | 213 | (defn extract-from [html start-cmds end-cmds] 214 | (extract (make-parser html) start-cmds end-cmds)) 215 | 216 | 217 | (defn extract-all [p start-cmds end-cmds] 218 | (loop [res []] 219 | (if (do-commands p start-cmds) 220 | (let [spos @(:pos p)] 221 | (if (do-commands p end-cmds) 222 | (recur (conj res (.substring ^String (:doc p) spos @(:pos p)))) 223 | res)) 224 | res))) 225 | 226 | (defn extract-all-from [html start-cmds end-cmds] 227 | (extract-all (make-parser html) start-cmds end-cmds)) 228 | 229 | (defn table-rows [html] 230 | (extract-all-from html 231 | '(:ft "cells [html] 235 | (extract-all-from html 236 | '(:fp "") 237 | '(:ft ""))) 238 | 239 | (defn html->links [html] 240 | (extract-all-from html 241 | '(:fp "href=\"") 242 | '(:ft "\""))) 243 | 244 | (defn html->anchors [html] 245 | (extract-all-from html 246 | '(:ft ""))) 248 | 249 | (defn anchor->href [html] 250 | (first (regex/re-find-first #"href=\"([^\"]+)\"" html))) 251 | 252 | (defn anchor->body [html] 253 | (first (regex/re-find-first #">(.+?)" html))) 254 | 255 | (defn html-find-link-with-body [^String html ^String text] 256 | (first 257 | (regex/re-find-first 258 | #"href=\"([^\"]+)\"" 259 | (first 260 | (filter #(.contains ^String % text) 261 | (html->anchors html)))))) 262 | 263 | (defn html->tables [html] 264 | (extract-all-from html 265 | '(:ft ""))) 267 | 268 | (defn html-table->matrix [html] 269 | (map row->cells (table-rows html))) 270 | 271 | (defn html->form-blocks [html] 272 | (extract-all-from html 273 | '(:ft ""))) 275 | 276 | 277 | ;; ( def p (make-parser (com.github.kyleburton.sandbox.web/get->string "http://asymmetrical-view.com/"))) 278 | ;; (forward-past-regex p :num-real) 279 | ;; (forward-to-regex p #"\d{4}") 280 | 281 | ;; ( def pat (Pattern/compile (str #"\d{4}") (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) 282 | ;; ( def m (.matcher pat (:doc p))) 283 | 284 | 285 | ;; (html->links (com.github.kyleburton.sandbox.web/get->string "http://asymmetrical-view.com/")) 286 | 287 | (defn parse-input-element [html] 288 | {:tag :input 289 | :type (first (regex/re-find-first "(?-ims:type=\"([^\"]+)\")" html)) 290 | :name (first (regex/re-find-first "(?-ims:name=\"([^\"]+)\")" html)) 291 | :value (first (regex/re-find-first "(?-ims:value=\"([^\"]+)\")" html)) 292 | }) 293 | 294 | ;; This technique won't work reliably...need to implement :forward-to-first-of '(:ft "" "" 295 | ;; TODO: parse out textarea, button and select 296 | (defn parse-form-elements [html] 297 | (apply concat [(map parse-input-element (extract-all-from html '(:ft ""))) 298 | ;; (extract-all-from html '(:ft "")) 299 | ;; (extract-all-from html '(:ft "")) 300 | ;; (extract-all-from html '(:ft "")) 301 | ])) 302 | 303 | ;;(parse-form-elements (first (html->form-blocks com.github.kyleburton.sandbox.web/html))) 304 | 305 | (defn parse-form [html] 306 | {:method (or (first (regex/re-find-first "(?-ims:method=\"([^\"]+)\")" html)) 307 | "GET") 308 | :action (or (first (regex/re-find-first "(?-ims:action=\"([^\"]+)\")" html)) 309 | nil) 310 | :params (vec (parse-form-elements html)) 311 | }) 312 | -------------------------------------------------------------------------------- /resources/clj_etl_utils/ref_data/usps-abbreviations.tab: -------------------------------------------------------------------------------- 1 | PRIMARY_NAME COMMON_ABBREVIATION USPS_ABBREVIATION 2 | ALLEY ALLEE ALY 3 | ALLEY ALLEY ALY 4 | ALLEY ALLY ALY 5 | ALLEY ALY ALY 6 | ANNEX ANEX ANX 7 | ANNEX ANNEX ANX 8 | ANNEX ANNX ANX 9 | ANNEX ANX ANX 10 | ARCADE ARC ARC 11 | ARCADE ARCADE ARC 12 | AVENUE AV AVE 13 | AVENUE AVE AVE 14 | AVENUE AVEN AVE 15 | AVENUE AVENU AVE 16 | AVENUE AVENUE AVE 17 | AVENUE AVN AVE 18 | AVENUE AVNUE AVE 19 | BAYOO BAYOO BYU 20 | BAYOO BAYOU BYU 21 | BEACH BCH BCH 22 | BEACH BEACH BCH 23 | BEND BEND BND 24 | BEND BND BND 25 | BLUFF BLF BLF 26 | BLUFF BLUF BLF 27 | BLUFF BLUFF BLF 28 | BLUFFS BLUFFS BLFS 29 | BOTTOM BOT BTM 30 | BOTTOM BOTTM BTM 31 | BOTTOM BOTTOM BTM 32 | BOTTOM BTM BTM 33 | BOULEVARD BLVD BLVD 34 | BOULEVARD BOUL BLVD 35 | BOULEVARD BOULEVARD BLVD 36 | BOULEVARD BOULV BLVD 37 | BRANCH BR BR 38 | BRANCH BRANCH BR 39 | BRANCH BRNCH BR 40 | BRIDGE BRDGE BRG 41 | BRIDGE BRIDGE BRG 42 | BROOK BRK BRK 43 | BROOK BROOK BRK 44 | BROOKS BROOKS BRKS 45 | BURG BURG BG 46 | BURGS BURGS BGS 47 | BYPASS BYP BYP 48 | BYPASS BYPA BYP 49 | BYPASS BYPAS BYP 50 | BYPASS BYPASS BYP 51 | BYPASS BYPS BYP 52 | CAMP CAMP CP 53 | CAMP CMP CP 54 | CAMP CP CP 55 | CANYON CANYN CYN 56 | CANYON CANYON CYN 57 | CANYON CNYN CYN 58 | CANYON CYN CYN 59 | CAPE CAPE CPE 60 | CAPE CPE CPE 61 | CAUSEWAY CAUSEWAY CSWY 62 | CAUSEWAY CAUSWAY CSWY 63 | CAUSEWAY CSWY CSWY 64 | CENTER CEN CTR 65 | CENTER CENT CTR 66 | CENTER CENTER CTR 67 | CENTER CENTR CTR 68 | CENTER CENTRE CTR 69 | CENTER CNTER CTR 70 | CENTER CNTR CTR 71 | CENTER CTR CTR 72 | CENTERS CENTERS CTRS 73 | CIRCLE CIR CIR 74 | CIRCLE CIRC CIR 75 | CIRCLE CIRCL CIR 76 | CIRCLE CIRCLE CIR 77 | CIRCLE CRCL CIR 78 | CIRCLE CRCLE CIR 79 | CIRCLES CIRCLES CIRS 80 | CLIFF CLF CLF 81 | CLIFF CLIFF CLF 82 | CLIFFS CLFS CLFS 83 | CLIFFS CLIFFS CLFS 84 | CLUB CLB CLB 85 | CLUB CLUB CLB 86 | COMMON COMMON CMN 87 | CORNER COR COR 88 | CORNER CORNER COR 89 | CORNERS CORNERS CORS 90 | CORNERS CORS CORS 91 | COURSE COURSE CRSE 92 | COURSE CRSE CRSE 93 | COURT COURT CT 94 | COURT CRT CT 95 | COURT CT CT 96 | COURTS COURTS CTS 97 | COURTS CT CTS 98 | COVE COVE CV 99 | COVE CV CV 100 | COVES COVES CVS 101 | CREEK CK CRK 102 | CREEK CR CRK 103 | CREEK CREEK CRK 104 | CREEK CRK CRK 105 | CRESCENT CRECENT CRES 106 | CRESCENT CRES CRES 107 | CRESCENT CRESCENT CRES 108 | CRESCENT CRESENT CRES 109 | CRESCENT CRSCNT CRES 110 | CRESCENT CRSENT CRES 111 | CRESCENT CRSNT CRES 112 | CREST CREST CRST 113 | CROSSING CROSSING XING 114 | CROSSING CRSSING XING 115 | CROSSING CRSSNG XING 116 | CROSSING XING XING 117 | CROSSROAD CROSSROAD XRD 118 | CURVE CURVE CURV 119 | DALE DALE DL 120 | DALE DL DL 121 | DAM DAM DM 122 | DAM DM DM 123 | DIVIDE DIV DV 124 | DIVIDE DIVIDE DV 125 | DIVIDE DV DV 126 | DIVIDE DVD DV 127 | DRIVE DR DR 128 | DRIVE DRIV DR 129 | DRIVE DRIVE DR 130 | DRIVE DRV DR 131 | DRIVES DRIVES DRS 132 | ESTATE EST EST 133 | ESTATE ESTATE EST 134 | ESTATES ESTATES ESTS 135 | ESTATES ESTS ESTS 136 | EXPRESSWAY EXP EXPY 137 | EXPRESSWAY EXPR EXPY 138 | EXPRESSWAY EXPRESS EXPY 139 | EXPRESSWAY EXPRESSWAY EXPY 140 | EXPRESSWAY EXPW EXPY 141 | EXPRESSWAY EXPY EXPY 142 | EXTENSION EXT EXT 143 | EXTENSION EXTENSION EXT 144 | EXTENSION EXTN EXT 145 | EXTENSION EXTNSN EXT 146 | EXTENSIONS EXTENSIONS EXTS 147 | EXTENSIONS EXTS EXTS 148 | FALL FALL FALL 149 | FALLS FALLS FLS 150 | FALLS FLS FLS 151 | FERRY FERRY FRY 152 | FERRY FRRY FRY 153 | FERRY FRY FRY 154 | FIELD FIELD FLD 155 | FIELD FLD FLD 156 | FIELDS FIELDS FLDS 157 | FIELDS FLDS FLDS 158 | FLAT FLAT FLT 159 | FLAT FLT FLT 160 | FLATS FLATS FLTS 161 | FLATS FLTS FLTS 162 | FORD FORD FRD 163 | FORD FRD FRD 164 | FORDS FORDS FRDS 165 | FOREST FOREST FRST 166 | FOREST FORESTS FRST 167 | FOREST FRST FRST 168 | FORGE FORG FRG 169 | FORGE FORGE FRG 170 | FORGE FRG FRG 171 | FORGES FORGES FRGS 172 | FORK FORK FRK 173 | FORK FRK FRK 174 | FORKS FORKS FRKS 175 | FORKS FRKS FRKS 176 | FORT FORT FT 177 | FORT FRT FT 178 | FORT FT FT 179 | FREEWAY FREEWAY FWY 180 | FREEWAY FREEWY FWY 181 | FREEWAY FRWAY FWY 182 | FREEWAY FRWY FWY 183 | FREEWAY FWY FWY 184 | GARDEN GARDEN GDN 185 | GARDEN GARDN GDN 186 | GARDEN GDN GDN 187 | GARDEN GRDEN GDN 188 | GARDEN GRDN GDN 189 | GARDENS GARDENS GDNS 190 | GARDENS GDNS GDNS 191 | GARDENS GRDNS GDNS 192 | GATEWAY GATEWAY GTWY 193 | GATEWAY GATEWY GTWY 194 | GATEWAY GATWAY GTWY 195 | GATEWAY GTWAY GTWY 196 | GATEWAY GTWY GTWY 197 | GLEN GLEN GLN 198 | GLEN GLN GLN 199 | GLENS GLENS GLNS 200 | GREEN GREEN GRN 201 | GREEN GRN GRN 202 | GREENS GREENS GRNS 203 | GROVE GROV GRV 204 | GROVE GROVE GRV 205 | GROVE GRV GRV 206 | GROVES GROVES GRVS 207 | HARBOR HARB HBR 208 | HARBOR HARBOR HBR 209 | HARBOR HARBR HBR 210 | HARBOR HBR HBR 211 | HARBOR HRBOR HBR 212 | HARBORS HARBORS HBRS 213 | HAVEN HAVEN HVN 214 | HAVEN HAVN HVN 215 | HAVEN HVN HVN 216 | HEIGHTS HEIGHT HTS 217 | HEIGHTS HEIGHTS HTS 218 | HEIGHTS HGTS HTS 219 | HEIGHTS HT HTS 220 | HEIGHTS HTS HTS 221 | HIGHWAY HIGHWAY HWY 222 | HIGHWAY HIGHWY HWY 223 | HIGHWAY HIWAY HWY 224 | HIGHWAY HIWY HWY 225 | HIGHWAY HWAY HWY 226 | HIGHWAY HWY HWY 227 | HILL HILL HL 228 | HILL HL HL 229 | HILLS HILLS HLS 230 | HILLS HLS HLS 231 | HOLLOW HLLW HOLW 232 | HOLLOW HOLLOW HOLW 233 | HOLLOW HOLLOWS HOLW 234 | HOLLOW HOLW HOLW 235 | HOLLOW HOLWS HOLW 236 | INLET INLET INLT 237 | INLET INLT INLT 238 | ISLAND IS IS 239 | ISLAND ISLAND IS 240 | ISLAND ISLND IS 241 | ISLANDS ISLANDS ISS 242 | ISLANDS ISLNDS ISS 243 | ISLANDS ISS ISS 244 | ISLE ISLE ISLE 245 | ISLE ISLES ISLE 246 | JUNCTION JCT JCT 247 | JUNCTION JCTION JCT 248 | JUNCTION JCTN JCT 249 | JUNCTION JUNCTION JCT 250 | JUNCTION JUNCTN JCT 251 | JUNCTION JUNCTON JCT 252 | JUNCTIONS JCTNS JCTS 253 | JUNCTIONS JCTS JCTS 254 | JUNCTIONS JUNCTIONS JCTS 255 | KEY KEY KY 256 | KEY KY KY 257 | KEYS KEYS KYS 258 | KEYS KYS KYS 259 | KNOLL KNL KNL 260 | KNOLL KNOL KNL 261 | KNOLL KNOLL KNL 262 | KNOLLS KNLS KNLS 263 | KNOLLS KNOLLS KNLS 264 | LAKE LAKE LK 265 | LAKE LK LK 266 | LAKES LAKES LKS 267 | LAKES LKS LKS 268 | LAND LAND LAND 269 | LANDING LANDING LNDG 270 | LANDING LNDG LNDG 271 | LANDING LNDNG LNDG 272 | LANE LA LN 273 | LANE LANE LN 274 | LANE LANES LN 275 | LANE LN LN 276 | LIGHT LGT LGT 277 | LIGHT LIGHT LGT 278 | LIGHTS LIGHTS LGTS 279 | LOAF LF LF 280 | LOAF LOAF LF 281 | LOCK LCK LCK 282 | LOCK LOCK LCK 283 | LOCKS LCKS LCKS 284 | LOCKS LOCKS LCKS 285 | LODGE LDG LDG 286 | LODGE LDGE LDG 287 | LODGE LODG LDG 288 | LODGE LODGE LDG 289 | LOOP LOOP LOOP 290 | LOOP LOOPS LOOP 291 | MALL MALL MALL 292 | MANOR MANOR MNR 293 | MANOR MNR MNR 294 | MANORS MANORS MNRS 295 | MANORS MNRS MNRS 296 | MEADOW MDW MDW 297 | MEADOW MEADOW MDW 298 | MEADOWS MDWS MDWS 299 | MEADOWS MEADOWS MDWS 300 | MEADOWS MEDOWS MDWS 301 | MEWS MEWS MEWS 302 | MILL MILL ML 303 | MILL ML ML 304 | MILLS MILLS MLS 305 | MILLS MLS MLS 306 | MISSION MISSION MSN 307 | MISSION MISSN MSN 308 | MISSION MSN MSN 309 | MISSION MSSN MSN 310 | MOTORWAY MOTORWAY MTWY 311 | MOUNT MNT MT 312 | MOUNT MOUNT MT 313 | MOUNT MT MT 314 | MOUNTAIN MNTAIN MTN 315 | MOUNTAIN MNTN MTN 316 | MOUNTAIN MOUNTAIN MTN 317 | MOUNTAIN MOUNTIN MTN 318 | MOUNTAIN MTIN MTN 319 | MOUNTAIN MTN MTN 320 | MOUNTAINS MNTNS MTNS 321 | MOUNTAINS MOUNTAINS MTNS 322 | NECK NCK NCK 323 | NECK NECK NCK 324 | ORCHARD ORCH ORCH 325 | ORCHARD ORCHARD ORCH 326 | ORCHARD ORCHRD ORCH 327 | OVAL OVAL OVAL 328 | OVAL OVL OVAL 329 | OVERPASS OVERPASS OPAS 330 | PARK PARK PARK 331 | PARK PK PARK 332 | PARK PRK PARK 333 | PARKS PARKS PARK 334 | PARKWAY PARKWAY PKWY 335 | PARKWAY PARKWY PKWY 336 | PARKWAY PKWAY PKWY 337 | PARKWAY PKWY PKWY 338 | PARKWAY PKY PKWY 339 | PARKWAYS PARKWAYS PKWY 340 | PARKWAYS PKWYS PKWY 341 | PASS PASS PASS 342 | PASSAGE PASSAGE PSGE 343 | PATH PATH PATH 344 | PATH PATHS PATH 345 | PIKE PIKE PIKE 346 | PIKE PIKES PIKE 347 | PINE PINE PNE 348 | PINES PINES PNES 349 | PINES PNES PNES 350 | PLACE PL PL 351 | PLACE PLACE PL 352 | PLAIN PLAIN PLN 353 | PLAIN PLN PLN 354 | PLAINS PLAINES PLNS 355 | PLAINS PLAINS PLNS 356 | PLAINS PLNS PLNS 357 | PLAZA PLAZA PLZ 358 | PLAZA PLZ PLZ 359 | PLAZA PLZA PLZ 360 | POINT POINT PT 361 | POINT PT PT 362 | POINTS POINTS PTS 363 | POINTS PTS PTS 364 | PORT PORT PRT 365 | PORT PRT PRT 366 | PORTS PORTS PRTS 367 | PORTS PRTS PRTS 368 | PRAIRIE PR PR 369 | PRAIRIE PRAIRIE PR 370 | PRAIRIE PRARIE PR 371 | PRAIRIE PRR PR 372 | RADIAL RAD RADL 373 | RADIAL RADIAL RADL 374 | RADIAL RADIEL RADL 375 | RADIAL RADL RADL 376 | RAMP RAMP RAMP 377 | RANCH RANCH RNCH 378 | RANCH RANCHES RNCH 379 | RANCH RNCH RNCH 380 | RANCH RNCHS RNCH 381 | RAPID RAPID RPD 382 | RAPID RPD RPD 383 | RAPIDS RAPIDS RPDS 384 | RAPIDS RPDS RPDS 385 | REST REST RST 386 | REST RST RST 387 | RIDGE RDG RDG 388 | RIDGE RDGE RDG 389 | RIDGE RIDGE RDG 390 | RIDGES RDGS RDGS 391 | RIDGES RIDGES RDGS 392 | RIVER RIV RIV 393 | RIVER RIVER RIV 394 | RIVER RIVR RIV 395 | RIVER RVR RIV 396 | ROAD RD RD 397 | ROAD ROAD RD 398 | ROADS RDS RDS 399 | ROADS ROADS RDS 400 | ROUTE ROUTE RTE 401 | ROW ROW ROW 402 | RUE RUE RUE 403 | RUN RUN RUN 404 | SHOAL SHL SHL 405 | SHOAL SHOAL SHL 406 | SHOALS SHLS SHLS 407 | SHOALS SHOALS SHLS 408 | SHORE SHOAR SHR 409 | SHORE SHORE SHR 410 | SHORE SHR SHR 411 | SHORES SHOARS SHRS 412 | SHORES SHORES SHRS 413 | SHORES SHRS SHRS 414 | SKYWAY SKYWAY SKWY 415 | SPRING SPG SPG 416 | SPRING SPNG SPG 417 | SPRING SPRING SPG 418 | SPRING SPRNG SPG 419 | SPRINGS SPGS SPGS 420 | SPRINGS SPNGS SPGS 421 | SPRINGS SPRINGS SPGS 422 | SPRINGS SPRNGS SPGS 423 | SPUR SPUR SPUR 424 | SPURS SPURS SPUR 425 | SQUARE SQ SQ 426 | SQUARE SQR SQ 427 | SQUARE SQRE SQ 428 | SQUARE SQU SQ 429 | SQUARE SQUARE SQ 430 | SQUARES SQRS SQS 431 | SQUARES SQUARES SQS 432 | STATION STA STA 433 | STATION STATION STA 434 | STATION STATN STA 435 | STATION STN STA 436 | STRAVENUE STRA STRA 437 | STRAVENUE STRAV STRA 438 | STRAVENUE STRAVE STRA 439 | STRAVENUE STRAVEN STRA 440 | STRAVENUE STRAVENUE STRA 441 | STRAVENUE STRAVN STRA 442 | STRAVENUE STRVN STRA 443 | STRAVENUE STRVNUE STRA 444 | STREAM STREAM STRM 445 | STREAM STREME STRM 446 | STREAM STRM STRM 447 | STREET ST ST 448 | STREET STR ST 449 | STREET STREET ST 450 | STREET STRT ST 451 | STREETS STREETS STS 452 | SUMMIT SMT SMT 453 | SUMMIT SUMIT SMT 454 | SUMMIT SUMITT SMT 455 | SUMMIT SUMMIT SMT 456 | TERRACE TER TER 457 | TERRACE TERR TER 458 | TERRACE TERRACE TER 459 | THROUGHWAY THROUGHWAY TRWY 460 | TRACE TRACE TRCE 461 | TRACE TRACES TRCE 462 | TRACE TRCE TRCE 463 | TRACK TRACK TRAK 464 | TRACK TRACKS TRAK 465 | TRACK TRAK TRAK 466 | TRACK TRK TRAK 467 | TRACK TRKS TRAK 468 | TRAFFICWAY TRAFFICWAY TRFY 469 | TRAFFICWAY TRFY TRFY 470 | TRAIL TR TRL 471 | TRAIL TRAIL TRL 472 | TRAIL TRAILS TRL 473 | TRAIL TRL TRL 474 | TRAIL TRLS TRL 475 | TUNNEL TUNEL TUNL 476 | TUNNEL TUNL TUNL 477 | TUNNEL TUNLS TUNL 478 | TUNNEL TUNNEL TUNL 479 | TUNNEL TUNNELS TUNL 480 | TURNPIKE TPKE TPKE 481 | TURNPIKE TRNPK TPKE 482 | TURNPIKE TRPK TPKE 483 | TURNPIKE TURNPIKE TPKE 484 | TURNPIKE TURNPK TPKE 485 | UNDERPASS UNDERPASS UPAS 486 | UNION UN UN 487 | UNION UNION UN 488 | UNIONS UNIONS UNS 489 | VALLEY VALLEY VLY 490 | VALLEY VALLY VLY 491 | VALLEY VLLY VLY 492 | VALLEY VLY VLY 493 | VALLEYS VALLEYS VLYS 494 | VALLEYS VLYS VLYS 495 | VIADUCT VDCT VIA 496 | VIADUCT VIA VIA 497 | VIADUCT VIADCT VIA 498 | VIADUCT VIADUCT VIA 499 | VIEW VIEW VW 500 | VIEW VW VW 501 | VIEWS VIEWS VWS 502 | VIEWS VWS VWS 503 | VILLAGE VILL VLG 504 | VILLAGE VILLAG VLG 505 | VILLAGE VILLAGE VLG 506 | VILLAGE VILLG VLG 507 | VILLAGE VILLIAGE VLG 508 | VILLAGE VLG VLG 509 | VILLAGES VILLAGES VLGS 510 | VILLAGES VLGS VLGS 511 | VILLE VILLE VL 512 | VILLE VL VL 513 | VISTA VISTA VIS 514 | VISTA VST VIS 515 | VISTA VSTA VIS 516 | WALK WALK WALK 517 | WALKS WALKS WALK 518 | WALL WALL WALL 519 | WAY WAY WAY 520 | WAY WY WAY 521 | WAYS WAYS WAYS 522 | WELL WELL WL 523 | WELLS WELLS WLS 524 | WELLS WLS WLS 525 | -------------------------------------------------------------------------------- /src/clj_etl_utils/lang_utils.clj: -------------------------------------------------------------------------------- 1 | (ns 2 | ^{:doc "Core, shared utility functions that aid in development with Clojure, or with development on the JVM." 3 | :author "Kyle Burton"} 4 | clj-etl-utils.lang-utils 5 | (:require 6 | clojure.set) 7 | (:import [org.apache.commons.io IOUtils] 8 | [java.net InetAddress])) 9 | 10 | (defn- raise-dispatch-fn [& [fst snd thrd & rst]] 11 | (cond 12 | ;; (and (isa? (class fst) Exception) 13 | ;; (isa? (class snd) Class) 14 | ;; (isa? (class thrd) String)) 15 | ;; [:type-to-throw :caused-by :fmt-and-args] 16 | 17 | ;; (and (isa? (class fst) Class) 18 | ;; (isa? (class snd) Exception) 19 | ;; (isa? (class thrd) String)) 20 | ;; [:caused-by :type-to-throw :fmt-and-args] 21 | 22 | (and 23 | (isa? (class fst) Throwable) 24 | (isa? (class snd) String) 25 | thrd) 26 | [:caused-by :fmt-and-args] 27 | 28 | (and 29 | (isa? (class fst) Throwable) 30 | (isa? (class snd) String) 31 | (not thrd)) 32 | [:caused-by :msg] 33 | 34 | (and 35 | (isa? (class fst) String) 36 | snd) 37 | [:fmt-and-args] 38 | 39 | :else 40 | :default)) 41 | 42 | 43 | (defmulti raise raise-dispatch-fn 44 | #_(fn [& [fst snd thrd & rst]] 45 | (cond 46 | ;; (and (isa? (class fst) Exception) 47 | ;; (isa? (class snd) Class) 48 | ;; (isa? (class thrd) String)) 49 | ;; [:type-to-throw :caused-by :fmt-and-args] 50 | 51 | ;; (and (isa? (class fst) Class) 52 | ;; (isa? (class snd) Exception) 53 | ;; (isa? (class thrd) String)) 54 | ;; [:caused-by :type-to-throw :fmt-and-args] 55 | 56 | (and 57 | (isa? (class fst) Throwable) 58 | (isa? (class snd) String) 59 | thrd) 60 | [:caused-by :fmt-and-args] 61 | 62 | (and 63 | (isa? (class fst) Throwable) 64 | (isa? (class snd) String) 65 | (not thrd)) 66 | [:caused-by :msg] 67 | 68 | (and 69 | (isa? (class fst) String) 70 | snd) 71 | [:fmt-and-args] 72 | 73 | :else 74 | :default))) 75 | 76 | 77 | (defmethod raise 78 | [:caused-by :fmt-and-args] 79 | [#^Throwable caused-by #^String fmt & args] 80 | (throw (RuntimeException. (apply format fmt args) caused-by))) 81 | 82 | (defmethod raise 83 | [:caused-by :msg] 84 | [#^Throwable caused-by #^String msg] 85 | (throw (RuntimeException. msg caused-by))) 86 | 87 | (defmethod raise 88 | [:fmt-and-args] 89 | [#^String fmt & args] 90 | (throw (RuntimeException. ^String (apply format fmt args)))) 91 | 92 | (defmethod raise 93 | :default 94 | [& stuff] 95 | (throw (RuntimeException. ^String (apply str stuff)))) 96 | 97 | (defn seq-like? [thing] 98 | (or (seq? thing) 99 | (vector? thing))) 100 | 101 | (defn resource-as-stream [res-url] 102 | (.getResourceAsStream (.getClass ^Object *ns*) res-url)) 103 | 104 | (defn resource-as-string [res-url] 105 | (let [^java.io.InputStream strm (resource-as-stream res-url)] 106 | (if (not strm) 107 | nil 108 | (with-open [istr strm] 109 | (IOUtils/toString istr))))) 110 | 111 | (defn rest-params->map [params] 112 | (reduce 113 | (fn [m pair] 114 | (apply assoc m pair)) 115 | {} 116 | (partition 2 params))) 117 | 118 | ;; (rest-params->map [:follow-redirects true :basic-auth {:user "bob" :pass "sekret"}]) 119 | 120 | (defn valid-keys? [params key-lists] 121 | (some 122 | (fn [key-list] 123 | (= (set key-list) 124 | (set (keys params)))) 125 | key-lists)) 126 | 127 | (defn valid-request-opts? [params required-keys allowable-keys] 128 | (let [param-keyset (set (keys params))] 129 | (and 130 | (empty? (clojure.set/difference param-keyset allowable-keys)) 131 | (empty? (clojure.set/difference required-keys param-keyset))))) 132 | 133 | 134 | (defn assert-allowed-keys! [m allowed-keys] 135 | (let [allowed-keys (apply hash-set allowed-keys)] 136 | (doseq [k (keys m)] 137 | (if (not (allowed-keys k)) 138 | (raise "Error: disallowed key: %s not in %s" k allowed-keys))))) 139 | 140 | ;; (assert-allowed-keys! {:a 1 :b 2} [:a :b]) 141 | ;; (assert-allowed-keys! {:a 1 :b 2 :c 3} [:a :b]) 142 | 143 | (defn make-periodic-invoker 144 | "Takes a count `count' and a function `f'. Returns a function 145 | that takes an optional 'action' and number of arguments. After 146 | `count' invocations it will invoke the originally supplied function 147 | `f'. `f' will be invoked with the current 'count' value and will be 148 | passed any arguments passed into the returned function. 149 | 150 | Useful for 'long' running processes where you would like to 151 | periodically see a progress update. For example: 152 | 153 | (let [progress-bar (make-periodic-invoker 10000 (fn [count chr] (.print System/err chr))] 154 | (doseq [line (line-seq (clojure.java.io/reader \"some/file\"))] 155 | (progress-bar \".\") 156 | (process-line line))) 157 | 158 | Actions: actions are used to interact with the returned function. The 159 | following actions are supported: 160 | 161 | :final or :invoke 162 | Causes the wrapped function `f' to be invoked immediately. Does not 163 | modify the value of the counter. 164 | 165 | :state 166 | Returns the current value of the counter. 167 | 168 | :reset 169 | Sets the counter back to zero. Allows the periodic function to be re-used. 170 | 171 | :set 172 | Set the counter to the supplied value. 173 | 174 | " 175 | [count f] 176 | (let [ctr (java.util.concurrent.atomic.AtomicLong.)] 177 | (fn [& args] 178 | (let [action (first args)] 179 | (cond 180 | (or (= :final action) 181 | (= :invoke action)) 182 | (apply f action (.get ctr) (rest args)) 183 | 184 | (= :state action) 185 | (.get ctr) 186 | 187 | (= :reset action) 188 | (.set ctr 0) 189 | 190 | (= :set action) 191 | (.set ctr (second args)) 192 | 193 | :else 194 | (let [nextval (.incrementAndGet ctr)] 195 | (if (= 0 (mod nextval count)) 196 | (apply f action nextval args)))))))) 197 | 198 | 199 | 200 | 201 | (defmacro prog1 [res & body] 202 | `(let [res# ~res] 203 | ~@body 204 | res#)) 205 | 206 | (defmacro aprog1 [res & body] 207 | `(let [~'it ~res] 208 | ~@body 209 | ~'it)) 210 | 211 | (defmacro prog2 [fst res & body] 212 | `(do 213 | ~fst 214 | (let [res# ~res] 215 | ~@body 216 | res#))) 217 | 218 | 219 | (defmacro aprog2 [fst res & body] 220 | `(do 221 | ~fst 222 | (let [~'it ~res] 223 | ~@body 224 | ~'it))) 225 | 226 | (defmacro aprogn [res & body] 227 | `(let [~'it ~res] 228 | ~@body)) 229 | 230 | 231 | (defmacro with-hit-timer [[sym-name block-size] & body] 232 | `(let [start-time# (- (.getTime (java.util.Date.)) 1.0) 233 | ~sym-name (clj-etl-utils.lang-utils/make-periodic-invoker 234 | ~block-size 235 | (fn [action# val# & args#] 236 | (let [elapsed# (- (.getTime (java.util.Date.)) 237 | start-time#) 238 | elapsed-secs# (/ elapsed# 1000.0) 239 | rate# (/ val# elapsed-secs#)] 240 | (if (= action# :final) 241 | (printf "COMPLETED: %d in %ss @ %s/s\n" val# elapsed-secs# rate#) 242 | (printf "%d in %ss @ %s/s\n" val# elapsed-secs# rate#)))))] 243 | (prog1 244 | (do ~@body) 245 | (~sym-name :final)))) 246 | 247 | 248 | ;; NB: this now exists in clojure.core as some-> 249 | (defmacro ..? 250 | ([x form] `(if (nil? ~x) nil (. ~x ~form))) 251 | ([x form & more] `(..? (if (nil? ~x) nil (. ~x ~form)) ~@more))) 252 | 253 | (defn array? [^Object thing] 254 | (..? thing (getClass) (isArray))) 255 | 256 | 257 | (defn iterable? [^Object thing] 258 | (or 259 | (seq? thing) 260 | (..? thing (getClass) (isArray)) 261 | (isa? (class thing) java.lang.Iterable) 262 | (isa? (class thing) java.util.List))) 263 | 264 | (def rec-bean 265 | (let [primitive? #{Class 266 | String 267 | clojure.lang.Keyword 268 | clojure.lang.Symbol 269 | Number 270 | Integer 271 | Long 272 | Double 273 | Float 274 | java.util.Map 275 | clojure.lang.IFn}] 276 | (fn rec-bean [thing] 277 | (if (or (nil? thing) 278 | (seq? thing) 279 | (primitive? (class thing))) 280 | thing 281 | (let [bn (dissoc (bean thing) :class)] 282 | (reduce (fn [res k] 283 | (assoc res k 284 | (if (iterable? (get bn k)) 285 | (vec (map rec-bean (get bn k))) 286 | (rec-bean (get bn k))))) 287 | {} 288 | (keys bn))))))) 289 | 290 | (defn get-stack-trace [^Throwable ex] 291 | (format "Exception Message: %s, Stack Trace: %s" 292 | (.getMessage ex) 293 | (with-out-str 294 | (.printStackTrace 295 | ex 296 | (java.io.PrintWriter. *out*))))) 297 | 298 | (defn caused-by-seq [^Throwable th] 299 | (loop [res [] 300 | next th] 301 | (if next 302 | (recur (conj res next) 303 | (.getCause next)) 304 | res))) 305 | 306 | (defmacro restructure-map [& vars] 307 | (reduce (fn [accum var] 308 | (assoc accum (keyword var) var)) 309 | {} 310 | vars)) 311 | 312 | 313 | (defmacro defn! [fn-name arg-spec & body] 314 | `(defn ~fn-name ~arg-spec 315 | ~@(map 316 | (fn [arg] 317 | `(if-not (isa? ~(:tag (meta arg)) (class ~arg)) 318 | (raise "Error: type-mismatch: expected:'%s' to be a '%s', it was a '%s'" 319 | '~arg ~(:tag (meta arg)) (class ~arg)))) 320 | arg-spec) 321 | ~@body)) 322 | 323 | 324 | (defn hostname [] 325 | (-> (java.net.InetAddress/getLocalHost) (.getHostName))) 326 | 327 | (defn string->int? [s] 328 | (try 329 | (Integer/parseInt s) 330 | (catch Exception e 331 | nil))) 332 | 333 | (defn string->long? [s] 334 | (try 335 | (Long/parseLong s) 336 | (catch Exception e 337 | nil))) 338 | 339 | 340 | ;; SRFI-??s cut macro 341 | (defn- cutpoint? [thing] 342 | (let [sthing (str thing)] 343 | (if (or (= '<> thing) 344 | (and (.startsWith sthing "<") 345 | (.endsWith sthing ">"))) 346 | (let [pfx (.replaceAll sthing "[<>]" "")] 347 | (if (empty? pfx) 348 | (gensym) 349 | (gensym (str pfx "-")))) 350 | nil))) 351 | 352 | (defmacro cut [f & arg-sig] 353 | (let [gsyms (map cutpoint? (filter cutpoint? arg-sig)) 354 | arg-spec (loop [[arg & args] arg-sig 355 | gsyms gsyms 356 | res []] 357 | (cond 358 | (not arg) 359 | res 360 | (cutpoint? arg) 361 | (recur args 362 | (rest gsyms) 363 | (conj res (first gsyms))) 364 | :else 365 | (recur args 366 | gsyms 367 | (conj res arg))))] 368 | `(fn [~@gsyms] 369 | (~f ~@arg-spec)))) 370 | 371 | 372 | (comment 373 | 374 | (cut format "this") 375 | 376 | ;; shall we make it look like ascii scissors? 377 | (defmacro %< [f & stuff] 378 | `(cut ~f ~@stuff)) 379 | 380 | (%< format "this") 381 | 382 | ) 383 | 384 | (defmacro assoc-if [test m & kvs] 385 | `(if ~test 386 | (assoc ~m ~@kvs) 387 | ~m)) 388 | 389 | (defmacro nth-let [[rec & bindings] & body] 390 | (let [rec-gensym `rec#] 391 | `(let [~rec-gensym ~rec 392 | ~@(vec (mapcat 393 | (fn [[sym idx]] 394 | [sym `(nth ~rec-gensym ~idx)]) (partition 2 bindings)))] 395 | ~@body))) 396 | 397 | (defmacro assoc-in-if [test m ks v] 398 | `(if ~test 399 | (assoc-in ~m ~ks ~v) 400 | ~m)) 401 | 402 | (defn resource->file-path [^String resource] 403 | (let [cl (.getClassLoader (.getClass *ns*)) 404 | url (.getResource ^ClassLoader cl ^String resource)] 405 | (.getFile ^java.net.URL url))) 406 | 407 | (comment 408 | 409 | (resource->file-path "xsd/iso-2-country-codes.xsd") 410 | (resource->file-path "xsd/iso-3-country-codes.xsd") 411 | 412 | ) 413 | -------------------------------------------------------------------------------- /src/clj_etl_utils/text.clj: -------------------------------------------------------------------------------- 1 | (ns 2 | ^{:doc "Text manipulation utilities." 3 | :author "Kyle Burton"} 4 | clj-etl-utils.text 5 | (:use [clj-etl-utils.lang-utils :only [raise]]) 6 | (:require [clojure.string :as str-utils]) 7 | (:import [org.apache.commons.lang WordUtils] 8 | [java.text NumberFormat DecimalFormat] 9 | [org.apache.commons.codec.binary Base64])) 10 | 11 | (defn 12 | ^{:doc "Convert string to upper case, null safe (returns empty string on null)."} 13 | uc [^String s] 14 | (if (nil? s) 15 | "" 16 | (.toUpperCase s))) 17 | 18 | (defn 19 | ^{:doc "Convert string to lower case, null safe (returns empty string on null)."} 20 | lc [^String s] 21 | (if (nil? s) 22 | "" 23 | (.toLowerCase s))) 24 | 25 | (defmacro 26 | ^{:doc "Binds a temporary file to the symbol indicated by var (java.io.File/createTempFile). 27 | prefix and suffix default to \"pfx\" and \"sfx\" respectively. Note that this macro does not 28 | create or clean up the actual temporary file itself. 29 | "} 30 | with-tmp-file [[var & [prefix suffix]] & body] 31 | `(let [prefix# ~prefix 32 | suffix# ~suffix 33 | ~var (java.io.File/createTempFile (or prefix# "pfx") (or suffix# "sfx"))] 34 | ~@body)) 35 | 36 | (defn 37 | ^{:doc "Compute the MD5 sum of a byte buffer, returning it as a hex-encoded string."} 38 | md5->string [^bytes bytes] 39 | (let [digester (java.security.MessageDigest/getInstance "MD5")] 40 | (.update digester bytes) 41 | (.toString 42 | (java.math.BigInteger. 1 (.digest digester)) 43 | 16))) 44 | 45 | (defn 46 | ^{:doc "Compute the SHA1 sum of a byte buffer, returning it as a hex-encoded string."} 47 | sha1->string [^bytes bytes] 48 | (let [digester (java.security.MessageDigest/getInstance "SHA1")] 49 | (.update digester bytes) 50 | (.toString 51 | (java.math.BigInteger. 1 (.digest digester)) 52 | 16))) 53 | 54 | (defn 55 | ^{:doc "Returns a sequence of all the security providers available in the current JVM. 56 | The sequence consists of pairs of [provider-type provider-algorithm]"} 57 | security-providers-type-algorithm-seq [] 58 | (mapcat (fn [provider] 59 | (map (fn [^java.security.Provider$Service svc] 60 | [(.getType svc) (.getAlgorithm svc)]) 61 | (.getServices ^java.security.Provider provider))) 62 | (java.security.Security/getProviders))) 63 | 64 | (defn 65 | ^{:doc "Returns a seq of all of the provider types available in the current JVM."} 66 | security-providers-types [] 67 | (vec (set (map first (security-providers-type-algorithm-seq))))) 68 | 69 | (defn 70 | ^{:doc "Filters security-providers-type-algorithm-seq for those that match the given type. 71 | (security-providers-for-type \"MessageDigest\") 72 | "} 73 | security-providers-for-type [type] 74 | (filter #(= (first %) type) 75 | (security-providers-type-algorithm-seq))) 76 | 77 | (defn 78 | ^{:doc "Sequence of all the MessageDigest providers available in the current JVM."} 79 | message-digest-algorithms [] 80 | (security-providers-for-type "MessageDigest")) 81 | 82 | (comment 83 | 84 | (security-providers-types) 85 | 86 | (message-digest-algorithms) 87 | 88 | ) 89 | 90 | (defn 91 | ^{:doc "Compute and return the SHA1 sum of the given string, returned as a hex-encoded string."} 92 | string->sha1 [^String s] 93 | (sha1->string (.getBytes s))) 94 | 95 | (defn 96 | ^{:doc "Compute and return the MD5 sum of the given string, returned as a hex-encoded string."} 97 | string->md5 [^String s] 98 | (md5->string (.getBytes s))) 99 | 100 | 101 | (defn 102 | ^{:doc "Compute and return the SHA256 sum of the given byte array, returned as a hex-encoded string."} 103 | sha256->string [^bytes bytes] 104 | (let [digester (java.security.MessageDigest/getInstance "SHA-256")] 105 | (.update digester bytes) 106 | (apply str (map (fn [byte] 107 | (Integer/toHexString (bit-and 0xFF byte))) 108 | (.digest digester))))) 109 | 110 | (defn 111 | ^{:doc "Compute and return the SHA256 sum of the given string, returned as a hex-encoded string."} 112 | string->sha256 [^String s] 113 | (sha256->string (.getBytes s))) 114 | 115 | 116 | (defn 117 | ^{:doc "Compute and return the SHA384 sum of the byte array, returned as a hex-encoded string."} 118 | sha384->string [^bytes bytes] 119 | (let [digester (java.security.MessageDigest/getInstance "SHA-384")] 120 | (.update digester bytes) 121 | (apply str (map (fn [byte] 122 | (Integer/toHexString (bit-and 0xFF byte))) 123 | (.digest digester))))) 124 | 125 | (defn 126 | ^{:doc "Compute and return the SHA384 sum of the given string, returned as a hex-encoded string."} 127 | string->sha384 [^String s] 128 | (sha384->string (.getBytes s))) 129 | 130 | (defn 131 | ^{:doc "Compute and return the SHA512 sum of the byte array, returned as a hex-encoded string."} 132 | sha512->string [^bytes bytes] 133 | (let [digester (java.security.MessageDigest/getInstance "SHA-512")] 134 | (.update digester bytes) 135 | (apply str (map (fn [byte] 136 | (Integer/toHexString (bit-and 0xFF byte))) 137 | (.digest digester))))) 138 | 139 | (defn 140 | ^{:doc "Compute and return the SHA512 sum of the given string, returned as a hex-encoded string."} 141 | string->sha512 [^String s] 142 | (sha512->string (.getBytes s))) 143 | 144 | (comment 145 | 146 | (count (string->sha1 "foof")) ;; 40 147 | (count (string->sha256 "foof")) ;; 63 148 | (count (string->sha384 "foof")) ;; 90 149 | (count (string->sha512 "foof")) ;; 126 150 | 151 | (time 152 | (dotimes [ii 10000] 153 | (string->sha1 "foof"))) 154 | 155 | (time 156 | (dotimes [ii 10000] 157 | (string->sha256 "foof"))) 158 | 159 | (time 160 | (dotimes [ii 10000] 161 | (string->sha512 "foof"))) 162 | 163 | 164 | ) 165 | 166 | 167 | 168 | ;; TODO this doesn't belong in text.clj, couldn't think of a better place for it 169 | (defn 170 | ^{:doc "Current time in milliseconds."} 171 | now-milliseconds [] 172 | (.getTime (java.util.Date.))) 173 | 174 | 175 | (defn 176 | ^{:doc "Substring that supports negative starting positions (negative takes the last N'th characters from the right-hand side of the string). 177 | 178 | (substr \"the quick brown fox\" 10) => \"brown fox\" 179 | (substr \"the quick brown fox\" -3) => \"fox\" 180 | 181 | "} 182 | substr [^String s start & [end]] 183 | (cond 184 | (and (< start 0) 185 | (not end)) 186 | (let [start (+ (count s) start)] 187 | (if (< start 0) 188 | s 189 | (.substring s start))) 190 | 191 | (> start (count s)) 192 | "" 193 | (or (not end) 194 | (> end (count s))) 195 | (.substring s start) 196 | 197 | :else 198 | (.substring s start end))) 199 | 200 | 201 | (comment 202 | 203 | (= "" (substr "" 0 0)) 204 | (= "" (substr "a" 0 0)) 205 | (= "a" (substr "a" 0)) 206 | (= "a" (substr "a" 0 1)) 207 | (= "a" (substr "a" 0 99)) 208 | (= "" (substr "a" 99)) 209 | (= "" (substr "a" 99 199)) 210 | (= "a" (substr "a" -1)) 211 | (= "bc" (clj-etl-utils.text/substr "abc" -2)) 212 | (= "" (substr "abc" -9))) 213 | 214 | 215 | 216 | ;; "public static String humanReadableByteCount(long bytes, boolean si) { 217 | ;; int unit = si ? 1000 : 1024; 218 | ;; if (bytes < unit) return bytes + " B "; 219 | ;; int exp = (int) (Math.log(bytes) / Math.log(unit)); 220 | ;; String pre = (si ? "kMGTPE " : "KMGTPE ").charAt(exp-1) + (si ? " " : "i "); 221 | ;; return String.format("%.1f %sB ", bytes / Math.pow(unit, exp), pre); 222 | ;; }" 223 | 224 | (defn 225 | ^{:doc "Prodcues a human-readable (friendly unit sizes) count of the number of bytes provided (as a string). 226 | 227 | (human-readable-byte-count 1023) \"1023B\" 228 | (human-readable-byte-count 1024) \"1.00KiB\" 229 | (human-readable-byte-count (* 1024 1024)) \"1.00MiB\" 230 | (human-readable-byte-count (* 1024 1024 1024)) \"1.00GiB\" 231 | (human-readable-byte-count (+ 1 (* 1024 1024 1024))) \"1.00GiB\" 232 | (human-readable-byte-count (* 1024 1024 1024 1024)) \"1.00TiB\" 233 | (human-readable-byte-count (* 1024 1024 1024 1024 1024)) \"1.00PiB\" 234 | (human-readable-byte-count (* 1024 1024 1024 1024 1024 1024)) \"1.00EiB\" 235 | (human-readable-byte-count (* 1024 1024 1024 1024 1024 1024 1024)) => Error, no Si prefix for this size 236 | 237 | 238 | Taken from: http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java 239 | 240 | "} 241 | human-readable-byte-count 242 | ([nbytes] 243 | (human-readable-byte-count nbytes false)) 244 | ([nbytes use-si] 245 | (let [unit (if use-si 1000 1024) 246 | exp (int (/ (Math/log nbytes) (Math/log unit)))] 247 | (if (< nbytes unit) 248 | (str nbytes "B") 249 | (format "%.2f%sB" 250 | (/ nbytes (Math/pow unit exp)) 251 | (str 252 | (.charAt 253 | (if use-si 254 | "kMGTPE" 255 | "KMGTPE") 256 | (dec exp)) 257 | (if use-si 258 | "" 259 | "i"))))))) 260 | 261 | (comment 262 | 263 | 264 | ) 265 | 266 | 267 | (defn 268 | ^{:doc "Wrap a string (sentence or paragraph) at a maximum length. 269 | 270 | (word-split \"This is a long sentence, if it were documentation someone would be happy and someone would be unsatisified. That is the way of things.\" 50) 271 | => 272 | (\"This is a long sentence, if it were documentation\" \"someone would be happy and someone would be\" \"unsatisified. That is the way of things.\") 273 | 274 | "} 275 | word-split 276 | ([^String str size] 277 | (word-split str size "\0")) 278 | ([^String str size ^String delim] 279 | (if (>= (.indexOf str delim) 0) 280 | (raise "Input string must not contain delimiter string (%s). Unable to split (input string=%s" delim str) 281 | (seq 282 | (.split 283 | (WordUtils/wrap 284 | str 285 | size 286 | delim 287 | false) 288 | delim))))) 289 | 290 | 291 | (comment 292 | 293 | 294 | ) 295 | 296 | 297 | (def formatter-setters 298 | {:negative-prefix (fn [^DecimalFormat nf ^String x] (.setNegativePrefix nf x)) 299 | :negative-suffix (fn [^DecimalFormat nf ^String x] (.setNegativeSuffix nf x)) 300 | :positive-prefix (fn [^DecimalFormat nf ^String x] (.setPositivePrefix nf x)) 301 | :positive-suffix (fn [^DecimalFormat nf ^String x] (.setPositiveSuffix nf x))}) 302 | 303 | (defn 304 | ^{:doc ""} 305 | apply-format-setter [^NumberFormat nf k v] 306 | (if-not (contains? formatter-setters k) 307 | (raise "set-formatter-option: option not yet implemented: %s" k)) 308 | ((get formatter-setters k) nf v) 309 | nf) 310 | 311 | (declare default-formatters) 312 | 313 | (defn get-currency-formatter [opts-or-keyword] 314 | (cond 315 | (map? opts-or-keyword) 316 | (reduce (fn [formatter [k v]] 317 | (apply-format-setter formatter k v)) 318 | (java.text.NumberFormat/getCurrencyInstance) 319 | opts-or-keyword) 320 | (keyword? opts-or-keyword) 321 | (or (get @default-formatters opts-or-keyword) 322 | (raise "Error: formatter not found for keyword: %s" opts-or-keyword)) 323 | :else 324 | (raise "Error: unrecognized formatter spec (not a map or keyword): [%s] %s" 325 | (class opts-or-keyword) opts-or-keyword))) 326 | 327 | (def currency-with-negative (get-currency-formatter {:negative-prefix "-$" :negative-suffix ""})) 328 | 329 | (def default-formatters 330 | (atom 331 | {:currency-with-negative currency-with-negative 332 | :default (get-currency-formatter {})})) 333 | 334 | 335 | 336 | 337 | (defn format-as-currency 338 | ([num] 339 | (format-as-currency num :default)) 340 | ([num opts] 341 | (.format ^java.text.Format (get-currency-formatter opts) 342 | num))) 343 | 344 | (defonce rx-clean-phone-number #"\D+") 345 | 346 | 347 | (defn canonical-phone-number [^String mobile-number] 348 | (if (nil? mobile-number) 349 | "" 350 | (let [num (str-utils/replace mobile-number rx-clean-phone-number "")] 351 | (if (= 10 (count num)) 352 | (str 1 num) 353 | num)))) 354 | 355 | 356 | (defn uncanonicalize-phone-number [^String mobile-number] 357 | (let [phone-number (canonical-phone-number mobile-number) 358 | [_ area-code central-office subscriber-number] (re-find #"\d{1}(\d{3})(\d{3})(\d{4})" phone-number)] 359 | (format "%s-%s-%s" area-code central-office subscriber-number))) 360 | 361 | (defn snake-case [^String s] 362 | (.toString 363 | ^StringBuilder 364 | (reduce 365 | (fn [^StringBuilder b c] 366 | (if (Character/isUpperCase ^char c) 367 | (do 368 | (.append b "-") 369 | (.append b ^CharSequence (clojure.string/lower-case c))) 370 | (.append ^StringBuilder b c))) 371 | (StringBuilder.) 372 | (name s)))) 373 | 374 | (defn camel->snake [^java.util.Map params] 375 | (reduce 376 | (fn [accum [k v]] 377 | (assoc accum (keyword (snake-case k)) v)) 378 | {} 379 | params)) 380 | 381 | (defn camel->underscore [^java.util.Map params] 382 | (reduce 383 | (fn [accum [k v]] 384 | (assoc accum (keyword (.replaceAll ^String (snake-case k) "-" "_")) v)) 385 | {} 386 | params)) 387 | 388 | (defn snake->underscore [^java.util.Map params] 389 | (reduce 390 | (fn [accum [k v]] 391 | (assoc accum (keyword (.replaceAll (name k) "-" "_")) v)) 392 | {} 393 | params)) 394 | 395 | (defn underscore->snake [^java.util.Map params] 396 | (reduce 397 | (fn [accum [k v]] 398 | (assoc accum (keyword (.replaceAll (name k) "_" "-")) v)) 399 | {} 400 | params)) 401 | 402 | (defn camelize-keyword [k] 403 | (let [[res & parts] (.split (name k) "[-_]")] 404 | (loop [res res 405 | [n & parts] parts] 406 | (if-not n 407 | (keyword res) 408 | (recur (str res (org.apache.commons.lang.WordUtils/capitalize n)) 409 | parts))))) 410 | 411 | (defn camelize-map-keys [m] 412 | (reduce 413 | (fn [accum [k v]] 414 | (assoc accum 415 | (camelize-keyword k) v)) 416 | {} 417 | m)) 418 | 419 | (def encode-base64 420 | (let [b (Base64.)] 421 | (fn encode-base64 [raw] 422 | (.encode b raw)))) 423 | 424 | (def decode-base64 425 | (let [b (Base64.)] 426 | (fn decode-base64 [coded] 427 | (.decode b coded)))) 428 | 429 | 430 | (defn summarize-message 431 | ([msg len] 432 | (summarize-message msg len "'" "...")) 433 | ([msg len delimiter summary-marker] 434 | (if (> (count msg) len) 435 | (str delimiter (first (word-split msg len)) summary-marker delimiter) 436 | (str delimiter msg delimiter)))) 437 | 438 | 439 | (comment 440 | (.format (java.text.NumberFormat/getCurrencyInstance) -1234) 441 | (format-as-currency -1234 :currency-with-negative) 442 | (format-as-currency -1234 :default) 443 | (format-as-currency -1234) 444 | (format-as-currency 1234 :currency-with-negative) 445 | 446 | (human-readable-byte-count 1024) 447 | (human-readable-byte-count 1024 true) 448 | (human-readable-byte-count (* 3 1024 1024)) 449 | (human-readable-byte-count (* 3 1024 1024) true) 450 | ) 451 | 452 | 453 | (defn trim-and-truncate 454 | ([^String value ^Number max-len] 455 | (trim-and-truncate value max-len nil)) 456 | ([^String value ^Number max-len ^String default-value] 457 | (cond 458 | (nil? value) 459 | default-value 460 | 461 | :trim-and-truncate 462 | (let [value (.trim value)] 463 | (if (empty? value) 464 | default-value 465 | (.trim ^String (substr value 0 max-len))))))) 466 | -------------------------------------------------------------------------------- /src/clj_etl_utils/indexer.clj: -------------------------------------------------------------------------------- 1 | (ns ^{:doc "Indexing functions for working with delimited and fixed 2 | width files in situ, allowing them to be searched and iterated 3 | through in other than natural order, without having to load the 4 | data into a database." 5 | :author "Kyle Burton"} 6 | clj-etl-utils.indexer 7 | (:require 8 | [clojure.java.shell :as sh] 9 | [clojure.string :as string] 10 | [clj-etl-utils.sequences :as sequences] 11 | [clj-etl-utils.io :as io] 12 | [clojure.java.io :as cljio]) 13 | (:import 14 | [java.io RandomAccessFile FileInputStream InputStreamReader BufferedReader] 15 | [org.apache.commons.io.input BoundedInputStream])) 16 | 17 | ;; index line oriented files 18 | 19 | 20 | ;; TODO: convention for escaping the key (URI? - it could contain a 21 | ;; tab character...), handling null or empty values, when the key fn 22 | ;; throws an exception, etc. 23 | 24 | ;; TODO: consider updating or refreshing - incrementally, the index files 25 | 26 | (defn line-position-seq [^RandomAccessFile fp] 27 | (let [start-pos (.getFilePointer fp) 28 | line (.readLine fp) 29 | end-pos (.getFilePointer fp)] 30 | (if (nil? line) 31 | (do (.close fp) 32 | nil) 33 | (lazy-cat 34 | [[line start-pos end-pos]] 35 | (line-position-seq fp))))) 36 | 37 | (defn line-index-seq 38 | "Given a random access file (need not be positioned at the start) 39 | and a key function (run on the line to compute the keys for the line) 40 | this will return a sequence of: 41 | 42 | ([[key-value ...] line-start-pos line-end-pos] ...) 43 | 44 | For all the lines in the file. 45 | 46 | " 47 | [^RandomAccessFile fp key-fn] 48 | (pmap (fn [[line start-pos end-pos]] 49 | [(key-fn line) start-pos end-pos]) 50 | (line-position-seq fp))) 51 | 52 | (comment 53 | 54 | (take 10 (line-index-seq 55 | (RandomAccessFile. "/home/superg/data/citi/relay_incremental_9_2010-10k-sample.rpt.fix" "r") 56 | (fn [line] 57 | [(str (.charAt line 0))]))) 58 | 59 | ) 60 | 61 | ;; returns a sequnce of [key line-start-byte-pos line-endbyte-pos] 62 | ;; given a key-fn that takes a line of text and returns a string key that represents the line. 63 | 64 | (defn file-index-seq [^String file key-fn] 65 | (line-index-seq (RandomAccessFile. file "r") key-fn)) 66 | 67 | (defn extract-range [^RandomAccessFile fp start end] 68 | (.seek fp start) 69 | (let [data-bytes (byte-array (- end start))] 70 | (.read fp data-bytes) 71 | (String. data-bytes))) 72 | 73 | ;; NB: decide on sort behavior - string collation or numeric? we're 74 | ;; going to shell out to GNU sort for this so that is a concern... 75 | (defn create-index-file [^String input-file ^String index-file key-fn] 76 | ;; run the indexer (seq), emit to index-file 77 | ;; sort index-file 78 | (with-open [^java.io.Writer outp (cljio/writer index-file)] 79 | (loop [[[kvals start end] & vals] (file-index-seq input-file key-fn)] 80 | (if (or (nil? kvals) 81 | (empty? kvals)) 82 | true 83 | (do 84 | (doseq [val kvals] 85 | (.write outp (format "%s\t%s\t%s" val start end)) 86 | (.write outp "\n")) 87 | (recur vals)))))) 88 | 89 | ;; NB: return value isn't taking into account error statuses 90 | ;; NB: will not work on platforms that don't have sort and mv, fix this... 91 | (defn sort-index-file [^String index-file] 92 | (let [tmp (java.io.File/createTempFile "idx-srt" "tmp") 93 | tmpnam (.getName tmp)] 94 | (sh/sh "sort" "-o" tmpnam index-file 95 | :env {"LANG" "C"}) 96 | (sh/sh "mv" tmpnam index-file)) 97 | true) 98 | 99 | (defn index-file! [^String input-file ^String index-file key-fn] 100 | (create-index-file input-file index-file key-fn) 101 | (sort-index-file index-file)) 102 | 103 | 104 | (comment 105 | 106 | (defn rand-elt [s] 107 | (let [idx (mod (.nextInt (java.util.Random.)) 108 | (count s))] 109 | (nth s idx))) 110 | 111 | (require 'clj-etl-utils.ref-data) 112 | (let [rnd (java.util.Random.) 113 | states (vec (map first clj-etl-utils.ref-data/*us-states*))] 114 | (with-open [wtr (java.io.PrintWriter. "file.txt")] 115 | (dotimes [ii 100] 116 | (.println wtr 117 | (str 118 | (rand-elt states) 119 | "\t" 120 | (.nextInt rnd)))))) 121 | 122 | (index-file! "file.txt" ".file.txt.id-idx" 123 | (fn [l] 124 | [(.toLowerCase (first (.split l "\t")))])) 125 | 126 | ) 127 | 128 | ;; TODO: this is splitting multiple times, rework to only split 1x 129 | (defn index-blocks-seq [^String index-file] 130 | (map (fn [grp] 131 | (map (fn [^String l] 132 | (let [[val spos epos] (.split l "\t")] 133 | [val (Integer/parseInt spos) (Integer/parseInt epos)])) 134 | grp)) 135 | (sequences/group-with (fn [^String l] 136 | (first (.split l "\t"))) 137 | (io/lazy-read-lines index-file)))) 138 | 139 | ;; This is the new form of above (only call split 1x), needs to be tested 140 | #_(defn index-blocks-seq [^String index-file] 141 | (sequences/group-with 142 | first 143 | (map 144 | (fn [l] 145 | (let [[val spos epos] (.split l "\t")] 146 | [val (Long/parseLong spos) (Long/parseLong epos)])) 147 | (io/lazy-read-lines index-file)))) 148 | 149 | 150 | 151 | (comment 152 | (index-blocks-seq ".file.txt.id-idx") 153 | ((["1" 24 48]) (["2" 48 65]) (["3" 65 88]) (["99" 0 24] ["99" 88 115])) 154 | ) 155 | 156 | 157 | (defn records-for-idx-block ^String [inp-file idx-block] 158 | (loop [recs [] 159 | [[k start-pos end-pos] & idx-block] idx-block] 160 | (if (not k) 161 | recs 162 | (recur 163 | ;; NB: range should be 1 and only 1 line/record 164 | (conj recs (first (vec (io/read-lines-from-file-segment inp-file start-pos end-pos)))) 165 | idx-block)))) 166 | 167 | (comment 168 | 169 | (records-for-idx-block "file.txt" [["99" 0 24] ["99" 88 115]]) 170 | 171 | ) 172 | 173 | ;; TODO: building an index has to stream the records, if we are to 174 | ;; build N indicies we will have to stream the records N times, modify 175 | ;; the implementation such that we can create multiple indicies by 176 | ;; streaming only one time... 177 | 178 | (defn record-blocks-via-index 179 | "Given an data file and an index file, this stream through the distinct 180 | index values returning records from the data file." 181 | [^String inp-file ^String index-file] 182 | (map (partial records-for-idx-block inp-file) 183 | (index-blocks-seq index-file))) 184 | 185 | ;; ((["1" 24 48]) (["2" 48 65]) (["3" 65 88]) (["99" 0 24] ["99" 88 115])) 186 | 187 | (comment 188 | 189 | (record-blocks-via-index "file.txt" ".file.txt.id-idx") 190 | 191 | ) 192 | 193 | ;; 1 MB 194 | (def min-streaming-threshold (* 1024 1024 1)) 195 | 196 | ;; matcher = (fn [index-val term]) => bool 197 | (defn index-search 198 | ([idx-file term] 199 | (index-search idx-file term =)) 200 | ([idx-file term matcher] 201 | (with-open [rdr (cljio/reader idx-file)] 202 | (index-search idx-file term matcher rdr))) 203 | ([idx-file term matcher ^java.io.BufferedReader rdr] 204 | (loop [line (.readLine rdr) 205 | res []] 206 | (if (nil? line) 207 | res 208 | (let [[v s e] (.split line "\t" 3) 209 | direction (compare v term)] 210 | (cond 211 | (matcher v term) 212 | (recur (.readLine rdr) 213 | (conj res [v (Long/parseLong s) (Long/parseLong e)])) 214 | 215 | (pos? direction) 216 | (do 217 | #_(println (format "direction was positive, indicating we've gone past: (compare \"%s\" \"%s\") %d" 218 | v term direction)) 219 | res) 220 | 221 | :continue 222 | (recur (.readLine rdr) res))))))) 223 | 224 | ;; NB: this only works with \n (byte value 10) as a line separator 225 | (defn rewind-to-newline [^RandomAccessFile fp min-pos] 226 | #_(println "rewind-to-newline: remove the max iters") 227 | (loop [] ;; max-iters 10000 228 | (let [b (.readByte fp) 229 | ch (Byte/toString b)] 230 | (cond 231 | ;; (<= max-iters 0) 232 | ;; (raise "Too much recursion") 233 | 234 | ;; (int (aget (.getBytes "a") 0)) 235 | (= (int b) 10) 236 | (do 237 | #_(println (format "rewind-to-newline: Found newline at %d" (.getFilePointer fp))) 238 | true) 239 | 240 | (<= (.getFilePointer fp) min-pos) 241 | (do 242 | #_(println (format "rewind-to-newline: Rewound to start")) 243 | (.seek fp 0) 244 | true) 245 | 246 | :rewind 247 | (do 248 | #_(println (format "rewind-to-newline: [%d/%s] Did not find newline at %d, going back to %d" 249 | (.intValue b) 250 | ch 251 | (.getFilePointer fp) 252 | (- (.getFilePointer fp) 2))) 253 | (.seek fp (- (.getFilePointer fp) 2)) 254 | ;; (recur (dec max-iters)) 255 | (recur)))))) 256 | 257 | 258 | ;; spos must point at either the start of the file, or the beginning of a line 259 | ;; epos must point at either the end of the file, or a newline 260 | (defn index-search-prefix-impl [^String idx-file ^String term spos epos] 261 | #_(println (format "index-search-prefix-impl %s %s %d %d" 262 | idx-file term spos epos)) 263 | (if (<= (- epos spos) min-streaming-threshold) 264 | (with-open [rdr (BufferedReader. 265 | (InputStreamReader. 266 | (BoundedInputStream. 267 | (doto (FileInputStream. idx-file) 268 | (.skip spos)) 269 | (- epos spos))))] 270 | #_(println (format "before binary search, spos=%d to epos=%d under THRESH, falling back to streaming search" spos epos)) 271 | (index-search idx-file 272 | term 273 | (fn [^String idx-val ^String term] 274 | (.startsWith idx-val term)) 275 | rdr)) 276 | (with-open [fp (RandomAccessFile. idx-file "r")] 277 | (loop [;;max-iters 25 278 | spos spos 279 | epos epos 280 | middle (long (/ (- epos spos) 2))] 281 | #_(println (format "loop: spos=%d epos=%d middle=%d" spos epos middle)) 282 | ;; (when (<= max-iters 0) 283 | ;; (raise "too much recursion")) 284 | (.seek fp middle) 285 | (rewind-to-newline fp spos) 286 | (let [middle (.getFilePointer fp) 287 | line (.readLine fp) 288 | [iterm bstart bend] (.split line "\t" 3) 289 | order (compare term iterm)] 290 | #_(println (format "Looking at[%d] line=%s" 291 | (.getFilePointer fp) 292 | line)) 293 | (cond 294 | (<= (- epos spos) min-streaming-threshold) 295 | (with-open [rdr (BufferedReader. 296 | (InputStreamReader. 297 | (BoundedInputStream. 298 | (doto (FileInputStream. idx-file) 299 | (.skip spos)) 300 | (- epos spos))))] 301 | #_(println (format "in binary search, spos=%d to epos=%d under THRESH, falling back to streaming search" spos epos)) 302 | (index-search 303 | idx-file 304 | term 305 | (fn [^String idx-val ^String term] 306 | #_(println (format "(.startsWith \"%s\" \"%s\") => %s" 307 | idx-val term (.startsWith idx-val term))) 308 | (.startsWith idx-val term)) 309 | rdr)) 310 | 311 | (neg? order) 312 | (do 313 | #_(println (format "order was: %d, go left" order)) 314 | (recur ;; (dec max-iters) 315 | spos 316 | middle 317 | (long (- middle (/ (- middle spos) 2))))) 318 | 319 | (zero? order) 320 | (do 321 | #_(println (format "order was: %d, we're in the block, need to find the start" order))) 322 | 323 | (pos? order) 324 | (do 325 | #_(println (format "order was: %d, go right" order)) 326 | (recur 327 | ;; (dec max-iters) 328 | middle 329 | epos 330 | (long (- epos (/ (- epos middle) 2))))))))))) 331 | 332 | 333 | (defn index-search-prefix [^String idx-file ^String term] 334 | ;; binary search the file 335 | ;; open a RandomAccessFile 336 | ;; start=0 end=LEN 337 | ;; if (- end len) < THRESH, just stream through the section 338 | ;; jump to the middle, rewind to '\n' 339 | (let [epos (.length (java.io.File. idx-file))] 340 | (if (<= epos min-streaming-threshold) 341 | (do 342 | #_(println (format "file size %d < thresh %d, falling back to streaming" 343 | epos min-streaming-threshold)) 344 | (index-search idx-file term #(= %1 %2))) 345 | (index-search-prefix-impl idx-file term 0 epos)))) 346 | 347 | 348 | (defn index-search-file 349 | ([^String input-file ^String index-file term] 350 | (index-search-file input-file index-file term =)) 351 | ([^String input-file ^String index-file term matcher] 352 | (map 353 | (fn [[v s e]] 354 | (first (vec (io/read-lines-from-file-segment input-file s e)))) 355 | (filter 356 | (fn [[v s e]] 357 | (matcher v term)) 358 | (index-search-prefix index-file term))))) 359 | 360 | (comment 361 | 362 | 363 | (index-search-file "file.txt" ".file.txt.id-idx" "IA") 364 | 365 | 366 | (index-file! "file.txt" ".file.txt.id-idx" (fn [line] [(first (.split line "\t"))])) 367 | (record-blocks-via-index "file.txt" ".file.txt.id-idx") 368 | 369 | 370 | ) 371 | 372 | 373 | ;; (defn csv-parse [^String s] 374 | ;; (with-in-str s 375 | ;; (first (csv/read-csv *in*)))) 376 | 377 | (defn index-file-path ^String [src idx-name] 378 | (let [^String fname (-> src :config :file) 379 | src-file (java.io.File. fname)] 380 | (format "%s/.%s.%s-idx" 381 | (.getParent src-file) 382 | (.getName src-file) 383 | (name idx-name)))) 384 | 385 | (defn ensure-indexes [src] 386 | (doseq [[idx-name idx] (-> src :config :indexes)] 387 | (println (format "idx:%s" idx)) 388 | (let [src-path (-> src :config :file) 389 | src-file (java.io.File. ^String src-path) 390 | idx-path (index-file-path src (:name idx)) 391 | idx-file (java.io.File. idx-path)] 392 | ;; only if the idx-file doesn't exist or the src file is newer 393 | (println (format "src-path:%s idx-path:%s" src-path idx-path)) 394 | (when (or (not (.exists idx-file)) 395 | (> (.lastModified src-file) 396 | (.lastModified idx-file))) 397 | (index-file! 398 | src-path 399 | idx-path 400 | (:fn idx)))))) 401 | 402 | (defn make-candidate-keyfile [sources index-name ^String candfile] 403 | ;; combine the index values, sort and count them 404 | (with-open [wtr (java.io.PrintWriter. candfile)] 405 | (doseq [src sources] 406 | (let [idx (-> src :config :indexes index-name) 407 | idx-file (index-file-path src index-name)] 408 | (with-open [rdr (java.io.BufferedReader. (java.io.FileReader. idx-file))] 409 | (doall 410 | (for [^String line (line-seq rdr)] 411 | (.println wtr (first (.split line "\t" 2))))))))) 412 | (let [tmp (java.io.File/createTempFile "cand-srt" "tmp") 413 | tmpnam (.getName tmp)] 414 | (sh/sh "sort" "-u" "-o" tmpnam candfile 415 | :env {"LANG" "C"}) 416 | (println (format "mv %s %s" tmpnam candfile)) 417 | (sh/sh "mv" tmpnam candfile))) 418 | 419 | (defn ensure-candidate-keyfile [sources index-name ^String candfile] 420 | (let [f (java.io.File. candfile)] 421 | (when-not (.exists f) 422 | (make-candidate-keyfile sources index-name candfile)))) 423 | 424 | (defn process-candidte-clusters [sources index-name f] 425 | (let [candfile (format "%s-by-%s.candidates" 426 | (string/join "" (map #(name (:name %1)) sources)) 427 | (name index-name))] 428 | ;; ensure indexes, then the candidate file 429 | (ensure-candidate-keyfile sources index-name candfile) 430 | (with-open [rdr (java.io.BufferedReader. (java.io.FileReader. candfile))] 431 | (doseq [term (line-seq rdr)] 432 | (let [cluster (vec (map (fn [src] 433 | (let [input-file (-> src :config :file) 434 | index-file (index-file-path src index-name)] 435 | {:source (-> src :name) 436 | :index-file index-file 437 | :recs (vec (index-search-file input-file index-file term))})) 438 | sources))] 439 | (f term cluster)))))) 440 | -------------------------------------------------------------------------------- /src/clj_etl_utils/io.clj: -------------------------------------------------------------------------------- 1 | (ns 2 | ^{:doc "I/O Utilities" 3 | :author "Kyle Burton"} 4 | clj-etl-utils.io 5 | (:require 6 | [clojure.java.shell :as sh] 7 | [clojure.java.io :as cljio] 8 | [clojure.tools.logging :as log]) 9 | (:use [clj-etl-utils.lang-utils :only [raise]] 10 | [clojure.string :only [join]]) 11 | (:import 12 | [java.io 13 | InputStream FileInputStream File InputStreamReader RandomAccessFile 14 | BufferedReader Reader FileReader] 15 | [org.apache.commons.io.input BoundedInputStream])) 16 | 17 | ;; 18 | ;; Unicode BOM handling is Based off of nu.xom.xinclude.EncodingHeuristics 19 | ;; FAIL: use the BOM stuff from commons-io!o 20 | 21 | ;; TODO: can fail for streams that don't support marking 22 | (defn 23 | ^{:doc "Read the first n-bytes available in the stream, if the stream supports 24 | marking, it will be reset back so that the bytes are not actually read." 25 | :added "1.0.0"} 26 | first-n-bytes-available [#^Reader stream n-bytes] 27 | (let [res (atom [])] 28 | (try 29 | (if (.markSupported stream) 30 | (.mark stream 1024)) 31 | (dotimes [nn n-bytes] 32 | (let [next-byte (.read stream)] 33 | (if (not (= -1 next-byte)) 34 | (reset! res (conj @res next-byte))))) 35 | (finally 36 | ;; NB: this is no good for already created streams IOW one's 37 | ;; that can be recreated, b/c the goal is to advance past the 38 | ;; BOM and no further, so we need to consume at most 1 byte at a 39 | ;; time 40 | (if (.markSupported stream) 41 | (.reset stream)))) 42 | @res)) 43 | 44 | ;; ;; from: http://unicode.org/faq/utf_bom.html 45 | ;; ( def *utf-16be* 46 | ;; {:encoding "UTF-16BE" 47 | ;; :name :utf-16be 48 | ;; :marker "\u00FE\u00FF" 49 | ;; :marker-bytes [0xFE 0xFF]}) 50 | 51 | ;; ( def *utf-16le* 52 | ;; {:encoding "UTF-16LE" 53 | ;; :name :utf-16le 54 | ;; :marker "\u00FF\u00FE" 55 | ;; :marker-bytes [0xFF 0xFE]}) 56 | 57 | ;; ( def *utf-32be* 58 | ;; {:encoding "UTF-32BE" 59 | ;; :name :utf-32be 60 | ;; :marker "\u0000\u0000\u00FE\u00FF" 61 | ;; :marker-bytes [0x00 0x00 0xFE 0xFF]}) 62 | 63 | ;; ( def *utf-32le* 64 | ;; {:encoding "UTF-32LE" 65 | ;; :name :utf-32le 66 | ;; :marker "\u00FF\u00FE\u0000\u0000" 67 | ;; :marker-bytes [0xFF 0xFE 0x00 0x00]}) 68 | 69 | ;; ( def *utf-8* 70 | ;; {:encoding "UTF-8" 71 | ;; :name :utf-8 72 | ;; :marker "\u00EF\u00BB\u00BF" 73 | ;; :marker-bytes [0xEF 0xBB 0xBF]}) 74 | 75 | ;; ( def *iso-8851-1* 76 | ;; {:encoding "ISO-8859-1" 77 | ;; :name :iso-8859-1 78 | ;; :marker "" 79 | ;; :marker-bytes []}) 80 | 81 | ;; ( def *us-ascii* 82 | ;; {:encoding "US-ASCII" 83 | ;; :name :us-ascii 84 | ;; :marker "" 85 | ;; :marker-bytes []}) 86 | 87 | ;; ( def *bom-markers* 88 | ;; [*utf-32be* 89 | ;; *utf-32le* 90 | ;; *utf-16be* 91 | ;; *utf-16le* 92 | ;; *utf-8*]) 93 | 94 | ;; ( def *default-encoding* *iso-8851-1*) 95 | 96 | 97 | (defn ^{:doc "Test if a given marker is equivalent to the given set of 98 | bytes from the file. This is a prefix test, and will return true of 99 | the shorter marker matches the beginning of the longer marker." 100 | :added "1.0.0"} 101 | byte-marker-matches? [marker-bytes file-bytes] 102 | (cond 103 | (empty? marker-bytes) false 104 | (empty? file-bytes) false 105 | :else 106 | (loop [[marker & marker-bytes] marker-bytes 107 | [byte & file-bytes] file-bytes] 108 | (cond 109 | (or (not marker) (not byte)) true 110 | (= marker byte) (recur marker-bytes file-bytes) 111 | :else false)))) 112 | 113 | ;; TODO: what if the stream doesn't support mark? 114 | ;; TODO: may return a false positive on arbitrary binary data 115 | ;; (defn detect-stream-encoding-via-bom [stream & [default-encoding]] 116 | ;; (let [file-bytes (first-n-bytes-available stream 4)] 117 | ;; (loop [[encoding & encodings] *bom-markers*] 118 | ;; (cond 119 | ;; (not encoding) 120 | ;; ;; TODO: return the default encoding here 121 | ;; (or default-encoding *default-encoding*) 122 | ;; (byte-marker-matches? (:marker-bytes encoding) file-bytes) 123 | ;; encoding 124 | ;; :else 125 | ;; (recur encodings))))) 126 | 127 | 128 | 129 | 130 | ;; (defmulti detect-file-encoding-via-bom (fn [x & [default-encoding]] (class x))) 131 | 132 | ;; (defmethod detect-file-encoding-via-bom String [#^String file & [#^String default-encoding]] 133 | ;; (detect-file-encoding-via-bom (File. file) default-encoding)) 134 | 135 | ;; (defmethod detect-file-encoding-via-bom File [#^File file & [#^String default-encoding]] 136 | ;; (with-open [inp (FileReader. file)] 137 | ;; (detect-stream-encoding-via-bom inp default-encoding))) 138 | 139 | ;; (defmethod detect-file-encoding-via-bom :default [file & [default-encoding]] 140 | ;; (throw (format "Error: fell through to :default for detect-stream-encoding-via-bom file=%s" file))) 141 | 142 | 143 | ;; (defn unicode-input-stream [#^String path] 144 | ;; (InputStreamReader. 145 | ;; (FileInputStream. path) 146 | ;; #^String (:encoding (detect-file-encoding-via-bom path)))) 147 | 148 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 149 | 150 | (defn ^{:doc "Wraps a StringReader around the given string." 151 | :added "1.0.0"} 152 | string-reader [^String s] 153 | (java.io.StringReader. s)) 154 | 155 | (defn 156 | ^{:doc "Creates an InputStream (ByteArrayInputStream) for the given string." 157 | :added "1.0.0"} 158 | string-input-stream [^String s] 159 | (java.io.ByteArrayInputStream. (.getBytes s))) 160 | 161 | (defn 162 | ^{:doc "Read a specific number of characters from the InputStream, return a string." 163 | :added "1.0.0"} 164 | read-fixed-length-string [#^InputStream inp nchars] 165 | (let [dest #^bytes (make-array Byte/TYPE nchars) 166 | nread (.read inp dest 0 nchars)] 167 | (String. dest 0 nread))) 168 | 169 | (defn 170 | ^{:doc "Drain a buffered reader into a sequence." 171 | :added "1.0.0"} 172 | drain-line-reader 173 | [#^java.io.BufferedReader rdr] 174 | (loop [res [] 175 | line (.readLine rdr)] 176 | (if line 177 | (recur (conj res line) 178 | (.readLine rdr)) 179 | res))) 180 | 181 | (defn 182 | ^{:doc "Simple wrapper around Runtime.exec" 183 | :added "1.0.0"} 184 | exec 185 | [#^String cmd] 186 | (let [proc #^Process (.exec (Runtime/getRuntime) cmd) 187 | rv (.waitFor proc)] 188 | {:error (drain-line-reader (java.io.BufferedReader. (java.io.InputStreamReader. (.getErrorStream proc)))) 189 | :output (drain-line-reader (java.io.BufferedReader. (java.io.InputStreamReader. (.getInputStream proc)))) 190 | :exit rv})) 191 | 192 | (defn 193 | ^{:doc "Change a file or directory's permissions. Shells out to perform the chmod." 194 | :added "1.0.0"} 195 | chmod 196 | [perms file] 197 | (let [cmd (format "chmod %s %s" perms file) 198 | res (exec cmd)] 199 | (log/infof "chmod: %s" cmd) 200 | (if (not (= 0 (:exit res))) 201 | (log/errorf "%s" (:error res))))) 202 | 203 | (defmacro with-tmp-dir [[var & [prefix suffix]] & body] 204 | `(let [prefix# ~prefix 205 | suffix# ~suffix 206 | ~var (java.io.File/createTempFile (or prefix# "pfx") (or suffix# "sfx"))] 207 | (try 208 | (do 209 | (.delete ~var) 210 | ~@body) 211 | (finally 212 | ;; TODO: this will fail if dir is not empty!, should this recrusively remove all the files? 213 | (.delete ~var))))) 214 | 215 | (defn basename 216 | "Strip off the last part of the file name." 217 | [fname] 218 | (if (instance? java.io.File fname) 219 | (.getParent #^java.io.File fname) 220 | (.getParent (java.io.File. #^String (str fname))))) 221 | 222 | (defn #^java.io.File $HOME 223 | "Construct a path relative to the user's home directory." 224 | [& paths] 225 | (java.io.File. 226 | #^String (apply str 227 | (cons (str (System/getProperty "user.home") "/") 228 | (apply str (interpose "/" paths)))))) 229 | 230 | (defmulti expand-file-name 231 | "Perform bash style expansion on the given path. Eg: ~/file.txt." 232 | class) 233 | 234 | (defn #^String get-user-home 235 | "Get the user's home dir as a string." 236 | [] 237 | (System/getProperty "user.home")) 238 | 239 | (defmethod expand-file-name String [#^String path] 240 | (cond (.startsWith path "~/") 241 | (.replaceFirst path "^~(/|$)" (str (get-user-home) "/")) 242 | (.startsWith path "file://~/") 243 | (.replaceFirst path "^file://~/" (str "file://" (get-user-home) "/")) 244 | :else 245 | path)) 246 | 247 | 248 | (defn mkdir 249 | "Create the given directory path, fall back gracefuly if the path 250 | exists, warning if it's not a directory." 251 | [path] 252 | (let [f (java.io.File. (str path))] 253 | (if (not (.exists f)) 254 | (do 255 | ;(log/infof "mkdir: creating %s" path) 256 | (.mkdirs f) 257 | true) 258 | (if (not (.isDirectory f)) 259 | (do 260 | ;(log/warnf "mkdir: %s exists and is not a directory!" path) 261 | false) 262 | (do 263 | ;(log/debugf "mkdir: exists: %s" path) 264 | true))))) 265 | 266 | ;; NB: Should be able to specify read/write/exec perms per-subdirectory 267 | (defn mkdir-p [dirs perm owner-only] 268 | (loop [dir [(first dirs)] 269 | sub-dirs (next dirs)] 270 | (let [next-dir (java.io.File. (join "/" dir ))] 271 | (.mkdir next-dir) 272 | (.setReadable next-dir perm owner-only) 273 | (.setWritable next-dir perm owner-only) 274 | (.setExecutable next-dir perm owner-only)) 275 | 276 | (if-not (empty? sub-dirs) 277 | (recur (conj dir (first sub-dirs)) 278 | (next sub-dirs))))) 279 | 280 | 281 | (defmulti exists? class) 282 | (defmethod exists? String [#^String s] (.exists (File. s))) 283 | (defmethod exists? File [#^File f] (.exists f)) 284 | (defmethod exists? :default [x] (throw (Exception. (str "Do not know how to test <" (pr-str x) "> if it `exists?'")))) 285 | 286 | (defn symlink 287 | "Create a symlink." 288 | [#^String src #^String dst] 289 | (let [src (java.io.File. (str src)) 290 | dst (java.io.File. (str dst))] 291 | (if (not (.exists src)) 292 | (raise "symlink: src does not exist: %s" src)) 293 | (if (.exists dst) 294 | (log/infof "symlink: dst exists %s => %s" src dst) 295 | (let [cmd (format "ln -s %s %s" src dst) 296 | res (exec cmd)] 297 | (log/infof "symlink: %s=>%s : %s" src dst cmd) 298 | (if (not (= 0 (:exit res))) 299 | (log/errorf "%s" (:error res))))))) 300 | 301 | (defn delete 302 | "Remove a file if it exists." 303 | [#^String path] 304 | (let [path (java.io.File. (str path))] 305 | (if (.exists path) 306 | (.delete path)))) 307 | 308 | (defn url-get 309 | "Very simplistic retreival of a url target." 310 | [url] 311 | (with-open [is (.openStream (java.net.URL. url))] 312 | (loop [sb (StringBuffer.) 313 | chr (.read is)] 314 | (if (= -1 chr) 315 | sb 316 | (do 317 | (.append sb (char chr)) 318 | (recur sb 319 | (.read is))))))) 320 | 321 | (defn url-download 322 | "Shell's out to wget to pull the file into the target directory." 323 | [url #^String target-dir] 324 | (let [cmd (format "wget -P %s -c %s" target-dir url) 325 | res (exec cmd)] 326 | (log/infof "wget: %s" cmd) 327 | (if (not (= 0 (:exit res))) 328 | (log/errorf "%s" (:error res))))) 329 | 330 | 331 | (defn object->file 332 | "Use Java Serialization to emit an object to a file (binary format)." 333 | [#^Object obj #^String file] 334 | (with-open [outp (java.io.ObjectOutputStream. (java.io.FileOutputStream. file))] 335 | (.writeObject outp obj))) 336 | 337 | 338 | (defn file->object 339 | "Use Java Serialization to pull an object from a file (see object->file)." 340 | [#^String file] 341 | (with-open [inp (java.io.ObjectInputStream. (java.io.FileInputStream. file))] 342 | (.readObject inp))) 343 | 344 | ;; clojure.lang.PersistentVector$Node ins't serializable any longer...is this an oversight? ignore for now... 345 | (defn freeze 346 | "Serialize an object to a byte array." 347 | [#^Object obj] 348 | (with-open [baos (java.io.ByteArrayOutputStream. 1024) 349 | oos (java.io.ObjectOutputStream. baos)] 350 | (.writeObject oos obj) 351 | (.toByteArray baos))) 352 | 353 | ;; (freeze "foo") 354 | ;; (freeze "foo" "bar" "qux") 355 | 356 | (defn thaw 357 | "Deserialize from a byte array to the object." 358 | [#^bytes bytes] 359 | (with-open [bais (java.io.ByteArrayInputStream. bytes) 360 | ois (java.io.ObjectInputStream. bais)] 361 | (.readObject ois))) 362 | 363 | ;; (thaw (freeze "foo")) 364 | 365 | ;; (object->file "foo" ($HOME "/foo.bin")) 366 | ;; (file->object ($HOME "/foo.bin")) 367 | 368 | (defmacro with-stdout-to-file 369 | "Wrap code, redirecting stdout to a file." 370 | [file & body] 371 | `(with-open [out# (ds/writer ~file)] 372 | (binding [*out* out#] 373 | ~@body))) 374 | 375 | (defmacro with-stderr-to-file 376 | "Wrap code, redirecting stderr to a file." 377 | [file & body] 378 | `(with-open [out# (ds/writer ~file)] 379 | (binding [*err* out#] 380 | ~@body))) 381 | 382 | 383 | (defn ensure-directory 384 | "Create the directory if it does not already exist." 385 | [#^String dir] 386 | (let [f (java.io.File. dir)] 387 | (if (not (.exists f)) 388 | (.mkdirs f)))) 389 | 390 | ;; TODO: port to pure java, rm is unix specific... 391 | (defn deltree 392 | "Remove the given directory tree, all files and subdirectories." 393 | [#^String dir] 394 | (sh/sh "rm" "-rf" dir)) 395 | 396 | ;; TODO this doesn't belong in io.clj, couldn't think of a better place for it 397 | (defn string-gzip [#^String s] 398 | (with-open [bout (java.io.ByteArrayOutputStream.) 399 | gzout (java.util.zip.GZIPOutputStream. bout)] 400 | (.write gzout (.getBytes s)) 401 | (.finish gzout) 402 | (.toByteArray bout))) 403 | 404 | (defn file-size [f] 405 | (.length (java.io.File. (str f)))) 406 | 407 | (defn byte-partitions-at-line-boundaries [#^String file-name desired-block-size-bytes] 408 | (with-open [fp (RandomAccessFile. file-name "r")] 409 | (let [file-length (.length fp)] 410 | (loop [byte-positions [0] 411 | next-seek-point desired-block-size-bytes] 412 | (if (>= next-seek-point file-length) 413 | (conj byte-positions file-length) 414 | (do 415 | (.seek fp next-seek-point) 416 | (if (nil? (.readLine fp)) 417 | byte-positions 418 | (recur (conj byte-positions (.getFilePointer fp)) 419 | (+ (.getFilePointer fp) desired-block-size-bytes))))))))) 420 | 421 | 422 | (defn- bounded-input-stream-line-seq [#^BufferedReader bis] 423 | (let [line (.readLine bis)] 424 | (if-not line 425 | (do 426 | (.close bis) 427 | nil) 428 | 429 | (lazy-cat 430 | [line] 431 | (bounded-input-stream-line-seq bis))))) 432 | 433 | (defn read-lines-from-file-segment [#^String file-name start end] 434 | (let [bis (BoundedInputStream. 435 | (doto (FileInputStream. file-name) 436 | (.skip start)) 437 | (- end start))] 438 | (bounded-input-stream-line-seq (BufferedReader. (InputStreamReader. bis))))) 439 | 440 | 441 | 442 | ;; (defn #^{:doc "Map over the lines of a file - in parallel. This 443 | ;; function will partition the given file into blocks of lines (where 444 | ;; each block size is approximately equal to `block-size', which defaults 445 | ;; to 8Mb). " 446 | ;; :added "1.0.18"} 447 | ;; pmap-file-lines [file-name f & [block-size]] 448 | ;; (mapcat identity 449 | ;; (pmap (fn [[start end]] 450 | ;; (map f (read-lines-from-file-segment inf start end))) 451 | ;; (partition 2 1 (byte-partitions-at-line-boundaries inf (or block-size (* 8 1024 1024))))))) 452 | 453 | 454 | (defn list-files [^String f] 455 | (map str (.listFiles (java.io.File. f)))) 456 | 457 | (defmulti ensure-directory! class) 458 | 459 | (defmethod ensure-directory! String [^String path] 460 | (ensure-directory! (File. path))) 461 | 462 | (defmethod ensure-directory! File [^File path] 463 | (if (not (.exists path)) 464 | (.mkdirs path))) 465 | 466 | (defmethod ensure-directory! :default [path] 467 | (raise "Error: ensure-directory!, don't know how to handle path=%s of type:%s" 468 | path (class type))) 469 | 470 | (comment 471 | (ensure-directory! "/tmp") 472 | 473 | ) 474 | 475 | (defmulti ensure-directory-for-file! class) 476 | 477 | (defmethod ensure-directory-for-file! String [^String path] 478 | (ensure-directory! (.getParentFile (File. path)))) 479 | 480 | (defmethod ensure-directory-for-file! File [path] 481 | (ensure-directory! (.getParentFile ^File path))) 482 | 483 | (comment 484 | (ensure-directory-for-file! "/tmp/foo/bar") 485 | 486 | ) 487 | 488 | (defn rewind-to-line-boundary [^java.io.RandomAccessFile fp] 489 | (loop [fp fp] 490 | (cond 491 | (= 0 (.getFilePointer fp)) 492 | fp 493 | 494 | (= (.length fp) (.getFilePointer fp)) 495 | (do 496 | (.seek fp (- (.getFilePointer fp) 1)) 497 | (recur fp)) 498 | 499 | (let [byte (.readByte fp)] 500 | (= \newline (char byte))) 501 | fp 502 | 503 | :else 504 | (do 505 | (.seek fp (- (.getFilePointer fp) 2)) 506 | (recur fp))))) 507 | 508 | (comment 509 | (char (first (.getBytes "\n"))) 510 | 511 | (let [fp (RandomAccessFile. "/home/relay/b" "r")] 512 | (.seek fp 3) 513 | (rewind-to-line-boundary fp) 514 | (printf "line=%s\n" (.readLine fp))) 515 | 516 | ) 517 | 518 | (defn seek-to-before-segment [^RandomAccessFile fp ^String value] 519 | (let [seek-size (* 1024 1024 1)] 520 | (loop [curr-val (.readLine ^RandomAccessFile (rewind-to-line-boundary fp))] 521 | (cond 522 | (< (.compareTo ^String curr-val value) 0) 523 | fp 524 | 525 | (< (.getFilePointer fp) seek-size) 526 | (do 527 | (.seek fp 0) 528 | fp) 529 | 530 | :else 531 | (do 532 | (.seek fp (- (.getFilePointer fp) 533 | seek-size)) 534 | (recur (.readLine ^RandomAccessFile (rewind-to-line-boundary fp)))))))) 535 | 536 | 537 | (defn stream-segment-lines [^String file-name start ^String value] 538 | (filter #(.startsWith ^String % value) 539 | (bounded-input-stream-line-seq 540 | (BufferedReader. 541 | (InputStreamReader. 542 | (doto (FileInputStream. file-name) 543 | (.skip start))))))) 544 | 545 | ;; makes the assumption that a line starts with the index value and a tab and is sorted(!) 546 | (defn binary-search-index [^String value ^String idx-file] 547 | (with-open [fp (RandomAccessFile. idx-file "r")] 548 | (loop [spos 0 549 | epos (.length fp) 550 | max 256] 551 | (.seek fp (/ (+ epos spos) 2)) 552 | (rewind-to-line-boundary fp) 553 | (let [mid-point (long (/ (+ spos epos) 2)) 554 | [val-from-idx] (.split (.readLine fp) "\t")] 555 | (cond 556 | (zero? max) 557 | nil 558 | 559 | (= epos spos) ;; nowhere else to seek to 560 | nil 561 | 562 | (= spos (.length fp)) 563 | nil 564 | 565 | (= value val-from-idx) 566 | (do 567 | (seek-to-before-segment fp value) 568 | (stream-segment-lines 569 | idx-file 570 | (.getFilePointer fp) 571 | value)) 572 | 573 | (< (.compareTo value val-from-idx) 0) 574 | (recur spos (long mid-point) (dec max)) 575 | 576 | (= 1 (- epos spos)) 577 | nil 578 | 579 | :else 580 | (recur mid-point (long epos) (dec max))))))) 581 | 582 | 583 | 584 | (defn lazy-read-lines [filename] 585 | (let [rdr ^java.io.BufferedReader (cljio/reader filename) 586 | read-next-line (fn read-next [] 587 | (if-let [line (.readLine rdr)] 588 | (cons line (lazy-seq (read-next))) 589 | (.close rdr)))] 590 | (lazy-seq (read-next-line)))) 591 | --------------------------------------------------------------------------------