├── .gitignore ├── .idea ├── compiler.xml ├── libraries │ ├── Maven__com_github_haifengl_smile_core_1_5_4.xml │ ├── Maven__com_github_haifengl_smile_data_1_5_4.xml │ ├── Maven__com_github_haifengl_smile_graph_1_5_4.xml │ ├── Maven__com_github_haifengl_smile_math_1_5_4.xml │ ├── Maven__com_github_haifengl_smile_nlp_1_5_3.xml │ ├── Maven__com_google_code_findbugs_jsr305_3_0_2.xml │ ├── Maven__com_google_errorprone_error_prone_annotations_2_3_2.xml │ ├── Maven__com_google_guava_failureaccess_1_0_1.xml │ ├── Maven__com_google_guava_guava_28_0_jre.xml │ ├── Maven__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava.xml │ ├── Maven__com_google_j2objc_j2objc_annotations_1_3.xml │ ├── Maven__com_google_re2j_re2j_1_3.xml │ ├── Maven__junit_junit_4_12.xml │ ├── Maven__org_checkerframework_checker_qual_2_8_1.xml │ ├── Maven__org_codehaus_mojo_animal_sniffer_annotations_1_17.xml │ ├── Maven__org_hamcrest_hamcrest_core_1_3.xml │ └── Maven__org_slf4j_slf4j_api_1_7_25.xml └── uiDesigner.xml ├── LICENSE ├── README.md ├── categorical.md ├── java ├── feature-examples.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── tdunning │ │ └── examples │ │ ├── CooData.java │ │ ├── Jacobi.java │ │ ├── Sort.java │ │ └── VectorText.java │ └── test │ └── java │ └── com │ └── tdunning │ └── examples │ ├── CooDataTest.java │ ├── JacobiTest.java │ └── VectorTextTest.java └── src └── python ├── README.md ├── cooc.py ├── onehot.py ├── time-encodings.py └── wiki-data-download.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | # Compiled class file 3 | *.class 4 | 5 | # Log file 6 | *.log 7 | 8 | # BlueJ files 9 | *.ctxt 10 | 11 | # Mobile Tools for Java (J2ME) 12 | .mtj.tmp/ 13 | 14 | # Package Files # 15 | *.jar 16 | *.war 17 | *.nar 18 | *.ear 19 | *.zip 20 | *.tar.gz 21 | *.rar 22 | 23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 24 | hs_err_pid* 25 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_haifengl_smile_core_1_5_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_haifengl_smile_data_1_5_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_haifengl_smile_graph_1_5_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_haifengl_smile_math_1_5_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_haifengl_smile_nlp_1_5_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_findbugs_jsr305_3_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_errorprone_error_prone_annotations_2_3_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_failureaccess_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_guava_28_0_jre.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_j2objc_j2objc_annotations_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_re2j_re2j_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__junit_junit_4_12.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_checkerframework_checker_qual_2_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_mojo_animal_sniffer_annotations_1_17.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical Feature Extraction 2 | 3 | This repository contains a compendium of useful feature extraction techniques I have learned about over the years. If you have a favorite that I have missed, let me know. 4 | 5 | # Techniques covered (aspirationally) 6 | ## Categorical 7 | One-hot encoding 8 | 9 | Hashed one-hot encoding 10 | 11 | Unique ID 12 | 13 | Binary encoding after sorting 14 | 15 | Count encoding 16 | 17 | Rank encoding 18 | 19 | Rank-change 20 | 21 | Naive Bayes Rate Encoding 22 | 23 | Semantic embedding 24 | 25 | tf.idf 26 | 27 | Luduan terms * 28 | ## Numerical 29 | Binning * 30 | 31 | Rounding 32 | 33 | Log 34 | ## Temporal 35 | Day of week, Hour of day, Weekend/holiday indicators 36 | 37 | Quadrature encodings 38 | 39 | Distance to event 40 | 41 | Lagged features 42 | ## Geographical 43 | Pre-clustering 44 | 45 | S2 Geo Points 46 | 47 | Proximity to cities 48 | 49 | MSA 50 | 51 | Zip3 52 | ## Word-like and Text 53 | 54 | tf.idf 55 | 56 | Luduan terms 57 | 58 | Semantic embeddings 59 | 60 | Glove 61 | https://nlp.stanford.edu/projects/glove/?source=post_page 62 | 63 | Indicator detection 64 | ## IP Address 65 | Reverse resolution 66 | 67 | CIDR 68 | 69 | CIDR prefix 70 | 71 | ## Missing Data 72 | As a special value (unknown word) 73 | 74 | Means 75 | 76 | Reverse model 77 | 78 | ## Consolidation 79 | Unknown word 80 | 81 | Stemming 82 | 83 | ## Parsing and Modeling 84 | User agent 85 | 86 | IP domains 87 | 88 | Email address 89 | 90 | Headers 91 | 92 | Referrer 93 | 94 | 5P energy models 95 | ## Scaling 96 | Q scaling 97 | 98 | Z scaling 99 | 100 | Min-max scaling 101 | 102 | Log 103 | ## Cross modeling 104 | Other models 105 | 106 | Modeled structure 107 | 108 | Word2vec 109 | -------------------------------------------------------------------------------- /categorical.md: -------------------------------------------------------------------------------- 1 | # Categorical features 2 | There are many ways to transform a categorical variable with high cardinality. I will describe the following methods here: 3 | 4 | *one-hot* -- the simplest of all techniques, very useful in a number of settings with low cardinality 5 | 6 | *rare-word tagging* -- this allows the cardinality to vary 7 | 8 | *frequency binning* -- often of great use in anomaly detection, fraud prevention and intrusion detection 9 | 10 | *random embedding* -- the grandparent of modern semantic embedding 11 | 12 | *the hash trick* -- random embedding for people who like binary features 13 | 14 | *Luduan features* -- how to encode operational structure efficiently by observing cooccurrence. 15 | 16 | ## Background 17 | But before we get into all this too much, let’s settle some terminology. Let’s take “low cardinality” to be less than 10 or so, “medium cardinality” to be from 10 to 100, “high cardinality” to be 100 to 1000 and “ultra high cardinality” to be above 1000. These boundaries aren’t hard and fast and we should be willing to wiggle a bit on them. Some categorical variables are ordered (birth year) and some are not (car make). We should also keep in mind categorical variables where we do not know the full cardinality, be it low, medium, high or ultra high. 18 | 19 | Examples of features in these different cardinality ranges include: 20 | 21 | *low cardinality* -- gender, rotation direction (CW or CCW), cardinal points (N, S, E, W, NE, etc), phone type (land line, cell, VOIP) 22 | 23 | *medium cardinality* -- car make, telephone brand, key on keyboard, US state 24 | 25 | *high cardinality* -- country of birth, birth year 26 | 27 | *ultra high cardinality* -- word from text, URL, domain name, IP address, post code 28 | 29 | Examples of categorical variables where we can’t easily know the full cardinality with absolute certainty or where we might wish to allow for change might include brand names, countries and gender. Examples of categorical variables where the cardinality is not just not currently known, but is growing continually include domain names, IP addresses and words from text. 30 | 31 | There are lots of techniques and tricks for handling variables of this general class. Which techniques work best depend a lot on the rough cardinality and whether the cardinality is fixed. When cardinality is even moderately high, you start to encounter problems due to the fact that some values will be much more rare than others. As you get to ultra high cardinality, this problem becomes very severe as frequencies can vary between different values by many orders of magnitude. 32 | 33 | When the cardinality is not fixed or is simply not yet known, a different problem arises in that essentially all machine learning techniques want to deal with a fixed number of input variables. That means that we have to figure out some way to convert an unbounded kind of input into a strictly bounded number of inputs without losing information. With numerical features, we also have a large cardinality, but the mathematical structure of numbers such as distance and ordering usually allows us to treat such inputs much more simply. With true categorical values, we have to discover or impose this structure. 34 | 35 | ## One-hot Encoding 36 | 37 | In the simplest of all cases, we have low and fixed cardinality. In such a case, we can have a model feature for each possible value that the variable can take on and set all of these features to zero except for the one corresponding to the value of our categorical feature. This works great and is known as one-hot encoding. This might lead to encoding days of the week as Monday = (1,0,0,0,0,0,0), Tuesday = (0,1,0,0,0,0,0) and so on. 38 | 39 | ## Rare-word Collapse 40 | 41 | As the cardinality increases, however, this works less and less well, largely because some values will be much more rare than other values and increasing the number of features to a model beyond a few thousand generally has very bad effects on the ability to build a model. Even worse, high cardinality generally goes hand in hand with indefinite cardinality. Even so, it is common in natural language models to simply group all but the 𝑘 most common values of a categorical variable as a single “RARE-WORD” value. This reduction allows us to have a 𝑘+1-hot encoding. If 𝑘 is big enough, this will work pretty well because the “RARE-WORD” value will itself be pretty rare. 42 | 43 | ## Frequency Binning 44 | 45 | We can take this idea of collapsing to a radical and surprisingly effective extreme. This is done by reducing a high cardinality categorical feature to a single number that represents the frequency of the value of the feature. Alternately, you might use the quantile of the rank of the frequency, or bin the frequency of the value. In any case, this works in applications where a specific value isn’t as important as the fact that you have seen a surprisingly rare value. Consider, network intrusion detection where suddenly seeing lots of data going to a previously almost unknown external network address could be very informative. It doesn’t really matter which previously unknown address is being used, just that it is previously unknown or nearly so. Note that you can combine this kind of frequency feature with other features as well so that you not only get these desirable novelty effects, but you can keep the precise resolution about exactly which categorical value was seen. 46 | 47 | ## Random Vector Embedding 48 | 49 | Another way to keep a fixed sized encoding with values of large or unknown cardinality without collapsing rare values together is to use a random embedding or projection. One simple way to do this is convert each possible value to a 50–300 dimensional vector. Commonly, these vectors will be constrained to have unit length You can actually do this in a consistent way without knowing the categorical values ahead of time by using the actual value as a seed for a random number generator and then using that generator to sample a “random” unit vector. If the dimension of the vector is high enough (say 100 to 500 dimensions or more) then the vectors corresponding to any two categorical values will be nearly orthogonal with high probability. This quasi-orthogonality of random vectors is very handy since it makes each different value be sufficiently different from all other values so that machine learning algorithms can pick out important structure. 50 | 51 | These random vectors can also be tuned somewhat using simple techniques to build a semantic space, or using more advanced techniques to get some very fancy results. Such random projections can be used to do linear algebraic decompositions as well. 52 | 53 | [1] Context Vectors: A Step Toward a “Grand Unified Representation” 54 | https://link.springer.com/chapter/10.1007/10719871_14 55 | 56 | [2] Word2Vec 57 | https://en.wikipedia.org/wiki/Word2vec 58 | 59 | [3] BERT word embeddings 60 | https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ 61 | 62 | [4] Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decomposition 63 | https://arxiv.org/pdf/0909.4061.pdf 64 | 65 | 66 | 67 | ## The Hash Trick 68 | 69 | We can use different random projections to get something much more like the one-hot encoding as well without having to collapse rare features or, indeed, without having to even know which features are rare. For each distinct value, we can encode that value using a 𝑛 binary values of which exactly 𝑘 randomly chosen values are set to 1 with the rest set to 0 using the same seeding trick as before. Commonly 𝑛 is taken to be a few thousand while 𝑘 can be relatively small, typically less than 20. When 𝑘=1, we get one-hot encoding again. This technique works because of the same mathematical techniques as random projection, but is generally described more in terms of analogies to Bloom filters. 70 | 71 | ## Luduan Features 72 | 73 | Finally, you can derive a numerical features by grouping values that have anomalous correlation with some objective observation and then weighting by the underlying frequency of the feature value (or the inverse log of that frequency). This reduction is known as a Luduan feature and is based on the use of log-likelihood ratio tests for finding interesting cooccurrence. I gave a talk on using these techniques for transaction mining some time ago that described how to do this. 74 | 75 | [5] Finding Structure in Text, Genome and Other Symbolic Sequences 76 | https://arxiv.org/abs/1207.1847 77 | 78 | [6] Accurate Methods for the Statistics of Surprise and Coincidence 79 | https://aclweb.org/anthology/J93-1003 80 | 81 | [7] Mining Transactional Data 82 | https://www.slideshare.net/MapRTechnologies/transactional-data-mining-ted-dunning-2004 83 | -------------------------------------------------------------------------------- /java/feature-examples.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.tdunning 8 | feature-examples 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 8 17 | 8 18 | 19 | 20 | 21 | 22 | 23 | 24 | com.google.guava 25 | guava 26 | 28.0-jre 27 | 28 | 29 | com.google.re2j 30 | re2j 31 | 1.3 32 | 33 | 34 | com.github.haifengl 35 | smile-core 36 | 1.5.4 37 | 38 | 39 | com.github.haifengl 40 | smile-nlp 41 | 1.5.3 42 | 43 | 44 | junit 45 | junit 46 | 4.13.1 47 | test 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /java/src/main/java/com/tdunning/examples/CooData.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import smile.math.matrix.SparseMatrix; 4 | 5 | import java.util.Arrays; 6 | 7 | /** 8 | * Data structure that is used to build a sparse matrix if given a bunch of i,j,x triples. 9 | * This just maintains arrays of i, j, and m[i,j], but has some cleverness about reallocating 10 | * as data arrives and about sorting into a good order for being a real SparseMatrix. 11 | *

12 | * This is really inefficient for anything except accumulating entries. Any real processing will 13 | * need to be done by converting to csr or csc sparse formats. The SMILE SparseMatrix is a csc format. 14 | */ 15 | public class CooData { 16 | private int entriesAddedSinceCompression = 0; 17 | private ElementOrdering lastOrdering = ElementOrdering.NONE; 18 | 19 | int entries; 20 | private int nrows; 21 | private int ncols; 22 | int[] rows; 23 | int[] cols; 24 | double[] values; 25 | 26 | public CooData() { 27 | nrows = -1; 28 | ncols = -1; 29 | init(100, 100); 30 | } 31 | 32 | public CooData(int rows, int cols) { 33 | this.nrows = rows; 34 | this.ncols = cols; 35 | init(rows, cols); 36 | } 37 | 38 | private void init(int rows, int cols) { 39 | int n = Math.max(rows, cols) + 5; 40 | this.rows = new int[n]; 41 | this.cols = new int[n]; 42 | this.values = new double[n]; 43 | } 44 | 45 | /** 46 | * Adds a value to the value already at i,j. 47 | * 48 | * @param i The row 49 | * @param j The column 50 | * @param x The increment to the value at A[i,j] 51 | */ 52 | public void add(int i, int j, double x) { 53 | if (i < 0 || (nrows != -1 && i >= nrows)) { 54 | throw new IllegalArgumentException(String.format("Invalid row %d (should be in [0,%d)", i, nrows)); 55 | } 56 | if (j < 0 || (ncols != -1 && i >= ncols)) { 57 | throw new IllegalArgumentException(String.format("Invalid row %d (should be in [0,%d)", j, ncols)); 58 | } 59 | 60 | if (entries >= rows.length) { 61 | if (entriesAddedSinceCompression > entries / 4.0) { 62 | compress(ElementOrdering.BY_COL, false); 63 | } 64 | int n = 2 * entries; 65 | if (n > rows.length) { 66 | rows = Arrays.copyOf(rows, n); 67 | cols = Arrays.copyOf(cols, n); 68 | values = Arrays.copyOf(values, n); 69 | } 70 | } 71 | rows[entries] = i; 72 | cols[entries] = j; 73 | values[entries] = x; 74 | entries++; 75 | lastOrdering = ElementOrdering.NONE; 76 | entriesAddedSinceCompression++; 77 | } 78 | 79 | /** 80 | * Reorder and aggregate data and indexes to be a proper sparse matrix. 81 | * 82 | * @return The resulting matrix 83 | */ 84 | public SparseMatrix asSparseMatrix() { 85 | compress(ElementOrdering.BY_COL, false); 86 | resolveSizing(); 87 | 88 | // data is now sorted by col, then row 89 | // we just need to make a short column index 90 | // note that we create one last element to point to the end of all data 91 | int[] colIndex = new int[ncols + 1]; 92 | int last = -1; 93 | int j = 0; 94 | for (int k = 0; k < entries; ) { 95 | assert rows[k] >= 0 && rows[k] < nrows; 96 | assert cols[k] >= 0 && cols[k] < ncols; 97 | 98 | while (j <= cols[k]) { 99 | colIndex[j++] = k; 100 | } 101 | last = cols[k]; 102 | while (k < entries && cols[k] == last) { 103 | k++; 104 | } 105 | } 106 | colIndex[ncols] = entries; 107 | return new SparseMatrix(nrows, ncols, values, rows, colIndex); 108 | } 109 | 110 | private void resolveSizing() { 111 | if (ncols == -1 || nrows == -1) { 112 | for (int k = 0; k < entries; k++) { 113 | ncols = Math.max(ncols, cols[k] + 1); 114 | nrows = Math.max(nrows, rows[k] + 1); 115 | } 116 | } 117 | } 118 | 119 | enum ElementOrdering { 120 | NONE, BY_ROW, BY_COL 121 | } 122 | 123 | @SuppressWarnings("WeakerAccess") 124 | public void compress(ElementOrdering elementOrdering, boolean force) { 125 | if (!force && lastOrdering == elementOrdering) { 126 | return; 127 | } 128 | entriesAddedSinceCompression = 0; 129 | lastOrdering = elementOrdering; 130 | 131 | int[] major; 132 | int[] minor; 133 | switch (elementOrdering) { 134 | case BY_ROW: 135 | major = this.rows; 136 | minor = this.cols; 137 | break; 138 | case BY_COL: 139 | default: 140 | major = this.cols; 141 | minor = this.rows; 142 | } 143 | 144 | // first sort everything in row order 145 | int[] order = new int[entries]; 146 | Sort.sort(order, major, 0, entries); 147 | untangle(order, major, 0, entries); 148 | untangle(order, minor, 0, entries); 149 | untangle(order, values, 0, entries); 150 | 151 | // now scan through all the data 152 | int fill = 0; 153 | for (int i = 0; i < entries; ) { 154 | // for each range of constant row number, sort by column 155 | int j = i + 1; 156 | while (j < entries && major[j] == major[i]) { 157 | j++; 158 | } 159 | if (j > i + 1) { 160 | Sort.sort(order, minor, i, j - i); 161 | untangle(order, major, i, j); 162 | untangle(order, minor, i, j); 163 | untangle(order, values, i, j); 164 | } 165 | 166 | // and now collapse ranges of constant column number 167 | for (int k = i; k < j; ) { 168 | int r = major[k]; 169 | int c = minor[k]; 170 | double sum = 0; 171 | for (; k < j && minor[k] == c; k++) { 172 | sum += values[k]; 173 | } 174 | major[fill] = r; 175 | minor[fill] = c; 176 | values[fill] = sum; 177 | fill++; 178 | } 179 | i = j; 180 | } 181 | 182 | entries = fill; 183 | } 184 | 185 | private void untangle(int[] order, int[] values, int start, int end) { 186 | int[] tmp = Arrays.copyOfRange(values, start, end); 187 | for (int i = start; i < end; i++) { 188 | tmp[i - start] = values[order[i]]; 189 | } 190 | System.arraycopy(tmp, 0, values, start, end - start); 191 | } 192 | 193 | private void untangle(int[] order, double[] values, int start, int end) { 194 | double[] tmp = Arrays.copyOfRange(values, start, end); 195 | for (int i = start; i < end; i++) { 196 | tmp[i - start] = values[order[i]]; 197 | } 198 | System.arraycopy(tmp, 0, values, start, end - start); 199 | } 200 | 201 | public void append(CooData other) { 202 | if (entries + other.entries > rows.length) { 203 | int n = entries + other.entries; 204 | rows = Arrays.copyOf(rows, n); 205 | cols = Arrays.copyOf(cols, n); 206 | values = Arrays.copyOf(values, n); 207 | } 208 | System.arraycopy(rows, entries, other.rows, 0, other.entries); 209 | System.arraycopy(cols, entries, other.cols, 0, other.entries); 210 | System.arraycopy(values, entries, other.values, 0, other.entries); 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /java/src/main/java/com/tdunning/examples/Jacobi.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import smile.math.matrix.SparseMatrix; 4 | 5 | import java.util.Arrays; 6 | 7 | /** 8 | * Classic iterative solver for sparse systems. This converges if the matrix A is diagonally dominant or if 9 | * it is symmetrical and positive definite. 10 | */ 11 | public class Jacobi { 12 | private SparseMatrix a; 13 | 14 | public Jacobi(SparseMatrix a) { 15 | if (a.ncols() != a.nrows()) { 16 | throw new IllegalArgumentException("Matrix must be square"); 17 | } 18 | this.a = a; 19 | } 20 | 21 | public double[] solve(double[] b) { 22 | return solve(b, 1e-10, 10000); 23 | } 24 | 25 | public double[] solve(double[] b, double tolerance, int maxIteration) { 26 | final int n = a.ncols(); 27 | if (b.length != n) { 28 | throw new IllegalArgumentException("Must have b vector same size as matrix"); 29 | } 30 | 31 | double[] x = new double[n]; 32 | double[] diagonal = new double[n]; 33 | a.foreachNonzero((i, j, value) -> { 34 | if (i == j) { 35 | diagonal[i] = value; 36 | } 37 | }); 38 | 39 | double dMax = Double.POSITIVE_INFINITY; 40 | int iteration = 0; 41 | while (dMax > tolerance && iteration < maxIteration) { 42 | // z = b - Rx, where R is A except for diagonal elements 43 | double[] tmp = Arrays.copyOf(b, n); 44 | a.foreachNonzero((i, j, value) -> { 45 | if (i != j) { 46 | tmp[i] -= value * x[j]; 47 | } 48 | }); 49 | 50 | dMax = 0; 51 | for (int i = 0; i < n; i++) { 52 | double v = tmp[i] / diagonal[i]; 53 | dMax = Math.max(Math.abs(x[i] - v), dMax); 54 | x[i] = v; 55 | } 56 | iteration++; 57 | System.out.printf("%10.2f\n", dMax); 58 | } 59 | return x; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /java/src/main/java/com/tdunning/examples/Sort.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Ted Dunning under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Originally from t-digest. 18 | */ 19 | 20 | package com.tdunning.examples; 21 | 22 | import java.util.Random; 23 | 24 | /** 25 | * Static sorting methods 26 | */ 27 | public class Sort { 28 | private static final Random prng = new Random(); // for choosing pivots during quicksort 29 | /** 30 | * Quick sort using an index array. On return, 31 | * values[order[i]] is in order as i goes 0..values.length 32 | * 33 | * @param order Indexes into values 34 | * @param values The values to sort. 35 | */ 36 | @SuppressWarnings("WeakerAccess") 37 | public static void sort(int[] order, int[] values) { 38 | sort(order, values, 0, values.length); 39 | } 40 | 41 | /** 42 | * Quick sort using an index array. On return, 43 | * values[order[i]] is in order as i goes 0..n 44 | * 45 | * @param order Indexes into values 46 | * @param values The values to sort. 47 | * @param n The number of values to sort 48 | */ 49 | @SuppressWarnings("WeakerAccess") 50 | public static void sort(int[] order, int[] values, int n) { 51 | sort(order, values, 0, n); 52 | } 53 | 54 | /** 55 | * Quick sort using an index array. On return, 56 | * values[order[i]] is in order as i goes start..n 57 | * 58 | * @param order Indexes into values 59 | * @param values The values to sort. 60 | * @param start The first element to sort 61 | * @param n The number of values to sort 62 | */ 63 | @SuppressWarnings("WeakerAccess") 64 | public static void sort(int[] order, int[] values, int start, int n) { 65 | for (int i = start; i < start + n; i++) { 66 | order[i] = i; 67 | } 68 | quickSort(order, values, start, start + n, 64); 69 | insertionSort(order, values, start, start + n, 64); 70 | } 71 | 72 | /** 73 | * Standard quick sort except that sorting is done on an index array rather than the values themselves 74 | * 75 | * @param order The pre-allocated index array 76 | * @param values The values to sort 77 | * @param start The beginning of the values to sort 78 | * @param end The value after the last value to sort 79 | * @param limit The minimum size to recurse down to. 80 | */ 81 | private static void quickSort(int[] order, int[] values, int start, int end, int limit) { 82 | // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases 83 | while (end - start > limit) { 84 | 85 | // pivot by a random element 86 | int pivotIndex = start + prng.nextInt(end - start); 87 | double pivotValue = values[order[pivotIndex]]; 88 | 89 | // move pivot to beginning of array 90 | swap(order, start, pivotIndex); 91 | 92 | // we use a three way partition because many duplicate values is an important case 93 | 94 | int low = start + 1; // low points to first value not known to be equal to pivotValue 95 | int high = end; // high points to first value > pivotValue 96 | int i = low; // i scans the array 97 | while (i < high) { 98 | // invariant: values[order[k]] == pivotValue for k in [0..low) 99 | // invariant: values[order[k]] < pivotValue for k in [low..i) 100 | // invariant: values[order[k]] > pivotValue for k in [high..end) 101 | // in-loop: i < high 102 | // in-loop: low < high 103 | // in-loop: i >= low 104 | double vi = values[order[i]]; 105 | if (vi == pivotValue) { 106 | if (low != i) { 107 | swap(order, low, i); 108 | } else { 109 | i++; 110 | } 111 | low++; 112 | } else if (vi > pivotValue) { 113 | high--; 114 | swap(order, i, high); 115 | } else { 116 | // vi < pivotValue 117 | i++; 118 | } 119 | } 120 | // invariant: values[order[k]] == pivotValue for k in [0..low) 121 | // invariant: values[order[k]] < pivotValue for k in [low..i) 122 | // invariant: values[order[k]] > pivotValue for k in [high..end) 123 | // assert i == high || low == high therefore, we are done with partition 124 | 125 | // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > 126 | // we have to move the values equal to the pivot into the middle. To do this, we swap pivot 127 | // values into the top end of the [low,high) range stopping when we run out of destinations 128 | // or when we run out of values to copy 129 | int from = start; 130 | int to = high - 1; 131 | for (i = 0; from < low && to >= low; i++) { 132 | swap(order, from++, to--); 133 | } 134 | if (from == low) { 135 | // ran out of things to copy. This means that the the last destination is the boundary 136 | low = to + 1; 137 | } else { 138 | // ran out of places to copy to. This means that there are uncopied pivots and the 139 | // boundary is at the beginning of those 140 | low = from; 141 | } 142 | 143 | // checkPartition(order, values, pivotValue, start, low, high, end); 144 | 145 | // now recurse, but arrange it so we handle the longer limit by tail recursion 146 | if (low - start < end - high) { 147 | quickSort(order, values, start, low, limit); 148 | 149 | // this is really a way to do 150 | // quickSort(order, values, high, end, limit); 151 | start = high; 152 | } else { 153 | quickSort(order, values, high, end, limit); 154 | // this is really a way to do 155 | // quickSort(order, values, start, low, limit); 156 | end = low; 157 | } 158 | } 159 | } 160 | 161 | /** 162 | * Quick sort in place of several paired arrays. On return, 163 | * keys[...] is in order and the values[] arrays will be 164 | * reordered as well in the same way. 165 | * 166 | * @param key Values to sort on 167 | * @param values The auxilliary values to sort. 168 | */ 169 | @SuppressWarnings("WeakerAccess") 170 | public static void sort(double[] key, double[] ... values) { 171 | sort(key, 0, key.length, values); 172 | } 173 | 174 | /** 175 | * Quick sort using an index array. On return, 176 | * values[order[i]] is in order as i goes start..n 177 | * @param key Values to sort on 178 | * @param start The first element to sort 179 | * @param n The number of values to sort 180 | * @param values The auxilliary values to sort. 181 | */ 182 | @SuppressWarnings("WeakerAccess") 183 | public static void sort(double[] key, int start, int n, double[]... values) { 184 | quickSort(key, values, start, start + n, 8); 185 | insertionSort(key, values, start, start + n, 8); 186 | } 187 | 188 | /** 189 | * Standard quick sort except that sorting rearranges parallel arrays 190 | * 191 | * @param key Values to sort on 192 | * @param values The auxilliary values to sort. 193 | * @param start The beginning of the values to sort 194 | * @param end The value after the last value to sort 195 | * @param limit The minimum size to recurse down to. 196 | */ 197 | private static void quickSort(double[] key, double[][] values, int start, int end, int limit) { 198 | // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases 199 | while (end - start > limit) { 200 | 201 | // median of three values for the pivot 202 | int a = start; 203 | int b = (start + end) / 2; 204 | int c = end - 1; 205 | 206 | int pivotIndex; 207 | double pivotValue; 208 | double va = key[a]; 209 | double vb = key[b]; 210 | double vc = key[c]; 211 | //noinspection Duplicates 212 | if (va > vb) { 213 | if (vc > va) { 214 | // vc > va > vb 215 | pivotIndex = a; 216 | pivotValue = va; 217 | } else { 218 | // va > vb, va >= vc 219 | if (vc < vb) { 220 | // va > vb > vc 221 | pivotIndex = b; 222 | pivotValue = vb; 223 | } else { 224 | // va >= vc >= vb 225 | pivotIndex = c; 226 | pivotValue = vc; 227 | } 228 | } 229 | } else { 230 | // vb >= va 231 | if (vc > vb) { 232 | // vc > vb >= va 233 | pivotIndex = b; 234 | pivotValue = vb; 235 | } else { 236 | // vb >= va, vb >= vc 237 | if (vc < va) { 238 | // vb >= va > vc 239 | pivotIndex = a; 240 | pivotValue = va; 241 | } else { 242 | // vb >= vc >= va 243 | pivotIndex = c; 244 | pivotValue = vc; 245 | } 246 | } 247 | } 248 | 249 | // move pivot to beginning of array 250 | swap(start, pivotIndex, key, values); 251 | 252 | // we use a three way partition because many duplicate values is an important case 253 | 254 | int low = start + 1; // low points to first value not known to be equal to pivotValue 255 | int high = end; // high points to first value > pivotValue 256 | int i = low; // i scans the array 257 | while (i < high) { 258 | // invariant: values[order[k]] == pivotValue for k in [0..low) 259 | // invariant: values[order[k]] < pivotValue for k in [low..i) 260 | // invariant: values[order[k]] > pivotValue for k in [high..end) 261 | // in-loop: i < high 262 | // in-loop: low < high 263 | // in-loop: i >= low 264 | double vi = key[i]; 265 | if (vi == pivotValue) { 266 | if (low != i) { 267 | swap(low, i, key, values); 268 | } else { 269 | i++; 270 | } 271 | low++; 272 | } else if (vi > pivotValue) { 273 | high--; 274 | swap(i, high, key, values); 275 | } else { 276 | // vi < pivotValue 277 | i++; 278 | } 279 | } 280 | // invariant: values[order[k]] == pivotValue for k in [0..low) 281 | // invariant: values[order[k]] < pivotValue for k in [low..i) 282 | // invariant: values[order[k]] > pivotValue for k in [high..end) 283 | // assert i == high || low == high therefore, we are done with partition 284 | 285 | // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > 286 | // we have to move the values equal to the pivot into the middle. To do this, we swap pivot 287 | // values into the top end of the [low,high) range stopping when we run out of destinations 288 | // or when we run out of values to copy 289 | int from = start; 290 | int to = high - 1; 291 | for (i = 0; from < low && to >= low; i++) { 292 | swap(from++, to--, key, values); 293 | } 294 | if (from == low) { 295 | // ran out of things to copy. This means that the the last destination is the boundary 296 | low = to + 1; 297 | } else { 298 | // ran out of places to copy to. This means that there are uncopied pivots and the 299 | // boundary is at the beginning of those 300 | low = from; 301 | } 302 | 303 | // checkPartition(order, values, pivotValue, start, low, high, end); 304 | 305 | // now recurse, but arrange it so we handle the longer limit by tail recursion 306 | if (low - start < end - high) { 307 | quickSort(key, values, start, low, limit); 308 | 309 | // this is really a way to do 310 | // quickSort(order, values, high, end, limit); 311 | start = high; 312 | } else { 313 | quickSort(key, values, high, end, limit); 314 | // this is really a way to do 315 | // quickSort(order, values, start, low, limit); 316 | end = low; 317 | } 318 | } 319 | } 320 | 321 | 322 | /** 323 | * Limited range insertion sort. We assume that no element has to move more than limit steps 324 | * because quick sort has done its thing. This version works on parallel arrays of keys and values. 325 | * 326 | * @param key The array of keys 327 | * @param values The values we are sorting 328 | * @param start The starting point of the sort 329 | * @param end The ending point of the sort 330 | * @param limit The largest amount of disorder 331 | */ 332 | @SuppressWarnings("SameParameterValue") 333 | private static void insertionSort(double[] key, double[][] values, int start, int end, int limit) { 334 | // loop invariant: all values start ... i-1 are ordered 335 | for (int i = start + 1; i < end; i++) { 336 | double v = key[i]; 337 | int m = Math.max(i - limit, start); 338 | for (int j = i; j >= m; j--) { 339 | if (j == m || key[j - 1] <= v) { 340 | if (j < i) { 341 | System.arraycopy(key, j, key, j + 1, i - j); 342 | key[j] = v; 343 | for (double[] value : values) { 344 | double tmp = value[i]; 345 | System.arraycopy(value, j, value, j + 1, i - j); 346 | value[j] = tmp; 347 | } 348 | } 349 | break; 350 | } 351 | } 352 | } 353 | } 354 | 355 | private static void swap(int[] order, int i, int j) { 356 | int t = order[i]; 357 | order[i] = order[j]; 358 | order[j] = t; 359 | } 360 | 361 | private static void swap(int i, int j, double[] key, double[]...values) { 362 | double t = key[i]; 363 | key[i] = key[j]; 364 | key[j] = t; 365 | 366 | for (int k = 0; k < values.length; k++) { 367 | t = values[k][i]; 368 | values[k][i] = values[k][j]; 369 | values[k][j] = t; 370 | } 371 | } 372 | 373 | /** 374 | * Check that a partition step was done correctly. For debugging and testing. 375 | * 376 | * @param order The array of indexes representing a permutation of the keys. 377 | * @param values The keys to sort. 378 | * @param pivotValue The value that splits the data 379 | * @param start The beginning of the data of interest. 380 | * @param low Values from start (inclusive) to low (exclusive) are < pivotValue. 381 | * @param high Values from low to high are equal to the pivot. 382 | * @param end Values from high to end are above the pivot. 383 | */ 384 | @SuppressWarnings("UnusedDeclaration") 385 | public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) { 386 | if (order.length != values.length) { 387 | throw new IllegalArgumentException("Arguments must be same size"); 388 | } 389 | 390 | if (!(start >= 0 && low >= start && high >= low && end >= high)) { 391 | throw new IllegalArgumentException(String.format("Invalid indices %d, %d, %d, %d", start, low, high, end)); 392 | } 393 | 394 | for (int i = 0; i < low; i++) { 395 | double v = values[order[i]]; 396 | if (v >= pivotValue) { 397 | throw new IllegalArgumentException(String.format("Value greater than pivot at %d", i)); 398 | } 399 | } 400 | 401 | for (int i = low; i < high; i++) { 402 | if (values[order[i]] != pivotValue) { 403 | throw new IllegalArgumentException(String.format("Non-pivot at %d", i)); 404 | } 405 | } 406 | 407 | for (int i = high; i < end; i++) { 408 | double v = values[order[i]]; 409 | if (v <= pivotValue) { 410 | throw new IllegalArgumentException(String.format("Value less than pivot at %d", i)); 411 | } 412 | } 413 | } 414 | 415 | /** 416 | * Limited range insertion sort. We assume that no element has to move more than limit steps 417 | * because quick sort has done its thing. 418 | * 419 | * @param order The permutation index 420 | * @param values The values we are sorting 421 | * @param start Where to start the sort 422 | * @param n How many elements to sort 423 | * @param limit The largest amount of disorder 424 | */ 425 | @SuppressWarnings("SameParameterValue") 426 | private static void insertionSort(int[] order, int[] values, int start, int n, int limit) { 427 | for (int i = start + 1; i < n; i++) { 428 | int t = order[i]; 429 | double v = values[order[i]]; 430 | int m = Math.max(i - limit, start); 431 | for (int j = i; j >= m; j--) { 432 | if (j == 0 || values[order[j - 1]] <= v) { 433 | if (j < i) { 434 | System.arraycopy(order, j, order, j + 1, i - j); 435 | order[j] = t; 436 | } 437 | break; 438 | } 439 | } 440 | } 441 | } 442 | 443 | /** 444 | * Reverses an array in-place. 445 | * 446 | * @param order The array to reverse 447 | */ 448 | @SuppressWarnings("WeakerAccess") 449 | public static void reverse(int[] order) { 450 | reverse(order, 0, order.length); 451 | } 452 | 453 | /** 454 | * Reverses part of an array. See {@link #reverse(int[])} 455 | * 456 | * @param order The array containing the data to reverse. 457 | * @param offset Where to start reversing. 458 | * @param length How many elements to reverse 459 | */ 460 | @SuppressWarnings("WeakerAccess") 461 | public static void reverse(int[] order, int offset, int length) { 462 | for (int i = 0; i < length / 2; i++) { 463 | int t = order[offset + i]; 464 | order[offset + i] = order[offset + length - i - 1]; 465 | order[offset + length - i - 1] = t; 466 | } 467 | } 468 | 469 | /** 470 | * Reverses part of an array. See {@link #reverse(int[])} 471 | * 472 | * @param order The array containing the data to reverse. 473 | * @param offset Where to start reversing. 474 | * @param length How many elements to reverse 475 | */ 476 | @SuppressWarnings({"WeakerAccess", "SameParameterValue"}) 477 | public static void reverse(double[] order, int offset, int length) { 478 | for (int i = 0; i < length / 2; i++) { 479 | double t = order[offset + i]; 480 | order[offset + i] = order[offset + length - i - 1]; 481 | order[offset + length - i - 1] = t; 482 | } 483 | } 484 | } 485 | -------------------------------------------------------------------------------- /java/src/main/java/com/tdunning/examples/VectorText.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import java.util.*; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | import java.util.stream.Collectors; 7 | import java.util.stream.Stream; 8 | import java.util.stream.StreamSupport; 9 | 10 | public class VectorText { 11 | private static Pattern word = Pattern.compile(String.join("", "", 12 | // "([A-Z]\\.)+", // a word 13 | "\\d+:(\\.\\d)+", // a number 14 | "|(\\w+)", // a word 15 | "|(https?://)?(\\w+\\.)(\\w{2,})+([\\w/]+)?", // URL 16 | "|[@#]?\\w+(?:[-']\\w+)*", // twitter-like user reference 17 | "|\\$\\d+(\\.\\d+)?%?", // dollar amount 18 | "|\\\\[Uu]\\w+", // normal word 19 | "|\\\\[Uu]\\w+'t" // contraction 20 | )); 21 | 22 | @SuppressWarnings("WeakerAccess") 23 | public static Stream tokenize(CharSequence s) { 24 | Iterator is = new Iterator() { 25 | int position = 0; 26 | Matcher m = word.matcher(s); 27 | 28 | @Override 29 | public boolean hasNext() { 30 | return m.find(position); 31 | } 32 | 33 | @Override 34 | public String next() { 35 | position = m.end(); 36 | return m.group().toLowerCase(); 37 | } 38 | }; 39 | int characteristics = Spliterator.DISTINCT | Spliterator.SORTED | Spliterator.IMMUTABLE; 40 | Spliterator spliterator = Spliterators.spliteratorUnknownSize(is, characteristics); 41 | 42 | return StreamSupport.stream(spliterator, false); 43 | } 44 | 45 | @SuppressWarnings("WeakerAccess") 46 | public static List tokenizeAsList(CharSequence s) { 47 | return tokenize(s).collect(Collectors.toList()); 48 | } 49 | 50 | public static int[] vectorize(Map dictionary, String s) { 51 | int[] result = new int[dictionary.size()]; 52 | VectorText.tokenize(s).forEach(w -> { 53 | if (dictionary.containsKey(w)) { 54 | result[dictionary.get(w)] = 1; 55 | } 56 | }); 57 | return result; 58 | } 59 | 60 | public static int[] count(Map dictionary, String s) { 61 | int[] result = new int[dictionary.size()]; 62 | VectorText.tokenize(s).forEach(w -> { 63 | if (dictionary.containsKey(w)) { 64 | result[dictionary.get(w)]++; 65 | } 66 | }); 67 | return result; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /java/src/test/java/com/tdunning/examples/CooDataTest.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import org.junit.Test; 4 | import smile.math.matrix.SparseMatrix; 5 | 6 | import java.util.*; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class CooDataTest { 11 | 12 | public static final int ROWS = 100000; 13 | public static final int COLS = 100000; 14 | 15 | @Test 16 | public void basics() { 17 | Random rand = new Random(); 18 | CooData m = new CooData(ROWS, COLS); 19 | Map ref = new TreeMap<>(); 20 | for (int step = 0; step < 100000; step++) { 21 | int i; 22 | int j; 23 | 24 | if (rand.nextDouble() < 0.2) { 25 | i = rand.nextInt(20); 26 | j = rand.nextInt(20); 27 | } else { 28 | i = (int) (-10000 * Math.log(rand.nextDouble())); 29 | if (i >= ROWS) { 30 | i = ROWS - 1; 31 | } 32 | j = (int) (-10000 * Math.log(rand.nextDouble())); 33 | if (j >= COLS) { 34 | j = COLS - 1; 35 | } 36 | } 37 | double x = rand.nextGaussian(); 38 | Pair k = new Pair(i, j); 39 | if (ref.containsKey(k)) { 40 | ref.put(k, ref.get(k) + x); 41 | } else { 42 | ref.put(k, x); 43 | } 44 | m.add(i, j, x); 45 | } 46 | m.compress(CooData.ElementOrdering.BY_ROW, true); 47 | 48 | for (int step = 0; step < m.entries; step++) { 49 | assertEquals(ref.get(new Pair(m.rows[step], m.cols[step])), m.values[step], 0); 50 | } 51 | } 52 | 53 | @Test 54 | public void small() { 55 | CooData m = new CooData(5, 7); 56 | for (int i = 0; i < 5; i++) { 57 | for (int j = 0; j < 7; j++) { 58 | m.add(i, j, 100 * i + j); 59 | } 60 | } 61 | for (int i = 0; i < 5; i++) { 62 | m.add(i, i, 2); 63 | m.add(i, i + 1, 3); 64 | } 65 | m.compress(CooData.ElementOrdering.BY_ROW, true); 66 | int k = 0; 67 | for (int i = 0; i < 5; i++) { 68 | for (int j = 0; j < 7; j++) { 69 | assertEquals(m.rows[k], i); 70 | assertEquals(m.cols[k], j); 71 | assertEquals(m.values[k], 100 * i + j + ((i == j) ? 2 : 0) + ((j == i + 1) ? 3 : 0), 0); 72 | k++; 73 | } 74 | } 75 | 76 | SparseMatrix mx = m.asSparseMatrix(); 77 | k = 0; 78 | for (int j = 0; j < 7; j++) { 79 | for (int i = 0; i < 5; i++) { 80 | assertEquals(m.rows[k], i); 81 | assertEquals(m.cols[k], j); 82 | assertEquals(m.values[k], 100 * i + j + ((i == j) ? 2 : 0) + ((j == i + 1) ? 3 : 0), 0); 83 | assertEquals(m.values[k], mx.get(i,j), 0); 84 | k++; 85 | } 86 | } 87 | 88 | 89 | } 90 | 91 | private class Pair implements Comparable { 92 | int i, j; 93 | 94 | public Pair(int i, int j) { 95 | this.i = i; 96 | this.j = j; 97 | } 98 | 99 | @Override 100 | public boolean equals(Object o) { 101 | if (this == o) return true; 102 | if (o == null || getClass() != o.getClass()) return false; 103 | Pair pair = (Pair) o; 104 | return i == pair.i && 105 | j == pair.j; 106 | } 107 | 108 | @Override 109 | public int hashCode() { 110 | return 31 * i + j; 111 | } 112 | 113 | @Override 114 | public int compareTo(Pair other) { 115 | int r = this.i - other.j; 116 | if (r == 0) { 117 | return this.j - other.j; 118 | } else { 119 | return r; 120 | } 121 | } 122 | } 123 | } -------------------------------------------------------------------------------- /java/src/test/java/com/tdunning/examples/JacobiTest.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import org.junit.Test; 4 | import smile.math.matrix.SparseMatrix; 5 | 6 | import static org.junit.Assert.*; 7 | 8 | public class JacobiTest { 9 | 10 | @Test 11 | public void solve() { 12 | CooData connectionData = new CooData(); 13 | // 100 x 100 mesh encoded as 10,000 elements in a vector 14 | for (int i = 0; i < 100; i++) { 15 | for (int j = 0; j < 100; j++) { 16 | int k0 = coord(i, j); 17 | double sum = 0; 18 | for (int dx = -1; dx <= 1; dx++) { 19 | for (int dy = -1; dy <= 1; dy++) { 20 | if ((dx != 0 || dy != 0) && i + dx >= 0 && i + dx < 100 && j + dy >= 0 && j + dy < 100) { 21 | double w = 0.125; 22 | sum += w; 23 | connectionData.add(k0, coord(i + dx, j + dy), w); 24 | } 25 | } 26 | } 27 | connectionData.add(k0, k0, -sum); 28 | } 29 | } 30 | 31 | SparseMatrix transfer = connectionData.asSparseMatrix(); 32 | 33 | Jacobi jSolver = new Jacobi(transfer); 34 | double[] b = new double[10000]; 35 | for (int i = 0; i < 100; i++) { 36 | b[coord(i, 0)] = 1; 37 | b[coord(i, 99)] = -1; 38 | } 39 | double[] x = jSolver.solve(b); 40 | for (int j = 0; j < 10; j++) { 41 | System.out.printf("%.2f ", x[j]); 42 | } 43 | for (int j = 10; j < 100; j += 5) { 44 | System.out.printf("%.2f ", x[j]); 45 | } 46 | System.out.printf("\n"); 47 | 48 | } 49 | 50 | private int coord(int i, int j) { 51 | return 100 * i + j; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /java/src/test/java/com/tdunning/examples/VectorTextTest.java: -------------------------------------------------------------------------------- 1 | package com.tdunning.examples; 2 | 3 | import com.google.common.collect.HashMultiset; 4 | import com.google.common.collect.Multiset; 5 | import org.junit.Test; 6 | import smile.math.matrix.DenseMatrix; 7 | import smile.math.matrix.JMatrix; 8 | import smile.math.matrix.SparseMatrix; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | import java.nio.charset.StandardCharsets; 13 | import java.nio.file.Files; 14 | import java.nio.file.Paths; 15 | import java.util.*; 16 | import java.util.concurrent.atomic.AtomicInteger; 17 | import java.util.function.Function; 18 | import java.util.stream.Collectors; 19 | import java.util.stream.Stream; 20 | 21 | import static org.junit.Assert.*; 22 | 23 | public class VectorTextTest { 24 | private String sample = "We stayed for 5 nights last week. " + 25 | "The cold food of fruit/pastries/cereals is great. " + 26 | "$10 for a small OJ!! "; 27 | private String[] tokens = { 28 | "we", "stayed", "for", "5", "nights", "last", "week", 29 | "the", "cold", "food", "of", "fruit", "pastries", 30 | "cereals", "is", "great", "$10", "for", "a", "small", "oj" 31 | }; 32 | 33 | @org.junit.Test 34 | public void tokenizeAsStream() { 35 | Stream ref = Stream.of(tokens); 36 | Iterator ix = ref.iterator(); 37 | VectorText.tokenize(sample) 38 | .forEachOrdered(s -> assertEquals(ix.next(), s)); 39 | assertFalse(ix.hasNext()); 40 | } 41 | 42 | @org.junit.Test 43 | public void tokenize() { 44 | Iterator is = Arrays.asList(tokens).iterator(); 45 | for (String s : VectorText.tokenizeAsList(sample)) { 46 | assertEquals(is.next(), s); 47 | } 48 | } 49 | 50 | @Test 51 | public void vectorize() { 52 | // build a dictionary of all words we see in a subset of the text 53 | Map dict = VectorText.tokenize(sample.split("\\.")[0]).collect( 54 | TreeMap::new, 55 | (Map d, String s) -> d.put(s, d.size()), 56 | Map::putAll 57 | ); 58 | dict.remove("5"); 59 | dict.put("tiger", dict.size()); 60 | 61 | int[] v = VectorText.vectorize(dict, sample); 62 | assertArrayEquals(new int[]{1, 1, 1, 0, 1, 1, 1}, v); 63 | } 64 | 65 | @Test 66 | public void count() { 67 | // build a dictionary of all words we see in a subset of the text 68 | Map dict = VectorText.tokenize(sample.split("\\.")[0]).collect( 69 | TreeMap::new, 70 | (Map d, String s) -> d.put(s, d.size()), 71 | Map::putAll 72 | ); 73 | dict.remove("5"); 74 | dict.put("tiger", dict.size()); 75 | 76 | int[] v = VectorText.count(dict, sample); 77 | assertArrayEquals(new int[]{1, 1, 2, 0, 1, 1, 1}, v); 78 | } 79 | 80 | @Test 81 | public void gloveVectors() throws IOException { 82 | int nDocs = 50000; 83 | Progress p = new Progress(); 84 | 85 | AtomicInteger docCount = new AtomicInteger(); 86 | double t0 = System.nanoTime() / 1e9; 87 | Multiset counts = docs(p, nDocs) 88 | .flatMap(VectorText::tokenize) 89 | .collect( 90 | HashMultiset::create, 91 | (strings, element) -> { 92 | docCount.incrementAndGet(); 93 | strings.add(element); 94 | }, 95 | HashMultiset::addAll); 96 | AtomicInteger wordCount = new AtomicInteger(); 97 | Map dict = counts.elementSet().stream() 98 | .filter(w -> counts.count(w) > 3) 99 | .collect( 100 | TreeMap::new, 101 | (d, w) -> d.put(w, wordCount.getAndIncrement()), 102 | TreeMap::putAll); 103 | List undict = new ArrayList<>(dict.keySet()); 104 | 105 | Progress p1 = new Progress(); 106 | DenseMatrix wordVectors = Files.lines(Paths.get("/Users/tdunning/Downloads/glove.6B/glove.6B.100d.txt")) 107 | .collect( 108 | () -> new JMatrix(dict.size(), 100), 109 | (DenseMatrix m, String rawWordVector) -> { 110 | p1.log(); 111 | int i = rawWordVector.indexOf(' '); 112 | String word = rawWordVector.substring(0, i); 113 | if (dict.containsKey(word)) { 114 | int row = dict.get(word); 115 | int j = 0; 116 | for (String v : rawWordVector.substring(i + 1).split(" ")) { 117 | try { 118 | m.set(row, j++, Double.parseDouble(v)); 119 | } catch (NumberFormatException e) { 120 | System.out.printf("Error in %s\n%s\n", v, rawWordVector); 121 | } 122 | } 123 | } 124 | }, 125 | DenseMatrix::add 126 | ); 127 | 128 | DenseMatrix idf = new JMatrix(dict.size(), 1); 129 | for (String w : dict.keySet()) { 130 | idf.set(dict.get(w), 0, Math.log(counts.size() / counts.count(w))); 131 | } 132 | 133 | docs(p, 100) 134 | .forEach( 135 | doc -> { 136 | // for each document, build out sum of idf-weighted one-hot vectors 137 | DenseMatrix docVector = new JMatrix(100, 1); 138 | VectorText.tokenize(doc) 139 | .filter(dict::containsKey) 140 | .forEach( 141 | w -> { 142 | int iw = dict.get(w); 143 | for (int i = 0; i < 100; i++) { 144 | docVector.add(i, 0, wordVectors.get(iw, i) * idf.get(iw, 0)); 145 | } 146 | } 147 | ); 148 | 149 | // now multiply back at the word vectors to find nearest terms 150 | // (dict.size() x 100) * (100 x 1) => (dict.size() x 1) 151 | DenseMatrix r = wordVectors.abmm(docVector); 152 | 153 | // find words with highest score 154 | PriorityQueue pq = new PriorityQueue<>(Comparator.comparingDouble(a -> a.score)); 155 | for (int i = 0; i < r.nrows(); i++) { 156 | pq.add(new ScoredPair(i, 0, r.get(i, 0))); 157 | while (pq.size() > 50) { 158 | pq.poll(); 159 | } 160 | } 161 | 162 | // reverse into descending order 163 | List best = pq.stream() 164 | .map(scoredItem -> scoredItem.i) 165 | .collect(Collectors.toList()); 166 | Collections.reverse(best); 167 | 168 | // and let's take a look 169 | System.out.printf("%s\n ", doc.substring(0, Math.min(50, doc.length()))); 170 | for (Integer w : best) { 171 | System.out.printf(" %s", undict.get(w)); 172 | } 173 | System.out.printf("\n"); 174 | } 175 | ); 176 | } 177 | 178 | private static class Progress { 179 | int step = 1; 180 | int order = 1; 181 | 182 | int count = 0; 183 | double t0 = System.nanoTime() * 1e-9; 184 | private int oldCount = 0; 185 | 186 | void log() { 187 | count++; 188 | if (count == step * order) { 189 | double t1 = System.nanoTime() * 1e-9; 190 | double rate = (count - oldCount) / (t1 - t0); 191 | t0 = t1; 192 | oldCount = count; 193 | System.out.printf("%d (%.0f / sec)\n", count, rate); 194 | 195 | step = (int) (2.51 * step); 196 | if (step > 10) { 197 | step = 1; 198 | order *= 10; 199 | } 200 | } 201 | } 202 | } 203 | 204 | @Test 205 | public void documentSpeed() throws IOException { 206 | int nDocs = -1; 207 | double frequencyCut = 1000; 208 | int minScore = 12; 209 | int maxAssociates = 100; 210 | 211 | double t0 = System.nanoTime() / 1e9; 212 | Progress p = new Progress(); 213 | // count all the words in our corpus 214 | Multiset counts = docs(p, nDocs) 215 | .flatMap(VectorText::tokenize) 216 | .collect( 217 | HashMultiset::create, 218 | Multiset::add, 219 | HashMultiset::addAll); 220 | System.out.printf("%d total terms processed\n", counts.size()); 221 | // build a dictionary with words that occur sufficiently 222 | Map dict = counts.stream() 223 | .filter(w -> counts.count(w) > 3) 224 | .collect( 225 | TreeMap::new, 226 | (d, w) -> d.put(w, d.size()), 227 | TreeMap::putAll); 228 | 229 | // invert our dictionary as well 230 | Map undict = new HashMap<>(); 231 | for (String w : dict.keySet()) { 232 | undict.put(dict.get(w), w); 233 | } 234 | 235 | 236 | double t1 = System.nanoTime() / 1e9; 237 | System.out.printf("built dictionaries %.1f MB/s\n", new File("/Users/tdunning/tmp/OpinRank/hotels.txt").length() / (t1 - t0) / 1e6); 238 | p = new Progress(); 239 | 240 | Random rand = new Random(); 241 | 242 | // print some documents out for reference and checking 243 | AtomicInteger id = new AtomicInteger(0); 244 | Map> ref = docs(p, 10) 245 | .collect( 246 | TreeMap::new, 247 | (m, raw) -> { 248 | int currentDoc = id.getAndIncrement(); 249 | // downsample our words according to limit max frequency 250 | // and translate to integer form 251 | Set words = VectorText.tokenize(raw) 252 | .filter(w -> dict.containsKey(w) && (rand.nextDouble() < frequencyCut / counts.count(w))) 253 | .map(w -> w + "-" + dict.get(w)) 254 | .collect(Collectors.toSet()); 255 | m.put(currentDoc, words); 256 | }, 257 | Map::putAll); 258 | 259 | for (Integer docId : ref.keySet()) { 260 | System.out.printf("%d: (", docId); 261 | for (String w : ref.get(docId)) { 262 | System.out.printf("%s ", w); 263 | } 264 | System.out.printf(")\n"); 265 | } 266 | System.out.printf("\n"); 267 | 268 | p = new Progress(); 269 | 270 | // do the cooccurrence counting with downsampling of common items 271 | t0 = System.nanoTime() / 1e9; 272 | AtomicInteger docid = new AtomicInteger(0); 273 | CooData binaryTerms = docs(p, nDocs) 274 | .collect( 275 | CooData::new, 276 | (CooData m, String raw) -> { 277 | int currentDoc = docid.getAndIncrement(); 278 | // downsample our words according to limit max frequency 279 | // and translate to integer form 280 | CooData words = VectorText.tokenize(raw) 281 | .filter(w -> dict.containsKey(w) && (rand.nextDouble() < frequencyCut / counts.count(w))) 282 | .collect( 283 | () -> m, 284 | (CooData mx, String w) -> mx.add(currentDoc, dict.get(w), 1.0), 285 | CooData::append); 286 | }, 287 | CooData::append); 288 | binaryTerms.compress(CooData.ElementOrdering.BY_COL, false); 289 | for (int k = 0; k < binaryTerms.entries; k++) { 290 | binaryTerms.values[k] = 1; 291 | } 292 | t1 = System.nanoTime() / 1e9; 293 | System.out.printf("build doc matrix %.1f MB/s\n", new File("/Users/tdunning/tmp/OpinRank/hotels.txt").length() / (t1 - t0) / 1e6); 294 | 295 | SparseMatrix docByTerms = binaryTerms.asSparseMatrix(); 296 | double[] finalCounts = new double[docByTerms.ncols()]; 297 | docByTerms.foreachNonzero( 298 | (doc, word, k) -> { 299 | finalCounts[word]++; 300 | }); 301 | int totalDocuments = docByTerms.nrows(); 302 | int totalWords = docByTerms.ncols(); 303 | 304 | System.out.printf("doc matrix is %d x %d (%d vs %d non-zeros)\n", docByTerms.nrows(), docByTerms.ncols(), docByTerms.size(), binaryTerms.entries); 305 | SparseMatrix cooc = docByTerms.ata(); 306 | System.out.printf("%d x %d (%d non-zeros)\n", cooc.nrows(), cooc.ncols(), cooc.size()); 307 | 308 | // build associates matrix for words 309 | CooData rawConnections = new CooData(cooc.nrows(), cooc.ncols()); 310 | for (int word = 0; word < totalWords; word++) { 311 | PriorityQueue highScores = new PriorityQueue<>(Comparator.comparingDouble(t12 -> t12.score)); 312 | 313 | // scan through each column, scoring cooccurrences 314 | cooc.foreachNonzero(word, word + 1, 315 | (w1, w2, k11) -> { 316 | double k1x = finalCounts[w1]; 317 | double kx1 = finalCounts[w2]; 318 | double k12 = k1x - k11; 319 | double k21 = kx1 - k11; 320 | double k22 = totalDocuments - k11 - k12 - k21; 321 | double score = llr(k11, k12, k21, k22); 322 | if (score > minScore && (highScores.size() < maxAssociates || score > highScores.peek().score)) { 323 | highScores.add(new ScoredPair(w1, w2, score)); 324 | } 325 | while (highScores.size() > maxAssociates) { 326 | highScores.poll(); 327 | } 328 | }); 329 | while (highScores.size() > 0) { 330 | ScoredPair associate = highScores.poll(); 331 | rawConnections.add(associate.i, associate.j, 1); 332 | } 333 | } 334 | 335 | SparseMatrix associates = rawConnections.asSparseMatrix(); 336 | SparseMatrix similar = associates.ata(); 337 | for (String w : new String[]{"wild", "bad", "good", "lovely", "hotel", "rail"}) { 338 | System.out.printf("%s: ", w); 339 | similar.foreachNonzero(dict.get(w), dict.get(w) + 1, 340 | (w1, w2, x) -> { 341 | if (x > 8) { 342 | System.out.printf("%s-%.0f ", undict.get(w1), x); 343 | } 344 | }); 345 | System.out.printf("\n"); 346 | } 347 | } 348 | 349 | private double h(double... kxx) { 350 | double sum = 0; 351 | for (double k : kxx) { 352 | sum += k; 353 | } 354 | double r = 0; 355 | for (double k : kxx) { 356 | if (k > 0) { 357 | r -= k * Math.log(k / sum); 358 | } 359 | } 360 | return r; 361 | } 362 | 363 | private double llr(double k11, double k12, double k21, double k22) { 364 | return 2 * (h(k11 + k12, k21 + k22) + h(k11 + k21, k12 + k22) - h(k11, k12, k21, k22)); 365 | } 366 | 367 | @Test 368 | public void testHash() { 369 | int[] counts = new int[65536]; 370 | for (int i = 0; i < 65536; i++) { 371 | for (int j = 0; j < 65536; j++) { 372 | int k = new IntPair(i, j).hashCode(); 373 | k = k % counts.length; 374 | if (k < 0) { 375 | k += counts.length; 376 | } 377 | counts[k]++; 378 | } 379 | } 380 | int[] tmp = Arrays.copyOf(counts, counts.length); 381 | Arrays.sort(tmp); 382 | 383 | double qSoFar = 0; 384 | System.out.printf("%10.3f %d\n", qSoFar, 0); 385 | for (double q = 0; q < 0.9; q += 0.1) { 386 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]); 387 | } 388 | for (double q = 0.9; q < 0.99; q += 0.01) { 389 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]); 390 | } 391 | for (double q = 0.99; q < 1; q += 0.001) { 392 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]); 393 | } 394 | 395 | System.out.printf("\n\nbig\n"); 396 | int last = 0; 397 | for (int i = 0; i < 65536; i++) { 398 | if (counts[i] >= 1082052) { 399 | System.out.printf("%10x %10d %10d\n", i, i, i - last); 400 | last = i; 401 | } 402 | } 403 | System.out.printf("\nend\n"); 404 | } 405 | 406 | private class IntPair { 407 | public IntPair(int i, int j) { 408 | this.i = i; 409 | this.j = j; 410 | } 411 | 412 | int i, j; 413 | 414 | @Override 415 | public boolean equals(Object o) { 416 | if (this == o) return true; 417 | if (o == null || getClass() != o.getClass()) return false; 418 | IntPair intPair = (IntPair) o; 419 | return i == intPair.i && 420 | j == intPair.j; 421 | } 422 | 423 | @Override 424 | public int hashCode() { 425 | int seed = 3; 426 | // murmur is nice for general bit mixing, but it has some nasty favored patterns 427 | return 1037 * murmur(seed, i, j) + 17 * i + 53 * j; 428 | } 429 | 430 | private int murmur(int seed, int i, int j) { 431 | // one round of murmur 432 | int c1 = 0xcc9e2d51; 433 | int c2 = 0x1b873593; 434 | 435 | int k = i; 436 | k *= c1; 437 | k = (k << 15) | (k >> 17); 438 | k *= c2; 439 | 440 | int h = seed ^ k; 441 | h = (h << 13) | (h >> 19); 442 | h = h * 5 + 0xe6546b64; 443 | 444 | k = j; 445 | k *= c1; 446 | k = (k << 15) | (k >> 17); 447 | k *= c2; 448 | h = h ^ k; 449 | h = (h << 13) | (h >> 19); 450 | h = h * 5 + 0xe6546b64; 451 | 452 | h ^= h >>> 16; 453 | h *= 0x85ebca6b; 454 | h ^= h >>> 13; 455 | h *= 0xc2b2ae35; 456 | h ^= h >>> 16; 457 | 458 | return h; 459 | } 460 | } 461 | 462 | private Stream docs(Progress p) throws IOException { 463 | return docs(p, -1); 464 | } 465 | 466 | private Stream docs(Progress p, int limit) throws IOException { 467 | Function parser = line -> { 468 | p.log(); 469 | int k = line.indexOf('\t'); 470 | if (k >= 0) { 471 | k = line.indexOf('\t', k + 1); 472 | if (k >= 0) { 473 | return line.substring(k + 1); 474 | } else { 475 | throw new IllegalArgumentException("Couldn't find second tab"); 476 | } 477 | } else { 478 | throw new IllegalArgumentException("Couldn't find first tab"); 479 | } 480 | }; 481 | if (limit <= 0) { 482 | return Files.lines(Paths.get("/Users/tdunning/tmp/OpinRank/hotels.txt"), StandardCharsets.ISO_8859_1) 483 | .map(parser); 484 | } else { 485 | return Files.lines(Paths.get("/Users/tdunning/tmp/OpinRank/hotels.txt"), StandardCharsets.ISO_8859_1) 486 | .limit(limit) 487 | .map(parser); 488 | 489 | } 490 | } 491 | 492 | private class ScoredPair { 493 | private final int i; 494 | private final int j; 495 | private final double score; 496 | 497 | public ScoredPair(int i, int j, double score) { 498 | this.i = i; 499 | this.j = j; 500 | this.score = score; 501 | } 502 | 503 | public int getI() { 504 | return i; 505 | } 506 | 507 | public int getJ() { 508 | return j; 509 | } 510 | 511 | public double getScore() { 512 | return score; 513 | } 514 | } 515 | } -------------------------------------------------------------------------------- /src/python/README.md: -------------------------------------------------------------------------------- 1 | # Python Feature Extraction Examples 2 | 3 | ## Methods Illustrated 4 | Symbol combinations 5 | 6 | Quantiles, log-odds and binning 7 | 8 | Reduction to integers using ordinal encoding 9 | 10 | One hot encoding (and counting) 11 | 12 | Frequency encoding and unknown entity 13 | 14 | Luduan features 15 | ## Use Cases 16 | Web log 17 | * Domain, referer, user agent 18 | * referer + domain hashed encoding 19 | 20 | Header fields 21 | * ordering 22 | * language frequency 23 | * language + charset combos 24 | * unknown word 25 | 26 | Purchase amount history 27 | * log-odds, binning on purchase size 28 | * symbol combination (store + quantile-bin) 29 | 30 | Viewership 31 | * time quadrature, one-hot 32 | * one-hot time encodings 33 | 34 | Common point of compromise 35 | * Luduan 36 | 37 | Energy models 38 | * 5P model parameters 39 | * residuals 40 | 41 | Credit card gangs 42 | * card velocity 43 | -------------------------------------------------------------------------------- /src/python/cooc.py: -------------------------------------------------------------------------------- 1 | ### We will use the hotel reviews from https://kavita-ganesan.com/entity-ranking-data 2 | ### to build word representations using cooccurrence 3 | import collections 4 | import random 5 | import re 6 | import time 7 | from math import floor 8 | from typing import Optional, Set, Callable, List 9 | 10 | import numpy as np 11 | import scipy.sparse as sparse 12 | from sklearn import preprocessing as pre 13 | 14 | # We just use a simple regex here to define words 15 | # this is a bit lossy compared to fancier tokenizers 16 | # but it is also about 50x faster 17 | wordPattern = re.compile(r'''(?x) 18 | ([A-Z]\.)+ 19 | |\d+:(\.\d)+ 20 | |(https?://)?(\w+\.)(\w{2,})+([\w/]+)? 21 | |[@#]?\w+(?:[-']\w+)* 22 | |\$\d+(\.\d+)?%? 23 | |\\[Uu]\w+ 24 | |\\[Uu]\w+'t 25 | |\.\.\. 26 | |[!?]+ 27 | ''') 28 | 29 | 30 | def docs(max_docs=-1, ignore=None): 31 | """Returns a generator of generators. The inner generators return the tokens of 32 | each document in our corpus.""" 33 | if ignore is None: 34 | ignore = set() 35 | with open("/Users/tdunning/tmp/OpinRank/hotels.txt", "r", encoding="latin_1") as f: 36 | doc = 0 37 | step = 1 38 | scale = 10 39 | t0 = time.time_ns() / 1e9 40 | i0 = 0 41 | for line in f.read().split("\n"): 42 | doc = doc + 1 43 | if max_docs != -1 and doc > max_docs: 44 | break 45 | if doc % (step * scale) == 0: 46 | t1 = time.time_ns() / 1e9 47 | print("Doc %d (%.0f doc/s)" % (doc, (doc - i0) / (t1 - t0))) 48 | i0 = doc 49 | t0 = t1 50 | step = floor(step * 2.55) 51 | if step >= 10: 52 | step = 1 53 | scale = scale * 10 54 | pieces = line.split('\t', maxsplit=2) 55 | if len(pieces) == 3: 56 | yield (m.group(0) for m in wordPattern.finditer(pieces[2].lower()) if m and m.group(0) not in ignore) 57 | 58 | 59 | def H(k): 60 | """Computes unnormalized entropy of a stack of vectors""" 61 | if k.ndim == 2: 62 | k = k[:, :, np.newaxis] 63 | p = (k + 0.0) / k.sum(axis=1).sum(axis=1)[:, np.newaxis, np.newaxis] 64 | raw = -(k * np.log(p + (p == 0))) 65 | while raw.ndim > 1: 66 | raw = raw.sum(axis=1) 67 | return raw 68 | 69 | 70 | def llr(k): 71 | """Computes the log-likelihood ratio test for binomials in a vector-wise fashion. 72 | K is assumed to contain an n x 2 x 2 array of counts presumed to be a 2x2 table for 73 | each of n cases. We return an n-long vector of scores.""" 74 | s_row = H(k.sum(axis=1)) 75 | s_col = H(k.sum(axis=2)) 76 | s = H(k) 77 | return 2 * (s_row + s_col - s) 78 | 79 | 80 | def encode(docs: collections.abc.Iterable, lexicon: List[str], ignore=Optional[Set], 81 | matrix: Optional[Callable] = sparse.csr_matrix) -> sparse.spmatrix: 82 | if ignore is None: 83 | ignore = {} 84 | lookup = dict(zip(sorted(lexicon), range(len(lexicon)))) 85 | rows = [] 86 | cols = [] 87 | data = [] 88 | k = 0 89 | for d in docs: 90 | cols.extend({lookup[w] for w in d if w not in ignore}) 91 | n = len(cols) - len(rows) 92 | rows.extend(itertools.repeat(k, n)) 93 | data.extend(itertools.repeat(1, n)) 94 | k += 1 95 | zz = matrix((data, (rows, cols))) 96 | return (zz) 97 | 98 | 99 | from collections import Counter 100 | from nltk.corpus import brown 101 | import itertools 102 | 103 | 104 | def count(): 105 | k = Counter() 106 | for w in wordPattern.split(brown.raw()): 107 | k[w] += 1 108 | return k 109 | 110 | 111 | def test(): 112 | return Counter((w for d in docs() for w in d)) 113 | 114 | 115 | Ndocs = 50000 116 | minScore = 15 117 | maxAssociates = 30 118 | 119 | # count all the words that appear 120 | lexicon = Counter(itertools.chain.from_iterable(docs(Ndocs))) 121 | 122 | # kill words too rare to have interesting collocation 123 | kill = {w for w in lexicon if lexicon[w] < 3} 124 | for w in kill: 125 | del lexicon[w] 126 | 127 | allWords = sorted(lexicon) 128 | 129 | # build the doc x word matrix using the lexicon we have slightly tuned 130 | # note column friendly result 131 | z = encode(docs(Ndocs), allWords, ignore=kill, matrix=sparse.csc_matrix) 132 | 133 | # downsample frequent words (don't kill them entirely) 134 | targetMaxFrequency = max(200.0, Ndocs / 30.0) 135 | downSampleRate = [min(1, targetMaxFrequency / lexicon[w]) for w in allWords] 136 | print("downsample %.0f words out of %d" % (sum(1 if p < 1 else 0 for p in downSampleRate), len(lexicon))) 137 | for w in range(len(allWords)): 138 | p = downSampleRate[w] 139 | if p < 1: 140 | # only a few words will get whacked 141 | nz = z[:, w].nonzero() 142 | v = [1 if random.random() < p else 0 for i in nz[0]] 143 | z[nz[0], w] = v 144 | 145 | # so here are final counts 146 | wordCounts = z.sum(axis=0) 147 | total = sum(wordCounts) 148 | print("doc x word matrix ready") 149 | 150 | # compute raw cooccurrence 151 | cooc = z.T @ z 152 | # but avoid self-cooccurrence 153 | cooc[(range(cooc.shape[0]), range(cooc.shape[1]))] = 0 154 | print('cooccurrence computation done %.3f sparsity' % ((cooc > 0).sum() / (lambda s: s[0] * s[1])(cooc.shape))) 155 | 156 | # now find interesting cooccurrence 157 | # we build a 3D array with one 2x2 contingency tables for each non-zero in the cooccurrence table 158 | # the four elements count how often two particular words cooccur or not 159 | nz = cooc.nonzero() 160 | # A and B together 161 | k11 = cooc[nz] 162 | # A anywhere 163 | k1_ = wordCounts[0, nz[0]] 164 | # A without B 165 | k12 = k1_ - k11 166 | 167 | # B anywhere 168 | k_1 = wordCounts[0, nz[1]] 169 | # B without A 170 | k21 = k_1 - k11 171 | # neither A nor B 172 | k22 = Ndocs - k12 - k21 - k11 173 | 174 | # final shape should be n x 2 x 2 175 | k = np.array([k11, k12, k21, k22]).reshape((2, 2, k11.shape[1])).transpose() 176 | print("%d x %d x %d counts ready" % k.shape) 177 | 178 | # constructs scores whereever cooc was non-zero. Note cooc is symmetric, extra work here 179 | scores = sparse.csr_matrix((llr(k), nz)) 180 | print("scoring done") 181 | 182 | # now review each word and limit the number of associates 183 | rows = [] 184 | cols = [] 185 | for row in range(scores.shape[0]): 186 | # find nth highest score 187 | index = (scores[row, :] >= minScore).nonzero()[1] 188 | if len(index) > 0: 189 | s = sorted((scores[row, index].toarray().flat), reverse=True) 190 | cutoff = s[min(len(s), maxAssociates) - 1] 191 | cols.extend(i for i in index if scores[row, i] >= cutoff) 192 | rows.extend(itertools.repeat(row, len(cols) - len(rows))) 193 | # final result has row per word consisting of unweighted associates 194 | # might should consider idf weighting here 195 | associates = sparse.csr_matrix((list(itertools.repeat(1, len(rows))), (rows, cols)), shape=scores.shape) 196 | print("associates ready") 197 | synonyms = associates * associates.T 198 | 199 | lookup = dict(zip(sorted(lexicon), range(len(lexicon)))) 200 | unlook = list(sorted(lexicon)) 201 | print([unlook[i] for i in (synonyms[lookup['railway'], :] > 3).nonzero()[1]]) 202 | print([unlook[i] for i in (synonyms[lookup['hot'], :] > 3).nonzero()[1]]) 203 | print([unlook[i] for i in (synonyms[lookup['cold'], :] > 3).nonzero()[1]]) 204 | print([unlook[i] for i in (synonyms[lookup['food'], :] > 3).nonzero()[1]]) 205 | print([unlook[i] for i in (synonyms[lookup['room'], :] > 3).nonzero()[1]]) 206 | 207 | # lookup = dict(zip(sorted(lexicon), range(len(lexicon)))) 208 | # unlook = list(sorted(lexicon)) 209 | # m = lookup['railway'] 210 | # index = scores[m,:].nonzero()[1] 211 | # print(sorted(((scores[m,i], i, unlook[i]) for i in index), key=lambda x: -x[0])[1:15]) 212 | 213 | limit = 1000 214 | 215 | n1 = (wordCounts ** 2).sum() 216 | n2 = (wordCounts[wordCounts < limit] ** 2).sum() + ((wordCounts >= limit) * limit * limit).sum() 217 | print(n1 / n2) 218 | 219 | all_docs = [bag for bag in docs()] 220 | wordEncoder = pre.OneHotEncoder() 221 | words = [[w] for bag in all_docs for w in bag] 222 | wordEncoder.fit(words) 223 | vectors = [v.sum(axis=0) for bag in all_docs] 224 | wordEncoder.transform([[x] for x in all_docs[0]]) 225 | -------------------------------------------------------------------------------- /src/python/onehot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn import preprocessing 5 | 6 | enc = preprocessing.OrdinalEncoder() 7 | 8 | X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] 9 | enc.fit(X) 10 | enc.transform([['female', 'from US', 'uses Safari']]) 11 | -------------------------------------------------------------------------------- /src/python/time-encodings.py: -------------------------------------------------------------------------------- 1 | """ 2 | This demonstrates some simple ways to encode time so that models can 3 | make sense of it. 4 | 5 | The problem at hand is prediction of web traffic on various wikipedia pages. 6 | 7 | The features we will use include: 8 | 9 | * Lagged values for traffic 10 | * Time of day expressed as continuous variables 11 | * Day of week expressed as continuous variables 12 | * Day of week expressed as one-hot variables 13 | * Page URL 14 | """ 15 | import handout 16 | import os 17 | 18 | os.mkdir("handouts") # handout: exclude 19 | 20 | doc = handout.Handout("handouts/time") # handout: exclude 21 | 22 | doc.show() # handout: exclude -------------------------------------------------------------------------------- /src/python/wiki-data-download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | import time 5 | 6 | import urllib3 7 | from bs4 import BeautifulSoup 8 | 9 | 10 | base_url = 'https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-07/' 11 | 12 | http = urllib3.PoolManager() 13 | 14 | r = http.request('GET', base_url) 15 | 16 | soup = BeautifulSoup(r.data, 'html.parser') 17 | 18 | page_pattern = re.compile('pagecounts-.*gz') 19 | 20 | links = [anchor['href'] for anchor in soup.find_all("a") if re.match(page_pattern, anchor['href'])] 21 | 22 | try: 23 | os.mkdir('./wiki-stats') 24 | except FileExistsError: 25 | print("wiki-stats already exists") 26 | 27 | print(os.getcwd()) 28 | for link in links: 29 | url = base_url + '/' + link 30 | local_file = os.path.join('wiki-stats', link) 31 | if os.path.exists(local_file): 32 | print("%s already exists, skipping" % local_file) 33 | else: 34 | with http.request('GET', url, preload_content=False) as resp, open(local_file, "wb") as f: 35 | shutil.copyfileobj(resp, f) 36 | print(link) 37 | time.sleep(0.5) 38 | --------------------------------------------------------------------------------