├── .gitignore
├── .idea
├── compiler.xml
├── libraries
│ ├── Maven__com_github_haifengl_smile_core_1_5_4.xml
│ ├── Maven__com_github_haifengl_smile_data_1_5_4.xml
│ ├── Maven__com_github_haifengl_smile_graph_1_5_4.xml
│ ├── Maven__com_github_haifengl_smile_math_1_5_4.xml
│ ├── Maven__com_github_haifengl_smile_nlp_1_5_3.xml
│ ├── Maven__com_google_code_findbugs_jsr305_3_0_2.xml
│ ├── Maven__com_google_errorprone_error_prone_annotations_2_3_2.xml
│ ├── Maven__com_google_guava_failureaccess_1_0_1.xml
│ ├── Maven__com_google_guava_guava_28_0_jre.xml
│ ├── Maven__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava.xml
│ ├── Maven__com_google_j2objc_j2objc_annotations_1_3.xml
│ ├── Maven__com_google_re2j_re2j_1_3.xml
│ ├── Maven__junit_junit_4_12.xml
│ ├── Maven__org_checkerframework_checker_qual_2_8_1.xml
│ ├── Maven__org_codehaus_mojo_animal_sniffer_annotations_1_17.xml
│ ├── Maven__org_hamcrest_hamcrest_core_1_3.xml
│ └── Maven__org_slf4j_slf4j_api_1_7_25.xml
└── uiDesigner.xml
├── LICENSE
├── README.md
├── categorical.md
├── java
├── feature-examples.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── tdunning
│ │ └── examples
│ │ ├── CooData.java
│ │ ├── Jacobi.java
│ │ ├── Sort.java
│ │ └── VectorText.java
│ └── test
│ └── java
│ └── com
│ └── tdunning
│ └── examples
│ ├── CooDataTest.java
│ ├── JacobiTest.java
│ └── VectorTextTest.java
└── src
└── python
├── README.md
├── cooc.py
├── onehot.py
├── time-encodings.py
└── wiki-data-download.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | # Compiled class file
3 | *.class
4 |
5 | # Log file
6 | *.log
7 |
8 | # BlueJ files
9 | *.ctxt
10 |
11 | # Mobile Tools for Java (J2ME)
12 | .mtj.tmp/
13 |
14 | # Package Files #
15 | *.jar
16 | *.war
17 | *.nar
18 | *.ear
19 | *.zip
20 | *.tar.gz
21 | *.rar
22 |
23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
24 | hs_err_pid*
25 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_github_haifengl_smile_core_1_5_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_github_haifengl_smile_data_1_5_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_5_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_github_haifengl_smile_math_1_5_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_github_haifengl_smile_nlp_1_5_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_code_findbugs_jsr305_3_0_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_errorprone_error_prone_annotations_2_3_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_guava_failureaccess_1_0_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_guava_guava_28_0_jre.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_j2objc_j2objc_annotations_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_re2j_re2j_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__junit_junit_4_12.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_checkerframework_checker_qual_2_8_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_codehaus_mojo_animal_sniffer_annotations_1_17.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Practical Feature Extraction
2 |
3 | This repository contains a compendium of useful feature extraction techniques I have learned about over the years. If you have a favorite that I have missed, let me know.
4 |
5 | # Techniques covered (aspirationally)
6 | ## Categorical
7 | One-hot encoding
8 |
9 | Hashed one-hot encoding
10 |
11 | Unique ID
12 |
13 | Binary encoding after sorting
14 |
15 | Count encoding
16 |
17 | Rank encoding
18 |
19 | Rank-change
20 |
21 | Naive Bayes Rate Encoding
22 |
23 | Semantic embedding
24 |
25 | tf.idf
26 |
27 | Luduan terms *
28 | ## Numerical
29 | Binning *
30 |
31 | Rounding
32 |
33 | Log
34 | ## Temporal
35 | Day of week, Hour of day, Weekend/holiday indicators
36 |
37 | Quadrature encodings
38 |
39 | Distance to event
40 |
41 | Lagged features
42 | ## Geographical
43 | Pre-clustering
44 |
45 | S2 Geo Points
46 |
47 | Proximity to cities
48 |
49 | MSA
50 |
51 | Zip3
52 | ## Word-like and Text
53 |
54 | tf.idf
55 |
56 | Luduan terms
57 |
58 | Semantic embeddings
59 |
60 | Glove
61 | https://nlp.stanford.edu/projects/glove/?source=post_page
62 |
63 | Indicator detection
64 | ## IP Address
65 | Reverse resolution
66 |
67 | CIDR
68 |
69 | CIDR prefix
70 |
71 | ## Missing Data
72 | As a special value (unknown word)
73 |
74 | Means
75 |
76 | Reverse model
77 |
78 | ## Consolidation
79 | Unknown word
80 |
81 | Stemming
82 |
83 | ## Parsing and Modeling
84 | User agent
85 |
86 | IP domains
87 |
88 | Email address
89 |
90 | Headers
91 |
92 | Referrer
93 |
94 | 5P energy models
95 | ## Scaling
96 | Q scaling
97 |
98 | Z scaling
99 |
100 | Min-max scaling
101 |
102 | Log
103 | ## Cross modeling
104 | Other models
105 |
106 | Modeled structure
107 |
108 | Word2vec
109 |
--------------------------------------------------------------------------------
/categorical.md:
--------------------------------------------------------------------------------
1 | # Categorical features
2 | There are many ways to transform a categorical variable with high cardinality. I will describe the following methods here:
3 |
4 | *one-hot* -- the simplest of all techniques, very useful in a number of settings with low cardinality
5 |
6 | *rare-word tagging* -- this allows the cardinality to vary
7 |
8 | *frequency binning* -- often of great use in anomaly detection, fraud prevention and intrusion detection
9 |
10 | *random embedding* -- the grandparent of modern semantic embedding
11 |
12 | *the hash trick* -- random embedding for people who like binary features
13 |
14 | *Luduan features* -- how to encode operational structure efficiently by observing cooccurrence.
15 |
16 | ## Background
17 | But before we get into all this too much, let’s settle some terminology. Let’s take “low cardinality” to be less than 10 or so, “medium cardinality” to be from 10 to 100, “high cardinality” to be 100 to 1000 and “ultra high cardinality” to be above 1000. These boundaries aren’t hard and fast and we should be willing to wiggle a bit on them. Some categorical variables are ordered (birth year) and some are not (car make). We should also keep in mind categorical variables where we do not know the full cardinality, be it low, medium, high or ultra high.
18 |
19 | Examples of features in these different cardinality ranges include:
20 |
21 | *low cardinality* -- gender, rotation direction (CW or CCW), cardinal points (N, S, E, W, NE, etc), phone type (land line, cell, VOIP)
22 |
23 | *medium cardinality* -- car make, telephone brand, key on keyboard, US state
24 |
25 | *high cardinality* -- country of birth, birth year
26 |
27 | *ultra high cardinality* -- word from text, URL, domain name, IP address, post code
28 |
29 | Examples of categorical variables where we can’t easily know the full cardinality with absolute certainty or where we might wish to allow for change might include brand names, countries and gender. Examples of categorical variables where the cardinality is not just not currently known, but is growing continually include domain names, IP addresses and words from text.
30 |
31 | There are lots of techniques and tricks for handling variables of this general class. Which techniques work best depend a lot on the rough cardinality and whether the cardinality is fixed. When cardinality is even moderately high, you start to encounter problems due to the fact that some values will be much more rare than others. As you get to ultra high cardinality, this problem becomes very severe as frequencies can vary between different values by many orders of magnitude.
32 |
33 | When the cardinality is not fixed or is simply not yet known, a different problem arises in that essentially all machine learning techniques want to deal with a fixed number of input variables. That means that we have to figure out some way to convert an unbounded kind of input into a strictly bounded number of inputs without losing information. With numerical features, we also have a large cardinality, but the mathematical structure of numbers such as distance and ordering usually allows us to treat such inputs much more simply. With true categorical values, we have to discover or impose this structure.
34 |
35 | ## One-hot Encoding
36 |
37 | In the simplest of all cases, we have low and fixed cardinality. In such a case, we can have a model feature for each possible value that the variable can take on and set all of these features to zero except for the one corresponding to the value of our categorical feature. This works great and is known as one-hot encoding. This might lead to encoding days of the week as Monday = (1,0,0,0,0,0,0), Tuesday = (0,1,0,0,0,0,0) and so on.
38 |
39 | ## Rare-word Collapse
40 |
41 | As the cardinality increases, however, this works less and less well, largely because some values will be much more rare than other values and increasing the number of features to a model beyond a few thousand generally has very bad effects on the ability to build a model. Even worse, high cardinality generally goes hand in hand with indefinite cardinality. Even so, it is common in natural language models to simply group all but the 𝑘 most common values of a categorical variable as a single “RARE-WORD” value. This reduction allows us to have a 𝑘+1-hot encoding. If 𝑘 is big enough, this will work pretty well because the “RARE-WORD” value will itself be pretty rare.
42 |
43 | ## Frequency Binning
44 |
45 | We can take this idea of collapsing to a radical and surprisingly effective extreme. This is done by reducing a high cardinality categorical feature to a single number that represents the frequency of the value of the feature. Alternately, you might use the quantile of the rank of the frequency, or bin the frequency of the value. In any case, this works in applications where a specific value isn’t as important as the fact that you have seen a surprisingly rare value. Consider, network intrusion detection where suddenly seeing lots of data going to a previously almost unknown external network address could be very informative. It doesn’t really matter which previously unknown address is being used, just that it is previously unknown or nearly so. Note that you can combine this kind of frequency feature with other features as well so that you not only get these desirable novelty effects, but you can keep the precise resolution about exactly which categorical value was seen.
46 |
47 | ## Random Vector Embedding
48 |
49 | Another way to keep a fixed sized encoding with values of large or unknown cardinality without collapsing rare values together is to use a random embedding or projection. One simple way to do this is convert each possible value to a 50–300 dimensional vector. Commonly, these vectors will be constrained to have unit length You can actually do this in a consistent way without knowing the categorical values ahead of time by using the actual value as a seed for a random number generator and then using that generator to sample a “random” unit vector. If the dimension of the vector is high enough (say 100 to 500 dimensions or more) then the vectors corresponding to any two categorical values will be nearly orthogonal with high probability. This quasi-orthogonality of random vectors is very handy since it makes each different value be sufficiently different from all other values so that machine learning algorithms can pick out important structure.
50 |
51 | These random vectors can also be tuned somewhat using simple techniques to build a semantic space, or using more advanced techniques to get some very fancy results. Such random projections can be used to do linear algebraic decompositions as well.
52 |
53 | [1] Context Vectors: A Step Toward a “Grand Unified Representation”
54 | https://link.springer.com/chapter/10.1007/10719871_14
55 |
56 | [2] Word2Vec
57 | https://en.wikipedia.org/wiki/Word2vec
58 |
59 | [3] BERT word embeddings
60 | https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
61 |
62 | [4] Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decomposition
63 | https://arxiv.org/pdf/0909.4061.pdf
64 |
65 |
66 |
67 | ## The Hash Trick
68 |
69 | We can use different random projections to get something much more like the one-hot encoding as well without having to collapse rare features or, indeed, without having to even know which features are rare. For each distinct value, we can encode that value using a 𝑛 binary values of which exactly 𝑘 randomly chosen values are set to 1 with the rest set to 0 using the same seeding trick as before. Commonly 𝑛 is taken to be a few thousand while 𝑘 can be relatively small, typically less than 20. When 𝑘=1, we get one-hot encoding again. This technique works because of the same mathematical techniques as random projection, but is generally described more in terms of analogies to Bloom filters.
70 |
71 | ## Luduan Features
72 |
73 | Finally, you can derive a numerical features by grouping values that have anomalous correlation with some objective observation and then weighting by the underlying frequency of the feature value (or the inverse log of that frequency). This reduction is known as a Luduan feature and is based on the use of log-likelihood ratio tests for finding interesting cooccurrence. I gave a talk on using these techniques for transaction mining some time ago that described how to do this.
74 |
75 | [5] Finding Structure in Text, Genome and Other Symbolic Sequences
76 | https://arxiv.org/abs/1207.1847
77 |
78 | [6] Accurate Methods for the Statistics of Surprise and Coincidence
79 | https://aclweb.org/anthology/J93-1003
80 |
81 | [7] Mining Transactional Data
82 | https://www.slideshare.net/MapRTechnologies/transactional-data-mining-ted-dunning-2004
83 |
--------------------------------------------------------------------------------
/java/feature-examples.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/java/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.tdunning
8 | feature-examples
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 | org.apache.maven.plugins
14 | maven-compiler-plugin
15 |
16 | 8
17 | 8
18 |
19 |
20 |
21 |
22 |
23 |
24 | com.google.guava
25 | guava
26 | 28.0-jre
27 |
28 |
29 | com.google.re2j
30 | re2j
31 | 1.3
32 |
33 |
34 | com.github.haifengl
35 | smile-core
36 | 1.5.4
37 |
38 |
39 | com.github.haifengl
40 | smile-nlp
41 | 1.5.3
42 |
43 |
44 | junit
45 | junit
46 | 4.13.1
47 | test
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/java/src/main/java/com/tdunning/examples/CooData.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import smile.math.matrix.SparseMatrix;
4 |
5 | import java.util.Arrays;
6 |
7 | /**
8 | * Data structure that is used to build a sparse matrix if given a bunch of i,j,x triples.
9 | * This just maintains arrays of i, j, and m[i,j], but has some cleverness about reallocating
10 | * as data arrives and about sorting into a good order for being a real SparseMatrix.
11 | *
12 | * This is really inefficient for anything except accumulating entries. Any real processing will
13 | * need to be done by converting to csr or csc sparse formats. The SMILE SparseMatrix is a csc format.
14 | */
15 | public class CooData {
16 | private int entriesAddedSinceCompression = 0;
17 | private ElementOrdering lastOrdering = ElementOrdering.NONE;
18 |
19 | int entries;
20 | private int nrows;
21 | private int ncols;
22 | int[] rows;
23 | int[] cols;
24 | double[] values;
25 |
26 | public CooData() {
27 | nrows = -1;
28 | ncols = -1;
29 | init(100, 100);
30 | }
31 |
32 | public CooData(int rows, int cols) {
33 | this.nrows = rows;
34 | this.ncols = cols;
35 | init(rows, cols);
36 | }
37 |
38 | private void init(int rows, int cols) {
39 | int n = Math.max(rows, cols) + 5;
40 | this.rows = new int[n];
41 | this.cols = new int[n];
42 | this.values = new double[n];
43 | }
44 |
45 | /**
46 | * Adds a value to the value already at i,j.
47 | *
48 | * @param i The row
49 | * @param j The column
50 | * @param x The increment to the value at A[i,j]
51 | */
52 | public void add(int i, int j, double x) {
53 | if (i < 0 || (nrows != -1 && i >= nrows)) {
54 | throw new IllegalArgumentException(String.format("Invalid row %d (should be in [0,%d)", i, nrows));
55 | }
56 | if (j < 0 || (ncols != -1 && i >= ncols)) {
57 | throw new IllegalArgumentException(String.format("Invalid row %d (should be in [0,%d)", j, ncols));
58 | }
59 |
60 | if (entries >= rows.length) {
61 | if (entriesAddedSinceCompression > entries / 4.0) {
62 | compress(ElementOrdering.BY_COL, false);
63 | }
64 | int n = 2 * entries;
65 | if (n > rows.length) {
66 | rows = Arrays.copyOf(rows, n);
67 | cols = Arrays.copyOf(cols, n);
68 | values = Arrays.copyOf(values, n);
69 | }
70 | }
71 | rows[entries] = i;
72 | cols[entries] = j;
73 | values[entries] = x;
74 | entries++;
75 | lastOrdering = ElementOrdering.NONE;
76 | entriesAddedSinceCompression++;
77 | }
78 |
79 | /**
80 | * Reorder and aggregate data and indexes to be a proper sparse matrix.
81 | *
82 | * @return The resulting matrix
83 | */
84 | public SparseMatrix asSparseMatrix() {
85 | compress(ElementOrdering.BY_COL, false);
86 | resolveSizing();
87 |
88 | // data is now sorted by col, then row
89 | // we just need to make a short column index
90 | // note that we create one last element to point to the end of all data
91 | int[] colIndex = new int[ncols + 1];
92 | int last = -1;
93 | int j = 0;
94 | for (int k = 0; k < entries; ) {
95 | assert rows[k] >= 0 && rows[k] < nrows;
96 | assert cols[k] >= 0 && cols[k] < ncols;
97 |
98 | while (j <= cols[k]) {
99 | colIndex[j++] = k;
100 | }
101 | last = cols[k];
102 | while (k < entries && cols[k] == last) {
103 | k++;
104 | }
105 | }
106 | colIndex[ncols] = entries;
107 | return new SparseMatrix(nrows, ncols, values, rows, colIndex);
108 | }
109 |
110 | private void resolveSizing() {
111 | if (ncols == -1 || nrows == -1) {
112 | for (int k = 0; k < entries; k++) {
113 | ncols = Math.max(ncols, cols[k] + 1);
114 | nrows = Math.max(nrows, rows[k] + 1);
115 | }
116 | }
117 | }
118 |
119 | enum ElementOrdering {
120 | NONE, BY_ROW, BY_COL
121 | }
122 |
123 | @SuppressWarnings("WeakerAccess")
124 | public void compress(ElementOrdering elementOrdering, boolean force) {
125 | if (!force && lastOrdering == elementOrdering) {
126 | return;
127 | }
128 | entriesAddedSinceCompression = 0;
129 | lastOrdering = elementOrdering;
130 |
131 | int[] major;
132 | int[] minor;
133 | switch (elementOrdering) {
134 | case BY_ROW:
135 | major = this.rows;
136 | minor = this.cols;
137 | break;
138 | case BY_COL:
139 | default:
140 | major = this.cols;
141 | minor = this.rows;
142 | }
143 |
144 | // first sort everything in row order
145 | int[] order = new int[entries];
146 | Sort.sort(order, major, 0, entries);
147 | untangle(order, major, 0, entries);
148 | untangle(order, minor, 0, entries);
149 | untangle(order, values, 0, entries);
150 |
151 | // now scan through all the data
152 | int fill = 0;
153 | for (int i = 0; i < entries; ) {
154 | // for each range of constant row number, sort by column
155 | int j = i + 1;
156 | while (j < entries && major[j] == major[i]) {
157 | j++;
158 | }
159 | if (j > i + 1) {
160 | Sort.sort(order, minor, i, j - i);
161 | untangle(order, major, i, j);
162 | untangle(order, minor, i, j);
163 | untangle(order, values, i, j);
164 | }
165 |
166 | // and now collapse ranges of constant column number
167 | for (int k = i; k < j; ) {
168 | int r = major[k];
169 | int c = minor[k];
170 | double sum = 0;
171 | for (; k < j && minor[k] == c; k++) {
172 | sum += values[k];
173 | }
174 | major[fill] = r;
175 | minor[fill] = c;
176 | values[fill] = sum;
177 | fill++;
178 | }
179 | i = j;
180 | }
181 |
182 | entries = fill;
183 | }
184 |
185 | private void untangle(int[] order, int[] values, int start, int end) {
186 | int[] tmp = Arrays.copyOfRange(values, start, end);
187 | for (int i = start; i < end; i++) {
188 | tmp[i - start] = values[order[i]];
189 | }
190 | System.arraycopy(tmp, 0, values, start, end - start);
191 | }
192 |
193 | private void untangle(int[] order, double[] values, int start, int end) {
194 | double[] tmp = Arrays.copyOfRange(values, start, end);
195 | for (int i = start; i < end; i++) {
196 | tmp[i - start] = values[order[i]];
197 | }
198 | System.arraycopy(tmp, 0, values, start, end - start);
199 | }
200 |
201 | public void append(CooData other) {
202 | if (entries + other.entries > rows.length) {
203 | int n = entries + other.entries;
204 | rows = Arrays.copyOf(rows, n);
205 | cols = Arrays.copyOf(cols, n);
206 | values = Arrays.copyOf(values, n);
207 | }
208 | System.arraycopy(rows, entries, other.rows, 0, other.entries);
209 | System.arraycopy(cols, entries, other.cols, 0, other.entries);
210 | System.arraycopy(values, entries, other.values, 0, other.entries);
211 | }
212 | }
213 |
--------------------------------------------------------------------------------
/java/src/main/java/com/tdunning/examples/Jacobi.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import smile.math.matrix.SparseMatrix;
4 |
5 | import java.util.Arrays;
6 |
7 | /**
8 | * Classic iterative solver for sparse systems. This converges if the matrix A is diagonally dominant or if
9 | * it is symmetrical and positive definite.
10 | */
11 | public class Jacobi {
12 | private SparseMatrix a;
13 |
14 | public Jacobi(SparseMatrix a) {
15 | if (a.ncols() != a.nrows()) {
16 | throw new IllegalArgumentException("Matrix must be square");
17 | }
18 | this.a = a;
19 | }
20 |
21 | public double[] solve(double[] b) {
22 | return solve(b, 1e-10, 10000);
23 | }
24 |
25 | public double[] solve(double[] b, double tolerance, int maxIteration) {
26 | final int n = a.ncols();
27 | if (b.length != n) {
28 | throw new IllegalArgumentException("Must have b vector same size as matrix");
29 | }
30 |
31 | double[] x = new double[n];
32 | double[] diagonal = new double[n];
33 | a.foreachNonzero((i, j, value) -> {
34 | if (i == j) {
35 | diagonal[i] = value;
36 | }
37 | });
38 |
39 | double dMax = Double.POSITIVE_INFINITY;
40 | int iteration = 0;
41 | while (dMax > tolerance && iteration < maxIteration) {
42 | // z = b - Rx, where R is A except for diagonal elements
43 | double[] tmp = Arrays.copyOf(b, n);
44 | a.foreachNonzero((i, j, value) -> {
45 | if (i != j) {
46 | tmp[i] -= value * x[j];
47 | }
48 | });
49 |
50 | dMax = 0;
51 | for (int i = 0; i < n; i++) {
52 | double v = tmp[i] / diagonal[i];
53 | dMax = Math.max(Math.abs(x[i] - v), dMax);
54 | x[i] = v;
55 | }
56 | iteration++;
57 | System.out.printf("%10.2f\n", dMax);
58 | }
59 | return x;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/java/src/main/java/com/tdunning/examples/Sort.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | * Originally from t-digest.
18 | */
19 |
20 | package com.tdunning.examples;
21 |
22 | import java.util.Random;
23 |
24 | /**
25 | * Static sorting methods
26 | */
27 | public class Sort {
28 | private static final Random prng = new Random(); // for choosing pivots during quicksort
29 | /**
30 | * Quick sort using an index array. On return,
31 | * values[order[i]] is in order as i goes 0..values.length
32 | *
33 | * @param order Indexes into values
34 | * @param values The values to sort.
35 | */
36 | @SuppressWarnings("WeakerAccess")
37 | public static void sort(int[] order, int[] values) {
38 | sort(order, values, 0, values.length);
39 | }
40 |
41 | /**
42 | * Quick sort using an index array. On return,
43 | * values[order[i]] is in order as i goes 0..n
44 | *
45 | * @param order Indexes into values
46 | * @param values The values to sort.
47 | * @param n The number of values to sort
48 | */
49 | @SuppressWarnings("WeakerAccess")
50 | public static void sort(int[] order, int[] values, int n) {
51 | sort(order, values, 0, n);
52 | }
53 |
54 | /**
55 | * Quick sort using an index array. On return,
56 | * values[order[i]] is in order as i goes start..n
57 | *
58 | * @param order Indexes into values
59 | * @param values The values to sort.
60 | * @param start The first element to sort
61 | * @param n The number of values to sort
62 | */
63 | @SuppressWarnings("WeakerAccess")
64 | public static void sort(int[] order, int[] values, int start, int n) {
65 | for (int i = start; i < start + n; i++) {
66 | order[i] = i;
67 | }
68 | quickSort(order, values, start, start + n, 64);
69 | insertionSort(order, values, start, start + n, 64);
70 | }
71 |
72 | /**
73 | * Standard quick sort except that sorting is done on an index array rather than the values themselves
74 | *
75 | * @param order The pre-allocated index array
76 | * @param values The values to sort
77 | * @param start The beginning of the values to sort
78 | * @param end The value after the last value to sort
79 | * @param limit The minimum size to recurse down to.
80 | */
81 | private static void quickSort(int[] order, int[] values, int start, int end, int limit) {
82 | // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases
83 | while (end - start > limit) {
84 |
85 | // pivot by a random element
86 | int pivotIndex = start + prng.nextInt(end - start);
87 | double pivotValue = values[order[pivotIndex]];
88 |
89 | // move pivot to beginning of array
90 | swap(order, start, pivotIndex);
91 |
92 | // we use a three way partition because many duplicate values is an important case
93 |
94 | int low = start + 1; // low points to first value not known to be equal to pivotValue
95 | int high = end; // high points to first value > pivotValue
96 | int i = low; // i scans the array
97 | while (i < high) {
98 | // invariant: values[order[k]] == pivotValue for k in [0..low)
99 | // invariant: values[order[k]] < pivotValue for k in [low..i)
100 | // invariant: values[order[k]] > pivotValue for k in [high..end)
101 | // in-loop: i < high
102 | // in-loop: low < high
103 | // in-loop: i >= low
104 | double vi = values[order[i]];
105 | if (vi == pivotValue) {
106 | if (low != i) {
107 | swap(order, low, i);
108 | } else {
109 | i++;
110 | }
111 | low++;
112 | } else if (vi > pivotValue) {
113 | high--;
114 | swap(order, i, high);
115 | } else {
116 | // vi < pivotValue
117 | i++;
118 | }
119 | }
120 | // invariant: values[order[k]] == pivotValue for k in [0..low)
121 | // invariant: values[order[k]] < pivotValue for k in [low..i)
122 | // invariant: values[order[k]] > pivotValue for k in [high..end)
123 | // assert i == high || low == high therefore, we are done with partition
124 |
125 | // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are >
126 | // we have to move the values equal to the pivot into the middle. To do this, we swap pivot
127 | // values into the top end of the [low,high) range stopping when we run out of destinations
128 | // or when we run out of values to copy
129 | int from = start;
130 | int to = high - 1;
131 | for (i = 0; from < low && to >= low; i++) {
132 | swap(order, from++, to--);
133 | }
134 | if (from == low) {
135 | // ran out of things to copy. This means that the the last destination is the boundary
136 | low = to + 1;
137 | } else {
138 | // ran out of places to copy to. This means that there are uncopied pivots and the
139 | // boundary is at the beginning of those
140 | low = from;
141 | }
142 |
143 | // checkPartition(order, values, pivotValue, start, low, high, end);
144 |
145 | // now recurse, but arrange it so we handle the longer limit by tail recursion
146 | if (low - start < end - high) {
147 | quickSort(order, values, start, low, limit);
148 |
149 | // this is really a way to do
150 | // quickSort(order, values, high, end, limit);
151 | start = high;
152 | } else {
153 | quickSort(order, values, high, end, limit);
154 | // this is really a way to do
155 | // quickSort(order, values, start, low, limit);
156 | end = low;
157 | }
158 | }
159 | }
160 |
161 | /**
162 | * Quick sort in place of several paired arrays. On return,
163 | * keys[...] is in order and the values[] arrays will be
164 | * reordered as well in the same way.
165 | *
166 | * @param key Values to sort on
167 | * @param values The auxilliary values to sort.
168 | */
169 | @SuppressWarnings("WeakerAccess")
170 | public static void sort(double[] key, double[] ... values) {
171 | sort(key, 0, key.length, values);
172 | }
173 |
174 | /**
175 | * Quick sort using an index array. On return,
176 | * values[order[i]] is in order as i goes start..n
177 | * @param key Values to sort on
178 | * @param start The first element to sort
179 | * @param n The number of values to sort
180 | * @param values The auxilliary values to sort.
181 | */
182 | @SuppressWarnings("WeakerAccess")
183 | public static void sort(double[] key, int start, int n, double[]... values) {
184 | quickSort(key, values, start, start + n, 8);
185 | insertionSort(key, values, start, start + n, 8);
186 | }
187 |
188 | /**
189 | * Standard quick sort except that sorting rearranges parallel arrays
190 | *
191 | * @param key Values to sort on
192 | * @param values The auxilliary values to sort.
193 | * @param start The beginning of the values to sort
194 | * @param end The value after the last value to sort
195 | * @param limit The minimum size to recurse down to.
196 | */
197 | private static void quickSort(double[] key, double[][] values, int start, int end, int limit) {
198 | // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases
199 | while (end - start > limit) {
200 |
201 | // median of three values for the pivot
202 | int a = start;
203 | int b = (start + end) / 2;
204 | int c = end - 1;
205 |
206 | int pivotIndex;
207 | double pivotValue;
208 | double va = key[a];
209 | double vb = key[b];
210 | double vc = key[c];
211 | //noinspection Duplicates
212 | if (va > vb) {
213 | if (vc > va) {
214 | // vc > va > vb
215 | pivotIndex = a;
216 | pivotValue = va;
217 | } else {
218 | // va > vb, va >= vc
219 | if (vc < vb) {
220 | // va > vb > vc
221 | pivotIndex = b;
222 | pivotValue = vb;
223 | } else {
224 | // va >= vc >= vb
225 | pivotIndex = c;
226 | pivotValue = vc;
227 | }
228 | }
229 | } else {
230 | // vb >= va
231 | if (vc > vb) {
232 | // vc > vb >= va
233 | pivotIndex = b;
234 | pivotValue = vb;
235 | } else {
236 | // vb >= va, vb >= vc
237 | if (vc < va) {
238 | // vb >= va > vc
239 | pivotIndex = a;
240 | pivotValue = va;
241 | } else {
242 | // vb >= vc >= va
243 | pivotIndex = c;
244 | pivotValue = vc;
245 | }
246 | }
247 | }
248 |
249 | // move pivot to beginning of array
250 | swap(start, pivotIndex, key, values);
251 |
252 | // we use a three way partition because many duplicate values is an important case
253 |
254 | int low = start + 1; // low points to first value not known to be equal to pivotValue
255 | int high = end; // high points to first value > pivotValue
256 | int i = low; // i scans the array
257 | while (i < high) {
258 | // invariant: values[order[k]] == pivotValue for k in [0..low)
259 | // invariant: values[order[k]] < pivotValue for k in [low..i)
260 | // invariant: values[order[k]] > pivotValue for k in [high..end)
261 | // in-loop: i < high
262 | // in-loop: low < high
263 | // in-loop: i >= low
264 | double vi = key[i];
265 | if (vi == pivotValue) {
266 | if (low != i) {
267 | swap(low, i, key, values);
268 | } else {
269 | i++;
270 | }
271 | low++;
272 | } else if (vi > pivotValue) {
273 | high--;
274 | swap(i, high, key, values);
275 | } else {
276 | // vi < pivotValue
277 | i++;
278 | }
279 | }
280 | // invariant: values[order[k]] == pivotValue for k in [0..low)
281 | // invariant: values[order[k]] < pivotValue for k in [low..i)
282 | // invariant: values[order[k]] > pivotValue for k in [high..end)
283 | // assert i == high || low == high therefore, we are done with partition
284 |
285 | // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are >
286 | // we have to move the values equal to the pivot into the middle. To do this, we swap pivot
287 | // values into the top end of the [low,high) range stopping when we run out of destinations
288 | // or when we run out of values to copy
289 | int from = start;
290 | int to = high - 1;
291 | for (i = 0; from < low && to >= low; i++) {
292 | swap(from++, to--, key, values);
293 | }
294 | if (from == low) {
295 | // ran out of things to copy. This means that the the last destination is the boundary
296 | low = to + 1;
297 | } else {
298 | // ran out of places to copy to. This means that there are uncopied pivots and the
299 | // boundary is at the beginning of those
300 | low = from;
301 | }
302 |
303 | // checkPartition(order, values, pivotValue, start, low, high, end);
304 |
305 | // now recurse, but arrange it so we handle the longer limit by tail recursion
306 | if (low - start < end - high) {
307 | quickSort(key, values, start, low, limit);
308 |
309 | // this is really a way to do
310 | // quickSort(order, values, high, end, limit);
311 | start = high;
312 | } else {
313 | quickSort(key, values, high, end, limit);
314 | // this is really a way to do
315 | // quickSort(order, values, start, low, limit);
316 | end = low;
317 | }
318 | }
319 | }
320 |
321 |
322 | /**
323 | * Limited range insertion sort. We assume that no element has to move more than limit steps
324 | * because quick sort has done its thing. This version works on parallel arrays of keys and values.
325 | *
326 | * @param key The array of keys
327 | * @param values The values we are sorting
328 | * @param start The starting point of the sort
329 | * @param end The ending point of the sort
330 | * @param limit The largest amount of disorder
331 | */
332 | @SuppressWarnings("SameParameterValue")
333 | private static void insertionSort(double[] key, double[][] values, int start, int end, int limit) {
334 | // loop invariant: all values start ... i-1 are ordered
335 | for (int i = start + 1; i < end; i++) {
336 | double v = key[i];
337 | int m = Math.max(i - limit, start);
338 | for (int j = i; j >= m; j--) {
339 | if (j == m || key[j - 1] <= v) {
340 | if (j < i) {
341 | System.arraycopy(key, j, key, j + 1, i - j);
342 | key[j] = v;
343 | for (double[] value : values) {
344 | double tmp = value[i];
345 | System.arraycopy(value, j, value, j + 1, i - j);
346 | value[j] = tmp;
347 | }
348 | }
349 | break;
350 | }
351 | }
352 | }
353 | }
354 |
355 | private static void swap(int[] order, int i, int j) {
356 | int t = order[i];
357 | order[i] = order[j];
358 | order[j] = t;
359 | }
360 |
361 | private static void swap(int i, int j, double[] key, double[]...values) {
362 | double t = key[i];
363 | key[i] = key[j];
364 | key[j] = t;
365 |
366 | for (int k = 0; k < values.length; k++) {
367 | t = values[k][i];
368 | values[k][i] = values[k][j];
369 | values[k][j] = t;
370 | }
371 | }
372 |
373 | /**
374 | * Check that a partition step was done correctly. For debugging and testing.
375 | *
376 | * @param order The array of indexes representing a permutation of the keys.
377 | * @param values The keys to sort.
378 | * @param pivotValue The value that splits the data
379 | * @param start The beginning of the data of interest.
380 | * @param low Values from start (inclusive) to low (exclusive) are < pivotValue.
381 | * @param high Values from low to high are equal to the pivot.
382 | * @param end Values from high to end are above the pivot.
383 | */
384 | @SuppressWarnings("UnusedDeclaration")
385 | public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) {
386 | if (order.length != values.length) {
387 | throw new IllegalArgumentException("Arguments must be same size");
388 | }
389 |
390 | if (!(start >= 0 && low >= start && high >= low && end >= high)) {
391 | throw new IllegalArgumentException(String.format("Invalid indices %d, %d, %d, %d", start, low, high, end));
392 | }
393 |
394 | for (int i = 0; i < low; i++) {
395 | double v = values[order[i]];
396 | if (v >= pivotValue) {
397 | throw new IllegalArgumentException(String.format("Value greater than pivot at %d", i));
398 | }
399 | }
400 |
401 | for (int i = low; i < high; i++) {
402 | if (values[order[i]] != pivotValue) {
403 | throw new IllegalArgumentException(String.format("Non-pivot at %d", i));
404 | }
405 | }
406 |
407 | for (int i = high; i < end; i++) {
408 | double v = values[order[i]];
409 | if (v <= pivotValue) {
410 | throw new IllegalArgumentException(String.format("Value less than pivot at %d", i));
411 | }
412 | }
413 | }
414 |
415 | /**
416 | * Limited range insertion sort. We assume that no element has to move more than limit steps
417 | * because quick sort has done its thing.
418 | *
419 | * @param order The permutation index
420 | * @param values The values we are sorting
421 | * @param start Where to start the sort
422 | * @param n How many elements to sort
423 | * @param limit The largest amount of disorder
424 | */
425 | @SuppressWarnings("SameParameterValue")
426 | private static void insertionSort(int[] order, int[] values, int start, int n, int limit) {
427 | for (int i = start + 1; i < n; i++) {
428 | int t = order[i];
429 | double v = values[order[i]];
430 | int m = Math.max(i - limit, start);
431 | for (int j = i; j >= m; j--) {
432 | if (j == 0 || values[order[j - 1]] <= v) {
433 | if (j < i) {
434 | System.arraycopy(order, j, order, j + 1, i - j);
435 | order[j] = t;
436 | }
437 | break;
438 | }
439 | }
440 | }
441 | }
442 |
443 | /**
444 | * Reverses an array in-place.
445 | *
446 | * @param order The array to reverse
447 | */
448 | @SuppressWarnings("WeakerAccess")
449 | public static void reverse(int[] order) {
450 | reverse(order, 0, order.length);
451 | }
452 |
453 | /**
454 | * Reverses part of an array. See {@link #reverse(int[])}
455 | *
456 | * @param order The array containing the data to reverse.
457 | * @param offset Where to start reversing.
458 | * @param length How many elements to reverse
459 | */
460 | @SuppressWarnings("WeakerAccess")
461 | public static void reverse(int[] order, int offset, int length) {
462 | for (int i = 0; i < length / 2; i++) {
463 | int t = order[offset + i];
464 | order[offset + i] = order[offset + length - i - 1];
465 | order[offset + length - i - 1] = t;
466 | }
467 | }
468 |
469 | /**
470 | * Reverses part of an array. See {@link #reverse(int[])}
471 | *
472 | * @param order The array containing the data to reverse.
473 | * @param offset Where to start reversing.
474 | * @param length How many elements to reverse
475 | */
476 | @SuppressWarnings({"WeakerAccess", "SameParameterValue"})
477 | public static void reverse(double[] order, int offset, int length) {
478 | for (int i = 0; i < length / 2; i++) {
479 | double t = order[offset + i];
480 | order[offset + i] = order[offset + length - i - 1];
481 | order[offset + length - i - 1] = t;
482 | }
483 | }
484 | }
485 |
--------------------------------------------------------------------------------
/java/src/main/java/com/tdunning/examples/VectorText.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import java.util.*;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 | import java.util.stream.Collectors;
7 | import java.util.stream.Stream;
8 | import java.util.stream.StreamSupport;
9 |
10 | public class VectorText {
11 | private static Pattern word = Pattern.compile(String.join("", "",
12 | // "([A-Z]\\.)+", // a word
13 | "\\d+:(\\.\\d)+", // a number
14 | "|(\\w+)", // a word
15 | "|(https?://)?(\\w+\\.)(\\w{2,})+([\\w/]+)?", // URL
16 | "|[@#]?\\w+(?:[-']\\w+)*", // twitter-like user reference
17 | "|\\$\\d+(\\.\\d+)?%?", // dollar amount
18 | "|\\\\[Uu]\\w+", // normal word
19 | "|\\\\[Uu]\\w+'t" // contraction
20 | ));
21 |
22 | @SuppressWarnings("WeakerAccess")
23 | public static Stream tokenize(CharSequence s) {
24 | Iterator is = new Iterator() {
25 | int position = 0;
26 | Matcher m = word.matcher(s);
27 |
28 | @Override
29 | public boolean hasNext() {
30 | return m.find(position);
31 | }
32 |
33 | @Override
34 | public String next() {
35 | position = m.end();
36 | return m.group().toLowerCase();
37 | }
38 | };
39 | int characteristics = Spliterator.DISTINCT | Spliterator.SORTED | Spliterator.IMMUTABLE;
40 | Spliterator spliterator = Spliterators.spliteratorUnknownSize(is, characteristics);
41 |
42 | return StreamSupport.stream(spliterator, false);
43 | }
44 |
45 | @SuppressWarnings("WeakerAccess")
46 | public static List tokenizeAsList(CharSequence s) {
47 | return tokenize(s).collect(Collectors.toList());
48 | }
49 |
50 | public static int[] vectorize(Map dictionary, String s) {
51 | int[] result = new int[dictionary.size()];
52 | VectorText.tokenize(s).forEach(w -> {
53 | if (dictionary.containsKey(w)) {
54 | result[dictionary.get(w)] = 1;
55 | }
56 | });
57 | return result;
58 | }
59 |
60 | public static int[] count(Map dictionary, String s) {
61 | int[] result = new int[dictionary.size()];
62 | VectorText.tokenize(s).forEach(w -> {
63 | if (dictionary.containsKey(w)) {
64 | result[dictionary.get(w)]++;
65 | }
66 | });
67 | return result;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/java/src/test/java/com/tdunning/examples/CooDataTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import org.junit.Test;
4 | import smile.math.matrix.SparseMatrix;
5 |
6 | import java.util.*;
7 |
8 | import static org.junit.Assert.*;
9 |
10 | public class CooDataTest {
11 |
12 | public static final int ROWS = 100000;
13 | public static final int COLS = 100000;
14 |
15 | @Test
16 | public void basics() {
17 | Random rand = new Random();
18 | CooData m = new CooData(ROWS, COLS);
19 | Map ref = new TreeMap<>();
20 | for (int step = 0; step < 100000; step++) {
21 | int i;
22 | int j;
23 |
24 | if (rand.nextDouble() < 0.2) {
25 | i = rand.nextInt(20);
26 | j = rand.nextInt(20);
27 | } else {
28 | i = (int) (-10000 * Math.log(rand.nextDouble()));
29 | if (i >= ROWS) {
30 | i = ROWS - 1;
31 | }
32 | j = (int) (-10000 * Math.log(rand.nextDouble()));
33 | if (j >= COLS) {
34 | j = COLS - 1;
35 | }
36 | }
37 | double x = rand.nextGaussian();
38 | Pair k = new Pair(i, j);
39 | if (ref.containsKey(k)) {
40 | ref.put(k, ref.get(k) + x);
41 | } else {
42 | ref.put(k, x);
43 | }
44 | m.add(i, j, x);
45 | }
46 | m.compress(CooData.ElementOrdering.BY_ROW, true);
47 |
48 | for (int step = 0; step < m.entries; step++) {
49 | assertEquals(ref.get(new Pair(m.rows[step], m.cols[step])), m.values[step], 0);
50 | }
51 | }
52 |
53 | @Test
54 | public void small() {
55 | CooData m = new CooData(5, 7);
56 | for (int i = 0; i < 5; i++) {
57 | for (int j = 0; j < 7; j++) {
58 | m.add(i, j, 100 * i + j);
59 | }
60 | }
61 | for (int i = 0; i < 5; i++) {
62 | m.add(i, i, 2);
63 | m.add(i, i + 1, 3);
64 | }
65 | m.compress(CooData.ElementOrdering.BY_ROW, true);
66 | int k = 0;
67 | for (int i = 0; i < 5; i++) {
68 | for (int j = 0; j < 7; j++) {
69 | assertEquals(m.rows[k], i);
70 | assertEquals(m.cols[k], j);
71 | assertEquals(m.values[k], 100 * i + j + ((i == j) ? 2 : 0) + ((j == i + 1) ? 3 : 0), 0);
72 | k++;
73 | }
74 | }
75 |
76 | SparseMatrix mx = m.asSparseMatrix();
77 | k = 0;
78 | for (int j = 0; j < 7; j++) {
79 | for (int i = 0; i < 5; i++) {
80 | assertEquals(m.rows[k], i);
81 | assertEquals(m.cols[k], j);
82 | assertEquals(m.values[k], 100 * i + j + ((i == j) ? 2 : 0) + ((j == i + 1) ? 3 : 0), 0);
83 | assertEquals(m.values[k], mx.get(i,j), 0);
84 | k++;
85 | }
86 | }
87 |
88 |
89 | }
90 |
91 | private class Pair implements Comparable {
92 | int i, j;
93 |
94 | public Pair(int i, int j) {
95 | this.i = i;
96 | this.j = j;
97 | }
98 |
99 | @Override
100 | public boolean equals(Object o) {
101 | if (this == o) return true;
102 | if (o == null || getClass() != o.getClass()) return false;
103 | Pair pair = (Pair) o;
104 | return i == pair.i &&
105 | j == pair.j;
106 | }
107 |
108 | @Override
109 | public int hashCode() {
110 | return 31 * i + j;
111 | }
112 |
113 | @Override
114 | public int compareTo(Pair other) {
115 | int r = this.i - other.j;
116 | if (r == 0) {
117 | return this.j - other.j;
118 | } else {
119 | return r;
120 | }
121 | }
122 | }
123 | }
--------------------------------------------------------------------------------
/java/src/test/java/com/tdunning/examples/JacobiTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import org.junit.Test;
4 | import smile.math.matrix.SparseMatrix;
5 |
6 | import static org.junit.Assert.*;
7 |
8 | public class JacobiTest {
9 |
10 | @Test
11 | public void solve() {
12 | CooData connectionData = new CooData();
13 | // 100 x 100 mesh encoded as 10,000 elements in a vector
14 | for (int i = 0; i < 100; i++) {
15 | for (int j = 0; j < 100; j++) {
16 | int k0 = coord(i, j);
17 | double sum = 0;
18 | for (int dx = -1; dx <= 1; dx++) {
19 | for (int dy = -1; dy <= 1; dy++) {
20 | if ((dx != 0 || dy != 0) && i + dx >= 0 && i + dx < 100 && j + dy >= 0 && j + dy < 100) {
21 | double w = 0.125;
22 | sum += w;
23 | connectionData.add(k0, coord(i + dx, j + dy), w);
24 | }
25 | }
26 | }
27 | connectionData.add(k0, k0, -sum);
28 | }
29 | }
30 |
31 | SparseMatrix transfer = connectionData.asSparseMatrix();
32 |
33 | Jacobi jSolver = new Jacobi(transfer);
34 | double[] b = new double[10000];
35 | for (int i = 0; i < 100; i++) {
36 | b[coord(i, 0)] = 1;
37 | b[coord(i, 99)] = -1;
38 | }
39 | double[] x = jSolver.solve(b);
40 | for (int j = 0; j < 10; j++) {
41 | System.out.printf("%.2f ", x[j]);
42 | }
43 | for (int j = 10; j < 100; j += 5) {
44 | System.out.printf("%.2f ", x[j]);
45 | }
46 | System.out.printf("\n");
47 |
48 | }
49 |
50 | private int coord(int i, int j) {
51 | return 100 * i + j;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/java/src/test/java/com/tdunning/examples/VectorTextTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.examples;
2 |
3 | import com.google.common.collect.HashMultiset;
4 | import com.google.common.collect.Multiset;
5 | import org.junit.Test;
6 | import smile.math.matrix.DenseMatrix;
7 | import smile.math.matrix.JMatrix;
8 | import smile.math.matrix.SparseMatrix;
9 |
10 | import java.io.File;
11 | import java.io.IOException;
12 | import java.nio.charset.StandardCharsets;
13 | import java.nio.file.Files;
14 | import java.nio.file.Paths;
15 | import java.util.*;
16 | import java.util.concurrent.atomic.AtomicInteger;
17 | import java.util.function.Function;
18 | import java.util.stream.Collectors;
19 | import java.util.stream.Stream;
20 |
21 | import static org.junit.Assert.*;
22 |
23 | public class VectorTextTest {
24 | private String sample = "We stayed for 5 nights last week. " +
25 | "The cold food of fruit/pastries/cereals is great. " +
26 | "$10 for a small OJ!! ";
27 | private String[] tokens = {
28 | "we", "stayed", "for", "5", "nights", "last", "week",
29 | "the", "cold", "food", "of", "fruit", "pastries",
30 | "cereals", "is", "great", "$10", "for", "a", "small", "oj"
31 | };
32 |
33 | @org.junit.Test
34 | public void tokenizeAsStream() {
35 | Stream ref = Stream.of(tokens);
36 | Iterator ix = ref.iterator();
37 | VectorText.tokenize(sample)
38 | .forEachOrdered(s -> assertEquals(ix.next(), s));
39 | assertFalse(ix.hasNext());
40 | }
41 |
42 | @org.junit.Test
43 | public void tokenize() {
44 | Iterator is = Arrays.asList(tokens).iterator();
45 | for (String s : VectorText.tokenizeAsList(sample)) {
46 | assertEquals(is.next(), s);
47 | }
48 | }
49 |
50 | @Test
51 | public void vectorize() {
52 | // build a dictionary of all words we see in a subset of the text
53 | Map dict = VectorText.tokenize(sample.split("\\.")[0]).collect(
54 | TreeMap::new,
55 | (Map d, String s) -> d.put(s, d.size()),
56 | Map::putAll
57 | );
58 | dict.remove("5");
59 | dict.put("tiger", dict.size());
60 |
61 | int[] v = VectorText.vectorize(dict, sample);
62 | assertArrayEquals(new int[]{1, 1, 1, 0, 1, 1, 1}, v);
63 | }
64 |
65 | @Test
66 | public void count() {
67 | // build a dictionary of all words we see in a subset of the text
68 | Map dict = VectorText.tokenize(sample.split("\\.")[0]).collect(
69 | TreeMap::new,
70 | (Map d, String s) -> d.put(s, d.size()),
71 | Map::putAll
72 | );
73 | dict.remove("5");
74 | dict.put("tiger", dict.size());
75 |
76 | int[] v = VectorText.count(dict, sample);
77 | assertArrayEquals(new int[]{1, 1, 2, 0, 1, 1, 1}, v);
78 | }
79 |
80 | @Test
81 | public void gloveVectors() throws IOException {
82 | int nDocs = 50000;
83 | Progress p = new Progress();
84 |
85 | AtomicInteger docCount = new AtomicInteger();
86 | double t0 = System.nanoTime() / 1e9;
87 | Multiset counts = docs(p, nDocs)
88 | .flatMap(VectorText::tokenize)
89 | .collect(
90 | HashMultiset::create,
91 | (strings, element) -> {
92 | docCount.incrementAndGet();
93 | strings.add(element);
94 | },
95 | HashMultiset::addAll);
96 | AtomicInteger wordCount = new AtomicInteger();
97 | Map dict = counts.elementSet().stream()
98 | .filter(w -> counts.count(w) > 3)
99 | .collect(
100 | TreeMap::new,
101 | (d, w) -> d.put(w, wordCount.getAndIncrement()),
102 | TreeMap::putAll);
103 | List undict = new ArrayList<>(dict.keySet());
104 |
105 | Progress p1 = new Progress();
106 | DenseMatrix wordVectors = Files.lines(Paths.get("/Users/tdunning/Downloads/glove.6B/glove.6B.100d.txt"))
107 | .collect(
108 | () -> new JMatrix(dict.size(), 100),
109 | (DenseMatrix m, String rawWordVector) -> {
110 | p1.log();
111 | int i = rawWordVector.indexOf(' ');
112 | String word = rawWordVector.substring(0, i);
113 | if (dict.containsKey(word)) {
114 | int row = dict.get(word);
115 | int j = 0;
116 | for (String v : rawWordVector.substring(i + 1).split(" ")) {
117 | try {
118 | m.set(row, j++, Double.parseDouble(v));
119 | } catch (NumberFormatException e) {
120 | System.out.printf("Error in %s\n%s\n", v, rawWordVector);
121 | }
122 | }
123 | }
124 | },
125 | DenseMatrix::add
126 | );
127 |
128 | DenseMatrix idf = new JMatrix(dict.size(), 1);
129 | for (String w : dict.keySet()) {
130 | idf.set(dict.get(w), 0, Math.log(counts.size() / counts.count(w)));
131 | }
132 |
133 | docs(p, 100)
134 | .forEach(
135 | doc -> {
136 | // for each document, build out sum of idf-weighted one-hot vectors
137 | DenseMatrix docVector = new JMatrix(100, 1);
138 | VectorText.tokenize(doc)
139 | .filter(dict::containsKey)
140 | .forEach(
141 | w -> {
142 | int iw = dict.get(w);
143 | for (int i = 0; i < 100; i++) {
144 | docVector.add(i, 0, wordVectors.get(iw, i) * idf.get(iw, 0));
145 | }
146 | }
147 | );
148 |
149 | // now multiply back at the word vectors to find nearest terms
150 | // (dict.size() x 100) * (100 x 1) => (dict.size() x 1)
151 | DenseMatrix r = wordVectors.abmm(docVector);
152 |
153 | // find words with highest score
154 | PriorityQueue pq = new PriorityQueue<>(Comparator.comparingDouble(a -> a.score));
155 | for (int i = 0; i < r.nrows(); i++) {
156 | pq.add(new ScoredPair(i, 0, r.get(i, 0)));
157 | while (pq.size() > 50) {
158 | pq.poll();
159 | }
160 | }
161 |
162 | // reverse into descending order
163 | List best = pq.stream()
164 | .map(scoredItem -> scoredItem.i)
165 | .collect(Collectors.toList());
166 | Collections.reverse(best);
167 |
168 | // and let's take a look
169 | System.out.printf("%s\n ", doc.substring(0, Math.min(50, doc.length())));
170 | for (Integer w : best) {
171 | System.out.printf(" %s", undict.get(w));
172 | }
173 | System.out.printf("\n");
174 | }
175 | );
176 | }
177 |
178 | private static class Progress {
179 | int step = 1;
180 | int order = 1;
181 |
182 | int count = 0;
183 | double t0 = System.nanoTime() * 1e-9;
184 | private int oldCount = 0;
185 |
186 | void log() {
187 | count++;
188 | if (count == step * order) {
189 | double t1 = System.nanoTime() * 1e-9;
190 | double rate = (count - oldCount) / (t1 - t0);
191 | t0 = t1;
192 | oldCount = count;
193 | System.out.printf("%d (%.0f / sec)\n", count, rate);
194 |
195 | step = (int) (2.51 * step);
196 | if (step > 10) {
197 | step = 1;
198 | order *= 10;
199 | }
200 | }
201 | }
202 | }
203 |
204 | @Test
205 | public void documentSpeed() throws IOException {
206 | int nDocs = -1;
207 | double frequencyCut = 1000;
208 | int minScore = 12;
209 | int maxAssociates = 100;
210 |
211 | double t0 = System.nanoTime() / 1e9;
212 | Progress p = new Progress();
213 | // count all the words in our corpus
214 | Multiset counts = docs(p, nDocs)
215 | .flatMap(VectorText::tokenize)
216 | .collect(
217 | HashMultiset::create,
218 | Multiset::add,
219 | HashMultiset::addAll);
220 | System.out.printf("%d total terms processed\n", counts.size());
221 | // build a dictionary with words that occur sufficiently
222 | Map dict = counts.stream()
223 | .filter(w -> counts.count(w) > 3)
224 | .collect(
225 | TreeMap::new,
226 | (d, w) -> d.put(w, d.size()),
227 | TreeMap::putAll);
228 |
229 | // invert our dictionary as well
230 | Map undict = new HashMap<>();
231 | for (String w : dict.keySet()) {
232 | undict.put(dict.get(w), w);
233 | }
234 |
235 |
236 | double t1 = System.nanoTime() / 1e9;
237 | System.out.printf("built dictionaries %.1f MB/s\n", new File("/Users/tdunning/tmp/OpinRank/hotels.txt").length() / (t1 - t0) / 1e6);
238 | p = new Progress();
239 |
240 | Random rand = new Random();
241 |
242 | // print some documents out for reference and checking
243 | AtomicInteger id = new AtomicInteger(0);
244 | Map> ref = docs(p, 10)
245 | .collect(
246 | TreeMap::new,
247 | (m, raw) -> {
248 | int currentDoc = id.getAndIncrement();
249 | // downsample our words according to limit max frequency
250 | // and translate to integer form
251 | Set words = VectorText.tokenize(raw)
252 | .filter(w -> dict.containsKey(w) && (rand.nextDouble() < frequencyCut / counts.count(w)))
253 | .map(w -> w + "-" + dict.get(w))
254 | .collect(Collectors.toSet());
255 | m.put(currentDoc, words);
256 | },
257 | Map::putAll);
258 |
259 | for (Integer docId : ref.keySet()) {
260 | System.out.printf("%d: (", docId);
261 | for (String w : ref.get(docId)) {
262 | System.out.printf("%s ", w);
263 | }
264 | System.out.printf(")\n");
265 | }
266 | System.out.printf("\n");
267 |
268 | p = new Progress();
269 |
270 | // do the cooccurrence counting with downsampling of common items
271 | t0 = System.nanoTime() / 1e9;
272 | AtomicInteger docid = new AtomicInteger(0);
273 | CooData binaryTerms = docs(p, nDocs)
274 | .collect(
275 | CooData::new,
276 | (CooData m, String raw) -> {
277 | int currentDoc = docid.getAndIncrement();
278 | // downsample our words according to limit max frequency
279 | // and translate to integer form
280 | CooData words = VectorText.tokenize(raw)
281 | .filter(w -> dict.containsKey(w) && (rand.nextDouble() < frequencyCut / counts.count(w)))
282 | .collect(
283 | () -> m,
284 | (CooData mx, String w) -> mx.add(currentDoc, dict.get(w), 1.0),
285 | CooData::append);
286 | },
287 | CooData::append);
288 | binaryTerms.compress(CooData.ElementOrdering.BY_COL, false);
289 | for (int k = 0; k < binaryTerms.entries; k++) {
290 | binaryTerms.values[k] = 1;
291 | }
292 | t1 = System.nanoTime() / 1e9;
293 | System.out.printf("build doc matrix %.1f MB/s\n", new File("/Users/tdunning/tmp/OpinRank/hotels.txt").length() / (t1 - t0) / 1e6);
294 |
295 | SparseMatrix docByTerms = binaryTerms.asSparseMatrix();
296 | double[] finalCounts = new double[docByTerms.ncols()];
297 | docByTerms.foreachNonzero(
298 | (doc, word, k) -> {
299 | finalCounts[word]++;
300 | });
301 | int totalDocuments = docByTerms.nrows();
302 | int totalWords = docByTerms.ncols();
303 |
304 | System.out.printf("doc matrix is %d x %d (%d vs %d non-zeros)\n", docByTerms.nrows(), docByTerms.ncols(), docByTerms.size(), binaryTerms.entries);
305 | SparseMatrix cooc = docByTerms.ata();
306 | System.out.printf("%d x %d (%d non-zeros)\n", cooc.nrows(), cooc.ncols(), cooc.size());
307 |
308 | // build associates matrix for words
309 | CooData rawConnections = new CooData(cooc.nrows(), cooc.ncols());
310 | for (int word = 0; word < totalWords; word++) {
311 | PriorityQueue highScores = new PriorityQueue<>(Comparator.comparingDouble(t12 -> t12.score));
312 |
313 | // scan through each column, scoring cooccurrences
314 | cooc.foreachNonzero(word, word + 1,
315 | (w1, w2, k11) -> {
316 | double k1x = finalCounts[w1];
317 | double kx1 = finalCounts[w2];
318 | double k12 = k1x - k11;
319 | double k21 = kx1 - k11;
320 | double k22 = totalDocuments - k11 - k12 - k21;
321 | double score = llr(k11, k12, k21, k22);
322 | if (score > minScore && (highScores.size() < maxAssociates || score > highScores.peek().score)) {
323 | highScores.add(new ScoredPair(w1, w2, score));
324 | }
325 | while (highScores.size() > maxAssociates) {
326 | highScores.poll();
327 | }
328 | });
329 | while (highScores.size() > 0) {
330 | ScoredPair associate = highScores.poll();
331 | rawConnections.add(associate.i, associate.j, 1);
332 | }
333 | }
334 |
335 | SparseMatrix associates = rawConnections.asSparseMatrix();
336 | SparseMatrix similar = associates.ata();
337 | for (String w : new String[]{"wild", "bad", "good", "lovely", "hotel", "rail"}) {
338 | System.out.printf("%s: ", w);
339 | similar.foreachNonzero(dict.get(w), dict.get(w) + 1,
340 | (w1, w2, x) -> {
341 | if (x > 8) {
342 | System.out.printf("%s-%.0f ", undict.get(w1), x);
343 | }
344 | });
345 | System.out.printf("\n");
346 | }
347 | }
348 |
349 | private double h(double... kxx) {
350 | double sum = 0;
351 | for (double k : kxx) {
352 | sum += k;
353 | }
354 | double r = 0;
355 | for (double k : kxx) {
356 | if (k > 0) {
357 | r -= k * Math.log(k / sum);
358 | }
359 | }
360 | return r;
361 | }
362 |
363 | private double llr(double k11, double k12, double k21, double k22) {
364 | return 2 * (h(k11 + k12, k21 + k22) + h(k11 + k21, k12 + k22) - h(k11, k12, k21, k22));
365 | }
366 |
367 | @Test
368 | public void testHash() {
369 | int[] counts = new int[65536];
370 | for (int i = 0; i < 65536; i++) {
371 | for (int j = 0; j < 65536; j++) {
372 | int k = new IntPair(i, j).hashCode();
373 | k = k % counts.length;
374 | if (k < 0) {
375 | k += counts.length;
376 | }
377 | counts[k]++;
378 | }
379 | }
380 | int[] tmp = Arrays.copyOf(counts, counts.length);
381 | Arrays.sort(tmp);
382 |
383 | double qSoFar = 0;
384 | System.out.printf("%10.3f %d\n", qSoFar, 0);
385 | for (double q = 0; q < 0.9; q += 0.1) {
386 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]);
387 | }
388 | for (double q = 0.9; q < 0.99; q += 0.01) {
389 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]);
390 | }
391 | for (double q = 0.99; q < 1; q += 0.001) {
392 | System.out.printf("%10.3f %d\n", q, tmp[(int) (q * tmp.length)]);
393 | }
394 |
395 | System.out.printf("\n\nbig\n");
396 | int last = 0;
397 | for (int i = 0; i < 65536; i++) {
398 | if (counts[i] >= 1082052) {
399 | System.out.printf("%10x %10d %10d\n", i, i, i - last);
400 | last = i;
401 | }
402 | }
403 | System.out.printf("\nend\n");
404 | }
405 |
406 | private class IntPair {
407 | public IntPair(int i, int j) {
408 | this.i = i;
409 | this.j = j;
410 | }
411 |
412 | int i, j;
413 |
414 | @Override
415 | public boolean equals(Object o) {
416 | if (this == o) return true;
417 | if (o == null || getClass() != o.getClass()) return false;
418 | IntPair intPair = (IntPair) o;
419 | return i == intPair.i &&
420 | j == intPair.j;
421 | }
422 |
423 | @Override
424 | public int hashCode() {
425 | int seed = 3;
426 | // murmur is nice for general bit mixing, but it has some nasty favored patterns
427 | return 1037 * murmur(seed, i, j) + 17 * i + 53 * j;
428 | }
429 |
430 | private int murmur(int seed, int i, int j) {
431 | // one round of murmur
432 | int c1 = 0xcc9e2d51;
433 | int c2 = 0x1b873593;
434 |
435 | int k = i;
436 | k *= c1;
437 | k = (k << 15) | (k >> 17);
438 | k *= c2;
439 |
440 | int h = seed ^ k;
441 | h = (h << 13) | (h >> 19);
442 | h = h * 5 + 0xe6546b64;
443 |
444 | k = j;
445 | k *= c1;
446 | k = (k << 15) | (k >> 17);
447 | k *= c2;
448 | h = h ^ k;
449 | h = (h << 13) | (h >> 19);
450 | h = h * 5 + 0xe6546b64;
451 |
452 | h ^= h >>> 16;
453 | h *= 0x85ebca6b;
454 | h ^= h >>> 13;
455 | h *= 0xc2b2ae35;
456 | h ^= h >>> 16;
457 |
458 | return h;
459 | }
460 | }
461 |
462 | private Stream docs(Progress p) throws IOException {
463 | return docs(p, -1);
464 | }
465 |
466 | private Stream docs(Progress p, int limit) throws IOException {
467 | Function parser = line -> {
468 | p.log();
469 | int k = line.indexOf('\t');
470 | if (k >= 0) {
471 | k = line.indexOf('\t', k + 1);
472 | if (k >= 0) {
473 | return line.substring(k + 1);
474 | } else {
475 | throw new IllegalArgumentException("Couldn't find second tab");
476 | }
477 | } else {
478 | throw new IllegalArgumentException("Couldn't find first tab");
479 | }
480 | };
481 | if (limit <= 0) {
482 | return Files.lines(Paths.get("/Users/tdunning/tmp/OpinRank/hotels.txt"), StandardCharsets.ISO_8859_1)
483 | .map(parser);
484 | } else {
485 | return Files.lines(Paths.get("/Users/tdunning/tmp/OpinRank/hotels.txt"), StandardCharsets.ISO_8859_1)
486 | .limit(limit)
487 | .map(parser);
488 |
489 | }
490 | }
491 |
492 | private class ScoredPair {
493 | private final int i;
494 | private final int j;
495 | private final double score;
496 |
497 | public ScoredPair(int i, int j, double score) {
498 | this.i = i;
499 | this.j = j;
500 | this.score = score;
501 | }
502 |
503 | public int getI() {
504 | return i;
505 | }
506 |
507 | public int getJ() {
508 | return j;
509 | }
510 |
511 | public double getScore() {
512 | return score;
513 | }
514 | }
515 | }
--------------------------------------------------------------------------------
/src/python/README.md:
--------------------------------------------------------------------------------
1 | # Python Feature Extraction Examples
2 |
3 | ## Methods Illustrated
4 | Symbol combinations
5 |
6 | Quantiles, log-odds and binning
7 |
8 | Reduction to integers using ordinal encoding
9 |
10 | One hot encoding (and counting)
11 |
12 | Frequency encoding and unknown entity
13 |
14 | Luduan features
15 | ## Use Cases
16 | Web log
17 | * Domain, referer, user agent
18 | * referer + domain hashed encoding
19 |
20 | Header fields
21 | * ordering
22 | * language frequency
23 | * language + charset combos
24 | * unknown word
25 |
26 | Purchase amount history
27 | * log-odds, binning on purchase size
28 | * symbol combination (store + quantile-bin)
29 |
30 | Viewership
31 | * time quadrature, one-hot
32 | * one-hot time encodings
33 |
34 | Common point of compromise
35 | * Luduan
36 |
37 | Energy models
38 | * 5P model parameters
39 | * residuals
40 |
41 | Credit card gangs
42 | * card velocity
43 |
--------------------------------------------------------------------------------
/src/python/cooc.py:
--------------------------------------------------------------------------------
1 | ### We will use the hotel reviews from https://kavita-ganesan.com/entity-ranking-data
2 | ### to build word representations using cooccurrence
3 | import collections
4 | import random
5 | import re
6 | import time
7 | from math import floor
8 | from typing import Optional, Set, Callable, List
9 |
10 | import numpy as np
11 | import scipy.sparse as sparse
12 | from sklearn import preprocessing as pre
13 |
14 | # We just use a simple regex here to define words
15 | # this is a bit lossy compared to fancier tokenizers
16 | # but it is also about 50x faster
17 | wordPattern = re.compile(r'''(?x)
18 | ([A-Z]\.)+
19 | |\d+:(\.\d)+
20 | |(https?://)?(\w+\.)(\w{2,})+([\w/]+)?
21 | |[@#]?\w+(?:[-']\w+)*
22 | |\$\d+(\.\d+)?%?
23 | |\\[Uu]\w+
24 | |\\[Uu]\w+'t
25 | |\.\.\.
26 | |[!?]+
27 | ''')
28 |
29 |
30 | def docs(max_docs=-1, ignore=None):
31 | """Returns a generator of generators. The inner generators return the tokens of
32 | each document in our corpus."""
33 | if ignore is None:
34 | ignore = set()
35 | with open("/Users/tdunning/tmp/OpinRank/hotels.txt", "r", encoding="latin_1") as f:
36 | doc = 0
37 | step = 1
38 | scale = 10
39 | t0 = time.time_ns() / 1e9
40 | i0 = 0
41 | for line in f.read().split("\n"):
42 | doc = doc + 1
43 | if max_docs != -1 and doc > max_docs:
44 | break
45 | if doc % (step * scale) == 0:
46 | t1 = time.time_ns() / 1e9
47 | print("Doc %d (%.0f doc/s)" % (doc, (doc - i0) / (t1 - t0)))
48 | i0 = doc
49 | t0 = t1
50 | step = floor(step * 2.55)
51 | if step >= 10:
52 | step = 1
53 | scale = scale * 10
54 | pieces = line.split('\t', maxsplit=2)
55 | if len(pieces) == 3:
56 | yield (m.group(0) for m in wordPattern.finditer(pieces[2].lower()) if m and m.group(0) not in ignore)
57 |
58 |
59 | def H(k):
60 | """Computes unnormalized entropy of a stack of vectors"""
61 | if k.ndim == 2:
62 | k = k[:, :, np.newaxis]
63 | p = (k + 0.0) / k.sum(axis=1).sum(axis=1)[:, np.newaxis, np.newaxis]
64 | raw = -(k * np.log(p + (p == 0)))
65 | while raw.ndim > 1:
66 | raw = raw.sum(axis=1)
67 | return raw
68 |
69 |
70 | def llr(k):
71 | """Computes the log-likelihood ratio test for binomials in a vector-wise fashion.
72 | K is assumed to contain an n x 2 x 2 array of counts presumed to be a 2x2 table for
73 | each of n cases. We return an n-long vector of scores."""
74 | s_row = H(k.sum(axis=1))
75 | s_col = H(k.sum(axis=2))
76 | s = H(k)
77 | return 2 * (s_row + s_col - s)
78 |
79 |
80 | def encode(docs: collections.abc.Iterable, lexicon: List[str], ignore=Optional[Set],
81 | matrix: Optional[Callable] = sparse.csr_matrix) -> sparse.spmatrix:
82 | if ignore is None:
83 | ignore = {}
84 | lookup = dict(zip(sorted(lexicon), range(len(lexicon))))
85 | rows = []
86 | cols = []
87 | data = []
88 | k = 0
89 | for d in docs:
90 | cols.extend({lookup[w] for w in d if w not in ignore})
91 | n = len(cols) - len(rows)
92 | rows.extend(itertools.repeat(k, n))
93 | data.extend(itertools.repeat(1, n))
94 | k += 1
95 | zz = matrix((data, (rows, cols)))
96 | return (zz)
97 |
98 |
99 | from collections import Counter
100 | from nltk.corpus import brown
101 | import itertools
102 |
103 |
104 | def count():
105 | k = Counter()
106 | for w in wordPattern.split(brown.raw()):
107 | k[w] += 1
108 | return k
109 |
110 |
111 | def test():
112 | return Counter((w for d in docs() for w in d))
113 |
114 |
115 | Ndocs = 50000
116 | minScore = 15
117 | maxAssociates = 30
118 |
119 | # count all the words that appear
120 | lexicon = Counter(itertools.chain.from_iterable(docs(Ndocs)))
121 |
122 | # kill words too rare to have interesting collocation
123 | kill = {w for w in lexicon if lexicon[w] < 3}
124 | for w in kill:
125 | del lexicon[w]
126 |
127 | allWords = sorted(lexicon)
128 |
129 | # build the doc x word matrix using the lexicon we have slightly tuned
130 | # note column friendly result
131 | z = encode(docs(Ndocs), allWords, ignore=kill, matrix=sparse.csc_matrix)
132 |
133 | # downsample frequent words (don't kill them entirely)
134 | targetMaxFrequency = max(200.0, Ndocs / 30.0)
135 | downSampleRate = [min(1, targetMaxFrequency / lexicon[w]) for w in allWords]
136 | print("downsample %.0f words out of %d" % (sum(1 if p < 1 else 0 for p in downSampleRate), len(lexicon)))
137 | for w in range(len(allWords)):
138 | p = downSampleRate[w]
139 | if p < 1:
140 | # only a few words will get whacked
141 | nz = z[:, w].nonzero()
142 | v = [1 if random.random() < p else 0 for i in nz[0]]
143 | z[nz[0], w] = v
144 |
145 | # so here are final counts
146 | wordCounts = z.sum(axis=0)
147 | total = sum(wordCounts)
148 | print("doc x word matrix ready")
149 |
150 | # compute raw cooccurrence
151 | cooc = z.T @ z
152 | # but avoid self-cooccurrence
153 | cooc[(range(cooc.shape[0]), range(cooc.shape[1]))] = 0
154 | print('cooccurrence computation done %.3f sparsity' % ((cooc > 0).sum() / (lambda s: s[0] * s[1])(cooc.shape)))
155 |
156 | # now find interesting cooccurrence
157 | # we build a 3D array with one 2x2 contingency tables for each non-zero in the cooccurrence table
158 | # the four elements count how often two particular words cooccur or not
159 | nz = cooc.nonzero()
160 | # A and B together
161 | k11 = cooc[nz]
162 | # A anywhere
163 | k1_ = wordCounts[0, nz[0]]
164 | # A without B
165 | k12 = k1_ - k11
166 |
167 | # B anywhere
168 | k_1 = wordCounts[0, nz[1]]
169 | # B without A
170 | k21 = k_1 - k11
171 | # neither A nor B
172 | k22 = Ndocs - k12 - k21 - k11
173 |
174 | # final shape should be n x 2 x 2
175 | k = np.array([k11, k12, k21, k22]).reshape((2, 2, k11.shape[1])).transpose()
176 | print("%d x %d x %d counts ready" % k.shape)
177 |
178 | # constructs scores whereever cooc was non-zero. Note cooc is symmetric, extra work here
179 | scores = sparse.csr_matrix((llr(k), nz))
180 | print("scoring done")
181 |
182 | # now review each word and limit the number of associates
183 | rows = []
184 | cols = []
185 | for row in range(scores.shape[0]):
186 | # find nth highest score
187 | index = (scores[row, :] >= minScore).nonzero()[1]
188 | if len(index) > 0:
189 | s = sorted((scores[row, index].toarray().flat), reverse=True)
190 | cutoff = s[min(len(s), maxAssociates) - 1]
191 | cols.extend(i for i in index if scores[row, i] >= cutoff)
192 | rows.extend(itertools.repeat(row, len(cols) - len(rows)))
193 | # final result has row per word consisting of unweighted associates
194 | # might should consider idf weighting here
195 | associates = sparse.csr_matrix((list(itertools.repeat(1, len(rows))), (rows, cols)), shape=scores.shape)
196 | print("associates ready")
197 | synonyms = associates * associates.T
198 |
199 | lookup = dict(zip(sorted(lexicon), range(len(lexicon))))
200 | unlook = list(sorted(lexicon))
201 | print([unlook[i] for i in (synonyms[lookup['railway'], :] > 3).nonzero()[1]])
202 | print([unlook[i] for i in (synonyms[lookup['hot'], :] > 3).nonzero()[1]])
203 | print([unlook[i] for i in (synonyms[lookup['cold'], :] > 3).nonzero()[1]])
204 | print([unlook[i] for i in (synonyms[lookup['food'], :] > 3).nonzero()[1]])
205 | print([unlook[i] for i in (synonyms[lookup['room'], :] > 3).nonzero()[1]])
206 |
207 | # lookup = dict(zip(sorted(lexicon), range(len(lexicon))))
208 | # unlook = list(sorted(lexicon))
209 | # m = lookup['railway']
210 | # index = scores[m,:].nonzero()[1]
211 | # print(sorted(((scores[m,i], i, unlook[i]) for i in index), key=lambda x: -x[0])[1:15])
212 |
213 | limit = 1000
214 |
215 | n1 = (wordCounts ** 2).sum()
216 | n2 = (wordCounts[wordCounts < limit] ** 2).sum() + ((wordCounts >= limit) * limit * limit).sum()
217 | print(n1 / n2)
218 |
219 | all_docs = [bag for bag in docs()]
220 | wordEncoder = pre.OneHotEncoder()
221 | words = [[w] for bag in all_docs for w in bag]
222 | wordEncoder.fit(words)
223 | vectors = [v.sum(axis=0) for bag in all_docs]
224 | wordEncoder.transform([[x] for x in all_docs[0]])
225 |
--------------------------------------------------------------------------------
/src/python/onehot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from sklearn import preprocessing
5 |
6 | enc = preprocessing.OrdinalEncoder()
7 |
8 | X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
9 | enc.fit(X)
10 | enc.transform([['female', 'from US', 'uses Safari']])
11 |
--------------------------------------------------------------------------------
/src/python/time-encodings.py:
--------------------------------------------------------------------------------
1 | """
2 | This demonstrates some simple ways to encode time so that models can
3 | make sense of it.
4 |
5 | The problem at hand is prediction of web traffic on various wikipedia pages.
6 |
7 | The features we will use include:
8 |
9 | * Lagged values for traffic
10 | * Time of day expressed as continuous variables
11 | * Day of week expressed as continuous variables
12 | * Day of week expressed as one-hot variables
13 | * Page URL
14 | """
15 | import handout
16 | import os
17 |
18 | os.mkdir("handouts") # handout: exclude
19 |
20 | doc = handout.Handout("handouts/time") # handout: exclude
21 |
22 | doc.show() # handout: exclude
--------------------------------------------------------------------------------
/src/python/wiki-data-download.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import shutil
4 | import time
5 |
6 | import urllib3
7 | from bs4 import BeautifulSoup
8 |
9 |
10 | base_url = 'https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-07/'
11 |
12 | http = urllib3.PoolManager()
13 |
14 | r = http.request('GET', base_url)
15 |
16 | soup = BeautifulSoup(r.data, 'html.parser')
17 |
18 | page_pattern = re.compile('pagecounts-.*gz')
19 |
20 | links = [anchor['href'] for anchor in soup.find_all("a") if re.match(page_pattern, anchor['href'])]
21 |
22 | try:
23 | os.mkdir('./wiki-stats')
24 | except FileExistsError:
25 | print("wiki-stats already exists")
26 |
27 | print(os.getcwd())
28 | for link in links:
29 | url = base_url + '/' + link
30 | local_file = os.path.join('wiki-stats', link)
31 | if os.path.exists(local_file):
32 | print("%s already exists, skipping" % local_file)
33 | else:
34 | with http.request('GET', url, preload_content=False) as resp, open(local_file, "wb") as f:
35 | shutil.copyfileobj(resp, f)
36 | print(link)
37 | time.sleep(0.5)
38 |
--------------------------------------------------------------------------------