├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── vec-rot
    ├── README.md
    └── vec-rot.py
└── vec2word.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 | 
113 | # Pyre type checker
114 | .pyre/
115 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Logan Kearsley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vec2word
 2 | Semi-automated vocabulary generation from semantic vector models
 3 | 
 4 | This script generates a list of potential conlang word forms along with associated possible glosses based on a word-shape template and a word2vec-style semantic vector model. The process works something like this:
 5 | 
 6 | 1. Acquire a word2vec-style semantic vector model (either word2vec binary format or text format).
 7 | 2. Define a word-shape template.
 8 | 3. Figure out how many word forms can be made from that template, and group vectors into that many clusters.
 9 | 4. Use Principle Component Analysis to project the vector model down to the same number of dimensions as you have slots in your template.
10 | 5. Match the new model dimensions to slots based on how many phonemes can go in a slot vs. the variance in a given dimension (large phoneme range pairs with large variance), and then discretize those dimensions into the appropriate number of buckets.
11 | 5. Use the buckets each cluster-centroid vector ends up getting put in to select phonemes for each template slot and generate new conlang words, along with a list of all of the model words whose vectors ended up in that cluster.
12 | 
13 | This results in word forms in which each phoneme represents a category in some semantic classification scheme, rather like a traditional philosophical language--except, the categories are not always obviously-sensible, human-defined categories such as you might find in a thesaurus, but weird collections of whatever happens to project into similar places in low-dimensional space. Getting reasonable definitions for your new words will still require work at selecting among the various options provided to you, or making up a new one in a similar semantic space--whatever you decide that means. Ideally, this should result in a lexicon with lots of discoverable sound-symbolism, but very little obvious regular morphology. The various source words in each cluster are output in order of their distance from the most central prototype of the cluster--which usually means being ordered from most generic to most specific, but not always.
14 | 
15 | You could also decide that, rather than generating complete words, you just want to generate, e.g., individual syllables, which could then be compounded together to produce words with more specific meanings--essentially, simulating the process by which Chinese produced lots of homophones (single phonetic forms with wildly varying ambiguous meanings) and then used compounding to re-disambiguate the lexicon.
16 | 
17 | Or generate triliteral consonant roots, whose semantics will be narrowed down by intercalated vowel patterns.
18 | 
19 | Or something else entirely! Play around, experiment, have fun!
20 | 
21 | # Example use
22 | 
23 | `python vec2word.py model.bin "t,d,n,k,g,q,p,b,m" "i,u,e" "t,n,k,q,p,m" > syllables.txt`
24 | 
25 | This uses the `model.bin` model to produce "words" on a CVC template and save the results in `syllables.txt`. For longer templates, just add more command-line arguments, each consisting of a comma-separated list of phonemes/graphemes that are allowed in the slot.
26 | 
27 | Most models are really big, which means the clusters are *really big*, which can be overwhelming. So, if you put a number bigger than zero between the model file name and the first slot definition, it will output only that many of the most central members of each cluster.
28 | 
29 | Many pre-built word2vec models suitable for use with this script can be downloaded from the [NLPL Word Vectors Repository](http://vectors.nlpl.eu/repository/).
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim==4.0.1
2 | scikit-learn==1.0.1
3 | sklearn==0.0
4 | 


--------------------------------------------------------------------------------
/vec-rot/README.md:
--------------------------------------------------------------------------------
 1 | # Discover fuzzy relations... of some sort
 2 | 
 3 | `python vec-rot.py {model file} > output.txt`
 4 | 
 5 | The program expects output to redirected to a file, and prints diagnostic info to stderr.
 6 | 
 7 | This is an accidental side-product of trying to automatically extract analogy sets. It completely fails at that, but the first time I tested this general approach (get a random difference vector, try to apply it to a new base after rotating it from its original position to the new base position, and then see if wherever it takes you lets you get back again; the idea here was to try to come up with some way of judging if a relationship is "real" other than having an arbitrary similarity cut-off), it seemed to spit out hyponym sets. In fact, it's a lot messier than that--it seems to extract sets of words that all have a similar relationship *of some sort* with a common target word, of which hypoynmy is one possibility, and I am not sure exactly what it is doing. I thought I had it figured out, until I realized that I had a less-than sign accidentally exchanged for a less-than sign... and "fixing" that ended up producing garbage!
 8 | 
 9 | So, I have tried to make it do whatever it is that it does reasonably quickly, and cleaned it up for other people to look at. Maybe it will be useful, or at least interpretable, to someone else.
10 | 


--------------------------------------------------------------------------------
/vec-rot/vec-rot.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from gensim.models import KeyedVectors
 4 | 
 5 | # Rotate v in the plane of x and y through the angle between x and y
 6 | def rot_plane(v, x, y):
 7 |   vx = np.dot(v, x)
 8 |   vy = np.dot(v, y)
 9 |   xcomp = -vx - vy
10 |   ycomp = vx - vy
11 |   return v + x * xcomp + y * ycomp
12 | 
13 | def s(w):
14 |     return "None" if w is None else repr(w.encode("utf-8", errors='replace'))[2:-1]
15 | 
16 | def diffs(keys):
17 |     # Maybe this could be replaced with just a random vector generator?
18 |     # But then why does the rotation matter? I dunno!
19 |     for i, w1 in enumerate(keys):
20 |         if w1 is None: continue
21 |         v = wv.vectors[i]
22 |         for j in range(i+1, len(keys)):
23 |             w2 = keys[j]
24 |             if w2 is None: continue
25 |             yield (i, j, v, wv.vectors[j])
26 | 
27 | used = set()
28 | def test_relation(i, j, v1, v2):
29 |     d = v2 - v1
30 |     n = np.linalg.norm(d)
31 |     print("d", n)
32 |     print("d", n, file=sys.stderr)
33 |     if n > 1.1: return # this may be a model-specific parameter
34 |     for k, w1 in enumerate(keys):
35 |         if w1 is None or k == i or k == j: continue
36 |         v3 = wv.vectors[k]
37 |         r = rot_plane(d, v1, v3)
38 |         (w2, e) = wv.similar_by_vector(v3 + r, topn=1)[0]
39 |         # This e > 0.75 check feels backwards, but it seems to be critical
40 |         # Why does filtering out results that are "too good" actually make the results better?
41 |         # I haven't a single clue!
42 |         if w2 is None or e > 0.75 or w2 == w1 or (w1, w2) in used: continue
43 |         (w3, _) = wv.similar_by_vector(wv[w2] - r, topn=1)[0]
44 |         if w3 == w1:
45 |             used.add((w1,w2))
46 |             yield (w1,w2,e)
47 | 
48 | model_file = sys.argv[1]
49 | wv = KeyedVectors.load_word2vec_format(model_file, binary=(model_file.endswith('.bin')))
50 | 
51 | keys = wv.index_to_key
52 | 
53 | # just testing diffs between unique pairs, ignoring sign
54 | count = float(len(keys) * (len(keys) - 1) / 2)
55 | 
56 | for p, (i, j, v1, v2) in enumerate(diffs(keys)):
57 |     print(p, round(100 * p / count), "%", file=sys.stderr)
58 |     
59 |     s1 = s(keys[i])
60 |     s2 = s(keys[j])
61 |     print(s2, ">", s1, file=sys.stderr)
62 |     
63 |     rel = list(test_relation(i, j, v2, v1))
64 |     if len(rel) > 0:
65 |         print(s2, ">", s1)
66 |         for w1,w2,d in rel:
67 |             s1 = s(w1)
68 |             s2 = s(w2)
69 |             print(s1, s2, d)
70 |             print(s1, s2, d, file=sys.stderr)


--------------------------------------------------------------------------------
/vec2word.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import operator
 4 | import numpy as np
 5 | from functools import reduce
 6 | from itertools import product
 7 | from gensim.models import KeyedVectors
 8 | from sklearn.decomposition import PCA
 9 | from sklearn.cluster import KMeans
10 | 
11 | def select(k, n):
12 |     value = list(range(k))
13 |     rest = dict()
14 |     for i in range(k):
15 |         j = random.randrange(i, n)
16 |         if j < k:
17 |             # Both elements to be swapped are in the selection
18 |             value[i], value[j] = value[j], value[i]
19 |         elif j in rest:
20 |             # Element j has been swapped before
21 |             value[i], rest[j] = rest[j], value[i]
22 |         else:
23 |             # The value at j is still j, we now add it to the virtual array
24 |             rest[j] = value[i]
25 |             value[i] = j
26 |     return value
27 | 
28 | cluster_size = 0
29 | try:
30 |     cluster_size = int(sys.argv[2])
31 | except:
32 |     pass
33 | 
34 | segments = [a.split(',') for a in sys.argv[(2 if cluster_size == 0 else 3):]]
35 | dims = [len(a) for a in segments]
36 | D = len(dims)
37 | K = reduce(lambda a,b: a*b, dims)
38 | 
39 | d_order = np.lexsort((dims,))
40 | 
41 | model_file = sys.argv[1]
42 | wv = KeyedVectors.load_word2vec_format(model_file, binary=(model_file.endswith('.bin')))
43 | 
44 | sample_count = K*50
45 | vector_count = len(wv.vectors)
46 | if sample_count > vector_count:
47 |     sample_count = vector_count
48 | 
49 | print("Used", sample_count, "of", vector_count, "vectors for clustering")
50 | samples = [wv.vectors[i,:] for i in select(sample_count, vector_count)]
51 | 
52 | km = KMeans(n_clusters=K)
53 | 
54 | if cluster_size == 0:
55 |     labels2words = [list() for _ in range(K)]
56 |     labels = km.fit_predict(wv.vectors)
57 |     centers = km.cluster_centers_
58 |     for i, l in enumerate(labels):
59 |         w = wv.index_to_key[i]
60 |         if w is not None:
61 |             labels2words[l].append(w)
62 | else:
63 |     labels2words = []
64 |     km.fit(wv.vectors)
65 |     centers = km.cluster_centers_
66 |     for center in centers:
67 |         cluster = [r[0] for r in wv.most_similar(positive=[center], topn=(cluster_size+1)) if r[0] is not None]
68 |         if len(cluster) > cluster_size: cluster.pop()
69 |         labels2words.append(cluster)
70 | 
71 | pca = PCA(n_components=D)
72 | result = pca.fit_transform(centers)
73 | r_order = np.lexsort((result[:,d_order]).T)
74 | 
75 | print("Forms:", K)
76 | 
77 | for i, emes in enumerate(product(*segments)):
78 |     center = centers[r_order[i],:]
79 |     cluster = labels2words[r_order[i]]
80 |     if cluster_size == 0:
81 |         distances = wv.distances(center, cluster)
82 |         cluster = [k for k, _ in sorted(zip(cluster, distances), key=operator.itemgetter(1))]
83 | 
84 |     word = cluster[0]
85 |     print(
86 |         "".join(emes), ':',
87 |         repr(word.encode("utf-8", errors='replace'))[2:-1],
88 |         len(cluster)
89 |     )
90 |     for w in cluster[1:]:
91 |         print('\t', repr(w.encode("utf-8", errors='replace'))[2:-1])
92 |     print('==========')


--------------------------------------------------------------------------------