├── .gitignore ├── README.md ├── _src ├── ngrams3.db ├── requirements.txt ├── setup.cfg └── wordgen.py ├── _tests └── test_one.json └── kappa.yml /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # kappa cache directories 6 | .kappa/ 7 | 8 | # zip files 9 | *.zip 10 | 11 | # Python files 12 | *.pyc 13 | *.egg-info 14 | *.dist-info 15 | 16 | # Ignore kappa generated config file 17 | **/_src/config.json 18 | 19 | # Pymodules dists 20 | pymodules/**/dist 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Wordgen 2 | ======= 3 | 4 | **Wordgen** is to generate english like words to use for product and company names. 5 | 6 | This is what happens when your product manager is an engineer. :) 7 | 8 | It's designed to be used with AWS Lambda, and we highly reccomend deploying with [kappa](https://github.com/garnaat/kappa). 9 | 10 | The program was used to come up with the name [Yeobot](https://cloudnative.io/yeobot) 11 | -------------------------------------------------------------------------------- /_src/ngrams3.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jedberg/wordgen/f078b0ad9d3c9d4b271cbee2e8bf73f0ae760703/_src/ngrams3.db -------------------------------------------------------------------------------- /_src/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jedberg/wordgen/f078b0ad9d3c9d4b271cbee2e8bf73f0ae760703/_src/requirements.txt -------------------------------------------------------------------------------- /_src/setup.cfg: -------------------------------------------------------------------------------- 1 | [install] 2 | prefix= 3 | -------------------------------------------------------------------------------- /_src/wordgen.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import random 3 | import math 4 | import logging 5 | 6 | from bisect import bisect 7 | 8 | LOG = logging.getLogger() 9 | LOG.setLevel(logging.DEBUG) 10 | 11 | class ngram(object): 12 | def __init__(self): 13 | self.conn = sqlite3.connect('ngrams3.db') 14 | self.c = self.conn.cursor() 15 | 16 | def problist(self, pos=1, context="", reverse=False): 17 | #TODO: Error check the context to make sure it's 3 letters and only uses a % as a wildcard 18 | #TODO: Error check that the position isn't greater than 52 19 | #TODO: Error check that context and position make sense (ie. no context with position 1 if not being reversed 20 | 21 | rows = [] 22 | whichletter = 3 23 | letter_position = "pos%d" % pos 24 | where_clause = "%%%" 25 | 26 | if len(context) == 2: 27 | if reverse: 28 | where_clause = "%%%s" % context 29 | whichletter = 1 30 | else: 31 | where_clause = "%s%%" % context 32 | 33 | if len(context) == 1: 34 | whichletter = 2 35 | if reverse: 36 | where_clause = "%%%%%s" % context 37 | else: 38 | where_clause = "%s%%%%" % context 39 | 40 | if "%" in context: 41 | whichletter = 2 42 | letter_position = "pos%d" % (pos - 1) 43 | where_clause = "%s" % context 44 | 45 | # No context? Then we return a list of possible letters for that position 46 | if context == "": 47 | whichletter = 1 48 | rows = [row for row in self.c.execute( 49 | "SELECT SUBSTR(ngram, %d, %d) as choices, SUM(%s) as vals from ngrams3 where %s > 0 group by choices" % 50 | (whichletter, 1, letter_position, letter_position))] 51 | 52 | else: 53 | rows = [row for row in self.c.execute( 54 | "SELECT SUBSTR(ngram, %d, %d) as choices, SUM(%s) as vals from ngrams3 where (ngram like ?) AND %s > 0 group by choices" % 55 | (whichletter, 1, letter_position, letter_position), (where_clause,))] 56 | 57 | # normalize values for probabilities 58 | # this will come out to around 100, but may be higher due to rounding 59 | # it's here so when we generate a list for selecting a letter the list isn't too big 60 | # Also it gives the less likely letters a slightly better chance, which 61 | # gives a better chance of not generating a real word. 62 | total = sum([v[1] for v in rows]) 63 | return [(v[0].lower(), math.ceil((float(v[1])/float(total)*100))) for v in rows] 64 | 65 | def weighted_choice(choices): 66 | values, weights = zip(*choices) 67 | total = 0 68 | cum_weights = [] 69 | for w in weights: 70 | total += w 71 | cum_weights.append(total) 72 | x = random.random() * total 73 | i = bisect(cum_weights, x) 74 | return values[i] 75 | 76 | def genword(length=6, prefix="", suffix=""): 77 | blanks = length - len(prefix) - len(suffix) 78 | result = list(prefix) + list("%" * blanks) + list(suffix) 79 | n = ngram() 80 | choice_list = [] 81 | 82 | while "%" in result: 83 | position = result.index("%") 84 | context = ''.join(result[max(0,(position - 2)):max(0,(position))]) 85 | if suffix and prefix == "": 86 | position = ''.join(result).rfind("%") 87 | context = ''.join(result[max(0,(position+1)):max(0,(position + 3))]) 88 | if blanks == 1 and (position < length) and position != 0: 89 | infix_context = result[max(0,position-1)] + "%" + result[min(position+1,length - 1)] 90 | choice_list = n.problist(pos=(position + 1), context=context) + n.problist(pos=(position + 1), context=infix_context) 91 | else: 92 | if suffix and prefix == "": 93 | choice_list = n.problist(pos=(position + 1), context=context, reverse=True) 94 | else: 95 | choice_list = n.problist(pos=(position + 1), context=context) 96 | if len(choice_list) > 0: 97 | result[position] = weighted_choice(choice_list) 98 | else: 99 | # Went down a bad path, start over 100 | print "Whoops" 101 | blanks = length - len(prefix) - len(suffix) 102 | result = list(prefix) + list("%" * blanks) + list(suffix) 103 | 104 | blanks = blanks - 1 105 | return ''.join(result) 106 | 107 | def handler(event, context): 108 | ret = {} 109 | r = "" 110 | for x in xrange(int(event['words'])): 111 | w = genword(length=8, prefix=event['prefix']) 112 | ret[w.lower().title()] = "" 113 | return ret 114 | 115 | -------------------------------------------------------------------------------- /_tests/test_one.json: -------------------------------------------------------------------------------- 1 | { 2 | "words": "20", 3 | "prefix": "ops" 4 | } 5 | -------------------------------------------------------------------------------- /kappa.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: wordgen 3 | environments: 4 | dev: 5 | profile: jedberg 6 | region: us-west-2 7 | policy: 8 | resources: 9 | - arn: arn:aws:logs:*:*:* 10 | actions: 11 | - "*" 12 | lambda: 13 | description: An "english" word generator 14 | handler: wordgen.handler 15 | runtime: python2.7 16 | memory_size: 128 17 | timeout: 30 18 | 19 | --------------------------------------------------------------------------------