├── .gitignore
├── README.md
├── _src
    ├── ngrams3.db
    ├── requirements.txt
    ├── setup.cfg
    └── wordgen.py
├── _tests
    └── test_one.json
└── kappa.yml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | 
 5 | # kappa cache directories
 6 | .kappa/
 7 | 
 8 | # zip files
 9 | *.zip
10 | 
11 | # Python files
12 | *.pyc
13 | *.egg-info
14 | *.dist-info
15 | 
16 | # Ignore kappa generated config file
17 | **/_src/config.json
18 | 
19 | # Pymodules dists
20 | pymodules/**/dist
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Wordgen
 2 | =======
 3 | 
 4 | **Wordgen** is to generate english like words to use for product and company names.
 5 | 
 6 | This is what happens when your product manager is an engineer. :)
 7 | 
 8 | It's designed to be used with AWS Lambda, and we highly reccomend deploying with [kappa](https://github.com/garnaat/kappa).
 9 | 
10 | The program was used to come up with the name [Yeobot](https://cloudnative.io/yeobot)
11 | 


--------------------------------------------------------------------------------
/_src/ngrams3.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jedberg/wordgen/f078b0ad9d3c9d4b271cbee2e8bf73f0ae760703/_src/ngrams3.db


--------------------------------------------------------------------------------
/_src/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jedberg/wordgen/f078b0ad9d3c9d4b271cbee2e8bf73f0ae760703/_src/requirements.txt


--------------------------------------------------------------------------------
/_src/setup.cfg:
--------------------------------------------------------------------------------
1 | [install]
2 | prefix=
3 | 


--------------------------------------------------------------------------------
/_src/wordgen.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import random
  3 | import math
  4 | import logging
  5 | 
  6 | from bisect import bisect
  7 | 
  8 | LOG = logging.getLogger()
  9 | LOG.setLevel(logging.DEBUG)
 10 | 
 11 | class ngram(object):
 12 |     def __init__(self):
 13 |         self.conn = sqlite3.connect('ngrams3.db')
 14 |         self.c = self.conn.cursor()
 15 | 
 16 |     def problist(self, pos=1, context="", reverse=False):
 17 |         #TODO: Error check the context to make sure it's 3 letters and only uses a % as a wildcard
 18 |         #TODO: Error check that the position isn't greater than 52
 19 |         #TODO: Error check that context and position make sense (ie. no context with position 1 if not being reversed
 20 | 
 21 |         rows = []
 22 |         whichletter = 3
 23 |         letter_position = "pos%d" % pos
 24 |         where_clause = "%%%"
 25 | 
 26 |         if len(context) == 2:
 27 |             if reverse:
 28 |                 where_clause = "%%%s" % context
 29 |                 whichletter = 1
 30 |             else:
 31 |                 where_clause = "%s%%" % context
 32 | 
 33 |         if len(context) == 1:
 34 |             whichletter = 2
 35 |             if reverse:
 36 |                 where_clause = "%%%%%s" % context
 37 |             else:
 38 |                 where_clause = "%s%%%%" % context
 39 | 
 40 |         if "%" in context:
 41 |             whichletter = 2
 42 |             letter_position = "pos%d" % (pos - 1)
 43 |             where_clause = "%s" % context
 44 | 
 45 |         # No context? Then we return a list of possible letters for that position
 46 |         if context == "":
 47 |             whichletter = 1
 48 |             rows = [row for row in self.c.execute(
 49 |                 "SELECT SUBSTR(ngram, %d, %d) as choices, SUM(%s) as vals from ngrams3 where %s > 0 group by choices" %
 50 |                 (whichletter, 1, letter_position, letter_position))]
 51 | 
 52 |         else:
 53 |             rows = [row for row in self.c.execute(
 54 |                 "SELECT SUBSTR(ngram, %d, %d) as choices, SUM(%s) as vals from ngrams3 where (ngram like ?) AND %s > 0 group by choices" %
 55 |                 (whichletter, 1, letter_position, letter_position), (where_clause,))]
 56 | 
 57 |         # normalize values for probabilities
 58 |         # this will come out to around 100, but may be higher due to rounding
 59 |         # it's here so when we generate a list for selecting a letter the list isn't too big
 60 |         # Also it gives the less likely letters a slightly better chance, which
 61 |         # gives a better chance of not generating a real word.
 62 |         total = sum([v[1] for v in rows])
 63 |         return [(v[0].lower(), math.ceil((float(v[1])/float(total)*100))) for v in rows]
 64 | 
 65 | def weighted_choice(choices):
 66 |     values, weights = zip(*choices)
 67 |     total = 0
 68 |     cum_weights = []
 69 |     for w in weights:
 70 |         total += w
 71 |         cum_weights.append(total)
 72 |     x = random.random() * total
 73 |     i = bisect(cum_weights, x)
 74 |     return values[i]
 75 | 
 76 | def genword(length=6, prefix="", suffix=""):
 77 |     blanks = length - len(prefix) - len(suffix)
 78 |     result = list(prefix) + list("%" * blanks) + list(suffix)
 79 |     n = ngram()
 80 |     choice_list = []
 81 | 
 82 |     while "%" in result:
 83 |         position = result.index("%")
 84 |         context = ''.join(result[max(0,(position - 2)):max(0,(position))])
 85 |         if suffix and prefix == "":
 86 |             position = ''.join(result).rfind("%")
 87 |             context = ''.join(result[max(0,(position+1)):max(0,(position + 3))])
 88 |         if blanks == 1 and (position < length) and position != 0:
 89 |             infix_context = result[max(0,position-1)] + "%" + result[min(position+1,length - 1)]
 90 |             choice_list = n.problist(pos=(position + 1), context=context) + n.problist(pos=(position + 1), context=infix_context)
 91 |         else:
 92 |             if suffix and prefix == "":
 93 |                 choice_list = n.problist(pos=(position + 1), context=context, reverse=True)
 94 |             else:
 95 |                 choice_list = n.problist(pos=(position + 1), context=context)
 96 |         if len(choice_list) > 0:
 97 |             result[position] = weighted_choice(choice_list)
 98 |         else:
 99 |             # Went down a bad path, start over
100 |             print "Whoops"
101 |             blanks = length - len(prefix) - len(suffix)
102 |             result = list(prefix) + list("%" * blanks) + list(suffix)
103 | 
104 |         blanks = blanks - 1
105 |     return ''.join(result)
106 | 
107 | def handler(event, context):
108 |     ret = {}
109 |     r = ""
110 |     for x in xrange(int(event['words'])):
111 |         w = genword(length=8, prefix=event['prefix'])
112 |         ret[w.lower().title()] = ""
113 |     return ret
114 | 
115 | 


--------------------------------------------------------------------------------
/_tests/test_one.json:
--------------------------------------------------------------------------------
1 | {
2 |     "words": "20",
3 |     "prefix": "ops"
4 | }
5 | 


--------------------------------------------------------------------------------
/kappa.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: wordgen
 3 | environments:
 4 |   dev:
 5 |     profile: jedberg
 6 |     region: us-west-2
 7 |     policy:
 8 |       resources:
 9 |         - arn: arn:aws:logs:*:*:*
10 |           actions:
11 |           - "*"
12 | lambda:    
13 |   description: An "english" word generator
14 |   handler: wordgen.handler
15 |   runtime: python2.7
16 |   memory_size: 128
17 |   timeout: 30
18 |   
19 | 


--------------------------------------------------------------------------------