├── README.md ├── analyze.py ├── database_matrix_eig.png ├── inputs ├── cities.json ├── database.json ├── editors.json ├── js_frameworks.json └── prog_lang.json ├── js_framework_matrix_eig.png ├── prog_lang_matrix.png └── prog_lang_matrix_eig.png /README.md: -------------------------------------------------------------------------------- 1 | Scrape search queries from Google. The code quality isn't the best (should factor out the configuration into a static file at some point!) 2 | 3 | You probably need to run it through a proxy: `$ HTTP_PROXY=xyz python analyze.py` 4 | 5 | ![lang](https://raw.githubusercontent.com/erikbern/eigenstuff/master/prog_lang_matrix.png) 6 | 7 | ![lang](https://raw.githubusercontent.com/erikbern/eigenstuff/master/prog_lang_matrix_eig.png) 8 | -------------------------------------------------------------------------------- /analyze.py: -------------------------------------------------------------------------------- 1 | import bs4, itertools, json, matplotlib, numpy, os, random, re, requests, sys, time 2 | from matplotlib import pyplot 3 | 4 | 5 | input_fn = sys.argv[1] 6 | cache_fn = input_fn + '.cache.csv' 7 | 8 | cache = {} 9 | if os.path.exists(cache_fn): 10 | for line in open(cache_fn): 11 | q, n = line.strip().split('\t') 12 | cache[q] = int(n) 13 | 14 | def get_n_results_dumb(q): 15 | sleep = 60 16 | while True: 17 | r = requests.get('http://www.google.com/search', 18 | params={'q': q, 19 | "tbs": "li:1"}) 20 | if r.status_code == 200: 21 | break 22 | elif r.status_code == 503: 23 | print('%50s... got rate limited, sleeping %ds then retrying' % (q, sleep)) 24 | time.sleep(sleep) 25 | sleep = min(3*sleep, 1200) 26 | else: 27 | raise Exception('Got HTTP status code %d', r.status_code) 28 | 29 | soup = bs4.BeautifulSoup(r.text) 30 | s = soup.find('div', {'id': 'resultStats'}).text 31 | if not s: 32 | return 0 33 | m = int(re.search(r'([0-9,]+)', s).groups()[0].replace(',', '')) 34 | print('%50s... %d' % (q, m)) 35 | return m 36 | 37 | data = json.load(open(input_fn)) 38 | tag = data['tag'] 39 | items = data['items'] 40 | verbs = data['verbs'] 41 | 42 | item2i = dict([(item, i) for i, item in enumerate(items)]) 43 | 44 | qs = [] 45 | for item1, item2 in itertools.product(items, items): 46 | if item1 != item2: 47 | for verb in verbs: 48 | qs.append((item2i[item1], item2i[item2], '"%s from %s to %s"' % (verb, item1, item2))) 49 | qs.append((item2i[item1], item2i[item2], '"%s to %s from %s"' % (verb, item2, item1))) 50 | 51 | m = numpy.zeros((len(items), len(items))) 52 | random.shuffle(qs) 53 | progress = len(set(cache).intersection([q for _, _, q in qs])) / len(qs) 54 | print('Progress so far: %5.2f%%' % (100. * progress)) 55 | 56 | for i, j, q in qs: 57 | if q in cache: 58 | n = cache[q] 59 | else: 60 | n = get_n_results_dumb(q) 61 | f = open(cache_fn, 'a') 62 | f.write('%s\t%d\n' % (q, n)) 63 | f.close() 64 | m[j][i] += n 65 | 66 | def plot_mat(m, items, cm, fn, fmt, dir_text=None): 67 | s = 4 + len(items) * 0.3 68 | fig = pyplot.figure(figsize=(s, s)) 69 | ax = fig.add_subplot(111) 70 | ax.xaxis.set_label_position('top') 71 | ax.matshow(m.T + 1, cmap=cm, norm=matplotlib.colors.LogNorm(vmin=numpy.min(m+1), vmax=numpy.max(m+1))) 72 | 73 | if dir_text: 74 | ax.set_xlabel('To %s\n< Smaller %s %10s Larger %s >' % (tag, dir_text, '', dir_text)) 75 | ax.set_ylabel('From %s\n< Larger %s %10s Smaller %s >' % (tag, dir_text, '', dir_text)) 76 | else: 77 | ax.set_xlabel('To %s' % tag) 78 | ax.set_ylabel('From %s'% tag) 79 | ax.set_xticks(numpy.arange(0, len(items))) 80 | ax.set_yticks(numpy.arange(0, len(items))) 81 | ax.set_xticklabels(items, rotation=90, ha='center') 82 | ax.set_yticklabels(items, va='center') 83 | ax.set_xticks(numpy.arange(0.5, len(items)+0.5), minor=True) 84 | ax.set_yticks(numpy.arange(0.5, len(items)+0.5), minor=True) 85 | ax.grid(which='minor') 86 | 87 | for i in range(len(items)): 88 | for j in range(len(items)): 89 | text = fmt % m[i][j] 90 | if text != fmt % 0: 91 | ax.text(i, j, text, va='center', ha='center', size=7) 92 | 93 | fig.tight_layout() 94 | pyplot.savefig(fn, dpi=300) 95 | 96 | # Plot lexicographical 97 | ps = sorted(range(len(items)), key=lambda i: items[i]) 98 | plot_mat(m[ps,:][:,ps], sorted(items), pyplot.cm.OrRd, '%s_matrix.png' % tag, '%.0f') 99 | 100 | # m += numpy.eye(len(items)) # hack to fix zero entries 101 | for item, pop in zip(items, m.sum(axis=0) + m.sum(axis=1)): 102 | print('%20s %6d' % (item, pop)) 103 | m /= m.sum(axis=0)[numpy.newaxis,:] 104 | u = numpy.ones(len(items)) 105 | 106 | for i in range(100): 107 | u = numpy.dot(m, u) 108 | u /= u.sum() 109 | 110 | # Create a new matrix where rows/columns are ordered by u 111 | ps = sorted(range(len(items)), key=lambda i: u[i]) 112 | for p in reversed(ps): 113 | print('| %5.2f%% | %20s |' % (u[p]*100, items[p])) 114 | 115 | m_new = m[ps,:][:,ps] 116 | plot_mat(m_new, [items[p] for p in ps], pyplot.cm.BuGn, '%s_matrix_eig.png' % tag, '%.2f', dir_text='future popularity') 117 | -------------------------------------------------------------------------------- /database_matrix_eig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/eigenstuff/79cf3fb4b54e59efac7c4630e22ae005d91f0556/database_matrix_eig.png -------------------------------------------------------------------------------- /inputs/cities.json: -------------------------------------------------------------------------------- 1 | { 2 | "tag": "city", 3 | "items": ["New York", "Los Angeles", "Chicago", "Dallas", "Houston", "Washington, DC", "Miami", "Philadelphia", "Atlanta", "Boston", "Phoenix", "San Francisco", "Riverside", "Detroit", "Seattle", "Minneapolis", "San Diego", "Tampa", "Denver", "Baltimore", "St. Louis", "Charlotte", "Orlando", "San Antonio", "Portland", "Pittsburgh", "Sacramento", "Las Vegas", "Cincinnati", "Kansas City", "Austin", "Columbus", "Cleveland", "Indianapolis", "San Jose", "Nashville", "Virginia Beach", "Providence", "Milwaukee", "Jacksonville", "Oklahoma City", "Memphis", "Raleigh", "Richmond", "Louisville", "New Orleans", "Hartford", "Salt Lake City", "Birmingham", "Buffalo"], 4 | "verbs": ["move"] 5 | } 6 | -------------------------------------------------------------------------------- /inputs/database.json: -------------------------------------------------------------------------------- 1 | { 2 | "tag": "database", 3 | "items": ["mysql", "postgres", "mongodb", "cassandra", "dynamodb", "mariadb", "riak", "redis"], 4 | "verbs": ["switch", "switched", "move", "moved"] 5 | } 6 | -------------------------------------------------------------------------------- /inputs/editors.json: -------------------------------------------------------------------------------- 1 | { 2 | "tag": "editors", 3 | "items": ["emacs", "vim", "notepad++", "sublime", "visual studio", "atom", "brackets", "nano", "ultraedit", "textmate", "text wrangler", "jedit"], 4 | "verbs": ["switch", "switched", "move", "moved"] 5 | } 6 | -------------------------------------------------------------------------------- /inputs/js_frameworks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tag": "js_frameworks", 3 | "items": ["react", "angular", "vue", "backbone", "ember", "knockout", "jquery"], 4 | "verbs": ["switch", "switched", "move", "moved"] 5 | } 6 | -------------------------------------------------------------------------------- /inputs/prog_lang.json: -------------------------------------------------------------------------------- 1 | { 2 | "tag": "prog_lang", 3 | "items": ["java", "c", "c++", "c#", "python", "visual basic", "node", "perl", "php", "ruby", "go", "swift", "objective c", "cobol", "fortran", "lua", "scala", "lisp", "haskell", "rust", "erlang", "clojure", "matlab", "pascal", "r", "kotlin"], 4 | "verbs": ["switch", "switched", "move", "moved", "code"] 5 | } 6 | -------------------------------------------------------------------------------- /js_framework_matrix_eig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/eigenstuff/79cf3fb4b54e59efac7c4630e22ae005d91f0556/js_framework_matrix_eig.png -------------------------------------------------------------------------------- /prog_lang_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/eigenstuff/79cf3fb4b54e59efac7c4630e22ae005d91f0556/prog_lang_matrix.png -------------------------------------------------------------------------------- /prog_lang_matrix_eig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/eigenstuff/79cf3fb4b54e59efac7c4630e22ae005d91f0556/prog_lang_matrix_eig.png --------------------------------------------------------------------------------