├── .gitignore ├── README.md ├── recruit_bot.py ├── recruit_bot ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── data.cpython-35.pyc │ ├── inference.cpython-35.pyc │ └── module.cpython-35.pyc ├── data.py ├── inference.py └── module.py └── title_crawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | /vocab.p 2 | /model/ 3 | /title.txt 4 | /.idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RecruitBot 2 | 3 | ## Requirements 4 | 5 | Recruit Bot requires Python 3.5+ and the following python libs: 6 | 7 | ```bash 8 | pip install webcollector 9 | pip install flask 10 | pip install BeautifulSoup4 11 | 12 | # CPU 13 | pip install tensorflow 14 | 15 | # GPU 16 | pip install tensorflow-gpu 17 | ``` 18 | 19 | ## Run 20 | 21 | The __recruit_bot.py__ is used for both training and serving. 22 | 23 | ### Training 24 | 25 | For training, you should set `training = True` in recruit_bot.py and run __recruit_bot.py__. 26 | 27 | 28 | ### Serving 29 | 30 | For serving, you should set `training = False` in recruit_bot.py and run __recruit_bot.py__. 31 | 32 | After that, you can visit [http://127.0.0.1:5002](http://127.0.0.1:5002) to generate posts. 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /recruit_bot.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import tensorflow as tf 3 | 4 | from recruit_bot.data import build_vocab, preprocess_all, get_start_and_end_indices 5 | from recruit_bot.inference import predict 6 | 7 | tf.enable_eager_execution() 8 | import tensorflow.contrib.learn as learn 9 | import os 10 | from recruit_bot.module import RecruitBot 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | training = False 15 | 16 | if training: 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 18 | else: 19 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 20 | 21 | max_len = 50 22 | 23 | 24 | embedding_size = 150 25 | drop_rate = 0.3 26 | 27 | batch_size = 50 28 | vocab_path = "vocab.p" 29 | model_path = "model/model" 30 | model_dir = os.path.dirname(model_path) 31 | checkpoint_path = tf.train.latest_checkpoint(model_dir) 32 | 33 | 34 | if training: 35 | with open("title.txt", "r", encoding="utf-8") as f: 36 | text_list = [line.strip() for line in f.readlines()] 37 | words_list = preprocess_all(text_list) 38 | 39 | if checkpoint_path is None: 40 | vp = build_vocab(words_list, max_len, vocab_path) 41 | else: 42 | vp = learn.preprocessing.VocabularyProcessor.restore(vocab_path) 43 | 44 | indices_list = np.array(list(vp.transform(words_list)), dtype=np.int32) 45 | else: 46 | vp = learn.preprocessing.VocabularyProcessor.restore(vocab_path) 47 | 48 | vocab_size = len(vp.vocabulary_) 49 | recuit_bot = RecruitBot(vocab_size, embedding_size, drop_rate) 50 | 51 | start_index, end_index = get_start_and_end_indices(vp) 52 | 53 | optimizer = tf.train.AdamOptimizer(learning_rate=5e-3) 54 | checkpoint = tf.train.Checkpoint( 55 | optimizer=optimizer, 56 | model=recuit_bot, 57 | global_step=tf.train.get_or_create_global_step() 58 | ) 59 | 60 | if training: 61 | 62 | if checkpoint_path is not None: 63 | print("restore") 64 | checkpoint.restore(checkpoint_path) 65 | 66 | for epoch in range(1000): 67 | for step, batch_data in tqdm(enumerate(tf.data.Dataset.from_tensor_slices(indices_list).shuffle(1000).batch(batch_size))): 68 | batch_input = batch_data[:, :-1] 69 | batch_output = batch_data[:, 1:] 70 | 71 | with tf.GradientTape() as tape: 72 | logits = recuit_bot(batch_input, training=True) 73 | losses = tf.nn.softmax_cross_entropy_with_logits( 74 | logits=logits, 75 | labels=tf.one_hot(batch_output, depth=vocab_size) 76 | ) 77 | 78 | vars = tape.watched_variables() 79 | grads = tape.gradient(losses, vars) 80 | optimizer.apply_gradients(zip(grads, vars), global_step=tf.train.get_or_create_global_step()) 81 | 82 | if step == 0 and epoch % 5 == 0: 83 | words = recuit_bot.predict_words(vp, max_len)[0] 84 | print("".join(words)) 85 | 86 | mean_loss = tf.reduce_mean(losses) 87 | checkpoint.save(model_path) 88 | print(epoch, step, mean_loss) 89 | else: 90 | checkpoint.restore(tf.train.latest_checkpoint(model_dir)) 91 | words = recuit_bot.predict_words(vp, max_len)[0] 92 | print("".join(words)) 93 | 94 | from flask import Flask 95 | 96 | app = Flask('aop') 97 | 98 | 99 | @app.route("/") 100 | def index(): 101 | words_list = recuit_bot.predict_words(vp, max_len) 102 | texts = ["".join(words) for words in words_list] 103 | return "\n".join(texts) 104 | 105 | 106 | app.run(host='0.0.0.0', port=5002) 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /recruit_bot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__init__.py -------------------------------------------------------------------------------- /recruit_bot/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /recruit_bot/__pycache__/data.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/data.cpython-35.pyc -------------------------------------------------------------------------------- /recruit_bot/__pycache__/inference.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/inference.cpython-35.pyc -------------------------------------------------------------------------------- /recruit_bot/__pycache__/module.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/module.cpython-35.pyc -------------------------------------------------------------------------------- /recruit_bot/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tensorflow.contrib import learn 3 | import numpy as np 4 | 5 | 6 | def preprocess(text): 7 | return [""] + list(text) + [""] 8 | 9 | 10 | def preprocess_all(text_list): 11 | return [preprocess(text) for text in text_list] 12 | 13 | 14 | def tokenizer_fn(s): 15 | return s 16 | 17 | 18 | def build_vocab(words_list, max_len, vocab_path): 19 | vp = learn.preprocessing.VocabularyProcessor(max_document_length=max_len, tokenizer_fn=tokenizer_fn, min_frequency=10) 20 | vp.fit(words_list) 21 | vp.save(vocab_path) 22 | return vp 23 | 24 | 25 | def get_start_and_end_indices(vp): 26 | return vp.vocabulary_.get(""), vp.vocabulary_.get("") 27 | -------------------------------------------------------------------------------- /recruit_bot/inference.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import tensorflow as tf 4 | tf.enable_eager_execution() 5 | from recruit_bot.data import get_start_and_end_indices 6 | from tqdm import tqdm 7 | 8 | import numpy as np 9 | 10 | 11 | def predict(recruit_bot, vp, max_len): 12 | 13 | start_index, end_index = get_start_and_end_indices(vp) 14 | vocab_size = len(vp.vocabulary_) 15 | 16 | prefixes = [[start_index]] 17 | words = [] 18 | 19 | for _ in tqdm(range(max_len)): 20 | inputs = tf.constant(np.array(prefixes), dtype=tf.int32)#, trainable=False) 21 | logits = recruit_bot(inputs)[0, -1] 22 | probs = tf.nn.softmax(logits, axis=-1).numpy() 23 | while True: 24 | random_index = np.random.choice(vocab_size, 1, p=probs)[0] 25 | if random_index != 0: 26 | break 27 | 28 | prefixes[0].append(random_index) 29 | 30 | if random_index == end_index: 31 | break 32 | else: 33 | words.append(vp.vocabulary_.reverse(random_index)) 34 | 35 | return words 36 | 37 | -------------------------------------------------------------------------------- /recruit_bot/module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import tensorflow as tf 4 | import math 5 | 6 | from recruit_bot.data import get_start_and_end_indices 7 | from tqdm import tqdm 8 | tf.enable_eager_execution() 9 | 10 | from tensorflow.python import keras 11 | import numpy as np 12 | 13 | 14 | class RecruitBot(keras.Model): 15 | def __init__(self, vocab_size, embedding_size, drop_rate, *args, **kwargs): 16 | super().__init__(*args, **kwargs) 17 | 18 | self.vocab_size = vocab_size 19 | self.embedding_size = embedding_size 20 | 21 | self.partial_embeddings = tf.Variable( 22 | tf.truncated_normal([vocab_size - 1, embedding_size], stddev=1 / math.sqrt(embedding_size))) 23 | self.lstm = keras.layers.LSTM(embedding_size, return_sequences=True) 24 | self.dropout_layer = keras.layers.Dropout(drop_rate) 25 | self.dense_layer = keras.layers.Dense(vocab_size) 26 | 27 | def call(self, inputs, training=None, mask=None): 28 | embeddings = tf.concat([ 29 | tf.zeros([1, self.embedding_size], dtype=tf.float32), 30 | self.partial_embeddings 31 | ], axis=0) 32 | 33 | embedded = tf.nn.embedding_lookup(embeddings, inputs) 34 | 35 | lstm_output = self.lstm(embedded, initial_state=self.create_initial_state(inputs)) 36 | dropped_lstm_output = self.dropout_layer(lstm_output, training=training) 37 | logits = self.dense_layer(dropped_lstm_output) 38 | return logits 39 | 40 | def create_initial_state(self, inputs): 41 | if isinstance(inputs, list): 42 | batch_size = len(inputs) 43 | else: 44 | batch_size = inputs.shape[0] 45 | states = [ 46 | tf.zeros([batch_size, self.embedding_size], dtype=tf.float32), 47 | tf.zeros([batch_size, self.embedding_size], dtype=tf.float32) 48 | ] 49 | return states 50 | 51 | def predict_words(self, vp, max_len): 52 | vocab_size = len(vp.vocabulary_) 53 | start_index, end_index = get_start_and_end_indices(vp) 54 | 55 | current_inputs = [start_index] 56 | 57 | embeddings = tf.concat([ 58 | tf.zeros([1, self.embedding_size], dtype=tf.float32), 59 | self.partial_embeddings 60 | ], axis=0) 61 | 62 | h, c = self.create_initial_state(current_inputs) 63 | outputs_list = [] 64 | 65 | for i in tqdm(range(max_len - 1)): 66 | current_inputs = tf.constant(current_inputs, dtype=tf.int32) 67 | embedded = tf.nn.embedding_lookup(embeddings, current_inputs) 68 | _, [h, c] = self.lstm.cell(embedded, [h, c]) 69 | logits = self.dense_layer(h) 70 | 71 | probs = tf.nn.softmax(logits).numpy() 72 | outputs = [] 73 | for prob in probs: 74 | for _ in range(5): 75 | random_index = np.random.choice(vocab_size, 1, p=prob)[0] 76 | if random_index != 0: 77 | break 78 | outputs.append(random_index) 79 | outputs_list.append(outputs) 80 | current_inputs = outputs 81 | 82 | indices_list = np.stack(outputs_list, axis=1) 83 | words_list = [] 84 | for indices in indices_list: 85 | words = [] 86 | for index in indices: 87 | if index in [0, start_index]: 88 | continue 89 | if index == end_index: 90 | break 91 | else: 92 | words.append(vp.vocabulary_.reverse(index)) 93 | words_list.append(words) 94 | return words_list 95 | 96 | -------------------------------------------------------------------------------- /title_crawler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import webcollector as wc 4 | from bs4 import BeautifulSoup 5 | 6 | f = open("title.txt", "w", encoding="utf-8") 7 | 8 | 9 | class RubyCrawler(wc.RamCrawler): 10 | def __init__(self, **kwargs): 11 | super().__init__(auto_detect=False, **kwargs) 12 | self.num_threads = 10 13 | self.add_seeds(["https://ruby-china.org/jobs?page={}".format(i) for i in range(1, 151)]) 14 | 15 | def visit(self, page, detected): 16 | soup = BeautifulSoup(page.content) 17 | 18 | for a in soup.select("div.title.media-heading > a[title]"): 19 | title = a["title"].strip() 20 | # print(title) 21 | f.write("{}\n".format(title)) 22 | 23 | 24 | crawler = RubyCrawler() 25 | crawler.start(10) --------------------------------------------------------------------------------