├── .gitignore
├── README.md
├── recruit_bot.py
├── recruit_bot
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── data.cpython-35.pyc
    │   ├── inference.cpython-35.pyc
    │   └── module.cpython-35.pyc
    ├── data.py
    ├── inference.py
    └── module.py
└── title_crawler.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /vocab.p
2 | /model/
3 | /title.txt
4 | /.idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RecruitBot
 2 | 
 3 | ## Requirements
 4 | 
 5 | Recruit Bot requires Python 3.5+ and the following python libs:
 6 | 
 7 | ```bash
 8 | pip install webcollector
 9 | pip install flask
10 | pip install BeautifulSoup4
11 | 
12 | # CPU
13 | pip install tensorflow
14 | 
15 | # GPU
16 | pip install tensorflow-gpu
17 | ```
18 | 
19 | ## Run
20 | 
21 | The __recruit_bot.py__ is used for both training and serving.
22 | 
23 | ### Training
24 | 
25 | For training, you should set `training = True` in recruit_bot.py and run __recruit_bot.py__.
26 | 
27 | 
28 | ### Serving
29 | 
30 | For serving, you should set `training = False` in recruit_bot.py and run __recruit_bot.py__.
31 | 
32 | After that, you can visit [http://127.0.0.1:5002](http://127.0.0.1:5002) to generate posts.
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/recruit_bot.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import tensorflow as tf
  3 | 
  4 | from recruit_bot.data import build_vocab, preprocess_all, get_start_and_end_indices
  5 | from recruit_bot.inference import predict
  6 | 
  7 | tf.enable_eager_execution()
  8 | import tensorflow.contrib.learn as learn
  9 | import os
 10 | from recruit_bot.module import RecruitBot
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | 
 14 | training = False
 15 | 
 16 | if training:
 17 |     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 18 | else:
 19 |     os.environ["CUDA_VISIBLE_DEVICES"] = ""
 20 | 
 21 | max_len = 50
 22 | 
 23 | 
 24 | embedding_size = 150
 25 | drop_rate = 0.3
 26 | 
 27 | batch_size = 50
 28 | vocab_path = "vocab.p"
 29 | model_path = "model/model"
 30 | model_dir = os.path.dirname(model_path)
 31 | checkpoint_path = tf.train.latest_checkpoint(model_dir)
 32 | 
 33 | 
 34 | if training:
 35 |     with open("title.txt", "r", encoding="utf-8") as f:
 36 |         text_list = [line.strip() for line in f.readlines()]
 37 |         words_list = preprocess_all(text_list)
 38 | 
 39 |     if checkpoint_path is None:
 40 |         vp = build_vocab(words_list, max_len, vocab_path)
 41 |     else:
 42 |         vp = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
 43 | 
 44 |     indices_list = np.array(list(vp.transform(words_list)), dtype=np.int32)
 45 | else:
 46 |     vp = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
 47 | 
 48 | vocab_size = len(vp.vocabulary_)
 49 | recuit_bot = RecruitBot(vocab_size, embedding_size, drop_rate)
 50 | 
 51 | start_index, end_index = get_start_and_end_indices(vp)
 52 | 
 53 | optimizer = tf.train.AdamOptimizer(learning_rate=5e-3)
 54 | checkpoint = tf.train.Checkpoint(
 55 |     optimizer=optimizer,
 56 |     model=recuit_bot,
 57 |     global_step=tf.train.get_or_create_global_step()
 58 | )
 59 | 
 60 | if training:
 61 | 
 62 |     if checkpoint_path is not None:
 63 |         print("restore")
 64 |         checkpoint.restore(checkpoint_path)
 65 | 
 66 |     for epoch in range(1000):
 67 |         for step, batch_data in tqdm(enumerate(tf.data.Dataset.from_tensor_slices(indices_list).shuffle(1000).batch(batch_size))):
 68 |             batch_input = batch_data[:, :-1]
 69 |             batch_output = batch_data[:, 1:]
 70 | 
 71 |             with tf.GradientTape() as tape:
 72 |                 logits = recuit_bot(batch_input, training=True)
 73 |                 losses = tf.nn.softmax_cross_entropy_with_logits(
 74 |                     logits=logits,
 75 |                     labels=tf.one_hot(batch_output, depth=vocab_size)
 76 |                 )
 77 | 
 78 |             vars = tape.watched_variables()
 79 |             grads = tape.gradient(losses, vars)
 80 |             optimizer.apply_gradients(zip(grads, vars), global_step=tf.train.get_or_create_global_step())
 81 | 
 82 |             if step == 0 and epoch % 5 == 0:
 83 |                 words = recuit_bot.predict_words(vp, max_len)[0]
 84 |                 print("".join(words))
 85 | 
 86 |                 mean_loss = tf.reduce_mean(losses)
 87 |                 checkpoint.save(model_path)
 88 |                 print(epoch, step, mean_loss)
 89 | else:
 90 |     checkpoint.restore(tf.train.latest_checkpoint(model_dir))
 91 |     words = recuit_bot.predict_words(vp, max_len)[0]
 92 |     print("".join(words))
 93 | 
 94 |     from flask import Flask
 95 | 
 96 |     app = Flask('aop')
 97 | 
 98 | 
 99 |     @app.route("/")
100 |     def index():
101 |         words_list = recuit_bot.predict_words(vp, max_len)
102 |         texts = ["".join(words) for words in words_list]
103 |         return "\n".join(texts)
104 | 
105 | 
106 |     app.run(host='0.0.0.0', port=5002)
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/recruit_bot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__init__.py


--------------------------------------------------------------------------------
/recruit_bot/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/recruit_bot/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/recruit_bot/__pycache__/inference.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/inference.cpython-35.pyc


--------------------------------------------------------------------------------
/recruit_bot/__pycache__/module.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrawlScript/RecruitBot/b61d3d5d4abfef67c529659b341a1aed5d90d7f1/recruit_bot/__pycache__/module.cpython-35.pyc


--------------------------------------------------------------------------------
/recruit_bot/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tensorflow.contrib import learn
 3 | import numpy as np
 4 | 
 5 | 
 6 | def preprocess(text):
 7 |     return ["<S>"] + list(text) + ["<E>"]
 8 | 
 9 | 
10 | def preprocess_all(text_list):
11 |     return [preprocess(text) for text in text_list]
12 | 
13 | 
14 | def tokenizer_fn(s):
15 |     return s
16 | 
17 | 
18 | def build_vocab(words_list, max_len, vocab_path):
19 |     vp = learn.preprocessing.VocabularyProcessor(max_document_length=max_len, tokenizer_fn=tokenizer_fn, min_frequency=10)
20 |     vp.fit(words_list)
21 |     vp.save(vocab_path)
22 |     return vp
23 | 
24 | 
25 | def get_start_and_end_indices(vp):
26 |     return vp.vocabulary_.get("<S>"), vp.vocabulary_.get("<E>")
27 | 


--------------------------------------------------------------------------------
/recruit_bot/inference.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import tensorflow as tf
 4 | tf.enable_eager_execution()
 5 | from recruit_bot.data import get_start_and_end_indices
 6 | from tqdm import tqdm
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def predict(recruit_bot, vp, max_len):
12 | 
13 |     start_index, end_index = get_start_and_end_indices(vp)
14 |     vocab_size = len(vp.vocabulary_)
15 | 
16 |     prefixes = [[start_index]]
17 |     words = []
18 | 
19 |     for _ in tqdm(range(max_len)):
20 |         inputs = tf.constant(np.array(prefixes), dtype=tf.int32)#, trainable=False)
21 |         logits = recruit_bot(inputs)[0, -1]
22 |         probs = tf.nn.softmax(logits, axis=-1).numpy()
23 |         while True:
24 |             random_index = np.random.choice(vocab_size, 1, p=probs)[0]
25 |             if random_index != 0:
26 |                 break
27 | 
28 |         prefixes[0].append(random_index)
29 | 
30 |         if random_index == end_index:
31 |             break
32 |         else:
33 |             words.append(vp.vocabulary_.reverse(random_index))
34 | 
35 |     return words
36 | 
37 | 


--------------------------------------------------------------------------------
/recruit_bot/module.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import tensorflow as tf
 4 | import math
 5 | 
 6 | from recruit_bot.data import get_start_and_end_indices
 7 | from tqdm import tqdm
 8 | tf.enable_eager_execution()
 9 | 
10 | from tensorflow.python import keras
11 | import numpy as np
12 | 
13 | 
14 | class RecruitBot(keras.Model):
15 |     def __init__(self, vocab_size, embedding_size, drop_rate, *args, **kwargs):
16 |         super().__init__(*args, **kwargs)
17 | 
18 |         self.vocab_size = vocab_size
19 |         self.embedding_size = embedding_size
20 | 
21 |         self.partial_embeddings = tf.Variable(
22 |             tf.truncated_normal([vocab_size - 1, embedding_size], stddev=1 / math.sqrt(embedding_size)))
23 |         self.lstm = keras.layers.LSTM(embedding_size, return_sequences=True)
24 |         self.dropout_layer = keras.layers.Dropout(drop_rate)
25 |         self.dense_layer = keras.layers.Dense(vocab_size)
26 | 
27 |     def call(self, inputs, training=None, mask=None):
28 |         embeddings = tf.concat([
29 |             tf.zeros([1, self.embedding_size], dtype=tf.float32),
30 |             self.partial_embeddings
31 |         ], axis=0)
32 | 
33 |         embedded = tf.nn.embedding_lookup(embeddings, inputs)
34 | 
35 |         lstm_output = self.lstm(embedded, initial_state=self.create_initial_state(inputs))
36 |         dropped_lstm_output = self.dropout_layer(lstm_output, training=training)
37 |         logits = self.dense_layer(dropped_lstm_output)
38 |         return logits
39 | 
40 |     def create_initial_state(self, inputs):
41 |         if isinstance(inputs, list):
42 |             batch_size = len(inputs)
43 |         else:
44 |             batch_size = inputs.shape[0]
45 |         states = [
46 |             tf.zeros([batch_size, self.embedding_size], dtype=tf.float32),
47 |             tf.zeros([batch_size, self.embedding_size], dtype=tf.float32)
48 |         ]
49 |         return states
50 | 
51 |     def predict_words(self, vp, max_len):
52 |         vocab_size = len(vp.vocabulary_)
53 |         start_index, end_index = get_start_and_end_indices(vp)
54 | 
55 |         current_inputs = [start_index]
56 | 
57 |         embeddings = tf.concat([
58 |             tf.zeros([1, self.embedding_size], dtype=tf.float32),
59 |             self.partial_embeddings
60 |         ], axis=0)
61 | 
62 |         h, c = self.create_initial_state(current_inputs)
63 |         outputs_list = []
64 | 
65 |         for i in tqdm(range(max_len - 1)):
66 |             current_inputs = tf.constant(current_inputs, dtype=tf.int32)
67 |             embedded = tf.nn.embedding_lookup(embeddings, current_inputs)
68 |             _, [h, c] = self.lstm.cell(embedded, [h, c])
69 |             logits = self.dense_layer(h)
70 | 
71 |             probs = tf.nn.softmax(logits).numpy()
72 |             outputs = []
73 |             for prob in probs:
74 |                 for _ in range(5):
75 |                     random_index = np.random.choice(vocab_size, 1, p=prob)[0]
76 |                     if random_index != 0:
77 |                         break
78 |                 outputs.append(random_index)
79 |             outputs_list.append(outputs)
80 |             current_inputs = outputs
81 | 
82 |         indices_list = np.stack(outputs_list, axis=1)
83 |         words_list = []
84 |         for indices in indices_list:
85 |             words = []
86 |             for index in indices:
87 |                 if index in [0, start_index]:
88 |                     continue
89 |                 if index == end_index:
90 |                     break
91 |                 else:
92 |                     words.append(vp.vocabulary_.reverse(index))
93 |             words_list.append(words)
94 |         return words_list
95 | 
96 | 


--------------------------------------------------------------------------------
/title_crawler.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import webcollector as wc
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | f = open("title.txt", "w", encoding="utf-8")
 7 | 
 8 | 
 9 | class RubyCrawler(wc.RamCrawler):
10 |     def __init__(self, **kwargs):
11 |         super().__init__(auto_detect=False, **kwargs)
12 |         self.num_threads = 10
13 |         self.add_seeds(["https://ruby-china.org/jobs?page={}".format(i) for i in range(1, 151)])
14 | 
15 |     def visit(self, page, detected):
16 |         soup = BeautifulSoup(page.content)
17 | 
18 |         for a in soup.select("div.title.media-heading > a[title]"):
19 |             title = a["title"].strip()
20 |             # print(title)
21 |             f.write("{}\n".format(title))
22 | 
23 | 
24 | crawler = RubyCrawler()
25 | crawler.start(10)


--------------------------------------------------------------------------------