├── LICENSE ├── README.md └── markov_bot.py /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/39bit/Markov_Bot/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Markov_Bot 2 | 3 | Generates messages in chats based on Markov chains. 4 | 5 | Requires at least Python 3.3. 6 | 7 | By default, only sh-compatible shells and Unix-based systems are supported. espeak and opusenc are required for TTS: see the generateMarkovOgg function. 8 | 9 | Used libraries: requests (https://pypi.python.org/pypi/requests) -------------------------------------------------------------------------------- /markov_bot.py: -------------------------------------------------------------------------------- 1 | 2 | import os.path, pickle, hashlib, logging, time, sys, traceback, random, unicodedata, os, gc, json, urllib.error, urllib.parse, urllib.request, socket, requests, shlex 3 | # minimal Telegram bot library 4 | SENT = False 5 | 6 | T = "BOT_TOKEN_GOES_HERE" 7 | UA = "A_BROWSER_USER_AGENT_GOES_HERE" 8 | custom_urlopen = lambda u,**kw:urllib.request.urlopen(urllib.request.Request(u, headers={'User-Agent': UA}),**kw) 9 | class TelegramBot(): 10 | class attribute_dict(): 11 | def __init__(self, data): 12 | self.__data__ = data 13 | def __getattr__(self, index): 14 | if index == "__data__": return object.__getattr__(self, "__data__") 15 | try: 16 | return self.__getitem__(index) 17 | except KeyError: 18 | raise AttributeError 19 | def __getitem__(self, index): 20 | return self.__data__[index] 21 | def __setattr__(self, index, value): 22 | if index == "__data__": return object.__setattr__(self, "__data__", value) 23 | self.__setitem__(index) 24 | def __setitem__(self, index, value): 25 | self.__data__[index] = value 26 | def __delattr__(self, index, value): 27 | if index == "__data__": return object.__delattr__(self, "__data__", value) 28 | self.__delitem__(index) 29 | def __delitem__(self, index, value): 30 | del self.__data__[index] 31 | def __repr__(self): 32 | return repr(self.__data__) 33 | def __iter__(self): 34 | return iter(self.__data__) 35 | def __len__(self): 36 | return len(self.__data__) 37 | def keys(self): 38 | return self.__data__.keys() 39 | def has(self, key): 40 | return key in self.__data__.keys() and self.__data__[key] != None 41 | def __init__(self, token): 42 | self.token = token 43 | self.retry = 0 44 | def __getattr__(self, attr): 45 | return self.func_wrapper(attr) 46 | def get_url(self, fname, **kw): 47 | url_par={} 48 | for key in kw.keys(): 49 | if kw[key] != None: 50 | url_par[key] = urllib.parse.quote_plus(TelegramBot.escape(kw[key])) 51 | return (url_par,("https://api.telegram.org/bot" + self.token + "/" + (fname.replace("__UNSAFE","") if fname.endswith("__UNSAFE") else fname) + "?" + 52 | "&".join(map(lambda x:x+"="+url_par[x],url_par.keys())))) 53 | @staticmethod 54 | def default_urlopen(u): 55 | with custom_urlopen(u,timeout=90) as f: 56 | raw = f.read().decode('utf-8') 57 | return raw 58 | def func_wrapper(self, fname): 59 | def func(self, unsafe, _urlopen_hook=bot.default_urlopen, **kw): 60 | url_par, url = self.get_url(fname, **kw) 61 | RETRY = True 62 | while RETRY: 63 | try: 64 | raw = _urlopen_hook(url) 65 | RETRY = False 66 | except urllib.error.HTTPError as e: 67 | if "bad request" in str(e).lower() and not unsafe: 68 | print(fname, url) 69 | print(json.dumps(url_par)) 70 | print(e.read().decode('utf-8')) 71 | traceback.print_exc() 72 | return 73 | elif "forbidden" in str(e).lower() and not unsafe: 74 | print(fname, url) 75 | print(json.dumps(url_par)) 76 | print(e.read().decode('utf-8')) 77 | traceback.print_exc() 78 | return 79 | else: 80 | raise e 81 | except socket.timeout: 82 | if unsafe: 83 | raise ValueError("timeout") 84 | else: 85 | print("timeout!") 86 | time.sleep(1) 87 | except BaseException as e: 88 | print(str(e)) 89 | time.sleep(0.5) 90 | if "too many requests" in str(e).lower(): 91 | self.retry += 1 92 | time.sleep(self.retry * 5) 93 | elif "unreachable" in str(e).lower() or "bad gateway" in str(e).lower() or "name or service not known" in str(e).lower() or "network" in str(e).lower() or "handshake operation timed out" in str(e).lower(): 94 | time.sleep(3) 95 | elif "bad request" in str(e).lower() and not unsafe: 96 | print(fname, url) 97 | print(json.dumps(url_par)) 98 | traceback.print_exc() 99 | return 100 | elif "forbidden" in str(e).lower() and not unsafe: 101 | print(fname, url) 102 | print(json.dumps(url_par)) 103 | traceback.print_exc() 104 | return 105 | else: 106 | raise e 107 | self.retry = 0 108 | return TelegramBot.attributify(json.loads(raw)) 109 | return lambda **kw:func(self,fname.endswith("__UNSAFE"),**kw) 110 | @staticmethod 111 | def escape(obj): 112 | if type(obj) == str: 113 | return obj 114 | else: 115 | return json.dumps(obj).encode('utf-8') 116 | @staticmethod 117 | def attributify(obj): 118 | if type(obj)==list: 119 | return list(map(TelegramBot.attributify,obj)) 120 | elif type(obj)==dict: 121 | d = obj 122 | for k in d.keys(): 123 | d[k] = TelegramBot.attributify(d[k]) 124 | return TelegramBot.attribute_dict(d) 125 | else: 126 | return obj 127 | 128 | groups = {} 129 | 130 | # Unicode character categories considered 131 | ALLOWABLE = ["Lc","Ll","Lm","Lo","Lt","Lu","Nd","Nl","No"] 132 | COMMON_T = 0 133 | 134 | SPLIT_LINES = False 135 | LAST_USER = {} 136 | 137 | # Supported TTS languages 138 | LANGS = ["af","an","bg","bs","ca","cs","cy","da","de","el","en","en-gb","en-sc","en-uk-north","en-uk-rp","en-uk-wmids","en-us","en-wi","eo","es","es-la","et","fa","fa-pin","fi","fr-be","fr-fr","ga","grc","hi","hr","hu","hy","hy-west","id","is","it","jbo","ka","kn","ku","la","lfn","lt","lv","mk","ml","ms","ne","nl","no","pa","pl","pt-br","pt-pt","ro","ru","sk","sq","sr","sv","sw","ta","tr","vi","vi-hue","vi-sgn","zh","zh-yue"] 139 | 140 | gcache = [] 141 | # how many groups will be cached at most at one time 142 | max_cache_size = 10 143 | # GC is forced every N group unloads 144 | gc_every_unload = 30 145 | gc_counter = gc_every_unload 146 | 147 | # obtained when the bot is initialized 148 | MY_USERNAME = "" 149 | 150 | # whether to auto-restart? 151 | Restart = False 152 | 153 | try: 154 | from urllib.error import URLError 155 | except ImportError: 156 | from urllib2 import URLError 157 | 158 | def save(reason): 159 | print("SAVING ",reason) 160 | for key in groups: 161 | save_group(key) 162 | print("SAVED") 163 | 164 | bot = TelegramBot(T) 165 | MY_USERNAME = bot.getMe().result.username.lower() 166 | 167 | last_msg_id = 0 168 | 169 | def addMessage(message, g): 170 | w = [""] + message.lower().split(" ") + [""] 171 | for i in range(1,len(w)): 172 | lw = "".join(filter(lambda x:(unicodedata.category(x) in ALLOWABLE),w[i-1])) 173 | nw = w[i] 174 | if len(lw) < 50 and len(nw) < 50: 175 | if lw not in g.keys(): 176 | g[lw] = [] 177 | g[lw].append(nw) 178 | 179 | def limit(s): 180 | t = " ".join(s.split(" ")[:50]) 181 | return t[:400] 182 | 183 | def load_group(chat_id): 184 | global gcache 185 | try: 186 | with open("markov/chat_" + str(chat_id) + ".dat", "rb") as f: 187 | groups[chat_id] = pickle.load(f) 188 | gcache.append(chat_id) 189 | except KeyboardInterrupt as e: 190 | raise e 191 | except: 192 | pass 193 | check_cache() 194 | 195 | def check_cache(): 196 | global gcache 197 | while len(gcache) > max_cache_size: 198 | unload_group(gcache[0]) 199 | 200 | def unload_group(chat_id): 201 | global gcache, gc_counter 202 | try: 203 | with open("markov/chat_" + str(chat_id) + ".dat", "wb") as f: 204 | pickle.dump(groups[chat_id], f) 205 | groups[chat_id] = None 206 | del groups[chat_id] 207 | gcache.remove(chat_id) 208 | gc_counter -= 1 209 | if gc_counter < 1: 210 | gc_counter = gc_every_unload 211 | gc.collect() 212 | except KeyboardInterrupt as e: 213 | raise e 214 | except: 215 | pass 216 | 217 | def save_group(chat_id): 218 | try: 219 | with open("markov/chat_" + str(chat_id) + ".dat", "wb") as f: 220 | pickle.dump(groups[chat_id], f) 221 | except: 222 | pass 223 | 224 | def generateMarkovOgg(msg, g): 225 | # g are the group settings 226 | # msg is the message data 227 | # call espeak and opusenc 228 | os.system("rm markov.ogg 2>nul") 229 | os.system("espeak -s" + str(g[2]) + " -v" + g[1] + " " + shlex.quote(limit(msg)) + " --stdout | opusenc - markov.ogg >nul 2>&1") 230 | 231 | import logging 232 | 233 | tried_to = 0 234 | saferes = True 235 | OFF = 0 236 | try: 237 | def autoreset(): 238 | time.sleep(600) 239 | while not saferes: 240 | time.sleep(0.5) 241 | tried_to = 10000 242 | time.sleep(30) 243 | save("quitting - backup thread") 244 | os.execl(sys.executable, sys.executable, *sys.argv) 245 | if Restart: 246 | threading.Thread(target=autoreset, daemon=True).start() 247 | while True: 248 | tried_to += 1 249 | if tried_to >= 1000 and Restart: 250 | save("quitting") 251 | os.execl(sys.executable, sys.executable, *sys.argv) 252 | print("poll " + str(time.time()),end=":") 253 | saferes = False 254 | try: 255 | updates = bot.getUpdates__UNSAFE(offset=OFF, timeout=5).result 256 | except KeyboardInterrupt as e: 257 | print("E") 258 | raise e 259 | except BaseException as e: 260 | print("0") 261 | if str(e).strip().lower() != "timeout": 262 | print("poll failed: ", e) 263 | continue 264 | print(len(updates), end="") 265 | print("(" + str(OFF) + ")") 266 | for update in updates: 267 | last_msg_id = update.update_id 268 | OFF = update.update_id + 1 269 | if not update.has("message"): 270 | continue 271 | if update.message == None: 272 | continue 273 | chat_id = update.message.chat.id 274 | chat_type = update.message.chat.type 275 | if update.message.has("migrate_from_chat_id"): 276 | nid = update.message.chat.id 277 | oid = update.message.migrate_from_chat_id 278 | if oid == nid: 279 | continue 280 | if oid in gcache: 281 | unload_group(oid) 282 | # rename db file 283 | try: 284 | os.rename("markov/chat_" + str(oid) + ".dat", "markov/chat_" + str(nid) + ".dat") 285 | except: # file does not exist, ignore 286 | pass 287 | continue 288 | if update.message.has("text"): 289 | message = update.message.text 290 | else: 291 | message = "" 292 | replyto = update.message.message_id 293 | if update.message.has("from"): 294 | user = update.message["from"].id 295 | else: 296 | user = -1 297 | admbypass = False 298 | try: 299 | admbypass = admbypass or update.message.chat.all_members_are_administrators 300 | except: 301 | pass 302 | 303 | if chat_id not in gcache: 304 | load_group(chat_id) 305 | 306 | if chat_id not in groups.keys(): 307 | groups[chat_id] = {} 308 | gcache.append(chat_id) 309 | check_cache() 310 | 311 | # g contents 312 | # [mlimit, tts language, tts speed, markov collecting (pause/resume), ~ maximum words] 313 | g = groups[chat_id] 314 | if g == None: 315 | groups[chat_id] = {} 316 | g = {} 317 | if 0 not in g.keys(): 318 | g[0] = 1 319 | if 1 not in g.keys(): 320 | g[1] = "en" 321 | if 2 not in g.keys(): 322 | g[2] = 100 323 | if 3 not in g.keys(): 324 | g[3] = True 325 | if 4 not in g.keys(): 326 | g[4] = 10000 327 | 328 | curtime = time.time() 329 | t = str(user) + ":" + str(chat_id) 330 | 331 | if len(message) < 1: 332 | continue 333 | if message[0] == "/": 334 | rcmd = message.split(" ")[0].split("@")[0] 335 | if "@" in message.split(" ")[0]: 336 | cmdtarget = message.split(" ")[0].split("@")[1] 337 | # if the command is aimed at some other bot 338 | if cmdtarget.lower() != MY_USERNAME: 339 | continue 340 | cmd = rcmd.lower() 341 | if cmd == "/markov": 342 | if t in LAST_USER.keys(): 343 | if (curtime - LAST_USER[t]) < g[0]: 344 | continue 345 | 346 | LAST_USER[t] = curtime 347 | COMMON_T += 1 348 | if COMMON_T == 8: 349 | COMMON_T = 0 350 | tries_o = 0 351 | if "" in g.keys(): 352 | while True: 353 | tries_o += 1 354 | words = [] 355 | word = "" 356 | if random.randint(0,10)<5: 357 | word = random.choice(list(filter(lambda x:type(x)==str,g.keys()))) 358 | else: 359 | word = random.choice(g[word]) 360 | while word != "" and len(words) < min(g[4],100): 361 | words.append(word) 362 | word = "".join(filter(lambda x:(unicodedata.category(x) in ALLOWABLE),word)).lower() 363 | if word not in g.keys(): 364 | word = "" 365 | else: 366 | word = random.choice(g[word]) 367 | msg = " ".join(words) 368 | if len(msg) > 0: break 369 | if tries_o > 1000: break 370 | try: 371 | bot.sendMessage(chat_id=chat_id, 372 | text=msg) 373 | except KeyboardInterrupt as e: 374 | raise e 375 | except: 376 | pass 377 | else: 378 | try: 379 | bot.sendMessage(chat_id=chat_id, 380 | text="[Chain is empty]", 381 | reply_to_message_id=replyto) 382 | except KeyboardInterrupt as e: 383 | raise e 384 | except: 385 | pass 386 | if cmd == "/mlimit": 387 | if t in LAST_USER.keys(): 388 | if (curtime - LAST_USER[t]) < 1: 389 | continue 390 | try: 391 | st = bot.getChatMember(chat_id=chat_id, user_id=user).result.status 392 | if chat_type in ["group","supergroup","channel"] and not admbypass and (st != "administrator" and st != "creator"): 393 | continue 394 | except KeyboardInterrupt as e: 395 | raise e 396 | except: 397 | pass 398 | t = " ".join(message.split(" ")[1:]).strip() 399 | if len(t) < 1: 400 | bot.sendMessage(chat_id=chat_id, 401 | text="[Usage: /mlimit seconds]", 402 | reply_to_message_id=replyto) 403 | continue 404 | try: 405 | v = int(t) 406 | except KeyboardInterrupt as e: 407 | raise e 408 | except: 409 | bot.sendMessage(chat_id=chat_id, 410 | text="[Usage: /mlimit seconds]", 411 | reply_to_message_id=replyto) 412 | continue 413 | if v <= 0 or v > 100000: 414 | bot.sendMessage(chat_id=chat_id, 415 | text="[limit must be between 1-100 000 seconds]", 416 | reply_to_message_id=replyto) 417 | continue 418 | #print(t, "=", g[0]) 419 | bot.sendMessage(chat_id=chat_id, 420 | text="[Limit set]", 421 | reply_to_message_id=replyto) 422 | g[0] = v 423 | if cmd == "/markovttsspeed": 424 | if t in LAST_USER.keys(): 425 | if (curtime - LAST_USER[t]) < 1: 426 | continue 427 | t = " ".join(message.split(" ")[1:]).strip() 428 | if len(t) < 1: 429 | bot.sendMessage(chat_id=chat_id, 430 | text="[Usage: /markovttsspeed wpm]", 431 | reply_to_message_id=replyto) 432 | continue 433 | try: 434 | v = int(t) 435 | except KeyboardInterrupt as e: 436 | raise e 437 | except: 438 | bot.sendMessage(chat_id=chat_id, 439 | text="[Usage: /markovttsspeed wpm]", 440 | reply_to_message_id=replyto) 441 | continue 442 | if v < 80 or v > 500: 443 | bot.sendMessage(chat_id=chat_id, 444 | text="[Speed must be between 80-500 wpm]", 445 | reply_to_message_id=replyto) 446 | continue 447 | bot.sendMessage(chat_id=chat_id, 448 | text="[Speed set]", 449 | reply_to_message_id=replyto) 450 | g[2] = v 451 | if cmd == "/markovmaxwords": 452 | if t in LAST_USER.keys(): 453 | if (curtime - LAST_USER[t]) < 1: 454 | continue 455 | try: 456 | st = bot.getChatMember(chat_id=chat_id, user_id=user).result.status 457 | if chat_type in ["group","supergroup","channel"] and not admbypass and (st != "administrator" and st != "creator"): 458 | continue 459 | except KeyboardInterrupt as e: 460 | raise e 461 | except: 462 | pass 463 | t = " ".join(message.split(" ")[1:]).strip() 464 | if len(t) < 1: 465 | bot.sendMessage(chat_id=chat_id, 466 | text="[Usage: /markovmaxwords words]", 467 | reply_to_message_id=replyto) 468 | continue 469 | try: 470 | v = int(t) 471 | except KeyboardInterrupt as e: 472 | raise e 473 | except: 474 | bot.sendMessage(chat_id=chat_id, 475 | text="[Usage: /markovmaxwords words]", 476 | reply_to_message_id=replyto) 477 | continue 478 | if v < 1 or v > 120: 479 | bot.sendMessage(chat_id=chat_id, 480 | text="[Limit for words is 1-120]", 481 | reply_to_message_id=replyto) 482 | continue 483 | g[4] = v 484 | save_group(chat_id) 485 | bot.sendMessage(chat_id=chat_id, 486 | text="[Maximum words set]", 487 | reply_to_message_id=replyto) 488 | if cmd == "/markovclear": 489 | if t in LAST_USER.keys(): 490 | if (curtime - LAST_USER[t]) < 1: 491 | continue 492 | try: 493 | # do not allow non-admins to clear 494 | st = bot.getChatMember(chat_id=chat_id, user_id=user).result.status 495 | if chat_type in ["group","supergroup","channel"] and not admbypass and (st != "administrator" and st != "creator"): 496 | continue 497 | except KeyboardInterrupt as e: 498 | raise e 499 | except: 500 | pass 501 | checkhash = hashlib.md5((str(chat_id)+str(user)+str(time.time()//1000)).encode("utf-8")).hexdigest()[:12].upper() 502 | what = "" 503 | try: 504 | what = message.split(" ")[1].upper() 505 | except KeyboardInterrupt as e: 506 | raise e 507 | except: 508 | pass 509 | if what == checkhash: 510 | groups[chat_id] = {} 511 | save_group(chat_id) 512 | bot.sendMessage(chat_id=chat_id, 513 | text="[Messages cleared]", 514 | reply_to_message_id=replyto) 515 | else: 516 | bot.sendMessage(chat_id=chat_id, 517 | text="[Copy this to confirm]\n/markovclear " + checkhash, 518 | reply_to_message_id=replyto) 519 | if cmd == "/markovpause": 520 | if t in LAST_USER.keys(): 521 | if (curtime - LAST_USER[t]) < 1: 522 | continue 523 | try: 524 | st = bot.getChatMember(chat_id=chat_id, user_id=user).result.status 525 | if chat_type in ["group","supergroup","channel"] and not admbypass and (st != "administrator" and st != "creator"): 526 | continue 527 | except KeyboardInterrupt as e: 528 | raise e 529 | except: 530 | pass 531 | g[3] = False 532 | save_group(chat_id) 533 | bot.sendMessage(chat_id=chat_id, 534 | text="[Reading paused]", 535 | reply_to_message_id=replyto) 536 | if cmd == "/markovresume": 537 | if t in LAST_USER.keys(): 538 | if (curtime - LAST_USER[t]) < 1: 539 | continue 540 | try: 541 | st = bot.getChatMember(chat_id=chat_id, user_id=user).result.status 542 | if chat_type in ["group","supergroup","channel"] and not admbypass and (st != "administrator" and st != "creator"): 543 | continue 544 | except KeyboardInterrupt as e: 545 | raise e 546 | except: 547 | pass 548 | g[3] = True 549 | save_group(chat_id) 550 | bot.sendMessage(chat_id=chat_id, 551 | text="[Reading resumed]", 552 | reply_to_message_id=replyto) 553 | if cmd == "/markovtts": 554 | if t in LAST_USER.keys(): 555 | if (curtime - LAST_USER[t]) < max(5,g[0]): 556 | continue 557 | LAST_USER[t] = curtime 558 | COMMON_T += 1 559 | if COMMON_T == 8: 560 | COMMON_T = 0 561 | if "" in g.keys(): 562 | while True: 563 | words = [] 564 | word = "" 565 | if random.randint(0,10)<5: 566 | word = random.choice(list(filter(lambda x:type(x)==str,g.keys()))) 567 | else: 568 | word = random.choice(g[word]) 569 | while word != "" and len(words) < min(g[4],120): 570 | words.append(word) 571 | word = "".join(filter(lambda x:(unicodedata.category(x) in ALLOWABLE),word)).lower() 572 | if word not in g.keys(): 573 | word = "" 574 | else: 575 | word = random.choice(g[word]) 576 | msg = " ".join(words) 577 | if len(msg) > 0: break 578 | try: 579 | generateMarkovOgg(msg, g) 580 | headers = {'User-Agent': UA} 581 | files = {"voice": open("markov.ogg","rb")} 582 | bot.sendVoice(_urlopen_hook=lambda u:requests.post(u, headers=headers, files=files).text, 583 | chat_id=chat_id) 584 | except KeyboardInterrupt as e: 585 | raise e 586 | except BaseException as e: 587 | exc_type, exc_value, exc_traceback = sys.exc_info() 588 | print("\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))) 589 | bot.sendMessage(chat_id=chat_id, 590 | text="Could not send voice", 591 | reply_to_message_id=replyto) 592 | else: 593 | bot.sendMessage(chat_id=chat_id, 594 | text="[Chain is empty]", 595 | reply_to_message_id=replyto) 596 | if cmd == "/markovttslang": 597 | if t in LAST_USER.keys(): 598 | if (curtime - LAST_USER[t]) < 1: 599 | continue 600 | v = " ".join(message.split(" ")[1:]).strip() 601 | if v not in LANGS: 602 | bot.sendMessage(chat_id=chat_id, 603 | text=("[Unknown language]\n" if len(v) > 0 else "") + ", ".join(LANGS), 604 | reply_to_message_id=replyto) 605 | continue 606 | bot.sendMessage(chat_id=chat_id, 607 | text="[Language set]", 608 | reply_to_message_id=replyto) 609 | g[1] = v 610 | elif message[0] != "/": 611 | if g[3]: 612 | if SPLIT_LINES: 613 | for line in message.split("\n"): 614 | addMessage(line, g) 615 | else: 616 | addMessage(message, g) 617 | saferes = True 618 | time.sleep(0.02) 619 | except KeyboardInterrupt as e: 620 | save("Quit") 621 | except BaseException as e: 622 | save("Exception") 623 | traceback.print_exc() 624 | 625 | --------------------------------------------------------------------------------