├── .gitignore ├── README.md ├── genevabot.py ├── requirements.txt └── tscrape.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.markov 2 | .vscode/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Geneva Bot 2 | 3 | Scrape Tumblr blogs for a corpus, convert it into a Markov probablility matrix, 4 | and generate text posts in the style of the original blog. 5 | 6 | This process consists of two scripts, `tscrape.py` and `genevabot.py`. The 7 | first scrapes a Tumblr blog for text and tags (because lots of great content is 8 | included in Tumblr tags!) and creates a PyMarkovChain probability database from 9 | that data. The second simply reconsititues the probability database into memory 10 | and generates some sentances from it. 11 | 12 | Here is a sample usage: 13 | 14 | ``` 15 | 13:03:39: leo [~/Projects/genevabot] 16 | $ ./tscrape.py clientsfromhell 1 10 --notags 17 | Scraping page 1 18 | [...] 19 | Scraping page 10 20 | Generating database... 21 | Dumping database to clientsfromhell.posts.markov 22 | 23 | 13:03:59: leo [~/Projects/genevabot] 24 | $ ./genevabot.py clientsfromhell.posts.markov 10 25 | Perfect for everything from invitations to logos, the 35+ unique fonts – for example, an idea 26 | logo, and brand image for my sister’s cake 27 | It’s a Samsung iPhone 28 | Me: I removed all the details 29 | do it for free, and still 30 | By making it easy to make bold and professional designs, 31 | Thanks for the time to actually review said contract, I was 32 | After this phone call, I received some revisions from said customer that came in to work with us can pay our full fees and is standard across 33 | I sent it over – along with a 34 | What will we do 35 | ``` -------------------------------------------------------------------------------- /genevabot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Generates text posts from a given corpus 4 | """ 5 | from sys import argv 6 | import pymarkovchain 7 | from pymarkovchain import MarkovChain 8 | import argparse 9 | 10 | PARSER = argparse.ArgumentParser( 11 | description="Generate Tumblr posts from a Markov chain database.") 12 | PARSER.add_argument("filename", metavar="CORPUS", type=str, 13 | help="The corpus to use in generating text.") 14 | PARSER.add_argument("number", metavar="NUMBER", type=int, 15 | help="The number of strings to generate.") 16 | PARSER.add_argument('--minlen', metavar="LENGTH", type=int, 17 | help="Throw out strings shorter than this.", default=3) 18 | PARSER.add_argument('--notags', action="store_true", 19 | help="Don't generate tags (legacy database compat behaviour)") 20 | 21 | ARGS = PARSER.parse_args() 22 | 23 | FILENAME = ARGS.filename 24 | NUMBER = ARGS.number 25 | 26 | BOT = MarkovChain(FILENAME) 27 | 28 | VALID_SENTENCES = 0 29 | while VALID_SENTENCES < NUMBER: 30 | SENTENCE = BOT.generateString() 31 | if len(SENTENCE.split()) < ARGS.minlen: 32 | continue 33 | VALID_SENTENCES += 1 34 | print(SENTENCE) 35 | 36 | if not ARGS.notags: 37 | try: 38 | TAGS=BOT.generateStringWithSeed("#") 39 | print(TAGS) 40 | print(" --- ") 41 | except pymarkovchain.StringContinuationImpossibleError as e: 42 | print("[FATAL] Your database does not have tag data.") 43 | print("You can still generate posts without tags using --notags") 44 | import sys 45 | sys.exit(1) 46 | 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | pymarkovchain 4 | -------------------------------------------------------------------------------- /tscrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Build PyMarkovChain databases from Tumblr blogs 4 | """ 5 | import urllib 6 | 7 | import argparse 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from pymarkovchain import MarkovChain 11 | 12 | PARSER = argparse.ArgumentParser( 13 | description="Build a Markov Chain bot from a Tumblr " + 14 | "blog (or single tag on a blog).") 15 | PARSER.add_argument('url', metavar="URL", type=str, 16 | help="The Tumblr subdomain to scrape. " + 17 | "For staff.tumblr.com, URL would be staff") 18 | PARSER.add_argument('--tag', 19 | help="The tag to scrape on the given blog." + 20 | "Don't include the hash symbol '#'.") 21 | PARSER.add_argument('start_page', type=int, metavar='START_PAGE', 22 | help="The page from which to start scraping content.") 23 | PARSER.add_argument('end_page', type=int, metavar='END_PAGE', 24 | help="The final page to scrape content from.") 25 | PARSER.add_argument("--debug", action="store_true") 26 | PARSER.add_argument("--notags", 27 | action="store_true", help="Don't scrape tags, only content.") 28 | PARSER.add_argument("--nohash", 29 | action="store_true", help="Don't add # symbol to text from tags.") 30 | PARSER.add_argument("--prune", action="store_true", help="Prune short tags") 31 | 32 | ARGS = PARSER.parse_args() 33 | 34 | if ARGS.tag is None: 35 | TARGET_URL = "https://{}.tumblr.com/page/".format(ARGS.url) 36 | TARGET_FILE = "{}.posts.markov".format(ARGS.url) 37 | else: 38 | TARGET_URL = "https://{}.tumblr.com/tagged/{}/page/" 39 | TARGET_URL = TARGET_URL.format(ARGS.url, urllib.parse.quote(ARGS.tag)) 40 | TARGET_FILE = "{}.{}.markov".format(ARGS.url, urllib.parse.quote(ARGS.tag)) 41 | 42 | CORPUS = "" 43 | 44 | for page_number in range(ARGS.start_page, ARGS.end_page + 1): 45 | print("Scraping page {}".format(page_number)) 46 | soup = BeautifulSoup( 47 | requests.get(TARGET_URL + str(page_number)).text, 'lxml') 48 | 49 | # Search
tags for post content 50 | for para in soup.find_all('p'): 51 | t = para.get_text() 52 | if t is None: 53 | continue 54 | if "Originally posted by" in t: 55 | continue 56 | if "replied to your post" in t: 57 | continue 58 | CORPUS += t + "\n" 59 | 60 | if ARGS.notags: 61 | continue 62 | 63 | # Start the tags segment 64 | CORPUS += "# " 65 | # Search tags for post tags 66 | for tag in soup.find_all('a'): 67 | h = tag.get('href') 68 | if h is None: 69 | continue 70 | 71 | # Only extract tagged URLs 72 | if "/tagged" not in h: 73 | continue 74 | 75 | # If there's no text, skip 76 | try: 77 | t = tag.get_text() 78 | except Exception: 79 | continue 80 | if t is None: 81 | continue 82 | 83 | # Very commonly used tags 84 | if "//" in t: 85 | continue 86 | if "cw: " in t: 87 | continue 88 | 89 | # Prune short tags 90 | if ARGS.prune and len(t) <= 3: 91 | continue 92 | 93 | # Tags which are just numbers should not be in the corpus 94 | try: 95 | int(t.strip()) 96 | continue 97 | except ValueError: 98 | pass 99 | 100 | if ARGS.nohash: 101 | CORPUS += t + " " 102 | else: 103 | CORPUS += '#' + t + " " 104 | CORPUS += "\n" 105 | 106 | 107 | if ARGS.debug: 108 | print(CORPUS) 109 | exit(1) 110 | print("Generating database...") 111 | BOT = MarkovChain(TARGET_FILE) 112 | BOT.generateDatabase(CORPUS) 113 | print("Dumping database to {}".format(TARGET_FILE)) 114 | BOT.dumpdb() 115 | --------------------------------------------------------------------------------