├── .gitignore
├── README.md
├── genevabot.py
├── requirements.txt
└── tscrape.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.markov
2 | .vscode/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Geneva Bot
 2 | 
 3 | Scrape Tumblr blogs for a corpus, convert it into a Markov probablility matrix,
 4 | and generate text posts in the style of the original blog.
 5 | 
 6 | This process consists of two scripts, `tscrape.py` and `genevabot.py`. The
 7 | first scrapes a Tumblr blog for text and tags (because lots of great content is
 8 | included in Tumblr tags!) and creates a PyMarkovChain probability database from
 9 | that data. The second simply reconsititues the probability database into memory
10 | and generates some sentances from it.
11 | 
12 | Here is a sample usage:
13 | 
14 | ```
15 | 13:03:39: leo [~/Projects/genevabot]
16 | $ ./tscrape.py clientsfromhell 1 10 --notags
17 | Scraping page 1
18 | [...]
19 | Scraping page 10
20 | Generating database...
21 | Dumping database to clientsfromhell.posts.markov
22 | 
23 | 13:03:59: leo [~/Projects/genevabot]
24 | $ ./genevabot.py clientsfromhell.posts.markov 10
25 | Perfect for everything from invitations to logos, the 35+ unique fonts – for example, an idea
26 | logo, and brand image for my sister’s cake
27 | It’s a Samsung iPhone
28 | Me: I removed all the details
29 | do it for free, and still
30 | By making it easy to make bold and professional designs,
31 | Thanks for the time to actually review said contract, I was
32 | After this phone call, I received some revisions from said customer that came in to work with us can pay our full fees and is standard across
33 | I sent it over – along with a
34 | What will we do
35 | ```


--------------------------------------------------------------------------------
/genevabot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Generates text posts from a given corpus
 4 | """
 5 | from sys import argv
 6 | import pymarkovchain
 7 | from pymarkovchain import MarkovChain
 8 | import argparse
 9 | 
10 | PARSER = argparse.ArgumentParser(
11 |         description="Generate Tumblr posts from a Markov chain database.")
12 | PARSER.add_argument("filename", metavar="CORPUS", type=str,
13 |         help="The corpus to use in generating text.")
14 | PARSER.add_argument("number", metavar="NUMBER", type=int,
15 |         help="The number of strings to generate.")
16 | PARSER.add_argument('--minlen', metavar="LENGTH", type=int,
17 |         help="Throw out strings shorter than this.", default=3)
18 | PARSER.add_argument('--notags', action="store_true",
19 |         help="Don't generate tags (legacy database compat behaviour)")
20 | 
21 | ARGS = PARSER.parse_args()
22 | 
23 | FILENAME = ARGS.filename
24 | NUMBER = ARGS.number
25 | 
26 | BOT = MarkovChain(FILENAME)
27 | 
28 | VALID_SENTENCES = 0
29 | while VALID_SENTENCES < NUMBER:
30 |     SENTENCE = BOT.generateString()
31 |     if len(SENTENCE.split()) < ARGS.minlen:
32 |         continue
33 |     VALID_SENTENCES += 1
34 |     print(SENTENCE)
35 | 
36 |     if not ARGS.notags:
37 |         try:
38 |             TAGS=BOT.generateStringWithSeed("#")
39 |             print(TAGS)    
40 |             print(" --- ")
41 |         except pymarkovchain.StringContinuationImpossibleError as e:
42 |             print("[FATAL] Your database does not have tag data.")
43 |             print("You can still generate posts without tags using --notags")
44 |             import sys
45 |             sys.exit(1)
46 |     
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | pymarkovchain
4 | 


--------------------------------------------------------------------------------
/tscrape.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Build PyMarkovChain databases from Tumblr blogs
  4 | """
  5 | import urllib
  6 | 
  7 | import argparse
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from pymarkovchain import MarkovChain
 11 | 
 12 | PARSER = argparse.ArgumentParser(
 13 |     description="Build a Markov Chain bot from a Tumblr " + 
 14 |     "blog (or single tag on a blog).")
 15 | PARSER.add_argument('url', metavar="URL", type=str,
 16 |                     help="The Tumblr subdomain to scrape. " + 
 17 |                     "For staff.tumblr.com, URL would be staff")
 18 | PARSER.add_argument('--tag',
 19 |                     help="The tag to scrape on the given blog." + 
 20 |                     "Don't include the hash symbol '#'.")
 21 | PARSER.add_argument('start_page', type=int, metavar='START_PAGE',
 22 |                     help="The page from which to start scraping content.")
 23 | PARSER.add_argument('end_page', type=int, metavar='END_PAGE',
 24 |                     help="The final page to scrape content from.")
 25 | PARSER.add_argument("--debug", action="store_true")
 26 | PARSER.add_argument("--notags", 
 27 |         action="store_true", help="Don't scrape tags, only content.")
 28 | PARSER.add_argument("--nohash", 
 29 |         action="store_true", help="Don't add # symbol to text from tags.")
 30 | PARSER.add_argument("--prune", action="store_true", help="Prune short tags")
 31 | 
 32 | ARGS = PARSER.parse_args()
 33 | 
 34 | if ARGS.tag is None:
 35 |     TARGET_URL = "https://{}.tumblr.com/page/".format(ARGS.url)
 36 |     TARGET_FILE = "{}.posts.markov".format(ARGS.url)
 37 | else:
 38 |     TARGET_URL = "https://{}.tumblr.com/tagged/{}/page/"
 39 |     TARGET_URL = TARGET_URL.format(ARGS.url, urllib.parse.quote(ARGS.tag))
 40 |     TARGET_FILE = "{}.{}.markov".format(ARGS.url, urllib.parse.quote(ARGS.tag))
 41 | 
 42 | CORPUS = ""
 43 | 
 44 | for page_number in range(ARGS.start_page, ARGS.end_page + 1):
 45 |     print("Scraping page {}".format(page_number))
 46 |     soup = BeautifulSoup(
 47 |             requests.get(TARGET_URL + str(page_number)).text, 'lxml')
 48 | 
 49 |     # Search <p> tags for post content
 50 |     for para in soup.find_all('p'):
 51 |         t = para.get_text()
 52 |         if t is None:
 53 |             continue
 54 |         if "Originally posted by" in t:
 55 |             continue
 56 |         if "replied to your post" in t:
 57 |             continue
 58 |         CORPUS += t + "\n"
 59 | 
 60 |     if ARGS.notags:
 61 |         continue
 62 | 
 63 |     # Start the tags segment
 64 |     CORPUS += "# "
 65 |     # Search <a> tags for post tags
 66 |     for tag in soup.find_all('a'):
 67 |         h = tag.get('href')
 68 |         if h is None:
 69 |             continue
 70 | 
 71 |         # Only extract tagged URLs
 72 |         if "/tagged" not in h:
 73 |             continue
 74 | 
 75 |         # If there's no text, skip
 76 |         try:
 77 |             t = tag.get_text()
 78 |         except Exception:
 79 |             continue
 80 |         if t is None:
 81 |             continue
 82 | 
 83 |         # Very commonly used tags
 84 |         if "//" in t:
 85 |             continue
 86 |         if "cw: " in t:
 87 |             continue
 88 | 
 89 |         # Prune short tags
 90 |         if ARGS.prune and len(t) <= 3:
 91 |             continue
 92 | 
 93 |         # Tags which are just numbers should not be in the corpus
 94 |         try:
 95 |             int(t.strip())
 96 |             continue
 97 |         except ValueError:
 98 |             pass
 99 | 
100 |         if ARGS.nohash:
101 |             CORPUS += t + " "
102 |         else:
103 |             CORPUS += '#' + t + " "
104 |     CORPUS += "\n"
105 | 
106 | 
107 | if ARGS.debug:
108 |     print(CORPUS)
109 |     exit(1)
110 | print("Generating database...")
111 | BOT = MarkovChain(TARGET_FILE)
112 | BOT.generateDatabase(CORPUS)
113 | print("Dumping database to {}".format(TARGET_FILE))
114 | BOT.dumpdb()
115 | 


--------------------------------------------------------------------------------