├── .gitignore ├── assets └── imgs │ └── medium.jpg ├── external.py ├── requirements.txt ├── LICENSE.txt ├── README.md ├── medium.py └── functions.py /.gitignore: -------------------------------------------------------------------------------- 1 | *_blogs/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /assets/imgs/medium.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xAmmar/Medium-Miner/HEAD/assets/imgs/medium.jpg -------------------------------------------------------------------------------- /external.py: -------------------------------------------------------------------------------- 1 | class bcolors: 2 | OKCYAN = "\033[96m" 3 | OKGREEN = "\033[92m" 4 | WARNING = "\033[93m" 5 | FAIL = "\033[91m" 6 | ENDC = "\033[0m" 7 | BOLD = "\033[1m" 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | bs4==0.0.1 3 | certifi==2023.7.22 4 | charset-normalizer==3.2.0 5 | click==8.1.6 6 | cssselect==1.2.0 7 | feedfinder2==0.0.4 8 | filelock==3.12.2 9 | google==3.0.0 10 | idna==3.4 11 | jieba3k==0.35.1 12 | joblib==1.3.2 13 | lxml==4.9.3 14 | markdownify==0.11.6 15 | nltk==3.8.1 16 | Pillow==10.0.0 17 | python-dateutil==2.8.2 18 | PyYAML==6.0.1 19 | regex==2023.8.8 20 | requests==2.31.0 21 | requests-file==1.5.1 22 | sgmllib3k==1.0.0 23 | six==1.16.0 24 | soupsieve==2.4.1 25 | tinysegmenter==0.3 26 | tldextract==3.4.4 27 | tqdm==4.66.1 28 | urllib3==2.0.4 29 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Medium Miner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Medium Miner 2 | 3 | > A Medium Scraper That You Need! 4 | 5 | 6 | 7 | 8 | ## 🚩 Table of Contents 9 | 10 | - Why Should I Use It? 11 | - Install 12 | - How To Use 13 | - Preview 14 | 15 | ### Why Should I Use It? 16 | 1. It's Free. 17 | 2. Reading Medium Blogs Offline. 18 | 3. You Can Read On Your Favourite Markdown Reader for me it's Obsidian. 19 | 4. Easy To Use. 20 | 5. It's Free. 21 | 22 | ### Install: 23 | ```bash 24 | git clone https://github.com/xLe0x/Medium-Miner.git 25 | ``` 26 | ```bash 27 | cd Medium-Miner 28 | ``` 29 | **Install required Dependencies:** 30 | ```bash 31 | pip3 install -r requirements.txt 32 | ``` 33 | 34 | **Show Commands:** 35 | ```bash 36 | python3 medium.py -h 37 | ``` 38 | 39 | ### How To Use: 40 | 41 | **To search for a specific topic (word) on medium** 42 | ```bash 43 | python3 medium.py -w "how to lose?" -c 20 44 | ``` 45 | **To list and download all author's blogs** 46 | ```bash 47 | python3 medium.py -a ammarmosaber 48 | ``` 49 | 50 | ### Preview 51 | 52 | https://github.com/xLe0x/Medium-Miner/assets/112324284/fe885e86-d69a-4369-9b63-3ab861d3ffd7 53 | 54 | This program is licensed under the [MIT](https://github.com/xLe0x/Medium-Miner/blob/main/LICENSE.txt) © [xLe0x](https://github.com/xle0x). 55 | -------------------------------------------------------------------------------- /medium.py: -------------------------------------------------------------------------------- 1 | from functions import ( 2 | searchGoogle, 3 | SearchAuthorMedium, 4 | ) 5 | from external import bcolors 6 | import argparse,sys 7 | 8 | 9 | parser = argparse.ArgumentParser(description="Medium Miner.") 10 | 11 | word_group = parser.add_mutually_exclusive_group() 12 | word_group.add_argument( 13 | "-w", 14 | "--word", 15 | type=str, 16 | help="Search by word in google example: python3 medium.py -w xss", 17 | ) 18 | 19 | parser.add_argument( 20 | "-c", 21 | "--count", 22 | type=int, 23 | help="How many output you want? example: python3 medium.py -w idor -c 10", 24 | ) 25 | 26 | parser.add_argument( 27 | "-a", 28 | "--author", 29 | type=str, 30 | help="Search by author in medium example: python3 medium.py -a ammarmosaber", 31 | ) 32 | 33 | parser.add_argument("-v", action="store_true", help="Prints the program version") 34 | 35 | args = parser.parse_args() 36 | 37 | 38 | try: 39 | if args.word: 40 | if not args.count: 41 | args.count = 10 42 | searchGoogle(str(args.word), args.count) 43 | 44 | elif args.author: 45 | SearchAuthorMedium(args.author) 46 | 47 | elif args.v: 48 | print("1.0 Beta") 49 | 50 | else: 51 | print() 52 | parser.print_help() 53 | print() 54 | 55 | except ValueError: 56 | print(f"{bcolors.FAIL}Username is wrong or there is no blogs!{bcolors.ENDC}") 57 | except: 58 | print(f"{bcolors.FAIL}Something went wrong please try again!{bcolors.ENDC}") 59 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import requests, markdownify, os 2 | from re import match 3 | from googlesearch import search 4 | from external import bcolors 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | def save_MD_files(blog, i): 9 | print(f"{bcolors.OKCYAN}Started to download {i}!{bcolors.ENDC}") 10 | try: 11 | with open( 12 | f'{i} - {blog[int(i)]["title"]}.md', 13 | "x", 14 | ) as mdfile: 15 | mdfile.write(markdownify.markdownify(blog[int(i)]["content"])) 16 | 17 | except FileExistsError: 18 | print(f"#{i} file is already downloaded") 19 | 20 | 21 | def download_MD_files(blog, user_choice, isNumber, isComma, isAll): 22 | if isComma: 23 | if user_choice.split(",")[-1] == "": 24 | print("you suck") 25 | elif "," in user_choice: 26 | for i in user_choice.split(","): 27 | save_MD_files(blog, user_choice) 28 | 29 | if isNumber: 30 | save_MD_files(blog, user_choice) 31 | 32 | if isAll: 33 | for i, _ in enumerate(blog): 34 | save_MD_files(blog, int(i)) 35 | 36 | 37 | def getResults(url): 38 | blog_list = [] 39 | r = requests.get(url) 40 | soup = BeautifulSoup(r.content, features="xml") 41 | blogs = soup.findAll("item") 42 | for a in blogs: 43 | title = a.find("title").text 44 | link = a.find("link").text 45 | content = a.find("content:encoded") 46 | content = content.text if content is not None else "" 47 | blog = {"title": title, "link": link, "content": content} 48 | blog_list.append(blog) 49 | return blog_list 50 | 51 | 52 | def listBlogs(blogs): 53 | if blogs: 54 | print() 55 | print("Here is a list of all the blogs:") 56 | for i, blog in enumerate(blogs): 57 | if blog["content"] == "": 58 | print( 59 | f"{bcolors.BOLD}{i} - {blog['title']}{bcolors.WARNING} (Member-Only Story){bcolors.ENDC}{bcolors.ENDC}" 60 | ) 61 | else: 62 | print(f"{bcolors.BOLD}{i} - {blog['title']}{bcolors.ENDC}") 63 | 64 | else: 65 | raise ValueError() 66 | 67 | 68 | def checkUserInput(blogs): 69 | print() 70 | user_choice = input( 71 | f"{bcolors.OKGREEN}(A number,numbers (comma separated) or 'all' to downlaod): {bcolors.ENDC}" 72 | ) 73 | print() 74 | if user_choice.isnumeric(): 75 | download_MD_files(blogs, user_choice, isNumber=True, isAll=False, isComma=False) 76 | elif "," in user_choice: 77 | download_MD_files(blogs, user_choice, isComma=True, isNumber=False, isAll=False) 78 | elif user_choice == "all": 79 | download_MD_files(blogs, user_choice, isAll=True, isComma=False, isNumber=False) 80 | else: 81 | raise Exception() 82 | 83 | 84 | def checkForDir(author): 85 | if getResults(f"https://medium.com/@{author}/feed"): 86 | if not os.path.exists(f"{author}_blogs"): 87 | os.mkdir(f"{author}_blogs") 88 | 89 | os.chdir(f"{author}_blogs") 90 | 91 | 92 | def searchGoogle(userInput, count): 93 | def extractUsername(url): 94 | try: 95 | return match(r"https:\/\/(.*)\/@(.*)/", url).group(2) 96 | except: 97 | return Exception() 98 | 99 | query = f"site:medium.com {userInput}" 100 | 101 | for result in search(query, num=count, stop=count, pause=1): 102 | if "@" in result: 103 | print() 104 | print(f"{bcolors.BOLD}The Author: {extractUsername(result)} {bcolors.ENDC}") 105 | print(f"{bcolors.OKGREEN}{result} {bcolors.ENDC}") 106 | 107 | 108 | def SearchAuthorMedium(author): 109 | data = getResults(f"https://medium.com/@{author}/feed") 110 | listBlogs(data) 111 | checkForDir(author) 112 | checkUserInput(data) 113 | --------------------------------------------------------------------------------