├── .gitignore
├── README.md
├── mediumScraping.py
├── mediumscraper
├── .gitignore
├── db.sqlite3
├── home
│ ├── __init__.py
│ ├── admin.py
│ ├── apps.py
│ ├── forms.py
│ ├── migrations
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── manage.py
├── mediumscrapper
│ ├── __init__.py
│ ├── settings.py
│ ├── urls.py
│ └── wsgi.py
└── templates
│ └── home
│ └── index.html
├── myscrapy.png
├── read.mp3
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | ./mediumscraper/env
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Medium Scraper
2 |
3 | **NOTE: This project is developed for educational purpose only and does not try to imitate any legit application.**
4 |
5 | This project scrapes the content of any medium article and also allows the user to get an audio version of that article.
6 | User has to provide the link of the medium article in the user interface and submit it. In case user wants to generate an *mp3* file
7 | of that article, they should run the `mediumScraping.py` file, changing the url link inside the script.
8 |
9 | ## How to run
10 | Open terminal/cmd and run the following commands :
11 |
12 | ```git
13 | git clone https://github.com/globefire/ArticleScraping.git
14 | cd ArticleScraping
15 | pip install -r requirements.txt
16 | ```
17 |
18 | To run the python script run this command ([Change the link inside the file](https://github.com/globefire/ArticleScraping/blob/76ddf728769d562b8fbe2e7eadd02e5495bd20c5/mediumScraping.py#L6))
19 |
20 | ```git
21 | python mediumScraping.py
22 | ```
23 |
24 | To launch the web interface run the following commands
25 |
26 | ```git
27 | cd mediumscraper
28 | python manage.py runserver
29 | ```
30 |
31 | ## UI
32 |
33 |
34 |
35 | ## Demo
36 | [Press Me :)](https://www.youtube.com/watch?v=GGjnNVrm7rw&feature=youtu.be)
37 |
38 |
39 | ### Support Me
40 | If you liked this Repository, then please leave a star on this repository. It motivates me to contribute more in such Open Source projects in the future.
41 | ### Happy Coding =)
42 |
--------------------------------------------------------------------------------
/mediumScraping.py:
--------------------------------------------------------------------------------
1 | import requests as req
2 | import bs4
3 | import warnings
4 | warnings.simplefilter('ignore')
5 |
6 | res = req.get('https://web.whatsapp.com/') #your link here
7 | soup = bs4.BeautifulSoup(res.text, 'lxml')
8 | content = ''
9 |
10 | if soup.find('article'):
11 | for i in soup.select('article'):
12 | content += i.getText()
13 | #print(i.getText())
14 | print(content)
15 | else:
16 | print("This is not a medium article link..!")
17 |
18 | ###Audio file generating###
19 | # from gtts import gTTS
20 | # tts = gTTS(text = content, lang = 'en')
21 | # tts.save("read.mp3")
22 |
23 | '''
24 | use speechsynthesis of js to implement voice in web:
25 | https://devhints.io/js-speech,
26 | https://responsivevoice.org/,
27 | https://developer.mozilla.org/en-US/docs/Web/API/SpeechSynthesis
28 | this code should be called on button click
29 | Stop the web synthesis when new link is added****
30 | '''
31 |
32 | '''
33 | CSS improvising links
34 | https://stackoverflow.com/questions/10870564/how-to-exclude-a-class-with-all-children-in-style-definition
35 | https://stackoverflow.com/questions/17599035/django-how-can-i-call-a-view-function-from-template/19761466#19761466
36 | https://learn.shayhowe.com/advanced-html-css/complex-selectors/
37 | https://stackoverflow.com/questions/15603957/css-select-all-child-elements-except-first-two-and-last-two
38 | '''
39 |
--------------------------------------------------------------------------------
/mediumscraper/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | env
3 | *__pycache__
4 |
--------------------------------------------------------------------------------
/mediumscraper/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudonitin/MediumScraper/9da300dccc04f2ca24c7e6aa4e43a8f15f8e29eb/mediumscraper/db.sqlite3
--------------------------------------------------------------------------------
/mediumscraper/home/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudonitin/MediumScraper/9da300dccc04f2ca24c7e6aa4e43a8f15f8e29eb/mediumscraper/home/__init__.py
--------------------------------------------------------------------------------
/mediumscraper/home/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/mediumscraper/home/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class HomeConfig(AppConfig):
5 | name = 'home'
6 |
--------------------------------------------------------------------------------
/mediumscraper/home/forms.py:
--------------------------------------------------------------------------------
1 | from django import forms
2 |
3 | class NameForm(forms.Form):
4 | urlink = forms.CharField(label="Article link",widget=forms.TextInput(
5 | attrs={
6 | 'class':'form-control form-control-sm type-txt',
7 | }
8 | ))
9 |
--------------------------------------------------------------------------------
/mediumscraper/home/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudonitin/MediumScraper/9da300dccc04f2ca24c7e6aa4e43a8f15f8e29eb/mediumscraper/home/migrations/__init__.py
--------------------------------------------------------------------------------
/mediumscraper/home/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
--------------------------------------------------------------------------------
/mediumscraper/home/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/mediumscraper/home/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 | from . import views
3 |
4 | urlpatterns = [
5 | path('', views.get_link, name='get_link')
6 | ]
--------------------------------------------------------------------------------
/mediumscraper/home/views.py:
--------------------------------------------------------------------------------
1 | from django.http import HttpResponseRedirect
2 | from django.shortcuts import render, redirect
3 |
4 | from .forms import NameForm
5 |
6 | import requests as req
7 | import bs4
8 | import warnings
9 | warnings.simplefilter('ignore')
10 |
11 | def get_link(request):
12 | if request.method == 'POST':
13 | form = NameForm(request.POST)
14 | print(request.POST['urlink'])
15 |
16 | if form.is_valid():
17 | res = req.get(form.cleaned_data['urlink'])
18 | soup = bs4.BeautifulSoup(res.text, 'lxml')
19 | content = ''
20 | if soup.find('article'):
21 | for i in soup.select('article'):
22 | content += i.getText()
23 | else:
24 | print("This is not a medium article link..!")
25 | return render(request, 'home/index.html', {'form': form,
26 | 'text':"We could not find an article in this page. Please make sure this is a medium article link."})
27 | form = NameForm()
28 | val = content
29 |
30 | else:
31 | return render(request, 'home/index.html', {'form': form,
32 | 'text':"We could not find an article in this page. Please make sure this is a medium article link."})
33 | else:
34 | form = NameForm()
35 | val = "Please Enter a link"
36 |
37 | return render(request, 'home/index.html', {'form': form, 'text':val})
38 |
--------------------------------------------------------------------------------
/mediumscraper/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mediumscrapper.settings")
7 | try:
8 | from django.core.management import execute_from_command_line
9 | except ImportError as exc:
10 | raise ImportError(
11 | "Couldn't import Django. Are you sure it's installed and "
12 | "available on your PYTHONPATH environment variable? Did you "
13 | "forget to activate a virtual environment?"
14 | ) from exc
15 | execute_from_command_line(sys.argv)
16 |
--------------------------------------------------------------------------------
/mediumscraper/mediumscrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudonitin/MediumScraper/9da300dccc04f2ca24c7e6aa4e43a8f15f8e29eb/mediumscraper/mediumscrapper/__init__.py
--------------------------------------------------------------------------------
/mediumscraper/mediumscrapper/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for mediumscrapper project.
3 |
4 | Generated by 'django-admin startproject' using Django 2.0.3.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.0/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/2.0/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = '-yhh1ophs^z9^!=#p1*+21yc&6yzj0%b4=t9(v7l%(o#99$jey'
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = True
27 |
28 | ALLOWED_HOSTS = ['*']
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = [
34 | 'django.contrib.auth',
35 | 'django.contrib.contenttypes',
36 | 'django.contrib.sessions',
37 | 'django.contrib.messages',
38 | 'django.contrib.staticfiles',
39 | ]
40 |
41 | MIDDLEWARE = [
42 | 'django.middleware.security.SecurityMiddleware',
43 | 'django.contrib.sessions.middleware.SessionMiddleware',
44 | 'django.middleware.common.CommonMiddleware',
45 | 'django.middleware.csrf.CsrfViewMiddleware',
46 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
47 | 'django.contrib.messages.middleware.MessageMiddleware',
48 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
49 | ]
50 |
51 | ROOT_URLCONF = 'mediumscrapper.urls'
52 |
53 | TEMPLATES = [
54 | {
55 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
56 | 'DIRS': [os.path.join(BASE_DIR, "templates")],
57 | 'APP_DIRS': True,
58 | 'OPTIONS': {
59 | 'context_processors': [
60 | 'django.template.context_processors.debug',
61 | 'django.template.context_processors.request',
62 | 'django.contrib.auth.context_processors.auth',
63 | 'django.contrib.messages.context_processors.messages',
64 | ],
65 | },
66 | },
67 | ]
68 |
69 | WSGI_APPLICATION = 'mediumscrapper.wsgi.application'
70 |
71 |
72 | # Database
73 | # https://docs.djangoproject.com/en/2.0/ref/settings/#databases
74 |
75 | DATABASES = {
76 | 'default': {
77 | 'ENGINE': 'django.db.backends.sqlite3',
78 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
79 | }
80 | }
81 |
82 |
83 | # Password validation
84 | # https://docs.djangoproject.com/en/2.0/ref/settings/#auth-password-validators
85 |
86 | AUTH_PASSWORD_VALIDATORS = [
87 | {
88 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
89 | },
90 | {
91 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
92 | },
93 | {
94 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
95 | },
96 | {
97 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
98 | },
99 | ]
100 |
101 |
102 | # additional code for deployment
103 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
104 |
105 | # Static files (CSS, JavaScript, Images)
106 | # https://docs.djangoproject.com/en/1.9/howto/static-files/
107 | STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles')
108 | STATIC_URL = '/static/'
109 |
110 | # Extra places for collectstatic to find static files.
111 | STATICFILES_DIRS = (
112 | os.path.join(BASE_DIR, 'static'),
113 | )
114 |
115 |
116 |
117 | # Internationalization
118 | # https://docs.djangoproject.com/en/2.0/topics/i18n/
119 |
120 | LANGUAGE_CODE = 'en-us'
121 |
122 | TIME_ZONE = 'UTC'
123 |
124 | USE_I18N = True
125 |
126 | USE_L10N = True
127 |
128 | USE_TZ = True
129 |
130 |
131 | # Static files (CSS, JavaScript, Images)
132 | # https://docs.djangoproject.com/en/2.0/howto/static-files/
133 |
134 | STATIC_URL = '/static/'
135 |
--------------------------------------------------------------------------------
/mediumscraper/mediumscrapper/urls.py:
--------------------------------------------------------------------------------
1 |
2 | from django.urls import include, path
3 |
4 | urlpatterns = [
5 | path('', include('home.urls'))
6 | ]
7 |
--------------------------------------------------------------------------------
/mediumscraper/mediumscrapper/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for mediumscrapper project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.0/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mediumscrapper.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/mediumscraper/templates/home/index.html:
--------------------------------------------------------------------------------
1 |