├── app ├── __init__.py ├── utils │ └── manga_utils.py ├── requirements.txt ├── services │ ├── consumer_processor.py │ ├── producer_processor.py │ ├── save_page_service.py │ ├── manga_service.py │ ├── muitomanga_service.py │ └── mangalivre_service.py ├── dockerfile.consumer ├── dockerfile ├── adapter │ ├── queue │ │ └── consumer.py │ └── web │ │ └── routes │ │ ├── chapter.py │ │ └── page.py └── app.py ├── .gitignore ├── files └── diagram.jpg ├── nginx.conf ├── environment └── create-queues.sh ├── LICENSE ├── app.sh ├── docker-compose.yaml ├── .github └── workflows │ └── deploy.yml └── README.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | /mangas 3 | /app/mangas 4 | desktop.ini 5 | 6 | /volume -------------------------------------------------------------------------------- /files/diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjeanjacques10/manga-scrapper-api/HEAD/files/diagram.jpg -------------------------------------------------------------------------------- /app/utils/manga_utils.py: -------------------------------------------------------------------------------- 1 | def get_folder_name(manga_name, chapter_number): 2 | folder_name = manga_name.lower().replace(" ", "_") 3 | print("save in folder: " + folder_name) 4 | return f"mangas/{folder_name}/{chapter_number}" 5 | -------------------------------------------------------------------------------- /app/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | requests==2.28.1 3 | boto3 4 | click 5 | distlib 6 | et-xmlfile 7 | filelock 8 | Flask 9 | Flask-Cors 10 | importlib-metadata 11 | MarkupSafe 12 | msrest 13 | oauthlib 14 | requests 15 | requests-oauthlib 16 | urllib3 -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | listen [::]:80; 4 | 5 | server_name ec2-184-72-101-57.compute-1.amazonaws.com; 6 | 7 | location / { 8 | proxy_pass http://localhost:3000/; 9 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 10 | proxy_set_header X-Forwarded-Proto $scheme; 11 | proxy_set_header X-Forwarded-Port $server_port; 12 | } 13 | } -------------------------------------------------------------------------------- /app/services/consumer_processor.py: -------------------------------------------------------------------------------- 1 | from services.manga_service import MangaService 2 | from services.save_page_service import upload_chapter_pages 3 | 4 | 5 | def process_message(event): 6 | source = event.get('source', None) 7 | manga = event.get('manga', None) 8 | chapter = str(event.get('chapter', None)) 9 | try: 10 | manga_service = MangaService() 11 | manga_service.get_chapter_from_internet(source, manga, chapter, True) 12 | upload_chapter_pages(manga, chapter) 13 | except Exception as e: 14 | print(e) 15 | -------------------------------------------------------------------------------- /app/dockerfile.consumer: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | ARG AWS_ACCESS_KEY_ID 4 | ARG AWS_SECRET_ACCESS_KEY 5 | ARG AWS_DEFAULT_REGION 6 | 7 | ENV AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 8 | ENV AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 9 | ENV AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION 10 | 11 | # Create a workdir for our app 12 | WORKDIR /usr/src/app 13 | COPY . /usr/src/app 14 | 15 | # Install dependencies 16 | RUN pip install --upgrade pip && pip install --no-cache-dir --trusted-host pypi.python.org -r requirements.txt && pip install --ignore-installed six watson-developer-cloud 17 | 18 | # Run the app 19 | CMD [ "python", "./app.py", "consumer"] -------------------------------------------------------------------------------- /app/dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | ARG AWS_ACCESS_KEY_ID 4 | ARG AWS_SECRET_ACCESS_KEY 5 | ARG AWS_DEFAULT_REGION 6 | ARG API_HOST 7 | 8 | ENV AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 9 | ENV AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 10 | ENV AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION 11 | ENV API_HOST=$API_HOST 12 | 13 | # Create a workdir for our app 14 | WORKDIR /usr/src/app 15 | COPY . /usr/src/app 16 | 17 | # Install dependencies 18 | RUN pip install --upgrade pip 19 | RUN pip install --no-cache-dir --trusted-host pypi.python.org -r requirements.txt 20 | RUN pip install --ignore-installed six watson-developer-cloud 21 | 22 | # Expose port 3000 23 | EXPOSE 3000 24 | 25 | # Run the app 26 | CMD [ "python", "./app.py", "app" ] -------------------------------------------------------------------------------- /app/services/producer_processor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import boto3 4 | import json 5 | 6 | sqs = boto3.resource('sqs', region_name='us-east-1', 7 | endpoint_url=os.environ.get("SQS_ENDPOINT")) 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | 11 | 12 | def send_message(message): 13 | try: 14 | # Retrieving a queue by its name 15 | queue = sqs.get_queue_by_name(QueueName='manga-queue') 16 | 17 | # Create a new message 18 | response = queue.send_message(MessageBody=json.dumps(message)) 19 | 20 | # The response is not a resource, but gives you a message ID and MD5 21 | logging.info("MessageId created: {0}".format( 22 | response.get('MessageId'))) 23 | except Exception as e: 24 | logging.error(e) 25 | -------------------------------------------------------------------------------- /app/adapter/queue/consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import sleep 3 | import boto3 4 | import json 5 | 6 | from services.consumer_processor import process_message 7 | 8 | 9 | class Consumer: 10 | 11 | def __init__(self) -> None: 12 | self.sqs = boto3.resource('sqs', region_name='us-east-1', 13 | endpoint_url=os.environ.get("SQS_ENDPOINT")) 14 | 15 | def start(self): 16 | 17 | queue = self.sqs.get_queue_by_name(QueueName='manga-queue') 18 | 19 | print("Checking for messages...") 20 | while True: 21 | for message in queue.receive_messages(): 22 | try: 23 | print(message.body) 24 | process_message(json.loads(message.body)) 25 | except Exception as e: 26 | print(e) 27 | message.delete() 28 | # print("Waiting for messages...") 29 | sleep(5) 30 | -------------------------------------------------------------------------------- /app/adapter/web/routes/chapter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from flask import request, Blueprint 3 | from services.manga_service import MangaService 4 | 5 | 6 | chapter_bp = Blueprint('chapter_bp', __name__) 7 | 8 | """Get all pages from a chapter""" 9 | 10 | 11 | @chapter_bp.route("/chapter", methods=["GET"]) 12 | def get_all_chapter_pages(): 13 | source = request.args.get("source", None) 14 | manga = request.args.get("manga", None) 15 | number = request.args.get("number", None) 16 | 17 | logging.info(f"{source}, {manga}, {number}") 18 | 19 | if not source or not manga or not number: 20 | return {"message": "Invalid request"}, 422 21 | 22 | mangaService = MangaService() 23 | 24 | images = mangaService.get_chapter(source, manga, number) 25 | 26 | if not images: 27 | return {"message": "Chapter not found"}, 404 28 | 29 | return { 30 | "manga": manga, 31 | "chapter": number, 32 | "pages": images 33 | }, 200 34 | -------------------------------------------------------------------------------- /environment/create-queues.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | # enable debug 6 | # set -x 7 | 8 | echo "configure aws" 9 | echo "===================" 10 | 11 | aws configure set aws_access_key_id admin 12 | aws configure set aws_secret_access_key admin 13 | aws configure set region us-east-1 14 | 15 | echo "create sqs queue" 16 | echo "===================" 17 | 18 | # https://docs.aws.amazon.com/cli/latest/reference/sqs/create-queue.html 19 | create_queue() { 20 | local QUEUE_NAME_TO_CREATE=$1 21 | awslocal --endpoint-url=http://sqs:4576 sqs create-queue --queue-name ${QUEUE_NAME_TO_CREATE} --region us-east-1 --attributes VisibilityTimeout=30 22 | } 23 | 24 | create_queue "manga-queue" 25 | 26 | #aws sqs create-queue --queue-name manga-queue --endpoint-url=http://localhost:4576 27 | #aws sqs send-message --queue-url http://localhost:4576/000000000000/manga-queue --endpoint-url=http://localhost:4576 --message-body 123 28 | #aws sqs receive-message --queue-url http://localhost:4576/000000000000/manga-queue --endpoint-url=http://localhost:4576 -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | from flask import Flask 5 | from flask_cors import CORS 6 | from adapter.queue.consumer import Consumer 7 | 8 | from adapter.web.routes.page import page_bp 9 | from adapter.web.routes.chapter import chapter_bp 10 | 11 | 12 | def create_app(): 13 | app = Flask(__name__) 14 | CORS(app) 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | app.register_blueprint(page_bp) 19 | app.register_blueprint(chapter_bp) 20 | 21 | port = int(os.environ.get("PORT", 3000)) 22 | app.run(host="0.0.0.0", port=port, debug=True) 23 | 24 | 25 | def create_consumer(): 26 | consumer = Consumer() 27 | consumer.start() 28 | 29 | 30 | if __name__ == "__main__": 31 | if len(sys.argv) < 2: 32 | print("Usage: python main.py ") 33 | sys.exit(1) 34 | 35 | if sys.argv[1] == "app": 36 | create_app() 37 | elif sys.argv[1] == "consumer": 38 | create_consumer() 39 | else: 40 | print("Invalid argument:", sys.argv[1]) 41 | sys.exit(1) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jean Jacques Nascimento Barros 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /app/services/save_page_service.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import requests 4 | import logging 5 | 6 | from utils.manga_utils import get_folder_name 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | def upload_chapter_pages(manga_name, chapter_number): 13 | folder = get_folder_name(manga_name, chapter_number) 14 | 15 | # Get all images in the folder 16 | images = os.listdir(folder) 17 | 18 | for img in images: 19 | # Get the image path 20 | img_path = os.path.join(folder, img) 21 | 22 | url = "http://ec2-184-72-101-57.compute-1.amazonaws.com/page" 23 | 24 | payload = { 25 | 'source': 'manga_livre', 26 | 'manga': manga_name, 27 | 'number': chapter_number, 28 | 'page': img.split('.')[0] 29 | } 30 | files = [ 31 | ('image', ('MangaJJLogo.png', open(img_path, 'rb'), 'image/png')) 32 | ] 33 | headers = {} 34 | 35 | response = requests.request( 36 | "POST", url, headers=headers, data=payload, files=files) 37 | 38 | if response.status_code != 201: 39 | logger.error(f"Error uploading image {img} - {response.text}") 40 | 41 | logger.info(f"Chapter {chapter_number} uploaded") 42 | -------------------------------------------------------------------------------- /app.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo 'Starting to Deploy...' 4 | 5 | # Install required dependencies 6 | sudo apt-get update 7 | sudo apt-get upgrade 8 | yes | sudo apt-get install nginx 9 | yes | sudo apt install apt-transport-https ca-certificates curl software-properties-common 10 | yes | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 11 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable" 12 | apt-cache policy docker-ce 13 | yes | sudo apt install docker-ce 14 | 15 | # make sure manga-scrapper docker is not running 16 | sudo docker rm $(sudo docker stop $(sudo docker ps -a -q --filter ancestor=manga-scrapper:latest --format="{{.ID}}")) 17 | 18 | # copy nginx conf to default 19 | sudo cp nginx.conf /etc/nginx/conf.d/default.conf 20 | 21 | sudo systemctl restart nginx 22 | 23 | cd app 24 | 25 | # build dockerfile 26 | sudo docker build --build-arg AWS_ACCESS_KEY_ID=$1 \ 27 | --build-arg AWS_SECRET_ACCESS_KEY=$2 \ 28 | --build-arg AWS_DEFAULT_REGION=$3 \ 29 | --build-arg API_HOST=$4 \ 30 | -f dockerfile -t manga-scrapper:latest --no-cache . 31 | 32 | echo 'AWS default region: ' $3 33 | 34 | # run in detached mode 35 | sudo docker run -p 3000:3000 -v /home/ubuntu/mangas:/usr/src/app/mangas -d manga-scrapper:latest 36 | 37 | sleep 15 38 | 39 | PORT=3000 40 | 41 | echo 'Deployment completed successfully' -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | services: 3 | sqs: 4 | container_name: sqs 5 | image: localstack/localstack:latest 6 | restart: always 7 | environment: 8 | - AWS_DEFAULT_REGION=us-east-1 9 | - EDGE_PORT=4576 10 | - SERVICES=sqs 11 | - HOSTNAME=localhost 12 | - HOSTNAME_EXTERNAL=sqs 13 | - AWS_ACCESS_KEY_ID=admin 14 | - AWS_SECRET_ACCESS_KEY=admin 15 | ports: 16 | - '4576:4576' 17 | volumes: 18 | - "${LOCALSTACK_VOLUME_DIR:-./volume}:/var/lib/localstack" 19 | - "/var/run/docker.sock:/var/run/docker.sock" 20 | - ./environment/:/docker-entrypoint-initaws.d/ 21 | manga_app: 22 | container_name: manga_app 23 | build: ./app 24 | depends_on: 25 | - sqs 26 | ports: 27 | - "3000:3000" 28 | volumes: 29 | - manga_data:/usr/src/app/mangas 30 | environment: 31 | - AWS_ACCESS_KEY_ID=admin 32 | - AWS_SECRET_ACCESS_KEY=admin 33 | - AWS_DEFAULT_REGION=us-east-1 34 | - SQS_ENDPOINT=http://sqs:4576 35 | - API_HOST=http://localhost:3000 36 | manga_consumer: 37 | build: 38 | context: ./app 39 | dockerfile: dockerfile.consumer 40 | depends_on: 41 | - sqs 42 | - manga_app 43 | volumes: 44 | - manga_data:/usr/src/app/mangas 45 | environment: 46 | - AWS_ACCESS_KEY_ID=admin 47 | - AWS_SECRET_ACCESS_KEY=admin 48 | - AWS_DEFAULT_REGION=us-east-1 49 | - SQS_ENDPOINT=http://sqs:4576 50 | 51 | volumes: 52 | manga_data: 53 | -------------------------------------------------------------------------------- /app/services/manga_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | from services.producer_processor import send_message 3 | 4 | from services.mangalivre_service import get_manga_from_mangalivre 5 | from services.muitomanga_service import get_manga_from_muitomanga 6 | from utils.manga_utils import get_folder_name 7 | 8 | HOST_API = os.environ.get("API_HOST", "http://localhost:3000") 9 | 10 | 11 | class MangaService: 12 | 13 | def __init__(self) -> None: 14 | pass 15 | 16 | def get_chapter(self, source: str, manga: str, chapter: str): 17 | folder = get_folder_name(manga, chapter) 18 | if not os.path.exists(folder): 19 | send_message({ 20 | "source": source, 21 | "manga": manga, 22 | "chapter": chapter 23 | }) 24 | return self.get_chapter_from_internet(source, manga, chapter, False) 25 | 26 | images = os.listdir(folder) 27 | 28 | for i, img in enumerate(images): 29 | images[ 30 | i] = f"{HOST_API}/page?source={source}&manga={manga}&number={chapter}&page={img.split('.')[0]}" 31 | return self.order_pages(images) 32 | 33 | def get_chapter_from_internet(self, source: str, manga: str, chapter: str, download_pages: bool): 34 | if source == "manga_livre": 35 | print(f"Mangalivre - {manga}") 36 | return get_manga_from_mangalivre(manga, chapter, download_pages) 37 | elif source == "muito_manga": 38 | print(f"Muitomanga - {manga}") 39 | return get_manga_from_muitomanga(manga, chapter, download_pages) 40 | else: 41 | raise Exception("Invalid option") 42 | 43 | def order_pages(self, pages: list): 44 | pages.sort(key=lambda x: int(x.split("=").pop().split("_")[0])) 45 | return pages 46 | -------------------------------------------------------------------------------- /app/adapter/web/routes/page.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from flask import request, send_file, Blueprint 4 | from utils.manga_utils import get_folder_name 5 | 6 | page_bp = Blueprint('page_bp', __name__) 7 | 8 | """Save a image to the folder""" 9 | 10 | 11 | @page_bp.route("/page", methods=["POST"]) 12 | def save_page(): 13 | source = request.form.get("source", None) 14 | manga = request.form.get("manga", None) 15 | number = request.form.get("number", None) 16 | page = request.form.get("page", "1") 17 | 18 | # remove characters that are not numbers from page 19 | page = ''.join(filter(str.isdigit, page)) 20 | 21 | logging.info(f"{source}, {manga}, {number}, {page}") 22 | 23 | if not source or not manga or not number: 24 | return {"message": "Invalid request"}, 422 25 | 26 | image = request.files["image"] 27 | 28 | folder = get_folder_name(manga, number) 29 | if not os.path.exists(folder): 30 | os.makedirs(folder) 31 | 32 | image.save(os.path.join( 33 | folder, f"{page}.{'png' if source == 'manga_livre' else 'jpg'}")) 34 | return { 35 | "message": "Image saved" 36 | }, 201 37 | 38 | 39 | """Get a image from the folder""" 40 | 41 | 42 | @page_bp.route("/page", methods=["GET"]) 43 | def get_page(): 44 | source = request.args.get("source", None) 45 | manga = request.args.get("manga", None) 46 | number = request.args.get("number", None) 47 | page = request.args.get("page", "1") 48 | 49 | logging.info(f"{source}, {manga}, {number}, {page}") 50 | 51 | if not source or not manga or not number: 52 | return {"message": "Invalid request"}, 422 53 | 54 | folder = get_folder_name(manga, number) 55 | if not os.path.exists(folder): 56 | return {"message": "Page not found"}, 404 57 | 58 | # try png first then jpg 59 | try: 60 | image = open(os.path.join( 61 | folder, f"{page}.{'png' if source == 'manga_livre' else 'jpg'}"), "rb") 62 | except FileNotFoundError: 63 | image = open(os.path.join( 64 | folder, f"{page}.{'jpg' if source == 'manga_livre' else 'png'}"), "rb") 65 | 66 | return send_file(image, mimetype='image/jpeg'), 200 67 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy AWS EC2 2 | on: 3 | push: 4 | branches: 5 | - deploy-ec2 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Get Github action IP 11 | id: ip 12 | uses: haythem/public-ip@v1.2 13 | 14 | - name: AWS configure 15 | uses: aws-actions/configure-aws-credentials@v1 16 | with: 17 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 18 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 19 | aws-region: ${{ secrets.AWS_DEFAULT_REGION }} 20 | 21 | - name: Add github actions IP to security group 22 | env: 23 | AWS_SECURITY_GROUP: ${{ secrets.AWS_SECURITY_GROUP }} 24 | run: | 25 | aws ec2 authorize-security-group-ingress --group-id ${{ env.AWS_SECURITY_GROUP }} --protocol tcp --port 22 --cidr ${{ steps.ip.outputs.ipv4 }}/16 26 | sleep 10 27 | 28 | - name: Create Private Key 29 | env: 30 | PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} 31 | run: 'echo "${{ secrets.SSH_PRIVATE_KEY }}" > private_key.pem && chmod 600 private_key.pem' 32 | 33 | - name: Deploy on AWS EC2 34 | env: 35 | HOSTNAME: ${{ secrets.HOSTNAME }} 36 | USER_NAME: ${{ secrets.USERNAME }} 37 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 38 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 39 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 40 | AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} 41 | API_HOST: ${{ secrets.API_HOST }} 42 | run: | 43 | ssh -o StrictHostKeyChecking=no -i private_key.pem ${USER_NAME}@${HOSTNAME} ' 44 | rm -rf /home/ubuntu/manga-scrapper-api 45 | git clone https://github.com/jjeanjacques10/manga-scrapper-api.git 46 | cd manga-scrapper-api 47 | git checkout main 48 | git pull origin main 49 | sudo chmod +x ./app.sh 50 | ./app.sh ${{ secrets.AWS_ACCESS_KEY_ID }} ${{ secrets.AWS_SECRET_ACCESS_KEY }} ${{ secrets.AWS_DEFAULT_REGION }} ${{ secrets.API_HOST }} 51 | ' 52 | rm -rf private_key.pem 53 | 54 | - name: Remove github actions IP from security group 55 | env: 56 | AWS_SECURITY_GROUP: ${{ secrets.AWS_SECURITY_GROUP }} 57 | run: | 58 | aws ec2 revoke-security-group-ingress --group-id ${{ env.AWS_SECURITY_GROUP }} --protocol tcp --port 22 --cidr ${{ steps.ip.outputs.ipv4 }}/16 59 | if: always() 60 | -------------------------------------------------------------------------------- /app/services/muitomanga_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import multiprocessing 3 | from multiprocessing import Process 4 | import os 5 | 6 | import requests 7 | from bs4 import BeautifulSoup 8 | 9 | from utils.manga_utils import get_folder_name 10 | 11 | logger = logging.getLogger() 12 | logger.setLevel(logging.INFO) 13 | 14 | 15 | def search_manga(manga_name): 16 | url = f"https://muitomanga.com/buscar?q={manga_name}" 17 | 18 | headers = { 19 | 'Cookie': 'PHPSESSID=pt9g3ip57qeearso69ocvvh2lv' 20 | } 21 | 22 | response = requests.get(url, headers=headers, data={}) 23 | 24 | soup = BeautifulSoup(response.text, 'html.parser') 25 | mangas = [] 26 | i = 0 27 | for manga in soup.find_all("div", {"class": "anime"}): 28 | i = i + 1 29 | mangas.append([ 30 | i, 31 | manga.find("a").get("href").split("/")[-1] 32 | ]) 33 | return mangas 34 | 35 | 36 | def get_chapter(manga_name, chapter_number): 37 | manager = multiprocessing.Manager() 38 | return_dict = manager.dict() 39 | 40 | # Try to get 30 pages (max) 41 | pages = [] 42 | for page in range(1, 30): 43 | p = Process(target=download_page, args=( 44 | manga_name, chapter_number, page, return_dict)) 45 | pages.append(p) 46 | p.start() 47 | 48 | for page in pages: 49 | page.join() 50 | 51 | return return_dict.values() 52 | 53 | 54 | def download_page(manga_name, chapter_number, page): 55 | url = f"https://imgs.muitomanga.com/imgs/{manga_name}/{chapter_number}/{page}.jpg" 56 | response = requests.get(url) 57 | 58 | # Create folder if not exists 59 | folder = get_folder_name(manga_name, chapter_number) 60 | if not os.path.exists(folder): 61 | os.makedirs(folder) 62 | 63 | if response.status_code == 200: 64 | print(f"Downloading {manga_name} {chapter_number} page {page}") 65 | with open(f"{folder}/{''.join(filter(str.isdigit, page))}.jpg", "wb") as f: 66 | f.write(response.content) 67 | return url 68 | else: 69 | print(f"Page {page} not found") 70 | return 71 | 72 | 73 | def get_manga_from_muitomanga(name, chapter, save_pages=False): 74 | mangas = search_manga(name.replace(" ", "+")) 75 | 76 | if len(mangas) == 0: 77 | logger.error("Manga não encontrado") 78 | raise Exception("Manga não encontrado") 79 | 80 | id_manga = mangas[0][0] 81 | logger.info(f"Manga encontrado: {id_manga}") 82 | manga_name = mangas[id_manga][1] 83 | logger.info(f"Nome do manga: {manga_name}") 84 | 85 | logger.info(f"Downloading {manga_name} chapter {chapter}") 86 | get_chapter(manga_name, chapter) 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Manga Page Scrapper 2 | 3 |

4 | 5 |
6 |
7 | Report Bug 8 | · 9 | Request Feature 10 |

11 | 12 | This is a simple script to scrape manga pages from a websites and save them to a folder on AWS EC2 instance. There is an api and a consumer, the api is a Flask app that takes a chapter from a manga and if not already scraped, send a message to the consumer SQS to scrape the pages. 13 | 14 | ## SQS Queue 15 | 16 | SQS Message 17 | 18 | ``` json 19 | { 20 | "source": "manga_livre", 21 | "manga": "Naruto", 22 | "chapter": "692" 23 | } 24 | ``` 25 | 26 | ## Endpoints 27 | 28 | - Get a single chapter page 29 | 30 | `GET /page` 31 | 32 | | Query Param | Type | Description | 33 | | :---------- | :--------- | :---------------------------------- | 34 | | `source` | `string` | **Required**. manga_livre or muito_manga | 35 | | `manga` | `string` | **Required**. manga name | 36 | | `number` | `string` | **Required**. chapter number | 37 | | `page` | `string` | **Required**. page number | 38 | 39 | - Save a single chapter page on EBS 40 | 41 | `POST /page` 42 | 43 | | Form | Type | Description | 44 | | :---------- | :--------- | :---------------------------------- | 45 | | `source` | `string` | **Required**. manga_livre or muito_manga | 46 | | `manga` | `string` | **Required**. manga name | 47 | | `number` | `string` | **Required**. chapter number | 48 | | `page` | `string` | **Required**. number of pages | 49 | | `image` | `file` | **Required**. image file | 50 | 51 | - Get a chapter 52 | 53 | `GET /chapter` 54 | 55 | | Query Param | Type | Description | 56 | | :---------- | :--------- | :---------------------------------- | 57 | | `source` | `string` | **Required**. manga_livre or muito_manga | 58 | | `manga` | `string` | **Required**. manga name | 59 | | `number` | `string` | **Required**. chapter number | 60 | 61 | ## Sites Supported 62 | 63 | - [Manga Livre](https://mangalivre.net/) 64 | - [Muito Manga](https://muitomanga.com/) 65 | 66 | ## Architecture 67 | 68 | 69 | 70 | ## GitHub Actions 71 | 72 | - Variables to be set in the repository secrets 73 | 74 | ``` bash 75 | AWS_ACCESS_KEY_ID= 76 | AWS_SECRET_ACCESS_KEY= 77 | AWS_DEFAULT_REGION= 78 | AWS_SECURITY_GROUP = 79 | SSH_PRIVATE_KEY= 80 | HOSTNAME= 81 | USERNAME= 82 | ``` 83 | 84 | - Workflow to deploy to EC2 instance 85 | 86 | > [.github/workflows/deploy.yml](.github/workflows/deploy.yml) 87 | 88 | - Script to config the EC2 instance, install docker, update nginx and run the container 89 | 90 | > [app.sh](app.sh) 91 | 92 | ## Run Locally 93 | 94 | Use docker-compose to run both the api and the consumer 95 | 96 | ``` bash 97 | docker-compose up --build --scale manga_consumer=10 -d 98 | ``` 99 | 100 | ```--scale manga_consumer=10``` will run 10 consumers in parallel 101 | 102 | ## Licença 103 | 104 | [MIT](https://choosealicense.com/licenses/mit/) 105 | 106 | ## ⚠ Atention ⚠ 107 | 108 | This project is for study purposes only, I do not encourage piracy. If you like the manga, buy it. If you want to read it for free, go to the official website. I am not responsible for any misuse of this project. 109 | 110 | --- 111 | Developed by [Jean Jacques Barros](https://github.com/jeanjacques10) 112 | -------------------------------------------------------------------------------- /app/services/mangalivre_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import requests 5 | 6 | from utils.manga_utils import get_folder_name 7 | 8 | headers = { 9 | 'authority': 'mangalivre.net', 10 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 11 | 'accept-language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7', 12 | 'cache-control': 'max-age=0', 13 | 'cookie': '_ga=e88959f7-3144-4ae1-af76-cc99b0df6e80; __cf_bm=9SAugMSkcEA8uS.y5FOZY3v.0aNn9ixEZPjsM2iSsP0-1665619463-0-Aa+7SXgc7e2OpVQtzIm4aZ1YSXqEz3CXssnfWO+zBj1yG+g21hUjq8u++ogPiGr0NsHH7kKPUHHmQ8ezanE581oEECXcKvZJjK3vx/FgdoQ/zjLBoyEmVvFfKu+rX16jiw==; cf_use_ob=0', 14 | 'referer': 'https://mangalivre.net/ler/naruto/online/70908/700-09', 15 | 'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"', 16 | 'sec-ch-ua-mobile': '?0', 17 | 'sec-ch-ua-platform': '"Linux"', 18 | 'sec-fetch-dest': 'document', 19 | 'sec-fetch-mode': 'navigate', 20 | 'sec-fetch-site': 'same-origin', 21 | 'sec-fetch-user': '?1', 22 | 'upgrade-insecure-requests': '1', 23 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.42' 24 | } 25 | 26 | 27 | def search_manga(name): 28 | url = "https://mangalivre.net/lib/search/series.json" 29 | 30 | payload = f"search={name}" 31 | headers = { 32 | 'authority': 'mangalivre.net', 33 | 'accept': 'application/json, text/javascript, */*; q=0.01', 34 | 'accept-language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7', 35 | 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 36 | 'cookie': '_ga=e88959f7-3144-4ae1-af76-cc99b0df6e80; __cf_bm=5KDbO1NiKCprQ4ALceOr5UTG7B1n2U4tZ.19Fskp1os-1665617107-0-AYqASKEgB0jSGmmoLUnqambiOQwd4gxR17TEWSoslp2twJ6U1oFj+5PRkZkwGmCyJ+LI61LAFCiQKnu0z9k0XsP3iTmSsVrhSaMLbLFJfS/vjg5l3nqZtNTuNydyN1ceLA==; cf_use_ob=0', 37 | 'origin': 'https://mangalivre.net', 38 | 'referer': 'https://mangalivre.net/lista-de-mangas/ordenar-por-numero-de-leituras/todos/desde-o-comeco', 39 | 'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"', 40 | 'sec-ch-ua-mobile': '?0', 41 | 'sec-ch-ua-platform': '"Linux"', 42 | 'sec-fetch-dest': 'empty', 43 | 'sec-fetch-mode': 'cors', 44 | 'sec-fetch-site': 'same-origin', 45 | 'Access-Control-Allow-Origin': '*', 46 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.42', 47 | 'x-requested-with': 'XMLHttpRequest' 48 | } 49 | 50 | response = requests.post(url, headers=headers, data=payload) 51 | 52 | return response.json().get("series") 53 | 54 | 55 | def get_chapter(id_serie, number_chapter, page=1): 56 | url = f"https://mangalivre.net/series/chapters_list.json?page={page}&id_serie={id_serie}" 57 | print(url) 58 | headers = { 59 | 'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"', 60 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 61 | 'Referer': 'https://mangalivre.net/manga/naruto/1', 62 | 'X-Requested-With': 'XMLHttpRequest', 63 | 'sec-ch-ua-mobile': '?0', 64 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.42', 65 | 'sec-ch-ua-platform': '"Linux"' 66 | } 67 | 68 | try: 69 | response = requests.request("GET", url, headers=headers, data={}) 70 | 71 | if response.status_code == 200: 72 | for chapter in response.json().get('chapters'): 73 | if chapter.get('number') == number_chapter: 74 | release_scan = list(chapter.get("releases").keys())[0] 75 | return chapter.get("releases").get(release_scan).get("id_release"), chapter.get("releases").get( 76 | release_scan).get("link") 77 | return get_chapter(id_serie, number_chapter, page + 1) 78 | else: 79 | print( 80 | f"Capítulo {number_chapter} não encontrado - " + response.text) 81 | except Exception as e: 82 | print(f"Erro para encontrar capítulo {number_chapter} - {e}") 83 | 84 | 85 | def get_key(link): 86 | url = f"https://mangalivre.net{link}" 87 | response = requests.get(url, headers=headers, data={}) 88 | key_trash = re.findall(r'window\.READER_TOKEN = \'(.+)\';', response.text) 89 | key = key_trash[0] 90 | return key 91 | 92 | 93 | def get_page(id_release, key): 94 | url = f"https://mangalivre.net/leitor/pages/{id_release}.json?key={key}" 95 | response = requests.get(url, headers=headers, data={}) 96 | pages = [] 97 | for page in response.json().get("images"): 98 | pages.append(page.get("legacy")) 99 | return pages 100 | 101 | 102 | def save_chapter_pages(manga_name, chapter_number, pages): 103 | # Create folder if not exists 104 | folder = get_folder_name(manga_name, chapter_number) 105 | if not os.path.exists(folder): 106 | os.makedirs(folder) 107 | 108 | for page_number, page in enumerate(pages): 109 | print(f"Downloading {page}") 110 | response = requests.get(page, headers=headers, data={}) 111 | if response.status_code == 200: 112 | with open(f"{folder}/{page_number}.jpg", 'wb') as f: 113 | f.write(response.content) 114 | else: 115 | print(f"Erro ao baixar página {page} - " + response.text) 116 | 117 | 118 | def get_manga_from_mangalivre(name, chapter, download_pages=False): 119 | print(f"Searching Manga {name}") 120 | mangas = search_manga(name) 121 | if not mangas: 122 | print("Manga não encontrado") 123 | raise Exception("Manga não encontrado") 124 | 125 | id_serie = mangas.pop(0).get("id_serie") 126 | print(f"ID da série: {id_serie}") 127 | 128 | id_release, link = get_chapter(id_serie, chapter) 129 | print(f"ID da release: {id_release} - Link: {link}") 130 | 131 | key = get_key(link) 132 | 133 | pages = get_page(id_release, key) 134 | if download_pages: 135 | save_chapter_pages(name, chapter, pages) 136 | else: 137 | return pages 138 | --------------------------------------------------------------------------------