├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── app ├── tasks.py └── worker.py ├── docker-compose.yml └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | Dockerfile 3 | README.md 4 | *.pyc 5 | *.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | .vscode 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6.6 2 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1 3 | 4 | WORKDIR / 5 | COPY requirements.txt ./ 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | RUN rm requirements.txt 8 | 9 | COPY . / 10 | WORKDIR /app -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # celery-docker 2 | -------------------------------------------------------------------------------- /app/tasks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import newspaper 3 | import hashlib 4 | 5 | from io import BytesIO 6 | from urllib.parse import urlparse 7 | from celery.utils.log import get_task_logger 8 | from minio import Minio 9 | from minio.error import BucketAlreadyExists, BucketAlreadyOwnedByYou, NoSuchKey 10 | from worker import app 11 | 12 | 13 | logger = get_task_logger(__name__) 14 | 15 | 16 | @app.task(bind=True, name='refresh') 17 | def refresh(self, urls): 18 | for url in urls: 19 | fetch_source.s(url).delay() 20 | 21 | 22 | @app.task(bind=True, name='fetch_source') 23 | def fetch_source(self, url): 24 | logger.info(f'Build articles: {url}') 25 | source = newspaper.build(url) 26 | for article in source.articles: 27 | fetch_article.s(article.url).delay() 28 | 29 | 30 | @app.task(bind=True, name='fetch_article') 31 | def fetch_article(self, url): 32 | logger.info(f'Download {url}') 33 | article = newspaper.Article(url) 34 | article.download() 35 | logger.info(f'Parse {url}') 36 | article.parse() 37 | 38 | url = urlparse(article.source_url) 39 | save_article.s(url.netloc, article.title, article.text).delay() 40 | 41 | 42 | @app.task(bind=True, name='save_article', queue='minio') 43 | def save_article(self, bucket, key, text): 44 | 45 | minio_client = Minio(os.environ['MINIO_HOST'], 46 | access_key=os.environ['MINIO_ACCESS_KEY'], 47 | secret_key=os.environ['MINIO_SECRET_KEY'], 48 | secure=False) 49 | 50 | try: 51 | minio_client.make_bucket(bucket, location="us-east-1") 52 | except BucketAlreadyExists: 53 | pass 54 | except BucketAlreadyOwnedByYou: 55 | pass 56 | 57 | hexdigest = hashlib.md5(text.encode()).hexdigest() 58 | try: 59 | st = minio_client.stat_object(bucket, key) 60 | update = st.etag != hexdigest 61 | except NoSuchKey as err: 62 | update = True 63 | 64 | if update: 65 | logger.info(f'Write {bucket}/{key} to minio') 66 | stream = BytesIO(text.encode()) 67 | minio_client.put_object(bucket, key, stream, stream.getbuffer().nbytes) 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /app/worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | from celery import Celery 3 | 4 | 5 | app = Celery(include=('tasks',)) 6 | app.conf.beat_schedule = { 7 | 'refresh': { 8 | 'task': 'refresh', 9 | 'schedule': float(os.environ['NEWSPAPER_SCHEDULE']), 10 | 'args': (os.environ['NEWSPAPER_URLS'].split(','),) 11 | }, 12 | } -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | worker: 4 | build: . 5 | image: &img worker 6 | command: [celery, worker, --app=worker.app, --pool=gevent, --concurrency=20, --loglevel=INFO] 7 | environment: &env 8 | - CELERY_BROKER_URL=amqp://guest:guest@rabbitmq:5672 9 | - MINIO_HOST=minio:9000 10 | - MINIO_ACCESS_KEY=AKIAIOSFODNN7EXAMPLE 11 | - MINIO_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY 12 | - NEWSPAPER_URLS=https://www.theguardian.com,https://www.nytimes.com 13 | - NEWSPAPER_SCHEDULE=300 14 | depends_on: 15 | - beat 16 | - rabbitmq 17 | restart: 'no' 18 | volumes: 19 | - ./app:/app 20 | 21 | worker-minio: 22 | build: . 23 | image: *img 24 | command: [celery, worker, --app=worker.app, --pool=gevent, --concurrency=20, --queues=minio, --loglevel=INFO] 25 | environment: *env 26 | depends_on: 27 | - beat 28 | - rabbitmq 29 | restart: 'no' 30 | volumes: 31 | - ./app:/app 32 | 33 | beat: 34 | build: . 35 | image: *img 36 | command: [celery, beat, --app=worker.app, --loglevel=INFO] 37 | environment: *env 38 | depends_on: 39 | - rabbitmq 40 | restart: 'no' 41 | volumes: 42 | - ./app:/app 43 | 44 | rabbitmq: 45 | image: rabbitmq:3.7.8 46 | 47 | minio: 48 | image: minio/minio:RELEASE.2018-11-06T01-01-02Z 49 | command: server /export 50 | environment: *env 51 | ports: 52 | - 80:9000 53 | volumes: 54 | - minio:/data 55 | 56 | volumes: 57 | minio: -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | celery==4.2.1 2 | minio==4.0.6 3 | newspaper3k==0.2.8 4 | gevent==1.3.7 --------------------------------------------------------------------------------