├── myproject ├── __init__.py ├── core │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ ├── models.py │ ├── admin.py │ ├── apps.py │ ├── views.py │ ├── crawler.py │ └── tests.py ├── urls.py ├── wsgi.py └── settings.py ├── requirements.txt ├── contrib └── secret_gen.py ├── manage.py ├── crawler.py ├── README.md └── .gitignore /myproject/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /myproject/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /myproject/core/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /myproject/core/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /myproject/core/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dj-database-url==0.4.1 2 | Django==1.10.4 3 | python-decouple==3.0 4 | requests==2.12.3 5 | -------------------------------------------------------------------------------- /myproject/core/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class CoreConfig(AppConfig): 5 | name = 'core' 6 | -------------------------------------------------------------------------------- /myproject/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url 2 | from django.contrib import admin 3 | from myproject.core.views import counter_word 4 | 5 | urlpatterns = [ 6 | url(r'^api/$', counter_word, name='counter_word'), 7 | url(r'^admin/', admin.site.urls), 8 | ] 9 | -------------------------------------------------------------------------------- /myproject/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for myproject project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproject.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /contrib/secret_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Django SECRET_KEY generator. 5 | """ 6 | from django.utils.crypto import get_random_string 7 | 8 | 9 | chars = 'abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*(-_=+)' 10 | 11 | CONFIG_STRING = """ 12 | DEBUG=True 13 | SECRET_KEY=%s 14 | ALLOWED_HOSTS=127.0.0.1, .localhost 15 | """.strip() % get_random_string(50, chars) 16 | 17 | # Writing our configuration file to '.env' 18 | with open('.env', 'w') as configfile: 19 | configfile.write(CONFIG_STRING) 20 | -------------------------------------------------------------------------------- /myproject/core/views.py: -------------------------------------------------------------------------------- 1 | from django.http import HttpResponse, JsonResponse 2 | from .crawler import find_word_occurrences 3 | 4 | 5 | def counter_word(request): 6 | ''' 7 | url is the parameter to url. 8 | w is the parameter to word. 9 | Type: localhost:8000/api/?url=python.org&w=Python 10 | ''' 11 | try: 12 | url = request.GET['url'] 13 | except KeyError: 14 | url = '' 15 | try: 16 | word = request.GET['w'] 17 | except KeyError: 18 | word = '' 19 | if url is not None and word is not None: 20 | res = find_word_occurrences(url, word) 21 | return HttpResponse(JsonResponse(res)) 22 | -------------------------------------------------------------------------------- /myproject/core/crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | 4 | def contains_http(url): 5 | ''' Checks if url contains http or https. ''' 6 | STARTS_WITH_HTTP_OR_HTTPS = re.compile(r'(https?:\/\/[^\s]+)') 7 | return 'http://' + url if not STARTS_WITH_HTTP_OR_HTTPS.match(url) else url 8 | 9 | def get_text_content(url): 10 | """Retrieves the page text content given a URL.""" 11 | try: 12 | url = contains_http(url) 13 | return requests.get(url).text 14 | except Exception as e: 15 | print('Erro na conexão', e) 16 | return None 17 | 18 | 19 | def find_word_occurrences(url, word=''): 20 | """Find word occurrences based on the content retrieved.""" 21 | occurrences = {} 22 | occurrences[word] = get_text_content(url).count(word) 23 | return occurrences 24 | 25 | 26 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproject.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | import requests 4 | 5 | parser = argparse.ArgumentParser(description='Crawler to count word in site.') 6 | parser.add_argument('url', help='url of the site') 7 | parser.add_argument('word', help='word to count in the site') 8 | args = parser.parse_args() 9 | 10 | pat_is_http = re.compile(r'(https?:\/\/[^\s]+)') 11 | dic = {} 12 | 13 | 14 | def url_contain_http(url, pat_is_http): 15 | ''' Checks if url contains http ''' 16 | if not pat_is_http.match(url): 17 | url = 'http://' + url 18 | return url 19 | 20 | 21 | def crawler(url): 22 | try: 23 | url = url_contain_http(args.url, pat_is_http) 24 | return requests.get(url).text 25 | except Exception as e: 26 | print('Erro na conexão', e) 27 | return None 28 | 29 | 30 | def quantity_word(word=''): 31 | ''' Count words in text returned ''' 32 | res = crawler(args.url) 33 | dic[word] = res.count(word) 34 | return dic 35 | 36 | 37 | if __name__ == '__main__': 38 | print(quantity_word(args.word)) 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Api Crawler Challenge 2 | 3 | ## Objetivo 4 | 5 | Desenvolver uma API que receba dois parametros: 6 | 7 | 1 - Uma url de um site qualquer 8 | 9 | 2 - Uma palavra qualquer 10 | 11 | A API deve ser capaz de fazer um crawler no site informado e retornar uma resposta contendo um json com a quantidade de ocorrências da palavra informada. 12 | 13 | ## Versão 14 | 15 | * Python 3.5.0 16 | * Django==1.10.4 17 | 18 | 19 | ## Instalação 20 | 21 | ```bash 22 | git clone https://github.com/rg3915/api_crawler_challenge.git 23 | cd api_crawler_challenge 24 | python -m venv .venv 25 | source .venv/bin/activate 26 | pip install -r requirements.txt 27 | python contrib/secret_gen.py 28 | ``` 29 | 30 | ## Crawler 31 | 32 | Temos um primeiro exemplo simples feito com Python puro. No terminal digite 33 | 34 | ```bash 35 | python crawler.py -h 36 | python crawler.py python.org Python 37 | ``` 38 | 39 | ## Api Rest 40 | 41 | ```bash 42 | python manage.py runserver 43 | ``` 44 | 45 | No Browser digite, por exemplo 46 | 47 | ``` 48 | localhost:8000/api/?url=python.org&w=Python 49 | ``` 50 | 51 | *pontotel* -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | .venv/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | *.sqlite3 93 | *.DS_Store 94 | -------------------------------------------------------------------------------- /myproject/core/tests.py: -------------------------------------------------------------------------------- 1 | 2 | from django.test import TestCase 3 | from myproject.core.crawler import contains_http, get_text_content 4 | 5 | 6 | class APIGetTest(TestCase): 7 | """Tests for API GET Method.""" 8 | 9 | def setUp(self): 10 | self.res = self.client.get('/api/?url=python.org&w=Python') 11 | 12 | def test_get(self): 13 | """Must return status code 200.""" 14 | self.assertEqual(200, self.res.status_code) 15 | 16 | def test_content(self): 17 | """Response must contain word chosen (Python) and its number of word occurrences (98).""" 18 | CONTENTS = ('Python', '98') 19 | 20 | for content in CONTENTS: 21 | with self.subTest(): 22 | self.assertContains(self.res, content) 23 | 24 | 25 | class CrawlerTest(TestCase): 26 | """Tests for crawler.""" 27 | 28 | def test_response_content(self): 29 | """Response must return website content.""" 30 | res = get_text_content('http://www.python.org') 31 | self.assertTrue(res) 32 | 33 | 34 | class UrlSchemesTest(TestCase): 35 | """Tests for HTTP/HTTPS schemes.""" 36 | 37 | def test_url_not_contains_http(self): 38 | """Must add scheme HTTP in the beginning of the URL.""" 39 | url = 'www.python.org' 40 | self.assertEqual('http://' + url, contains_http(url)) 41 | 42 | def test_url_contains_http(self): 43 | """Must keep the URL the same when they have either HTTP or HTTPS schema.""" 44 | urls = ['http://www.python.org', 'https://www.python.org'] 45 | for url in urls: 46 | with self.subTest(): 47 | self.assertEqual(url, contains_http(url)) 48 | -------------------------------------------------------------------------------- /myproject/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from decouple import config, Csv 3 | from dj_database_url import parse as dburl 4 | 5 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 6 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 7 | 8 | 9 | # Quick-start development settings - unsuitable for production 10 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ 11 | 12 | # SECURITY WARNING: keep the secret key used in production secret! 13 | SECRET_KEY = config('SECRET_KEY') 14 | 15 | DEBUG = config('DEBUG', default=False, cast=bool) 16 | 17 | ALLOWED_HOSTS = config('ALLOWED_HOSTS', default=[], cast=Csv()) 18 | 19 | # Application definition 20 | 21 | INSTALLED_APPS = [ 22 | 'django.contrib.admin', 23 | 'django.contrib.auth', 24 | 'django.contrib.contenttypes', 25 | 'django.contrib.sessions', 26 | 'django.contrib.messages', 27 | 'django.contrib.staticfiles', 28 | 'myproject.core', 29 | ] 30 | 31 | MIDDLEWARE = [ 32 | 'django.middleware.security.SecurityMiddleware', 33 | 'django.contrib.sessions.middleware.SessionMiddleware', 34 | 'django.middleware.common.CommonMiddleware', 35 | 'django.middleware.csrf.CsrfViewMiddleware', 36 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 37 | 'django.contrib.messages.middleware.MessageMiddleware', 38 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 39 | ] 40 | 41 | ROOT_URLCONF = 'myproject.urls' 42 | 43 | TEMPLATES = [ 44 | { 45 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 46 | 'DIRS': [], 47 | 'APP_DIRS': True, 48 | 'OPTIONS': { 49 | 'context_processors': [ 50 | 'django.template.context_processors.debug', 51 | 'django.template.context_processors.request', 52 | 'django.contrib.auth.context_processors.auth', 53 | 'django.contrib.messages.context_processors.messages', 54 | ], 55 | }, 56 | }, 57 | ] 58 | 59 | WSGI_APPLICATION = 'myproject.wsgi.application' 60 | 61 | 62 | # Database 63 | # https://docs.djangoproject.com/en/1.10/ref/settings/#databases 64 | 65 | default_dburl = 'sqlite:///' + os.path.join(BASE_DIR, 'db.sqlite3') 66 | DATABASES = { 67 | 'default': config('DATABASE_URL', default=default_dburl, cast=dburl), 68 | } 69 | 70 | 71 | # Password validation 72 | # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators 73 | 74 | AUTH_PASSWORD_VALIDATORS = [ 75 | { 76 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 77 | }, 78 | { 79 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 80 | }, 81 | { 82 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 83 | }, 84 | { 85 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 86 | }, 87 | ] 88 | 89 | 90 | # Internationalization 91 | # https://docs.djangoproject.com/en/1.10/topics/i18n/ 92 | 93 | LANGUAGE_CODE = 'pt-br' 94 | 95 | TIME_ZONE = 'America/Sao_Paulo' 96 | 97 | USE_I18N = True 98 | 99 | USE_L10N = True 100 | 101 | USE_TZ = True 102 | 103 | 104 | # Static files (CSS, JavaScript, Images) 105 | # https://docs.djangoproject.com/en/1.10/howto/static-files/ 106 | 107 | STATIC_URL = '/static/' 108 | STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles') 109 | 110 | LOGIN_URL = '/admin/login/' 111 | --------------------------------------------------------------------------------