├── .gitignore ├── .gitmodules ├── LICENSE.txt ├── README.md ├── logs └── .gitkeep ├── manage.py ├── meta.json ├── requirements.txt ├── sqlite └── .gitkeep ├── src ├── css │ └── codemirror.css ├── img │ └── zup-logo.png ├── js │ ├── app.js │ ├── controllers.js │ ├── libs │ │ ├── angular-ui-codemirror.js │ │ ├── codemirror.js │ │ ├── jquery.scrolltofixed.min.js │ │ ├── jquery.toastmessage.js │ │ └── less-1.7.0.min.js │ └── services.js └── less │ └── style.less ├── templates └── zup │ └── index.html ├── test └── urls.tsv ├── zup.gif └── zup ├── __init__.py ├── admin.py ├── api.py ├── forms.py ├── local_settings.py.example ├── management ├── __init__.py └── commands │ ├── __init__.py │ ├── clean.py │ ├── start_job.py │ └── urls_to_zip.py ├── models.py ├── settings.py ├── tests.py ├── urls.py ├── utils.py ├── views.py └── wsgi.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.sublime-* 3 | *.pyc 4 | *.sqlite3 5 | static 6 | *.log 7 | local_settings.py 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "glue"] 2 | path = glue 3 | url = git@github.com:danieleguido/glue.git 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | zup 2 | === 3 | 4 | a simple interface to extract texts from (almost) any url. 5 | 6 | ![https://raw.githubusercontent.com/medialab/zup/master/zup.gif](https://raw.githubusercontent.com/medialab/zup/master/zup.gif "ZUP") 7 | 8 | ## installation 9 | Clone the repository and its submodules 10 | 11 | clone --recursive https://github.com/medialab/zup.git zup 12 | 13 | If `--recursive` options is not available, consider using these command to install submodules 14 | 15 | cd zup 16 | git submodule init 17 | git submodule update 18 | 19 | Create and activate a dedicated virtualenv for zup. If you're not sure, please follow this [how-to](http://docs.python-guide.org/en/latest/dev/virtualenvs/) 20 | 21 | Install then the dependencies via pip packages: 22 | 23 | cd zup 24 | pip install -r requirements.txt 25 | 26 | Some python package that zup requires, like lxml, needs some other libraries to be available. On unix environment, make sure the development packages of libxml2 and libxslt are installed. 27 | 28 | sudo apt-get install libxml2-dev libxslt1-dev python-dev 29 | 30 | In case, you are using Ubuntu/Lubuntu 13.04 or Ubuntu 13.10 and having problem with "/usr/bin/ld: cannot find -lz", you may need also install zlib1g-dev package. 31 | 32 | sudo apt-get install zlib1g-dev 33 | 34 | ## Configuration 35 | Once installation has been completed, there is one more step: configuration. 36 | 37 | cd zup 38 | cp local_settings.py.example local_settings.py 39 | 40 | And modify these two lines according to your own virtualenv 41 | 42 | SECRET_KEY = 'your own generated secret key' 43 | 44 | PYTHON_INTERPRETER = '/.virtualenvs/zup/bin/python' 45 | 46 | Also define in which directories you want to install Django's static files: 47 | 48 | STATIC_ROOT = '/static' 49 | MEDIA_ROOT = '/media' 50 | 51 | ## Run 52 | Zup needs a light sqlite database 53 | 54 | cd zup 55 | python manage.py syncdb 56 | python manage.py test 57 | python manage.py collectstatic 58 | 59 | Test your installation with 60 | 61 | python manage.py runserver 62 | 63 | 64 | -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/logs/.gitkeep -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "zup.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ZUP", 3 | "authors": ["Daniele Guido"], 4 | "url": "", 5 | "source": "https://github.com/medialab/zup", 6 | "licence": "LGPL/CECILL-C", 7 | "visual": "", 8 | "description": "A simple interface to extract texts from (almost) any url." 9 | } 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.1 2 | Django==1.6.5 3 | Pillow==2.5.0 4 | PyYAML==3.11 5 | cssselect==0.9.1 6 | goose-extractor==1.0.22 7 | jieba==0.32 8 | lxml==3.3.5 9 | nltk==2.0.4 10 | unicodecsv==0.9.4 11 | wsgiref==0.1.2 12 | -------------------------------------------------------------------------------- /sqlite/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/sqlite/.gitkeep -------------------------------------------------------------------------------- /src/css/codemirror.css: -------------------------------------------------------------------------------- 1 | /* BASICS */ 2 | 3 | .CodeMirror { 4 | /* Set height, width, borders, and global font properties here */ 5 | font-family: monospace; 6 | height: 300px; 7 | } 8 | .CodeMirror-scroll { 9 | /* Set scrolling behaviour here */ 10 | overflow: auto; 11 | } 12 | 13 | /* PADDING */ 14 | 15 | .CodeMirror-lines { 16 | padding: 4px 0; /* Vertical padding around content */ 17 | } 18 | .CodeMirror pre { 19 | padding: 0 4px; /* Horizontal padding of content */ 20 | } 21 | 22 | .CodeMirror-scrollbar-filler, .CodeMirror-gutter-filler { 23 | background-color: white; /* The little square between H and V scrollbars */ 24 | } 25 | 26 | /* GUTTER */ 27 | 28 | .CodeMirror-gutters { 29 | border-right: 1px solid #ddd; 30 | background-color: #f7f7f7; 31 | white-space: nowrap; 32 | } 33 | .CodeMirror-linenumbers {} 34 | .CodeMirror-linenumber { 35 | padding: 0 3px 0 5px; 36 | min-width: 20px; 37 | text-align: right; 38 | color: #999; 39 | -moz-box-sizing: content-box; 40 | box-sizing: content-box; 41 | } 42 | 43 | .CodeMirror-guttermarker { color: black; } 44 | .CodeMirror-guttermarker-subtle { color: #999; } 45 | 46 | /* CURSOR */ 47 | 48 | .CodeMirror div.CodeMirror-cursor { 49 | border-left: 1px solid black; 50 | } 51 | /* Shown when moving in bi-directional text */ 52 | .CodeMirror div.CodeMirror-secondarycursor { 53 | border-left: 1px solid silver; 54 | } 55 | .CodeMirror.cm-keymap-fat-cursor div.CodeMirror-cursor { 56 | width: auto; 57 | border: 0; 58 | background: #7e7; 59 | } 60 | .cm-animate-fat-cursor { 61 | width: auto; 62 | border: 0; 63 | -webkit-animation: blink 1.06s steps(1) infinite; 64 | -moz-animation: blink 1.06s steps(1) infinite; 65 | animation: blink 1.06s steps(1) infinite; 66 | } 67 | @-moz-keyframes blink { 68 | 0% { background: #7e7; } 69 | 50% { background: none; } 70 | 100% { background: #7e7; } 71 | } 72 | @-webkit-keyframes blink { 73 | 0% { background: #7e7; } 74 | 50% { background: none; } 75 | 100% { background: #7e7; } 76 | } 77 | @keyframes blink { 78 | 0% { background: #7e7; } 79 | 50% { background: none; } 80 | 100% { background: #7e7; } 81 | } 82 | 83 | /* Can style cursor different in overwrite (non-insert) mode */ 84 | div.CodeMirror-overwrite div.CodeMirror-cursor {} 85 | 86 | .cm-tab { display: inline-block; } 87 | 88 | .CodeMirror-ruler { 89 | border-left: 1px solid #ccc; 90 | position: absolute; 91 | } 92 | 93 | /* DEFAULT THEME */ 94 | 95 | .cm-s-default .cm-keyword {color: #708;} 96 | .cm-s-default .cm-atom {color: #219;} 97 | .cm-s-default .cm-number {color: #164;} 98 | .cm-s-default .cm-def {color: #00f;} 99 | .cm-s-default .cm-variable, 100 | .cm-s-default .cm-punctuation, 101 | .cm-s-default .cm-property, 102 | .cm-s-default .cm-operator {} 103 | .cm-s-default .cm-variable-2 {color: #05a;} 104 | .cm-s-default .cm-variable-3 {color: #085;} 105 | .cm-s-default .cm-comment {color: #a50;} 106 | .cm-s-default .cm-string {color: #a11;} 107 | .cm-s-default .cm-string-2 {color: #f50;} 108 | .cm-s-default .cm-meta {color: #555;} 109 | .cm-s-default .cm-qualifier {color: #555;} 110 | .cm-s-default .cm-builtin {color: #30a;} 111 | .cm-s-default .cm-bracket {color: #997;} 112 | .cm-s-default .cm-tag {color: #170;} 113 | .cm-s-default .cm-attribute {color: #00c;} 114 | .cm-s-default .cm-header {color: blue;} 115 | .cm-s-default .cm-quote {color: #090;} 116 | .cm-s-default .cm-hr {color: #999;} 117 | .cm-s-default .cm-link {color: #00c;} 118 | 119 | .cm-negative {color: #d44;} 120 | .cm-positive {color: #292;} 121 | .cm-header, .cm-strong {font-weight: bold;} 122 | .cm-em {font-style: italic;} 123 | .cm-link {text-decoration: underline;} 124 | 125 | .cm-s-default .cm-error {color: #f00;} 126 | .cm-invalidchar {color: #f00;} 127 | 128 | /* Default styles for common addons */ 129 | 130 | div.CodeMirror span.CodeMirror-matchingbracket {color: #0f0;} 131 | div.CodeMirror span.CodeMirror-nonmatchingbracket {color: #f22;} 132 | .CodeMirror-matchingtag { background: rgba(255, 150, 0, .3); } 133 | .CodeMirror-activeline-background {background: #e8f2ff;} 134 | 135 | /* STOP */ 136 | 137 | /* The rest of this file contains styles related to the mechanics of 138 | the editor. You probably shouldn't touch them. */ 139 | 140 | .CodeMirror { 141 | line-height: 1; 142 | position: relative; 143 | overflow: hidden; 144 | background: white; 145 | color: black; 146 | } 147 | 148 | .CodeMirror-scroll { 149 | /* 30px is the magic margin used to hide the element's real scrollbars */ 150 | /* See overflow: hidden in .CodeMirror */ 151 | margin-bottom: -30px; margin-right: -30px; 152 | padding-bottom: 30px; 153 | height: 100%; 154 | outline: none; /* Prevent dragging from highlighting the element */ 155 | position: relative; 156 | -moz-box-sizing: content-box; 157 | box-sizing: content-box; 158 | } 159 | .CodeMirror-sizer { 160 | position: relative; 161 | border-right: 30px solid transparent; 162 | -moz-box-sizing: content-box; 163 | box-sizing: content-box; 164 | } 165 | 166 | /* The fake, visible scrollbars. Used to force redraw during scrolling 167 | before actuall scrolling happens, thus preventing shaking and 168 | flickering artifacts. */ 169 | .CodeMirror-vscrollbar, .CodeMirror-hscrollbar, .CodeMirror-scrollbar-filler, .CodeMirror-gutter-filler { 170 | position: absolute; 171 | z-index: 6; 172 | display: none; 173 | } 174 | .CodeMirror-vscrollbar { 175 | right: 0; top: 0; 176 | overflow-x: hidden; 177 | overflow-y: scroll; 178 | } 179 | .CodeMirror-hscrollbar { 180 | bottom: 0; left: 0; 181 | overflow-y: hidden; 182 | overflow-x: scroll; 183 | } 184 | .CodeMirror-scrollbar-filler { 185 | right: 0; bottom: 0; 186 | } 187 | .CodeMirror-gutter-filler { 188 | left: 0; bottom: 0; 189 | } 190 | 191 | .CodeMirror-gutters { 192 | position: absolute; left: 0; top: 0; 193 | padding-bottom: 30px; 194 | z-index: 3; 195 | } 196 | .CodeMirror-gutter { 197 | white-space: normal; 198 | height: 100%; 199 | -moz-box-sizing: content-box; 200 | box-sizing: content-box; 201 | padding-bottom: 30px; 202 | margin-bottom: -32px; 203 | display: inline-block; 204 | /* Hack to make IE7 behave */ 205 | *zoom:1; 206 | *display:inline; 207 | } 208 | .CodeMirror-gutter-elt { 209 | position: absolute; 210 | cursor: default; 211 | z-index: 4; 212 | } 213 | 214 | .CodeMirror-lines { 215 | cursor: text; 216 | } 217 | .CodeMirror pre { 218 | /* Reset some styles that the rest of the page might have set */ 219 | -moz-border-radius: 0; -webkit-border-radius: 0; border-radius: 0; 220 | border-width: 0; 221 | background: transparent; 222 | font-family: inherit; 223 | font-size: inherit; 224 | margin: 0; 225 | white-space: pre; 226 | word-wrap: normal; 227 | line-height: inherit; 228 | color: inherit; 229 | z-index: 2; 230 | position: relative; 231 | overflow: visible; 232 | } 233 | .CodeMirror-wrap pre { 234 | word-wrap: break-word; 235 | white-space: pre-wrap; 236 | word-break: normal; 237 | } 238 | 239 | .CodeMirror-linebackground { 240 | position: absolute; 241 | left: 0; right: 0; top: 0; bottom: 0; 242 | z-index: 0; 243 | } 244 | 245 | .CodeMirror-linewidget { 246 | position: relative; 247 | z-index: 2; 248 | overflow: auto; 249 | } 250 | 251 | .CodeMirror-widget {} 252 | 253 | .CodeMirror-wrap .CodeMirror-scroll { 254 | overflow-x: hidden; 255 | } 256 | 257 | .CodeMirror-measure { 258 | position: absolute; 259 | width: 100%; 260 | height: 0; 261 | overflow: hidden; 262 | visibility: hidden; 263 | } 264 | .CodeMirror-measure pre { position: static; } 265 | 266 | .CodeMirror div.CodeMirror-cursor { 267 | position: absolute; 268 | border-right: none; 269 | width: 0; 270 | } 271 | 272 | div.CodeMirror-cursors { 273 | visibility: hidden; 274 | position: relative; 275 | z-index: 1; 276 | } 277 | .CodeMirror-focused div.CodeMirror-cursors { 278 | visibility: visible; 279 | } 280 | 281 | .CodeMirror-selected { background: #d9d9d9; } 282 | .CodeMirror-focused .CodeMirror-selected { background: #d7d4f0; } 283 | .CodeMirror-crosshair { cursor: crosshair; } 284 | 285 | .cm-searching { 286 | background: #ffa; 287 | background: rgba(255, 255, 0, .4); 288 | } 289 | 290 | /* IE7 hack to prevent it from returning funny offsetTops on the spans */ 291 | .CodeMirror span { *vertical-align: text-bottom; } 292 | 293 | /* Used to force a border model for a node */ 294 | .cm-force-border { padding-right: .1px; } 295 | 296 | @media print { 297 | /* Hide the cursor when printing */ 298 | .CodeMirror div.CodeMirror-cursors { 299 | visibility: hidden; 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /src/img/zup-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/src/img/zup-logo.png -------------------------------------------------------------------------------- /src/js/app.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | angular.module('zup', [ 3 | 'ngRoute', 4 | 'zup.controllers', 5 | 'zup.services' 6 | 7 | ]) 8 | .config(['$routeProvider', '$httpProvider', function($routeProvider, $httpProvider, ToastFactory, $cookies) { 9 | $httpProvider.defaults.xsrfHeaderName = 'X-CSRFToken'; 10 | $httpProvider.defaults.xsrfCookieName = 'csrftoken'; 11 | 12 | $httpProvider.responseInterceptors.push(['$q','$log', function($q, $log) { 13 | return function(promise) { 14 | return promise.then(function(response) { 15 | response.data.extra = ''; 16 | if(response.data.status == "error"){ 17 | $log.error('ZUP api errors',response.data.error); 18 | return $q.reject(response); 19 | } 20 | if(response.data.meta && response.data.meta.warnings){ // form error from server! 21 | // if(response.data.meta.warnings.invalid && response.data.meta.warnings.limit): 22 | // exceute, but send a message 23 | $log.info('warnings',response.data.meta.warnings); 24 | // return $q.reject(response); 25 | } 26 | return response; 27 | }, function(response) { // The HTTP request was not successful. 28 | if (response.status === 401) { 29 | response.data = { 30 | status: 'error', 31 | description: 'Authentication required, or TIMEOUT session!' 32 | }; 33 | return response; 34 | } 35 | return $q.reject(response); 36 | }); 37 | }; 38 | }]); 39 | }]); -------------------------------------------------------------------------------- /src/js/controllers.js: -------------------------------------------------------------------------------- 1 | const JOB_GET_PENDING = 'JOB_GET_PENDING'; 2 | const JOB_CREATED = 'JOB_CREATED'; 3 | const JOB_LOADED = 'JOB_LOADED'; 4 | const JOB_RUNNING = 'JOB_RUNNING'; 5 | 6 | 7 | angular.module('zup.controllers', ['ngCookies', 'ui.codemirror']) 8 | /* 9 | 10 | The very main controller. 11 | === 12 | */ 13 | .controller('zupCtrl', ['$scope', '$rootScope', '$log', 'JobsFactory', 'ToastFactory', function($scope, $rootScope, $log, JobsFactory, ToastFactory) { 14 | //ToastFactory.toast('ciao'); 15 | $scope.job = {}; 16 | 17 | 18 | $scope.save = function() { 19 | if($scope.job.id) { 20 | 21 | 22 | } else { 23 | JobsFactory.save($scope.job, function(data) { 24 | console.log(data); 25 | ToastFactory.toast('url list saved, starting ...'); 26 | 27 | $scope.job = data.object; 28 | // start listening 29 | $rootScope.$emit(JOB_CREATED, $scope.job.id); 30 | }) 31 | } 32 | }; 33 | 34 | $scope.download = function(job) { 35 | window.open(location.pathname + 'api/job/' + job.id + '/download', '_blank', ''); 36 | } 37 | 38 | 39 | $rootScope.$on(JOB_GET_PENDING, function(e, pending_job_ids){ 40 | $log.info('zupCtrl @JOB_GET_PENDING', pending_job_ids); 41 | var last_job_id = pending_job_ids.pop(); 42 | if(!isNaN(parseFloat(last_job_id)) && isFinite(last_job_id)) 43 | $rootScope.$emit(JOB_LOADED, last_job_id); 44 | 45 | }); 46 | 47 | $rootScope.$on(JOB_RUNNING, function(e, job){ 48 | $log.info('zupCtrl @JOB_RUNNING', job); 49 | $scope.job = job; 50 | }); 51 | 52 | $log.info('zupCtrl loaded'); 53 | }]) 54 | 55 | .controller('jobCtrl', ['$scope', '$rootScope', '$log', '$timeout', 'JobFactory', function($scope, $rootScope, $log, $timeout, JobFactory){ 56 | $scope.job_id = 0; // we keep the id separated, because ... i do not know 57 | $scope.job = {}; 58 | $scope.listening = false; 59 | 60 | 61 | function tick() { 62 | JobFactory.query({id: $scope.job_id}, function(data){ 63 | console.log(data); 64 | $timeout(tick, 3617); 65 | $rootScope.$emit(JOB_RUNNING, data.object); 66 | }, function(data){ 67 | $log.info('ticking error',data); // status 500 or 404 or other stuff 68 | $timeout(tick, 3917); 69 | }); /// todo HANDLE correctly connection refused 70 | }; 71 | 72 | 73 | $rootScope.$on(JOB_CREATED, function(e, id){ 74 | $log.info('jobCtrl @JOB_CREATED', id); 75 | $scope.job_id = id; 76 | !$scope.listening && tick(); 77 | $scope.listening = true; 78 | }); 79 | 80 | 81 | $rootScope.$on(JOB_LOADED, function(e, id){ 82 | $log.info('jobCtrl @JOB_LOADED', id); 83 | $scope.job_id = id; 84 | !$scope.listening && tick(); 85 | $scope.listening = true; 86 | }); 87 | 88 | $log.info('jobCtrl loaded'); 89 | }]) 90 | 91 | .controller('pendingCtrl', ['$scope', '$rootScope', '$log', '$cookies', function($scope, $rootScope, $log, $cookies) { 92 | $scope.pendings = $cookies.pendings? $cookies.pendings.split(','): []; 93 | 94 | $rootScope.$on(JOB_CREATED, function(e, id){ 95 | $log.info('pendingCtrl @JOB_CREATED'); 96 | $scope.pendings.push(id); 97 | $cookies.pendings = $scope.pendings.join('').split('').join(','); 98 | }); 99 | 100 | $log.info('pendingCtrl loaded. pendings:', $scope.pendings ); 101 | // $rootScope.$emit(JOB_GET_PENDING, angular.copy($scope.pendings)); 102 | }]) -------------------------------------------------------------------------------- /src/js/libs/angular-ui-codemirror.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | /** 3 | * Binds a CodeMirror widget to a 113 | 114 | 115 |
116 |
117 | 118 |
119 |
{{$index + 1}}
120 |
{{u.url}}
121 |
122 |
123 |
124 | 125 | 126 | 127 | 128 | 136 | 137 | 140 | 141 | {% endverbatim %} 142 | 143 | 144 | 145 | 146 | 147 | 148 |
 
149 |
 
150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /test/urls.tsv: -------------------------------------------------------------------------------- 1 | urls 2 | http://www.corriere.it/cronache/14_luglio_07/estate-ritirata-piogge-temporali-8e993fc8-0607-11e4-9ae2-2d514cff7f8f.shtml 3 | http://www.theguardian.com/news/datablog/2014/jul/07/which-phones-battery-life-stop-boarding-flight 4 | http://www.corrieredellosport.it/calcio/mondiali_2014/2014/07/11-368890/Roma%2C+visite+mediche+per+Emanuelson -------------------------------------------------------------------------------- /zup.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/zup.gif -------------------------------------------------------------------------------- /zup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/zup/__init__.py -------------------------------------------------------------------------------- /zup/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from zup.models import Url, Job 3 | 4 | class JobAdmin(admin.ModelAdmin): 5 | search_fields = ['name'] 6 | 7 | 8 | class UrlAdmin(admin.ModelAdmin): 9 | search_fields = ['url'] 10 | 11 | 12 | admin.site.register(Job, JobAdmin) 13 | admin.site.register(Url, UrlAdmin) -------------------------------------------------------------------------------- /zup/api.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | from django.db import transaction 3 | from glue import Epoxy, API_EXCEPTION_AUTH, API_EXCEPTION_FORMERRORS, API_EXCEPTION_DOESNOTEXIST 4 | from zup.forms import JobForm 5 | from zup.models import Url, Job 6 | 7 | # api 8 | def home(request): 9 | ''' 10 | Help or manual should be placed here 11 | ''' 12 | result = Epoxy(request) 13 | return result.json() 14 | 15 | 16 | def job(request, pk): 17 | epoxy = Epoxy(request) 18 | try: 19 | epoxy.item(Job.objects.get(pk=pk), deep=True) 20 | except Job.DoesNotExist, e: 21 | return epoxy.throw_error(code=API_EXCEPTION_DOESNOTEXIST, error=e).json() 22 | 23 | return epoxy.json() 24 | 25 | 26 | 27 | def jobs(request): 28 | epoxy = Epoxy(request) 29 | 30 | if epoxy.is_POST(): 31 | form = JobForm(epoxy.data) 32 | 33 | if not form.is_valid(): 34 | return epoxy.throw_error(error=form.errors).json() 35 | 36 | with transaction.atomic(): 37 | job = form.save() 38 | # Cfr forms.py claened data is here a list of (not yet) valid url 39 | urllist = form.cleaned_data['url_list'] 40 | # limit on url list 41 | if not request.user.is_staff: 42 | urllist = urllist[:settings.URLS_LIMIT] 43 | 44 | for url in urllist: 45 | u = Url(url=url) 46 | u.save() 47 | job.urls.add(u) 48 | 49 | job.start(cmd='scrape') 50 | epoxy.item(job) 51 | 52 | return epoxy.json() 53 | 54 | 55 | 56 | def job_download(request, pk): 57 | import os 58 | from mimetypes import guess_type 59 | from django.core.servers.basehttp import FileWrapper 60 | from django.http import HttpResponse 61 | 62 | epoxy = Epoxy(request) # useful to handle errors 63 | try: 64 | j = Job.objects.get(pk=pk) 65 | except Job.DoesNotExist, e: 66 | return epoxy.throw_error(code=API_EXCEPTION_DOESNOTEXIST, error=e).json() 67 | 68 | filepath = os.path.join(j.get_path(), 'urls_to_zip.zip') 69 | 70 | if not os.path.exists(filepath): 71 | return epoxy.throw_error(code=API_EXCEPTION_DOESNOTEXIST, error='Job does not seem to have any downloadable file associated').json() 72 | 73 | content_type = guess_type(filepath) 74 | wrapper = FileWrapper(file(filepath)) 75 | response = HttpResponse(wrapper, content_type=content_type[0]) 76 | response['Content-Length'] = os.path.getsize(filepath) 77 | response['Content-Disposition'] = 'attachment; filename=%s--%s[zup].zip' % (j.slug, j.date_created.strftime('%Y-%m-%d--%H-%M-%S')) 78 | return response 79 | -------------------------------------------------------------------------------- /zup/forms.py: -------------------------------------------------------------------------------- 1 | from django import forms 2 | from zup.models import Job, Url 3 | 4 | 5 | class JobForm(forms.ModelForm): 6 | url_list = forms.CharField() 7 | 8 | def clean_url_list(self): 9 | urls = filter(None, self.cleaned_data['url_list'].split('\n')) 10 | return urls 11 | 12 | class Meta: 13 | model = Job 14 | fields = ['name', 'url_list'] -------------------------------------------------------------------------------- /zup/local_settings.py.example: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DEBUG = True 4 | ALLOWED_HOSTS = [] # change this according to the documentation 5 | 6 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 7 | 8 | TITLE = 'ZUP' 9 | SECRET_KEY = 'your own generated secret key' 10 | 11 | # limit on url number (not applicable for admin staff) 12 | URLS_LIMIT=100 13 | 14 | DB_ENGINE = 'django.db.backends.sqlite3' 15 | DB_NAME = os.path.join(BASE_DIR, 'sqlite/db.sqlite3') # given as example 16 | 17 | LANGUAGE_CODE = 'en-us' 18 | 19 | STATIC_ROOT = os.path.join(BASE_DIR, '../static') 20 | STATIC_URL = '/static/' 21 | 22 | MEDIA_ROOT = os.path.join(BASE_DIR, '../media') 23 | MEDIA_URL = '/media/' 24 | 25 | TMP_ROOT = os.path.join(BASE_DIR, 'tmp') 26 | 27 | PYTHON_INTERPRETER = '/home/daniele/.virtualenvs/zup/bin/python' # mine, given as exemple. Cfr virtualenv doc. 28 | 29 | ENABLE_CDN_SERVICES = False # set to true if you want to use CDN. This const will be used in templates 30 | 31 | CLEANING_AFTER_SECONDS = 30 * 60 -------------------------------------------------------------------------------- /zup/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/zup/management/__init__.py -------------------------------------------------------------------------------- /zup/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/zup/c53c0d67d830ea679952053344d0e6e9b53c4b28/zup/management/commands/__init__.py -------------------------------------------------------------------------------- /zup/management/commands/clean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging, datetime 4 | 5 | from django.conf import settings 6 | from django.core.management.base import BaseCommand, CommandError 7 | from django.utils import timezone 8 | from zup.models import Job 9 | 10 | 11 | logger = logging.getLogger('zup.clean') 12 | 13 | 14 | # usage type: 15 | # 16 | # python manage.py clean 17 | # 18 | class Command(BaseCommand): 19 | 20 | args = '' 21 | help = 'execute some job on corpus.' 22 | option_list = BaseCommand.option_list 23 | 24 | def handle(self, *args, **options): 25 | logger.info('doing some cleaning') 26 | 27 | now = timezone.now() 28 | 29 | for job in Job.objects.exclude(status=Job.RUNNING): 30 | try: 31 | if (now - job.date_last_modified).total_seconds() > settings.CLEANING_AFTER_SECONDS: # more than two days ago 32 | logger.info('removing job "%s" because of its obsolesence of %s seconds' % (job.name, (now - job.date_last_modified).total_seconds())) 33 | job.delete() 34 | except Exception, e: 35 | logger.info('problem during cleaning ...') 36 | logger.exception(e) 37 | 38 | logger.info('cleaning complete') -------------------------------------------------------------------------------- /zup/management/commands/start_job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os, csv, time, codecs, shutil, urllib2, logging 4 | from optparse import make_option 5 | from datetime import datetime 6 | 7 | from django.core.management.base import BaseCommand, CommandError 8 | from django.utils.text import slugify 9 | 10 | from zipfile import ZipFile 11 | from zup.utils import gooseapi, unicode_dict_reader, unique_mkdir 12 | from zup.models import Job, Url 13 | 14 | 15 | logger = logging.getLogger('zup') 16 | 17 | 18 | class Command(BaseCommand): 19 | ''' 20 | usage type: 21 | 22 | python manage.py start_job --job=1 --cmd=test 23 | ''' 24 | args = '' 25 | help = 'execute some job on corpus.' 26 | option_list = BaseCommand.option_list + ( 27 | make_option('--job', 28 | action='store', 29 | dest='job_pk', 30 | default=False, 31 | help='job primary key'), 32 | 33 | make_option('--cmd', 34 | action='store', 35 | dest='cmd', 36 | default=False, 37 | help='manage.py command to be executed'), 38 | ) 39 | 40 | 41 | def _test(self, job): 42 | job.status = Job.RUNNING 43 | job.save() 44 | time.sleep(15) 45 | job.status = Job.COMPLETED 46 | job.save() 47 | 48 | 49 | def _scrape(self, job, fields=['title', 'tags', 'meta_keywords']): 50 | logger.debug('starting command "scrape"') 51 | job.status = Job.RUNNING 52 | job.save() 53 | 54 | job_path = job.get_path() 55 | path = unique_mkdir(os.path.join(job_path, 'files')) 56 | 57 | urls = job.urls.all() 58 | # create zip filename and remove previous one 59 | zip_path = os.path.join(job_path, 'urls_to_zip.zip') 60 | if os.path.exists(zip_path): 61 | os.remove(zip_path) 62 | 63 | # create csv report 64 | rep_path = os.path.join(path, 'report.csv') 65 | reports = [] 66 | 67 | logger.debug('zip path: %s' % zip_path) 68 | 69 | 70 | # filename length 71 | max_length = 64 72 | 73 | with ZipFile(zip_path, 'w') as zip_file: 74 | for i,url in enumerate(urls): # sync or not async 75 | index = '%0*d' % (5, int(i) + 1) 76 | url.status= Url.READY 77 | url.save() 78 | 79 | try: 80 | g = gooseapi(url=url.url) 81 | except urllib2.HTTPError, e: 82 | url.status= Url.ERROR 83 | url.log = '%s' % e 84 | url.save() 85 | continue 86 | except urllib2.URLError, e: 87 | url.status= Url.ERROR 88 | url.log = '%s' % e 89 | url.save() 90 | continue 91 | except ValueError, e: # that is, url is not a valid url 92 | url.status= Url.ERROR 93 | url.log = '%s' % e 94 | url.save() 95 | continue 96 | except IOError, e: # probably the stopword file was not found, skip this url 97 | url.status= Url.ERROR 98 | url.log = '%s' % e 99 | url.save() 100 | continue 101 | except Exception, e: 102 | logger.exception(e) 103 | continue 104 | 105 | logger.debug('title: %s', g.title) 106 | logger.debug('url: %s', url.url) 107 | # handling not found title stuff 108 | slug = '%s-%s' % (index,slugify(g.title if g.title else url.url)[:max_length]) 109 | slug_base = slug 110 | 111 | textified = os.path.join(path, slug) 112 | 113 | c = 1 114 | while os.path.exists(textified): 115 | 116 | candidate = '%s-%s-%s' % (index, slug_base, c) 117 | 118 | if len(candidate) > max_length: 119 | slug = slug[:max_length-len('-%s' % c)] 120 | slug = re.sub('\-+','-',candidate) 121 | textified = os.path.join(path, slug) 122 | c += 1 123 | 124 | textified = "%s.txt" % textified 125 | 126 | with codecs.open(textified, encoding='utf-8', mode='w') as f: 127 | f.write('\n\n%s\n\n\n\n' % g.title) 128 | f.write(g.cleaned_text) 129 | 130 | # completed url scraping 131 | url.status= Url.COMPLETED 132 | url.save() 133 | 134 | zip_file.write(textified, os.path.basename(textified)) 135 | 136 | # WRITE SOME REPORT 137 | result = { 138 | 'id': index, 139 | 'path': os.path.basename(textified), 140 | 'url': url.url 141 | } 142 | 143 | for i in fields: 144 | if i == 'tags': 145 | result[i] = ', '.join(getattr(g, i)) 146 | else: 147 | result[i] = getattr(g, i) 148 | result[i]=result[i].encode('utf8') 149 | reports.append(result) 150 | 151 | # JOB FINISHED, WRITING REPORT 152 | with open(rep_path, 'w') as report: 153 | writer = csv.DictWriter(report, ['id', 'path', 'url'] + fields) 154 | writer.writeheader() 155 | for report in reports: 156 | writer.writerow(report) 157 | 158 | zip_file.write(rep_path, os.path.basename(rep_path)) 159 | 160 | shutil.rmtree(path) 161 | # close job 162 | job.status = Job.COMPLETED 163 | job.save() 164 | 165 | 166 | def handle(self, *args, **options): 167 | if not options['cmd']: 168 | raise CommandError("\n ouch. You should specify a valid function as cmd param") 169 | if not options['job_pk']: 170 | raise CommandError("\n ouch. please provide a job id to record logs and other stuff") 171 | 172 | # maximum 5 jobs at the same time 173 | try: 174 | job = Job.objects.get(pk=options['job_pk']) 175 | except Job.DoesNotExist, e: 176 | raise CommandError("\n ouch. Try again, job pk=%s does not exist!" % options['job_pk']) 177 | 178 | cmd = '_%s' % options['cmd'] 179 | 180 | getattr(self, cmd)(job=job) # no job will be charged! 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /zup/management/commands/urls_to_zip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os, csv 4 | from optparse import make_option 5 | from datetime import datetime 6 | 7 | from django.core.management.base import BaseCommand, CommandError 8 | from zup.utils import urls_to_zip, unicode_dict_reader 9 | 10 | 11 | class Command(BaseCommand): 12 | ''' 13 | usage type 14 | 15 | 16 | ''' 17 | args = '' 18 | help = 'From a list of urls (one per line) get the title and the plaintext content.' 19 | option_list = BaseCommand.option_list + ( 20 | make_option('--tsv', 21 | action='store', 22 | dest='tsv', 23 | type='string', 24 | default=None, 25 | help='tsv file for the list. Header "urls" has to be provided'), 26 | ) 27 | 28 | def handle(self, *args, **options): 29 | # set default owner if ldap is not 30 | self.stdout.write("\n * .--.\n / / `\n + | |\n ' \\ \\__,\n * + '--' *\n + /\\\n + .' '. *\n * /======\\ +\n ;:. _ ;\n |:. (_) |\n |:. _ |\n + |:. (_) | *\n ;:. ;\n .' \:. / `.\n / .-'':._.'`-. \\\n |/ /||\\ \\|\n jgs _..--\"\"\"````\"\"\"--.._\n _.-'`` ``'-._\n -' '-\n\n") 31 | 32 | if not options['tsv']: 33 | self.stderr.write(" csv file must be specified") 34 | return 35 | 36 | self.stdout.write(" opening file: '%s'" % options['tsv']) 37 | 38 | f = open(options['tsv'], 'rb') 39 | d = unicode_dict_reader(f, delimiter='\t') 40 | 41 | 42 | # check fields 43 | urls = [row['urls'] for i,row in enumerate(d)] 44 | 45 | urls_to_zip(urls=urls) 46 | 47 | 48 | self.stdout.write(" done!") 49 | self.stdout.write(''' 50 | 51 | + 52 | + * 53 | 54 | * 55 | 56 | 57 | _..--"""````"""--.._ 58 | _.-'`` ``'-._ 59 | -' '- 60 | 61 | ''') 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /zup/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re, os, shutil, subprocess 4 | 5 | from django.conf import settings 6 | from django.db import models 7 | from django.db.models.signals import pre_delete, post_save 8 | from django.contrib.auth.models import User 9 | from django.dispatch import receiver 10 | from django.utils.text import slugify 11 | 12 | def helper_uuslug(model, instance, value, max_length=128): 13 | ''' 14 | create a unique slug key for a specific value, according to other model instances. 15 | instance should be provided not to change instance's own name. 16 | 17 | ''' 18 | slug = slugify(value)[:max_length] # safe autolimiting 19 | slug_base = slug 20 | i = 1; 21 | 22 | while model.objects.exclude(pk=instance.pk).filter(slug=slug).count(): 23 | candidate = '%s-%s' % (slug_base, i) 24 | if len(candidate) > max_length: 25 | slug = slug[:max_length-len('-%s' % i)] 26 | slug = re.sub('\-+','-',candidate) 27 | i += 1 28 | 29 | return slug 30 | 31 | 32 | 33 | class Url(models.Model): 34 | STARTED = 'BOO' 35 | READY = 'REA' 36 | ERROR = 'ERR' 37 | COMPLETED = 'END' 38 | 39 | STATUS_CHOICES = ( 40 | (STARTED, u'started'), 41 | (READY, u'ready'), 42 | (ERROR, u'error'), 43 | (COMPLETED, u'job completed') 44 | ) 45 | 46 | url = models.URLField() 47 | log = models.TextField() # solo errore 48 | 49 | date_created = models.DateTimeField(auto_now_add=True) 50 | date_last_modified = models.DateTimeField(auto_now=True) 51 | 52 | status = models.CharField(max_length=3, choices=STATUS_CHOICES, default=STARTED) 53 | 54 | 55 | def json(self, deep=False): 56 | d = { 57 | 'id': self.id, 58 | 'url': self.url, 59 | 'status': self.status, 60 | 'date_created': self.date_created.isoformat(), 61 | 'date_last_modified': self.date_last_modified.isoformat() if self.date_last_modified else None 62 | } 63 | 64 | if self.date_last_modified is not None: 65 | elapsedTime = self.date_created - self.date_last_modified 66 | d['elapsed'] = elapsedTime.total_seconds() 67 | else: 68 | d['elapsed'] = 0 69 | return d 70 | 71 | 72 | 73 | class Job(models.Model): 74 | STARTED = 'BOO' 75 | RUNNING = 'RUN' 76 | LOST = 'RIP' 77 | COMPLETED = 'END' 78 | TOBEREMOVED = 'RIP' 79 | 80 | STATUS_CHOICES = ( 81 | (STARTED, u'started'), 82 | (RUNNING, u'running'), 83 | (LOST, u'process not found'), 84 | (COMPLETED, u'job completed'), 85 | (TOBEREMOVED, u'to be deleted') 86 | ) 87 | 88 | name = models.CharField(max_length=64) 89 | slug = models.CharField(max_length=64, unique=True) 90 | 91 | date_created = models.DateTimeField(auto_now_add=True) 92 | date_last_modified = models.DateTimeField(auto_now=True) 93 | 94 | urls = models.ManyToManyField(Url) 95 | command = models.TextField() 96 | 97 | status = models.CharField(max_length=3, choices=STATUS_CHOICES, default=STARTED) 98 | 99 | 100 | def __unicode__(self): 101 | return '%s %s' % (self.name, self.status) 102 | 103 | 104 | def json(self, deep=False): 105 | d = { 106 | 'id': self.id, 107 | 'name': self.name, 108 | 'status': self.status, 109 | 'date_created': self.date_created.isoformat(), 110 | 'date_last_modified': self.date_last_modified.isoformat() if self.date_last_modified else None, 111 | } 112 | if deep: 113 | d.update({ 114 | 'urls': [u.json() for u in self.urls.all()] 115 | }) 116 | completed = 0.0; 117 | for url in d['urls']: 118 | completed += 1 if url['status'] != Url.STARTED else 0 119 | 120 | d['completion'] = completed / len(d['urls']) 121 | 122 | d['completion_label'] = '%s of %s' % (completed, len(d['urls'])) 123 | return d 124 | 125 | 126 | def get_path(self): 127 | index = '%0*d' % (5, int(self.pk) + 1) 128 | path = os.path.join(settings.TMP_ROOT, "job-%s-%s" % (self.pk, self.slug)) 129 | if not os.path.exists(path): 130 | os.makedirs(path) 131 | return path 132 | 133 | 134 | def save(self, **kwargs): 135 | if self.pk is None: 136 | self.slug = helper_uuslug(model=Job, instance=self, value=self.name) 137 | 138 | super(Job, self).save() 139 | 140 | # get_path makes use of newborn slug 141 | path = self.get_path() 142 | 143 | 144 | def start(self, cmd=''): 145 | popen_args = [ 146 | settings.PYTHON_INTERPRETER, 147 | os.path.join(settings.BASE_DIR,'manage.py'), 148 | 'start_job', 149 | '--cmd','scrape', 150 | '--job', str(self.pk)] 151 | if self.status == Job.STARTED: 152 | subprocess.Popen(popen_args, stdout=None, stderr=None, close_fds=True) 153 | 154 | print popen_args 155 | 156 | 157 | @receiver(pre_delete, sender=Job) 158 | def delete_job(sender, instance, **kwargs): 159 | ''' 160 | rename or delete the job path linked to the corpus instance. 161 | We should provide a zip with the whole text content under the name ..zip, @todo 162 | ''' 163 | path = instance.get_path() 164 | shutil.rmtree(path) 165 | 166 | -------------------------------------------------------------------------------- /zup/settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Django settings for zup project. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.6/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.6/ref/settings/ 11 | """ 12 | 13 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 14 | import os 15 | import local_settings 16 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = local_settings.SECRET_KEY 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = local_settings.DEBUG 27 | 28 | TEMPLATE_DEBUG = True 29 | 30 | ALLOWED_HOSTS = [] 31 | 32 | 33 | # Application definition 34 | 35 | INSTALLED_APPS = ( 36 | 'django.contrib.admin', 37 | 'django.contrib.auth', 38 | 'django.contrib.contenttypes', 39 | 'django.contrib.sessions', 40 | 'django.contrib.messages', 41 | 'django.contrib.staticfiles', 42 | 43 | 'glue', 44 | 'zup' 45 | ) 46 | 47 | MIDDLEWARE_CLASSES = ( 48 | 'django.contrib.sessions.middleware.SessionMiddleware', 49 | 'django.middleware.common.CommonMiddleware', 50 | 'django.middleware.csrf.CsrfViewMiddleware', 51 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 52 | 'django.contrib.messages.middleware.MessageMiddleware', 53 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 54 | 'django.middleware.locale.LocaleMiddleware', 55 | ) 56 | 57 | ROOT_URLCONF = 'zup.urls' 58 | 59 | WSGI_APPLICATION = 'zup.wsgi.application' 60 | 61 | 62 | # Database 63 | # https://docs.djangoproject.com/en/1.6/ref/settings/#databases 64 | 65 | DATABASES = { 66 | 'default': { 67 | 'ENGINE': local_settings.DB_ENGINE, 68 | 'NAME': local_settings.DB_NAME, 69 | } 70 | } 71 | 72 | # Internationalization 73 | # https://docs.djangoproject.com/en/1.6/topics/i18n/ 74 | 75 | LANGUAGE_CODE = local_settings.LANGUAGE_CODE 76 | 77 | TIME_ZONE = 'UTC' 78 | 79 | USE_I18N = True 80 | 81 | USE_L10N = True 82 | 83 | USE_TZ = True 84 | 85 | 86 | # Static files (CSS, JavaScript, Images) 87 | # https://docs.djangoproject.com/en/1.6/howto/static-files/ 88 | 89 | STATIC_URL = '/static/' 90 | 91 | 92 | TITLE = local_settings.TITLE 93 | STATIC_URL = local_settings.STATIC_URL 94 | STATIC_ROOT = local_settings.STATIC_ROOT 95 | MEDIA_ROOT = local_settings.MEDIA_ROOT 96 | MEDIA_URL = local_settings.MEDIA_URL 97 | TMP_ROOT = local_settings.TMP_ROOT 98 | URLS_LIMIT = local_settings.URLS_LIMIT 99 | 100 | STATICFILES_DIRS = ( 101 | os.path.join(BASE_DIR, 'src'), 102 | ) 103 | 104 | TEMPLATE_DIRS = ( 105 | os.path.join(BASE_DIR, 'templates'), 106 | ) 107 | 108 | ENABLE_CDN_SERVICES = local_settings.ENABLE_CDN_SERVICES 109 | 110 | PYTHON_INTERPRETER = local_settings.PYTHON_INTERPRETER 111 | 112 | EN = 'en' 113 | IT = 'it' 114 | FR = 'fr' 115 | NL = 'nl' 116 | 117 | LANGUAGE_CHOICES = ( 118 | (EN, u'english'), 119 | (FR, u'french'), 120 | (NL, u'dutch'), 121 | (IT, u'italian'), 122 | ) 123 | 124 | LOGGING_ROOT = os.path.join(BASE_DIR, 'logs') 125 | 126 | LOGGING = { 127 | 'version': 1, 128 | 'disable_existing_loggers': False, 129 | 'formatters': { 130 | 'verbose': { 131 | 'format': u'%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d «%(message)s»' 132 | }, 133 | 'simple': { 134 | 'format': u'%(asctime)s «%(message)s» %(module)s.%(funcName)s (%(lineno)s)' 135 | }, 136 | }, 137 | 'handlers': { 138 | 'file': { 139 | 'level': 'DEBUG', 140 | 'class': 'logging.FileHandler', 141 | 'filename': os.path.join(LOGGING_ROOT, 'debug.log'), 142 | 'formatter': 'simple' 143 | }, 144 | 'file.clean': { 145 | 'level': 'DEBUG', 146 | 'class': 'logging.FileHandler', 147 | 'filename': os.path.join(LOGGING_ROOT, 'clean.log'), 148 | 'formatter': 'simple' 149 | }, 150 | }, 151 | 'loggers': { 152 | 'zup': { 153 | 'handlers': ['file'], 154 | 'level': 'DEBUG', 155 | 'propagate': True, 156 | }, 157 | 'zup.clean': { 158 | 'handlers': ['file.clean'], 159 | 'level': 'DEBUG', 160 | 'propagate': True, 161 | }, 162 | }, 163 | } 164 | 165 | 166 | LOG_FILE = LOGGING['handlers']['file']['filename'] 167 | 168 | CLEANING_AFTER_SECONDS = local_settings.CLEANING_AFTER_SECONDS -------------------------------------------------------------------------------- /zup/tests.py: -------------------------------------------------------------------------------- 1 | import os, logging 2 | from django.conf import settings 3 | from django.test import TestCase 4 | from utils import urls_to_zip 5 | 6 | 7 | 8 | logger = logging.getLogger('zup') 9 | 10 | 11 | 12 | 13 | class LoggerTest(TestCase): 14 | def setUp(self): 15 | ''' 16 | it will create LOGGING_ROOT 17 | ''' 18 | if not os.path.exists(settings.LOGGING_ROOT): 19 | os.mkdir(settings.LOGGING_ROOT) 20 | 21 | 22 | def test_check_permissions(selft): 23 | ''' 24 | verify that logger has the right to write to logger folder... 25 | ''' 26 | logger.debug('logger test is working as expected') 27 | 28 | 29 | 30 | class UtilsTest(TestCase): 31 | def setUp(self): 32 | ''' 33 | it will create TMP_ROOT and MEDIA_ROOT 34 | ''' 35 | if not os.path.exists(settings.TMP_ROOT): 36 | os.mkdir(settings.TMP_ROOT) 37 | if not os.path.exists(settings.MEDIA_ROOT): 38 | os.mkdir(settings.MEDIA_ROOT) 39 | 40 | def test_urls_to_zip(self): 41 | zipified = urls_to_zip([ 42 | "http://www.nytimes.com/2014/07/07/us/mayor-mike-duggans-pledges-echo-in-detroits-north-end.html?hp&action=click&pgtype=Homepage&version=LargeMediaHeadlineSum&module=photo-spot-region®ion=photo-spot&WT.nav=photo-spot&_r=0", 43 | "http://www.nytimes.com/2014/07/08/world/europe/eduard-shevardnadze-soviet-foreign-minister-under-gorbachev-is-dead-at-86.html?rref=homepage&module=Ribbon&version=origin®ion=Header&action=click&contentCollection=Home%20Page&pgtype=article", 44 | "http://www.corriere.it/cronache/14_luglio_07/estate-ritirata-piogge-temporali-8e993fc8-0607-11e4-9ae2-2d514cff7f8f.shtml", 45 | "http://www.theguardian.com/news/datablog/2014/jul/07/which-phones-battery-life-stop-boarding-flight", 46 | "http://www.corrieredellosport.it/calcio/mondiali_2014/2014/07/11-368890/Roma%2C+visite+mediche+per+Emanuelson", 47 | "http://gasexchange.com/questions/do-labor-epidurals-increase-the-risk-of-instrumental-or-surgical-delivery/", 48 | "http://ghsm.hms.harvard.edu/uploads/pdf/PGSSC_Publications_2012.pdf", 49 | "http://guidance.nice.org.uk/CG/Published", 50 | "http://volunteermovement.org/ehsen-amri-activism-in-tunisia/" 51 | ]) 52 | self.assertEqual(os.path.exists(zipified), True) 53 | if os.path.exists(zipified): 54 | os.remove(zipified) -------------------------------------------------------------------------------- /zup/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, include, url 2 | from django.contrib import admin 3 | 4 | 5 | 6 | admin.autodiscover() 7 | 8 | 9 | 10 | apipatterns = patterns('zup.api', 11 | url(r'^$', 'home', name='zup_api_home'), 12 | 13 | url(r'^job$', 'jobs', name='zup_api_jobs'), 14 | url(r'^job/(?P\d+)$', 'job', name='zup_api_job'), 15 | url(r'^job/(?P\d+)/download$', 'job_download', name='zup_api_job_download'), 16 | ) 17 | 18 | urlpatterns = patterns('', 19 | url(r'^$', 'zup.views.home', name='home'), 20 | url(r'^api/', include(apipatterns)), 21 | 22 | url(r'^admin/', include(admin.site.urls)), 23 | ) 24 | -------------------------------------------------------------------------------- /zup/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os, csv, urllib2, re, codecs, shutil, unicodecsv, logging 4 | from goose import Goose 5 | from zipfile import ZipFile 6 | 7 | from django.conf import settings 8 | from django.utils.text import slugify 9 | 10 | 11 | 12 | logger = logging.getLogger('zup') 13 | 14 | 15 | 16 | def unicode_dict_reader(utf8_data, **kwargs): 17 | ''' 18 | Smart csv reader for unicode chars 19 | ''' 20 | csv_reader = csv.DictReader(utf8_data, **kwargs) 21 | for row in csv_reader: 22 | yield dict([(key, unicode(value, 'utf-8')) for key, value in row.iteritems()]) 23 | 24 | 25 | 26 | def gooseapi(url): 27 | ''' 28 | Return a goose instance for the given url. Goose instance brings together title and content from the pointed page body. 29 | ''' 30 | logger.debug('fetching url: %s' % url); 31 | goo = Goose({'browser_user_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'}) 32 | 33 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) 34 | opener.addheaders = [('User-agent', 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'), ('Accept', '*/*')] 35 | response = opener.open(url) 36 | 37 | raw_html = response.read() 38 | return goo.extract(raw_html=raw_html) 39 | 40 | 41 | 42 | def unique_mkdir(path): 43 | ''' 44 | Return a unique filename for a given path 45 | ''' 46 | base_path = path 47 | c = 1 48 | while os.path.exists(path): 49 | candidate = '%s-%s' % (base_path, c) 50 | path = os.path.join(base_path, candidate) 51 | c += 1 52 | os.mkdir(path) 53 | return path 54 | 55 | 56 | def urls_to_zip(urls=[], path=None, max_length=64, fields=['title', 'tags', 'meta_keywords']): 57 | ''' 58 | Given a list of urls, try to extract the body content from each url and put it in a txt file. 59 | The files are zipped together and delivered as filepath at the end. 60 | Cfr Goose documentation and gooseapi function as well. 61 | ''' 62 | if path is None: 63 | path = "untitled" 64 | 65 | # create unique folder 66 | path = unique_mkdir(os.path.join(settings.TMP_ROOT, os.path.basename(path))) 67 | # create zip filename 68 | zipfied = os.path.join(settings.TMP_ROOT, 'urls_to_zip.zip') 69 | # create csv report filename 70 | report_path = os.path.join(path, 'report.csv') 71 | 72 | c = 1 73 | while os.path.exists(zipfied): 74 | candidate = '%s-%s.zip' % ('urls_to_zip', c) 75 | zipfied = os.path.join(settings.TMP_ROOT, candidate) 76 | c += 1 77 | 78 | reports = [] 79 | 80 | with ZipFile(zipfied, 'w') as myzip: 81 | logger.debug('zip file opened to bring %s urls' % len(urls)) 82 | 83 | # 1 of 2. fill zip with each page body 84 | for i,url in enumerate(urls): 85 | index = '%0*d' % (5, int(i) + 1) 86 | logger.debug('url %s of %s' % (i+1, len(urls))) 87 | try: 88 | g = gooseapi(url=url) 89 | except urllib2.HTTPError, e: 90 | logger.debug('HTTPError %s for url %s'% (e, url)) 91 | except Exception, e: 92 | logger.exception(e) 93 | continue 94 | 95 | slug = '%s-%s' % (index,slugify(g.title)[:max_length]) 96 | slug_base = slug 97 | 98 | textified = os.path.join(path, slug) 99 | 100 | c = 1 # unique filename for the text file 101 | while os.path.exists(textified): 102 | candidate = '%s-%s-%s' % (index, slug_base, c) 103 | print "writing on %s" % candidate 104 | if len(candidate) > max_length: 105 | slug = slug[:max_length-len('-%s' % c)] 106 | slug = re.sub('\-+','-',candidate) 107 | textified = os.path.join(path, slug) 108 | c += 1 109 | 110 | textified = "%s.txt" % textified 111 | 112 | # open textified file and write goose body content, with title. 113 | with codecs.open(textified, encoding='utf-8', mode='w') as f: 114 | f.write('\n\n%s\n\n\n\n' % g.title) 115 | f.write(g.cleaned_text) 116 | # the row dict to be written as csv row 117 | result = { 118 | 'id': i, 119 | 'path': os.path.basename(textified), 120 | 'url': url 121 | } 122 | # if there are tags given from goose, they are given as a list. We join it with nice commas. 123 | for field in fields: 124 | if field == 'tags': 125 | result[field] = ', '.join(getattr(g, field)) 126 | else: 127 | result[field] = getattr(g, field) 128 | # push our line with the others 129 | reports.append(result) 130 | logger.debug('txt file added to zip file') 131 | 132 | myzip.write(textified, os.path.basename(textified)) 133 | 134 | # 2 of 2. write csv data 135 | logger.debug('writing csv report for %s url of %s' % (len(reports), len(urls))) 136 | with open(report_path, 'w') as report: 137 | writer = unicodecsv.DictWriter(report, ['id', 'path', 'url'] + fields) 138 | writer.writeheader() 139 | for report in reports: 140 | writer.writerow(report) 141 | 142 | myzip.write(report_path, os.path.basename(report_path)) 143 | logger.debug('csv report added to zipfile %s' % zipfied) 144 | 145 | shutil.rmtree(path) 146 | logger.debug('------------- END OF TASK ------------') 147 | return zipfied 148 | 149 | 150 | -------------------------------------------------------------------------------- /zup/views.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from django.conf import settings 4 | from django.shortcuts import render_to_response 5 | from django.utils.translation import ugettext as _ 6 | from django.template import RequestContext 7 | from django.contrib.admin.views.decorators import staff_member_required 8 | 9 | 10 | def home(request): 11 | d = _helper_shared_context(request) 12 | return render_to_response("zup/index.html", RequestContext(request, d)) 13 | 14 | 15 | 16 | def _helper_shared_context(request, tags=[], d={}): 17 | ''' 18 | Return an happy shared contex for your view 19 | ''' 20 | d.update({ 21 | 'TITLE': settings.TITLE, 22 | 'DEBUG': settings.DEBUG, 23 | 'ENABLE_CDN_SERVICES': settings.ENABLE_CDN_SERVICES, 24 | 'LANGUAGE': request.LANGUAGE_CODE, 25 | 'URLS_LIMIT': settings.URLS_LIMIT, 26 | 'URLS_LIMIT_ENABLE': not request.user.is_staff 27 | }) 28 | return d -------------------------------------------------------------------------------- /zup/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for zup project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.6/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "zup.settings") 12 | 13 | from django.core.wsgi import get_wsgi_application 14 | application = get_wsgi_application() 15 | --------------------------------------------------------------------------------