├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── TODO
├── manage
    ├── api_website_config.py
    ├── check-style.sh
    ├── clean-db.sh
    ├── config.sh
    ├── dump-db.py
    ├── exec-in-virtualenv.sh
    ├── mailserver.py
    ├── pre-commit
    ├── quickinstall
    ├── rebuild-html.py
    ├── rebuild-index.sh
    ├── reinstall.sh
    ├── run-list.sh
    ├── setenv.sh
    ├── setup.py
    ├── sopaper.net9.org
    └── tolower.py
├── pdf-compress.py
├── report
    ├── Makefile
    ├── img
    │   ├── author.png
    │   ├── chaos.png
    │   ├── index.png
    │   ├── logo.png
    │   ├── reading.png
    │   ├── redirect.png
    │   ├── reference.png
    │   ├── summary.png
    │   └── various.png
    ├── intro.tex
    ├── mint-defs.tex
    ├── refs.bib
    └── report.tex
├── setup.cfg
├── setup.py
├── sopaper.py
├── sopaper
    ├── __init__.py
    ├── __main__.py
    ├── authorfetch.py
    ├── contentsearch.py
    ├── dbsearch.py
    ├── fetcher
    │   ├── __init__.py
    │   ├── arxiv.py
    │   ├── base.py
    │   ├── direct.py
    │   ├── dlacm.py
    │   ├── ieee.py
    │   └── sciencedirect.py
    ├── job.py
    ├── lib
    │   ├── __init__.py
    │   ├── abbr.dic
    │   ├── downloader.py
    │   ├── exc.py
    │   ├── mailutil.py
    │   ├── pdf2html.py
    │   ├── pdfutil.py
    │   ├── sanitize.py
    │   ├── singleton.py
    │   ├── textutil.py
    │   ├── timeout.py
    │   └── ukutil.py
    ├── pdfprocess.py
    ├── queryhandler.py
    ├── searcher
    │   ├── __init__.py
    │   ├── google.py
    │   └── gscholar.py
    ├── test-fetcher.py
    ├── test-searcher.py
    ├── ukconfig.py
    ├── ukdbconn.py
    ├── uklogger.py
    └── xpengine
    │   ├── __init__.py
    │   ├── indexer.py
    │   ├── run-xp.py
    │   ├── searcher.py
    │   ├── stopwords.txt
    │   ├── wordlists
    │       ├── Caps.txt
    │       └── words.txt.gz
    │   └── xpcommon.py
└── webapi
    ├── __init__.py
    ├── api
        ├── __init__.py
        ├── comment.py
        ├── download.py
        ├── html.py
        ├── mark.py
        ├── query.py
        ├── test.py
        └── view.py
    ├── standalone_server.py
    ├── static
        ├── css
        │   ├── homepage.css
        │   └── semantic.css
        ├── fonts
        │   ├── basic.icons.eot
        │   ├── basic.icons.svg
        │   ├── basic.icons.ttf
        │   ├── basic.icons.woff
        │   ├── icons.eot
        │   ├── icons.otf
        │   ├── icons.svg
        │   ├── icons.ttf
        │   └── icons.woff
        ├── images
        │   ├── bg.jpg
        │   ├── dark-bg.png
        │   ├── loader-large-inverted.gif
        │   ├── loader-large.gif
        │   ├── logo.png
        │   ├── ltz.jpg
        │   ├── tile-bg.png
        │   ├── wyc.jpg
        │   └── wyx.jpg
        ├── index.html
        ├── javascript
        │   ├── ZeroClipboard.js
        │   ├── angular-resource.min.js
        │   ├── angular-route.min.js
        │   ├── angular-sanitize.min.js
        │   ├── angular.min.js
        │   ├── homepage.js
        │   ├── jquery.easing.1.3.js
        │   ├── jquery.js
        │   ├── search.js
        │   ├── semantic.js
        │   └── semantic.min.js
        └── search.html
    └── templates
        ├── ZeroClipboard.swf
        ├── index.html
        └── search.html


/.gitattributes:
--------------------------------------------------------------------------------
1 | webapi/static/javascript/semantic* linguist-vendored
2 | webapi/static/css/semantic* linguist-vendored
3 | webapi/static/javascript/ZeroClipboard.js linguist-vendored
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | common/person-profile.txt*
 2 | old-result
 3 | report/output
 4 | 
 5 | *.py[cod]
 6 | 
 7 | *.pdf
 8 | # C extensions
 9 | *.so
10 | 
11 | # Packages
12 | *.egg
13 | *.egg-info
14 | dist
15 | build
16 | eggs
17 | parts
18 | bin
19 | env
20 | .download_cache
21 | var
22 | sdist
23 | develop-eggs
24 | .installed.cfg
25 | __pycache__
26 | 
27 | # Installer logs
28 | pip-log.txt
29 | 
30 | # Unit test / coverage reports
31 | .coverage
32 | .tox
33 | nosetests.xml
34 | 
35 | # Translations
36 | *.mo
37 | 
38 | # Mr Developer
39 | .mr.developer.cfg
40 | .project
41 | .pydevproject
42 | 
43 | # virtualenv
44 | .env
45 | 
46 | # vim swap file
47 | *.swp
48 | *.pyc
49 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include sopaper *.py
2 | recursive-include sopaper *.dic
3 | recursive-exclude webapi *
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## SoPaper, So Easy
 2 | This is a project designed for researchers to conveniently access papers they need.
 3 | 
 4 | The command line tool ``sopaper`` can __automatically search and download__ paper
 5 | from Internet, given the title.
 6 | The downloaded paper will thus have a readable file name
 7 | (I wrote it at the beginning because I'm tired of seeing the file name being random strings).
 8 | It mainly supports searching papers in computer science.
 9 | 
10 | <!-- -This project also comes with a naive server to provide integrated search/read/download experience.  -->
11 | 
12 | ## How to Use
13 | Install command line dependencies:
14 | * [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) command line executable.
15 | 	+ Using pdftk on OSX10.11 might lead to hangs. See [here](http://stackoverflow.com/questions/32505951/pdftk-server-on-os-x-10-11) for more info.
16 | * poppler-utils (optional)
17 | 
18 | Install python package:
19 | ``pip install --user sopaper``
20 | 
21 | Usage:
22 | ```bash
23 | $ sopaper --help
24 | $ sopaper "Distinctive image features from scale-invariant keypoints"
25 | $ sopaper "https://arxiv.org/abs/1606.06160"
26 | ```
27 | NOTE: If you are not in school, you may need proxy by environment variable `http_proxy` and `https_proxy`,
28 | to be able to download from certain sites (such as 'dl.acm.org').
29 | 
30 | ## Features
31 | The ``searcher`` module will fuzzy search and analyse results in
32 | * Google Scholar
33 | * Google
34 | 
35 | and the ``fetcher`` module will further analyse the results and download papers from the following possible sources:
36 | * direct pdf link
37 | * [dl.acm.org](http://dl.acm.org/)
38 | * [ieeexplore.ieee.org](http://ieeexplore.ieee.org)
39 | * [arxiv.org](http://arxiv.org)
40 | 
41 | ``Searcher`` and ``Fetcher`` are __extensible__ to support more websites.
42 | 
43 | The command line tool will directly download the paper with a __clean filename__.
44 | All downloaded paper will be __compressed__ using `ps2pdf` from poppler-utils, if available.
45 | 
46 | <!--
47 |    -The server provide:
48 |    -* RESTful APIs on papers
49 |    -* Interactive paper reading UI supported by [pdf2htmlEX](https://github.com/coolwanglu/pdf2htmlEX)
50 |    -
51 |    -Command line tool is sufficient to use. If you'd like to play with the server, you'll need:
52 |    -* Python2 with virtualenv. Python headers are needed (python-dev on debian/ubuntu).
53 |    -* ghostscript
54 |    -* libcurl (libcurl4-{openssl,nss,gnutls}-dev on debian/ubuntu)
55 |    -* xapian (libxapian-dev & python2-xapian on debian/ubuntu)
56 |    -* pdf2htmlEx installed. See its [download guide](https://github.com/coolwanglu/pdf2htmlEX/wiki/Download)
57 |    -* poppler-utils which provide the 'pdftotext' command line util
58 |    -
59 |    -Note: if you need to run server on debian/ubuntu, make sure you do *not* have 'python2-bson' package installed.
60 | 	 -->
61 | 
62 | ## TODO
63 | * Fetcher dedup: when arxiv abs/pdf apperas both in search results, page would be downloaded twice (maybe add a cache for requests)
64 | * Don't trust arxiv link from google scholar
65 | * Is title correctly updated for dlacm?
66 | * Extract title from bibtex -- more accurate?
67 | * Fetcher for other sites
68 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
 1 | get title from pdf file
 2 | 
 3 | paper metadata: year
 4 | run parser in parrallel
 5 | 
 6 | citecnt problem
 7 | 
 8 | return cited cnt
 9 | 
10 | parser for:
11 | http://www.sciencedirect.com/science/article/pii/S1570870513000073
12 | 
13 | Fast Sorted-Set Intersection using SIMD Instructions - cite cnt error
14 | 
15 | failure: Blocks World Revisited
16 | 


--------------------------------------------------------------------------------
/manage/api_website_config.py:
--------------------------------------------------------------------------------
1 | 
2 | API_PORT = 1085
3 | 
4 | API_RUN_OPTIONS = {
5 |     'debug': True,
6 |     'use_reloader': True
7 | }
8 | 


--------------------------------------------------------------------------------
/manage/check-style.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # File: check-style.sh
 3 | # Date: Thu May 08 10:16:54 2014 +0800
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | [[ -n $ZSH_VERSION ]] && script_dir=$(dirname $0) || script_dir=$(dirname ${BASH_SOURCE[0]})
 7 | source $script_dir/setenv.sh
 8 | 
 9 | pep8=pep8
10 | type $pep8 2> /dev/null || pep8=pep8-python2		# for archlinux
11 | 
12 | realpath() {
13 |   [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
14 | }
15 | 
16 | if [ `uname` = 'Darwin' ]
17 | then
18 |   real_dir=$(realpath $script_dir)/..
19 | else
20 |   real_dir=$(readlink -f $script_dir)/..
21 | fi
22 | 
23 | $pep8 $real_dir --exclude=.env,.git,bin,lib --statistics
24 | 
25 | 


--------------------------------------------------------------------------------
/manage/clean-db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # File: clean-db.sh
 3 | # Date: Sat May 24 11:26:04 2014 +0000
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | 
 7 | PROG_NAME=`readlink -f "$0"`
 8 | PROG_DIR=`dirname "$PROG_NAME"`
 9 | cd "$PROG_DIR"
10 | 
11 | mongo sopaper --quiet <<< "
12 | db.paper.count();
13 | db.dropDatabase()"
14 | 
15 | ./rebuild-index.sh
16 | 


--------------------------------------------------------------------------------
/manage/config.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # $File: config.sh
3 | # $Date: Thu May 08 10:17:09 2014 +0800
4 | # $Author: jiakai <jia.kai66@gmail.com>
5 | 
6 | sopapermodules="webapi common"		# 'modules' conflicts with some read-only shell variable
7 | 


--------------------------------------------------------------------------------
/manage/dump-db.py:
--------------------------------------------------------------------------------
 1 | #!./exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: dump-db.py
 4 | # Date: Sat May 24 11:39:14 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import sys
 8 | import os
 9 | 
10 | from ukdbconn import get_mongo
11 | 
12 | 
13 | def dump(pid, output):
14 |     OUTPUT = os.path.join(os.path.dirname(__file__), output)
15 |     pid = int(pid)
16 |     db = get_mongo('paper')
17 | 
18 |     doc = list(db.find({'_id': pid}).limit(1))[0]
19 |     pdf = doc['pdf']
20 |     title = doc['title']
21 | 
22 |     try:
23 |         os.mkdir(OUTPUT)
24 |     except:
25 |         pass
26 |     fout = open(os.path.join(OUTPUT, title + '.pdf'), 'wb')
27 |     fout.write(pdf)
28 |     fout.close()
29 | 
30 |     npage = doc.get('page')
31 |     if npage:
32 |         for i in range(npage + 1):
33 |             fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'wb')
34 |             fout.write(doc['html'][i])
35 |             fout.close()
36 | 
37 | if __name__ == '__main__':
38 |     if len(sys.argv) != 3:
39 |         print("Usage: {0} <paper id> <output dir>".format(sys.argv[0]))
40 |         sys.exit()
41 | 
42 |     dump(sys.argv[1], sys.argv[2])
43 | 
44 | 


--------------------------------------------------------------------------------
/manage/exec-in-virtualenv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # $File: exec-in-virtualenv.sh
 3 | # $Date: Thu May 22 15:12:40 2014 +0800
 4 | # $Author: jiakai <jia.kai66@gmail.com>
 5 | 
 6 | # start a python script in virtualenv, with appropriate envrionment variables
 7 | # set
 8 | 
 9 | source $(dirname $0)/setenv.sh
10 | 
11 | if [ -z "$1" ]
12 | then
13 | 	echo "usage: $0 <python script>"
14 | 	exit
15 | fi
16 | 
17 | python "$@"
18 | 
19 | 


--------------------------------------------------------------------------------
/manage/mailserver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: mailserver.py
 4 | # Date: Sun Jun 29 10:05:10 2014 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | 
 8 | from flask import Flask, request
 9 | app = Flask(__name__)
10 | import traceback
11 | 
12 | import smtplib
13 | from email.mime.text import MIMEText
14 | 
15 | @app.route('/mail', methods=['POST'])
16 | def mail():
17 |     js = request.get_json(force=True)
18 |     print(js)
19 |     try:
20 |         sendmail(js['addr'], js['subject'], js['content'])
21 |     except Exception as e:
22 |         traceback.format_exc()
23 |         return {'status': 'error',
24 |                 'reason': str(e)}
25 |     return {'status': 'ok'}
26 | 
27 | server = smtplib.SMTP('localhost')
28 | def sendmail(addr, subject, content):
29 |     me = 'sopaper@net9.org'
30 |     you = addr
31 |     msg = MIMEText(content)
32 |     msg['Subject'] = subject
33 |     msg['From'] = me
34 |     msg['To'] = you
35 |     server.sendmail(me, [you], msg.as_string())
36 | 
37 | if __name__ == '__main__':
38 |     app.run(port=5184, debug=True)
39 | 
40 | 


--------------------------------------------------------------------------------
/manage/pre-commit:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # File: pre-commit
3 | # Date: Tue Dec 10 15:02:36 2013 +0800
4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
5 | 
6 | ./manage/check-style.sh || (echo "Please fix all the warnings on coding style" && exit 1)
7 | 


--------------------------------------------------------------------------------
/manage/quickinstall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # This script create a virtual environment in directory $ENV,
 3 | # install dependencies and setup executables according to setup.py.
 4 | #
 5 | # You need to install virtualenv and pip before running this script.
 6 | 
 7 | #[[ -d "../.git" ]] && cp pre-commit ../.git/hooks
 8 | 
 9 | script_dir=$(dirname $0)
10 | cd $script_dir
11 | : "DLC = ${DLC:=./.download_cache}, ENV = ${ENV:=../.env}"
12 | 
13 | # find the right python version
14 | if [ -z "$PYTHON" ]; then
15 |     for PYTHON in python{2.7,2.6,2,}; do
16 |         type "$PYTHON" >/dev/null 2>&1 && break
17 |     done
18 | fi
19 | 
20 | # find the right virtualenv version
21 | if [ -z "$VIRTUALENV" ]; then
22 |     for VIRTUALENV in virtualenv{2.7,2.6,2,}; do
23 |         type "$VIRTUALENV" >/dev/null 2>&1 && break
24 |     done
25 | fi
26 | 
27 | "$VIRTUALENV" --no-site-packages --python "$PYTHON" "$ENV" || exit 1
28 | 
29 | . "$ENV"/bin/activate || exit 1
30 | 
31 | pip install --download-cache="$DLC" -e .
32 | 


--------------------------------------------------------------------------------
/manage/rebuild-html.py:
--------------------------------------------------------------------------------
 1 | #!./exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: rebuild-html.py
 4 | # Date: 一 6月 09 17:34:27 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from pdfprocess import do_addhtml
 8 | from ukdbconn import get_mongo
 9 | 
10 | db = get_mongo('paper')
11 | itr = db.find({'_id': 67})
12 | for paper in itr:
13 |     try:
14 |         data = paper['pdf']
15 |     except:
16 |         print(paper['_id'], paper['title'])
17 |         continue
18 |     pid = paper['_id']
19 |     do_addhtml(data, pid)
20 | 


--------------------------------------------------------------------------------
/manage/rebuild-index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # File: rebuild-index.sh
 3 | # Date: Sat May 24 11:00:06 2014 +0000
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | PROG_NAME=`readlink -f "$0"`
 7 | PROG_DIR=`dirname "$PROG_NAME"`
 8 | cd "$PROG_DIR"
 9 | 
10 | ../common/contentsearch.py
11 | 


--------------------------------------------------------------------------------
/manage/reinstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | yes | pip2 uninstall sopaper
4 | pip2 install . --user
5 | 


--------------------------------------------------------------------------------
/manage/run-list.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # File: run-list.sh
 3 | # Date: 五 6月 13 18:38:18 2014 +0000
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | while read title; do
 7 | 	echo $title
 8 | 	timeout 300 ../common/queryhandler.py "$title"
 9 | 	echo "Done! Press Ctrl-C to stop"
10 | 	sleep 0.5
11 | done < ~/dm-list.txt
12 | 


--------------------------------------------------------------------------------
/manage/setenv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # $File: setenv.sh
 3 | # $Date: Fri May 23 20:04:36 2014 +0800
 4 | # $Author: jiakai <jia.kai66@gmail.com>
 5 | 
 6 | # zsh compatibility when direct sourcing from shell
 7 | [[ -n $ZSH_VERSION ]] && script_dir=$(dirname $0) || script_dir=$(dirname ${BASH_SOURCE[0]})
 8 | source $script_dir/config.sh
 9 | 
10 | realpath() {
11 |   [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
12 | }
13 | 
14 | if [ `uname` = 'Darwin' ]
15 | then
16 |   project_root=`realpath $script_dir/..`
17 | else
18 |   project_root=$(readlink -f $script_dir/..)
19 | fi
20 | 
21 | env_dir=$project_root/.env
22 | 
23 | if [ ! -d "$env_dir" ]
24 | then
25 | 	echo "$env_dir not found; please first run 'quickinstall'"
26 | fi
27 | 
28 | . $env_dir/bin/activate
29 | 
30 | 
31 | # zsh compatibility
32 | [[ -n $ZSH_VERSION ]] && set -o shwordsplit
33 | 
34 | for i in $sopapermodules
35 | do
36 | 	PYTHONPATH=$PYTHONPATH:$project_root/$i
37 | done
38 | export PYTHONPATH=$PYTHONPATH:$project_root
39 | 
40 | DEBIAN_PYTHON_PATH=/usr/lib/python2.7/dist-packages
41 | SYS_PYTHON_PATH=/usr/lib/python2.7/site-packages
42 | [[ -d $DEBIAN_PYTHON_PATH ]] && export PYTHONPATH=$PYTHONPATH:$DEBIAN_PYTHON_PATH
43 | [[ -d $SYS_PYTHON_PATH ]] && export PYTHONPATH=$PYTHONPATH:$SYS_PYTHON_PATH
44 | 
45 | if [ `uname` = 'Darwin' ]
46 | then
47 |   export SOPAPER_CONFIG=$(realpath -f $script_dir)
48 | else
49 |   export SOPAPER_CONFIG=$(readlink -f $script_dir)
50 | fi
51 | 
52 | 


--------------------------------------------------------------------------------
/manage/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup_args = dict(
 4 |     name='sopaper',
 5 |     install_requires=[
 6 |         # web framework
 7 |         'Flask>=0.9',
 8 |         'Flask-Login>=0.2.7',
 9 |         #'pyjade>=1.6',
10 | 
11 |         # database
12 |         'pymongo>=2.7',
13 | 
14 |         # IR & NLP
15 |         'xappy>=0.5',
16 | 
17 |         # tool
18 |         'sphinx>=1.1.3',
19 |         'pep8>=1.4.6',
20 |         'termcolor>=1.1.0',
21 | 
22 |         # network
23 |         'beautifulsoup4>=4.3.2',
24 |         'requests>=2.1.0',
25 | 
26 |         # pdf
27 |         'python-magic>=0.4.6',
28 |     ],
29 |     entry_points=dict(
30 |         console_scripts=[
31 |             'api-website = standalone_server:main',
32 |             ],
33 |     ),
34 | )
35 | 
36 | if __name__ == '__main__':
37 |     setup(**setup_args)
38 | 


--------------------------------------------------------------------------------
/manage/sopaper.net9.org:
--------------------------------------------------------------------------------
 1 | upstream sopaper {
 2 |   server 0:8088;
 3 | }
 4 | 
 5 | server {
 6 |   listen 80;
 7 |   listen [::]:80;
 8 |   server_name sopaper.net9.org;
 9 | 
10 |   return 301 https://$host$request_uri;
11 | }
12 | 
13 | server {
14 |   listen 443 ssl;
15 |   listen [::]:443 ssl;
16 |   server_name sopaper.net9.org;
17 |   access_log /var/log/nginx/uknow.access.log;
18 |   error_log /var/log/nginx/uknow.error.log;
19 | 
20 |   location / {
21 |     proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
22 |     proxy_set_header X-Forwarded-Proto $scheme;
23 |     proxy_set_header Host $http_host;
24 |     proxy_redirect off;
25 |     proxy_pass http://sopaper;
26 |   }
27 | }
28 | 
29 | # vim: ft=nginx
30 | 


--------------------------------------------------------------------------------
/manage/tolower.py:
--------------------------------------------------------------------------------
 1 | #!./exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: tolower.py
 4 | # Date: 二 6月 10 04:03:22 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from pdfprocess import do_addhtml
 8 | from ukdbconn import get_mongo
 9 | 
10 | db = get_mongo('paper')
11 | #itr = db.find({'_id': 67L})
12 | itr = db.find({}, {'author': 1, 'title': 1})
13 | for paper in itr:
14 |     try:
15 |         data = paper['author']
16 |     except:
17 |         print(paper['_id'], paper['title'])
18 |         continue
19 |     pid = paper['_id']
20 |     db.update({'_id': pid}, {'$set': {'author': [x.lower() for x in data]}})
21 | 


--------------------------------------------------------------------------------
/pdf-compress.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: pdf-compress.py
 4 | # Date: Mon Jun 01 01:32:07 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import sys
 8 | import os
 9 | import os.path
10 | import argparse
11 | 
12 | from sopaper.lib.pdfutil import pdf_compress
13 | 
14 | def get_args():
15 |     desc = 'Compress Pdf By ps2pdf'
16 |     parser = argparse.ArgumentParser(description = desc)
17 | 
18 |     parser.add_argument('file', help='file name')
19 |     ret = parser.parse_args()
20 |     return ret
21 | 
22 | def main():
23 |     global args
24 |     args = get_args()
25 |     data = open(args.file, 'rb').read()
26 |     newdata = pdf_compress(data)
27 | 
28 |     if len(newdata) < len(data):
29 |         newfilename = args.file + '.compressed'
30 |         with open(newfilename, 'wb') as fout:
31 |             fout.write(newdata)
32 |         os.remove(args.file)
33 |         os.rename(newfilename, args.file)
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/report/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=report
 2 | TEX=xelatex -interaction=nonstopmode -shell-escape
 3 | BIBTEX=bibtex
 4 | READER=mupdf
 5 | 
 6 | all: rebuild
 7 | 
 8 | rebuild output/$(TARGET).pdf: *.tex *.bib output
 9 | 	cd output && rm -f *.tex *.bib && ln -s ../*.tex ../*.bib .
10 | 	cd output &&  $(TEX) $(TARGET).tex
11 | 	cp output/$(TARGET).pdf ../ -f
12 | 	cp output/$(TARGET).pdf ./ -f
13 | 
14 | 
15 | output:
16 | 	mkdir output -p
17 | 	cd output && rm -f img src && ln -s ../img ../src ./
18 | 
19 | view: output/$(TARGET).pdf
20 | 	$(READER) output/$(TARGET).pdf &
21 | 
22 | clean:
23 | 	rm -rf output
24 | 
25 | # compile twice to get cross reference right
26 | final : *.tex *.bib output
27 | 	cd output && rm -f *.tex *.bib && ln -s ../*.tex ../*.bib .
28 | 	cd output && $(TEX) $(TARGET).tex && $(BIBTEX) $(TARGET).aux && \
29 | 		$(TEX) $(TARGET).tex && $(TEX) $(TARGET).tex
30 | 
31 | count:
32 | 	@word_count *.tex
33 | 
34 | run: view
35 | 
36 | .PHONY: all view clean rebuild run
37 | 


--------------------------------------------------------------------------------
/report/img/author.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/author.png


--------------------------------------------------------------------------------
/report/img/chaos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/chaos.png


--------------------------------------------------------------------------------
/report/img/index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/index.png


--------------------------------------------------------------------------------
/report/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/logo.png


--------------------------------------------------------------------------------
/report/img/reading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/reading.png


--------------------------------------------------------------------------------
/report/img/redirect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/redirect.png


--------------------------------------------------------------------------------
/report/img/reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/reference.png


--------------------------------------------------------------------------------
/report/img/summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/summary.png


--------------------------------------------------------------------------------
/report/img/various.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/report/img/various.png


--------------------------------------------------------------------------------
/report/intro.tex:
--------------------------------------------------------------------------------
  1 | %File: intro.tex
  2 | %Date: Thu Jun 26 13:01:10 2014 +0800
  3 | %Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  4 | 
  5 | \section{项目概述}
  6 | \large
  7 | \setlength{\baselineskip}{15pt}
  8 | \subsection{功能}
  9 | 
 10 | 如今的研究者在搜索论文中, 在种类繁多的论文资源提供平台中, 常常会遇到跳转过多, 文件名混乱等问题.
 11 | \begin{figure}[H]
 12 |   \centering
 13 |   \includegraphics[width=0.7\textwidth]{img/chaos.png}
 14 | \end{figure}
 15 | \begin{figure}[H]
 16 |   \centering
 17 |   \includegraphics[width=0.7\textwidth]{img/redirect.png}
 18 | \end{figure}
 19 | 
 20 | 搭建一个学术搜索引擎.根据用户输入查询内容(论文的标题、作者、内容摘要等),检索并呈现指定的论文.
 21 | 在获得版权前提下,提供直接下载链接,并保证下载完成后论文名称正确.
 22 | 
 23 | \subsection{文件结构}
 24 | \dirtree{%
 25 |   .1 /.
 26 |   .2 common/\DTcomment{功能及主要逻辑的实现}.
 27 |   .3 fetcher/\DTcomment{各论文提供源爬虫}.
 28 |   .3 lib/\DTcomment{公共库}.
 29 |   .3 searcher/\DTcomment{搜索引擎爬虫}.
 30 |   .3 xpengine/\DTcomment{内容数据库引擎}.
 31 |   .3 contentsearch.py\DTcomment{内容搜索逻辑实现}.
 32 |   .3 dbsearch.py\DTcomment{论文数据库搜索逻辑实现}.
 33 |   .3 pdfprocess.py\DTcomment{pdf处理库实现}.
 34 |   .3 queryhandler.py\DTcomment{外部请求处理逻辑}.
 35 |   .3 test-{fetcher,searcher}.py\DTcomment{fetcher与searcher的测试}.
 36 |   .3 job.py\DTcomment{定义查询的任务上下文}.
 37 |   .3 ukconfig.py\DTcomment{库功能的配置}.
 38 |   .3 ukdbconn.py\DTcomment{与mongo数据库交互的基本功能实现}.
 39 |   .3 uklogger.py\DTcomment{日志系统实现}.
 40 |   .2 report/\DTcomment{本报告的\LaTeX 源代码}.
 41 |   .2 manage/\DTcomment{开发用脚本,用于部署服务,操纵数据库等}.
 42 |   .2 webapi/\DTcomment{HTTP服务器实现}.
 43 |   .3 api/\DTcomment{RESTful API实现}.
 44 |   .3 static/\DTcomment{静态页面资源}.
 45 |   .3 templates/\DTcomment{HTML模板}.
 46 |   .2 paper-downloader.py\DTcomment{自动搜索下载paper的命令行工具}.
 47 |   .2 standalone\_server.py\DTcomment{服务器运行入口}.
 48 | }
 49 | \subsection{依赖}
 50 | 运行命令行工具需要如下python包:
 51 | \begin{enumerate}
 52 |   \item requests \footnote{\url{http://docs.python-requests.org/en/latest/}}
 53 |   \item BeautifulSoup4 \footnote{\url{http://www.crummy.com/software/BeautifulSoup/bs4/doc/}}
 54 |   \item termcolor \footnote{\url{https://pypi.python.org/pypi/termcolor}}
 55 | \end{enumerate}
 56 | 
 57 | 部署服务器需要系统中除python包外, 拥有如下库或软件:
 58 | \begin{enumerate}
 59 |   \item Mongodb安装并运行. 可在\verb|common/ukconfig.py|中配置其位置.
 60 |   \item python virtualenv
 61 |   \item ghostscript
 62 |   \item libcurl
 63 |   \item xapian
 64 |   \item pdf2htmlEx
 65 |   \item poppler-utils
 66 | \end{enumerate}
 67 | \subsection{运行方法}
 68 | \begin{enumerate}
 69 |     \item 命令行使用爬虫下载paper:
 70 |       \begin{lstlisting}[language=bash]
 71 | ./paper-downloader.py -t "title of the paper" -d /tmp
 72 |       \end{lstlisting}
 73 | 
 74 |     \item 启动服务器:
 75 | 
 76 | 通过如下命令安装python包依赖, 并部署虚拟环境:
 77 | \begin{lstlisting}[language=bash]
 78 | $ cd manage
 79 | $ ./quickinstall
 80 | \end{lstlisting}
 81 | 
 82 | 部署后, 通过\verb|./standalone_server.py|直接运行.
 83 | 服务器配置在\verb|manage/api_website_config.py|中.
 84 | \end{enumerate}
 85 | 
 86 | 
 87 | \section{后端实现}
 88 | 
 89 | \subsection{系统架构}
 90 | HTTP服务器使用python的Flask框架实现.
 91 | 各功能的实现也均使用python.
 92 | 系统中, HTTP服务器与后台逻辑完全解耦, 因此在不启动服务时仍然可以通过\verb|paper-downloader.py|调用
 93 | sopaper爬虫进行论文的搜索,下载, 方便日常使用.
 94 | 
 95 | 系统后端除服务器外, 主要由三部分构成:Searcher,Parser,Processor.
 96 | 其中Searcher主要负责将查询送往搜索引擎,
 97 | Parser负责对异构的网页及进行解析和抓取资源,
 98 | Processor对获取的论文进行展示前的压缩,转码处理.
 99 | 
100 | \subsection{查询处理流程}
101 | 
102 | 当一个查询进入处理后, 会立刻建立一个任务上下文\verb|JobContext|对象.
103 | 该对象会依次被各个searcher及parser所更新, 以汇总多方的数据.
104 | 
105 | 例如, 可以利用google / google scholar的结果来对paper的标题进行纠正.
106 | google搜索所提供的citation信息也可以加以利用. 同时, 打开google所提供的搜索结果后,
107 | 也可能能够进一步parse出citation信息作为补充.
108 | 正是由于不同服务都可能提供不同的数据, 将paper数据全部封装在一个上下文中进行Stream式的处理会使得项目逻辑更加清晰.
109 | 
110 | 在searcher阶段, 查询会被送往搜索引擎, 以寻找互联网上的论文资源. 在parser阶段,
111 | 对搜索引擎查找到的异构结果进行解析得到结构化数据.
112 | 过程中一旦获得有用的标识, 就会及时查询本地数据库是否已有相应数据, 以免浪费网络流量.
113 | 在元信息获取,整合完毕后, 将结果返回给前端, 同时后端开始异步下载并通知前端下载进度.
114 | 下载完成后, 对paper进行后处理, 送往前端展示并提供下载链接.
115 | 
116 | \subsection{插件式爬虫}
117 | 
118 | 所有资源爬虫位于\verb|common/fetcher/|目录下, 爬虫采取插件式设计,
119 | 每个爬虫继承自\verb|FetcherBase|,在系统启动时被系统自动发现,
120 | 并通过decorator进行自注册.
121 | 注册的爬虫实现如下接口:
122 | \begin{enumerate}
123 |   \item 爬虫可处理的url格式
124 |   \item 爬虫可能提供的paper信息
125 |   \item 爬虫优先级
126 |   \item 获取各类paper信息
127 |   \item 下载paper
128 | \end{enumerate}
129 | 
130 | \subsection{论文后处理}
131 | 
132 | 当获取到paper的pdf原始文件后, 会对其进行如下后处理:
133 | \begin{enumerate}
134 |   \item 压缩:
135 | 
136 |     使用libpoppler中的工具对pdf进行压缩, 数据库中的所有论文平均压缩至原大小的55\%,
137 |     尤其对于较大的pdf压缩更为明显 (因为较大的pdf大多是由于生成方式不高效导致的).
138 |     压缩使得从sopaper可以下载到更小的论文.
139 | 
140 |   \item 转HTML:
141 | 
142 |     使用pdf2htmlEx工具将pdf转为分页HTML, 并存储在数据库中.
143 |     展示时, 由前端发送一个或多个页码的http请求获取html再进行渲染.
144 | 
145 |   \item 转文本:
146 | 
147 |     使用libpoppler库将pdf转为纯文本, 用于建立数据索引进行内容搜索.
148 | 
149 | \end{enumerate}
150 | \subsection{数据库索引及查询}
151 | 
152 | 内容数据库采用xapian的python封装xappy实现.
153 | xapian\footnote{\url{http://xapian.org}}是一个轻量, 高效的搜索引擎后端, 且支持模糊查询, 布尔查询, 多域查询等功能.
154 | 
155 | xapian默认使用BM25作为结果的相关程度指标, 考虑到paper搜索的特殊性, 还加入了citation等因素综合评判.
156 | 
157 | 同时,系统使用mongodb作为论文数据库, 支持按照标题/作者进行查询, 同时系统还支持基于标题的近似查询.
158 | 
159 | \subsection{其他功能}
160 | 
161 | 在正规的论文信息之外,Sopaper提供了社交化的功能接口.用户可以对一篇论文进行如下操作:
162 | \begin{itemize}
163 |   \item 表态 支持/反对
164 |   \item 评分
165 |   \item 对论文发表评论,与其他用户探讨
166 |   \item 将论文分享到其他社交圈
167 | \end{itemize}
168 | 这是为了基于此构建了论文的大众评价和社区生态系统.
169 | 
170 | \section{前端实现}
171 | 
172 | \subsection{前端框架及技术}
173 | 
174 | 前端使用的CSS框架为Semantic UI.
175 | 
176 | 页面数据绑定通过Angular JS实现.
177 | 
178 | 网页渲染模板为Flask默认的 Jinja.
179 | 
180 | PDF 解析器基于开源项目pdf2htmlex修改.
181 | 
182 | 与服务器的通信请求利用Ajax.
183 | 
184 | \subsection{界面设计}
185 | 
186 | Logo设计:
187 | 
188 | \begin{figure}[H]
189 |   \small
190 |   \centering
191 |   \includegraphics[width=11cm]{img/logo.png}
192 |   \caption{Sopaper Logo}
193 | \end{figure}
194 | 
195 | Logo由"Sopaper"的英文构成,Paper设计为纸带的形式,取其双关义.搜索引擎主要的对象:堆叠的论文形成一架纸飞机,字母S 构成飞机的轨迹.富有趣味的意向意味着Sopaper 可以为研究工作插上翅膀.Logo 采取了涂鸦风格及明亮色系的配色.切合了"So easy"的口号.
196 | 
197 | \begin{figure}[H]
198 |   \small
199 |   \centering
200 |   \includegraphics[width=11cm]{img/index.png}
201 |   \caption{主页设计}
202 | \end{figure}
203 | 
204 | 主页采取搜索框居中大气简洁式设计.页面下方为作者的漫画化形象.
205 | 
206 | \begin{figure}[H]
207 |   \small
208 |   \centering
209 |   \includegraphics[width=11cm]{img/reading.png}
210 |   \caption{阅读页面设计}
211 | \end{figure}
212 | 可直接阅读的论文位于页面正中,左侧栏显示垂直搜索信息,包括作者、引用量、下载量、标签、引用列表等,以及提供下载、评论、分享等相关功能 按钮.
213 | 
214 | \begin{figure}[H]
215 |   \small
216 |   \centering
217 |   \includegraphics[width=11cm]{img/summary.png}
218 |   \caption{摘要页面设计}
219 | \end{figure}
220 | 
221 | 摘要页面位于页面右侧栏,单个摘要包含标题及高亮关键字的正文摘要.
222 | 
223 | \begin{figure}[H]
224 |   \small
225 |   \centering
226 |   \includegraphics[width=11cm]{img/author.png}
227 |   \caption{作者搜索}
228 | \end{figure}
229 | 
230 | 
231 | \begin{figure}[H]
232 |   \small
233 |   \centering
234 |   \includegraphics[width=11cm]{img/reference.png}
235 |   \caption{论文引用情况}
236 | \end{figure}
237 | 
238 | 采用了弹出式的Modal,点击链接可直接转到引用源网页.
239 | 
240 | \subsection{交互设计}
241 | 
242 | 相较于传统的搜索引擎交互模式,Sopaper做出了一些改变性的尝试:
243 | \begin{itemize}
244 |   \item \textbf{论文为主}:在交互上,将搜索的单个内容本身而非摘要列表作为搜索引擎的默认主体呈现内容.
245 |   \item \textbf{立即阅读}: 提供在网页上直接阅读论文的功能.将 PDF 文件精确转换成可供 Firefox、Google Chrome 等现代浏览器直接浏览的 HTML 文件.用户无须点选,下载等复杂操作便可直接阅读论文,以便用户以最快速度是否是自己想找的论文,简要浏览论文内容,在论文中摘录信息等需求.
246 |   \item \textbf{最大似然匹配} :当用户检索的结果可以精确的匹配到某论文的标题时,Sopaper将不返回结果的拍循序列表,而是直接进入该最有可能paper的阅读模式.跳过选择, 令用户尽可能的直达自己想要的搜索结果.
247 | \end{itemize}
248 | 
249 | 
250 | \section{总结}
251 | 由于开发团队有长期合作的经验,且分工明确,通力合作,最终成功完
252 | 成了项目.另外,这次课程设计的经历在开发之外也给我们带来了许 多宝贵的体验和教育.
253 | 
254 | 整个项目的出发点简单而明确,就是要整合和改进分散、繁琐的下载流程,提供一个便捷的定
255 | 位、下载与阅读服务.消灭用户在各种页面之间手动切换、反复确认、 重命名、同步数据的
256 | 冗余工作.
257 | 
258 | 如何能够删繁就简,我们两人开动脑筋设计了许多新颖的产品交互形式.经过激烈的几次讨论
259 | ,设计会,以及寻找身边同学做调研,最终决定了这样一种形式,我们在这一推敲的过程中体验到了在构建易用的产品时,苛求细节,反复迭代打磨
260 | 的工匠情怀.
261 | 
262 | 目前的Sopaper还有许多待完善之处,如受到下载限制和版权限制的索引量还仍需尽可能的提
263 | 高,如收集到了论文的引用关系,用户的评论与反馈后,还可以 尝试基于此做更多的引用关系
264 | 分析,专家推荐,内容推荐等工作.有待日后的学习与研究中进一步完善.
265 | 


--------------------------------------------------------------------------------
/report/mint-defs.tex:
--------------------------------------------------------------------------------
 1 | % $File: mint-defs.tex
 2 | % $Date: Sun Oct 27 23:49:39 2013 +0800
 3 | % $Author: wyx <ppwwyyxxc@gmail.com>
 4 | 
 5 | 
 6 | % \inputmintedConfigured[additional minted options]{lang}{file path}{
 7 | \newcommand{\inputmintedConfigured}[3][]{\inputminted[fontsize=\footnotesize,
 8 | 	label=#3,linenos,frame=lines,framesep=0.8em,tabsize=4,#1]{#2}{#3}}
 9 | 
10 | % \phpsrc[additional minted options]{file path}: show highlighted php source
11 | \newcommand{\phpsrc}[2][]{\inputmintedConfigured[#1]{php}{#2}}
12 | % \phpsrcpart[additional minted options]{file path}{first line}{last line}: show part of highlighted php source
13 | \newcommand{\phpsrcpart}[4][]{\phpsrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
14 | % \phpsrceg{example id}
15 | \newcommand{\phpeg}[1]{\inputminted[startinline,
16 | 	firstline=2,lastline=2]{php}{res/php-src-eg/#1.php}}
17 | 
18 | \newcommand{\txtsrc}[2][]{\inputmintedConfigured[#1]{text}{#2}}
19 | \newcommand{\txtsrcpart}[4][]{\txtsrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
20 | 
21 | \newcommand{\pysrc}[2][]{\inputmintedConfigured[#1]{py}{#2}}
22 | \newcommand{\pysrcpart}[4][]{\pysrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
23 | 
24 | \newcommand{\confsrc}[2][]{\inputmintedConfigured[#1]{squidconf}{#2}}
25 | \newcommand{\confsrcpart}[4][]{\confsrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
26 | 
27 | \newcommand{\cppsrc}[2][]{\inputmintedConfigured[#1]{cpp}{#2}}
28 | \newcommand{\cppsrcpart}[4][]{\cppsrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
29 | 
30 | \newcommand{\matsrc}[2][]{\inputmintedConfigured[#1]{matlab}{#2}}
31 | \newcommand{\matsrcpart}[4][]{\matsrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
32 | 
33 | \newcommand{\versrc}[2][]{\inputmintedConfigured[#1]{verilog}{#2}}
34 | \newcommand{\versrcpart}[4][]{\versrc[firstline=#3,firstnumber=#3,lastline=#4,#1]{#2}}
35 | 


--------------------------------------------------------------------------------
/report/refs.bib:
--------------------------------------------------------------------------------
 1 | @ONLINE{beer,
 2 | 	title ={Wikipedia page for Beer-Lambert Law},
 3 | 	url ={http://en.wikipedia.org/wiki/Beer-Lambert_law}
 4 | }
 5 | @ONLINE{fresnel,
 6 | 	title ={Wikipedia page for Fresnel Equations},
 7 | 	url ={https://en.wikipedia.org/wiki/Fresnel_equations}
 8 | }
 9 | @ONLINE{triangle,
10 | 	title ={Ray-triangle intersection},
11 | 	author={Brian Curless},
12 | 	url ={http://www.cs.washington.edu/education/courses/cse457/09au/lectures/triangle_intersection.pdf}
13 | }
14 | 
15 | @ONLINE{triangle_code,
16 | 	title={Lighthouse3d Page for Ray-Triangle Intersection},
17 | 	url={http://www.lighthouse3d.com/tutorials/maths/ray-triangle-intersection/}
18 | 
19 | }
20 | 
21 | @ONLINE{phong,
22 | 	title ={Wikipedia page for Phong Reflection Model},
23 | 	url ={http://en.wikipedia.org/wiki/Phong_reflection_model}
24 | }
25 | 
26 | @ONLINE{pathtracing,
27 | 	title ={Wikipedia page for Path Tracing},
28 | 	url ={http://en.wikipedia.org/wiki/Path_tracing}
29 | }
30 | 
31 | @inproceedings{aabb,
32 | 	title={An efficient and robust ray-box intersection algorithm},
33 | 	author={Williams, Amy and Barrus, Steve and Morley, R Keith and Shirley, Peter},
34 | 	booktitle={ACM SIGGRAPH 2005 Courses},
35 | 	pages={9},
36 | 	year={2005},
37 | 	organization={ACM}
38 | }
39 | 
40 | 
41 | @inproceedings{kdtree,
42 | 	title={On building fast kd-trees for ray tracing, and on doing that in O (N log N)},
43 | 	author={Wald, Ingo and Havran, Vlastimil},
44 | 	booktitle={Interactive Ray Tracing 2006, IEEE Symposium on},
45 | 	pages={61--69},
46 | 	year={2006},
47 | 	organization={IEEE}
48 | }
49 | 
50 | @article{mesh,
51 |   title={A simple, fast, and effective polygon reduction algorithm},
52 |   author={Melax, Stan},
53 |   journal={Game Developer},
54 |   volume={5},
55 |   number={11},
56 |   pages={44--49},
57 |   year={1998}
58 | }
59 | 
60 | @article{montecarlo,
61 |   title={Monte Carlo path tracing},
62 |   author={Koivisto, Jukka}
63 | }
64 | 


--------------------------------------------------------------------------------
/report/report.tex:
--------------------------------------------------------------------------------
 1 | % $File: report.tex
 2 | % $Date: Thu Jun 26 13:02:24 2014 +0800
 3 | % $Author: wyx <ppwwyyxxc@gmail.com>
 4 | 
 5 | \documentclass[11pt,a4paper]{article}
 6 | \usepackage{fontspec,amsmath,amssymb,zhspacing,verbatim,minted,listings,zhmath}
 7 | \usepackage{dirtree}
 8 | \usepackage{titlesec, titletoc}
 9 | \usepackage{enumerate}
10 | \usepackage[hyperfootnotes=false,colorlinks,linkcolor=blue,anchorcolor=blue,citecolor=blue]{hyperref}
11 | \usepackage{indentfirst}
12 | \usepackage{float}			% don't automatically change location of figure [H]
13 | \usepackage{chngpage}		% use \changetext to change page size
14 | \usepackage{caption}\captionsetup{hypcap=true}  % ref to jump to object instead of caption
15 | \newfontfamily\zhfont[BoldFont=SimHei,ItalicFont=KaiTi_GB2312]{SimSun}
16 | \lstset{keywordstyle=\color{blue!70}, commentstyle=\color{red!50!green!50!blue!50},frame=shadowbox,rulesepcolor=\color{red!20!green!20!blue!20},
17 | basicstyle=\footnotesize\ttfamily}
18 | 
19 | 
20 | \zhspacing
21 | \usepackage[framemethod=TikZ]{mdframed}
22 | \mdfdefinestyle{MyFrame}{%
23 |     linecolor=black,
24 |     outerlinewidth=0.5pt,
25 |     roundcorner=10pt,
26 |     innertopmargin=\baselineskip,
27 |     innerbottommargin=\baselineskip,
28 |     innerrightmargin=20pt,
29 |     innerleftmargin=20pt,
30 |     backgroundcolor=gray!50!white}
31 | 
32 | 
33 | 
34 | \setlength{\parindent}{2em}
35 | 
36 | \usepackage{fancyhdr}
37 | \changetext{}{2.2cm}{-1.1cm}{-1.1cm}{}
38 | \pagestyle{fancy}
39 | \setlength{\headheight}{15.2pt}
40 | \lhead[]{}\rhead[]{}
41 | \fancyhead[C]{\emph{Sopaper}}
42 | 
43 | 
44 | %use cell in tabular
45 | \newcommand{\tabincell}[2]{\begin{tabular}{@{}#1@{}}#2\end{tabular}}
46 | 
47 | %thick shline
48 | \newlength\savewidth
49 | \newcommand\shline{\noalign{\global\savewidth\arrayrulewidth\global\arrayrulewidth 1pt}
50 |                    \hline
51 |                    \noalign{\global\arrayrulewidth\savewidth}}
52 | 
53 | 
54 | \renewcommand{\abstractname}{摘要}
55 | \renewcommand{\contentsname}{目录}
56 | \renewcommand{\tablename}{表}
57 | \renewcommand{\figurename}{图}
58 | \newcommand{\figref}[1]{\hyperref[fig:#1]{图\ref*{fig:#1}}}
59 | \newcommand{\secref}[1]{\hyperref[sec:#1]{\ref*{sec:#1}节}}
60 | \newcommand{\tabref}[1]{\hyperref[tab:#1]{表\ref*{tab:#1}}}
61 | 
62 | \input{mint-defs.tex}
63 | 
64 | \title{Sopaper - SOA课程设计报告}
65 | \author{吴育昕~李铁峥~汪忆辰}
66 | \date{}
67 | 
68 | 
69 | \begin{document}
70 | %\fontsize{10pt}{\baselineskip}
71 | %\selectfont
72 | \maketitle
73 | \tableofcontents
74 | 
75 | %\begin{abstract}
76 | 
77 | 	%{\bf 关键词}
78 | %\end{abstract}
79 | 
80 | \titleformat*{\section}{\centering\Large\bf}
81 | 
82 | \input{intro}
83 | \end{document}
84 | 
85 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #from distutils.core import setup
 2 | from setuptools import setup
 3 | kwargs = dict(
 4 |   name = 'sopaper',
 5 |   version = '0.8',
 6 |   description = 'Automatically search and download paper',
 7 |   author = 'Yuxin Wu',
 8 |   author_email = 'ppwwyyxxc@gmail.com',
 9 |   url = 'https://github.com/ppwwyyxx/sopaper',
10 |   keywords = ['Utility'],
11 |   packages = ['sopaper', 'sopaper.fetcher',
12 |               'sopaper.lib', 'sopaper.searcher'],
13 |   entry_points={
14 |       'console_scripts': ['sopaper = sopaper.__main__:main']
15 |   },
16 |   include_package_data=True,
17 |   install_requires=['termcolor', 'requests', 'beautifulsoup4', 'python-magic']
18 | )
19 | setup(**kwargs)
20 | 


--------------------------------------------------------------------------------
/sopaper.py:
--------------------------------------------------------------------------------
1 | sopaper/__main__.py


--------------------------------------------------------------------------------
/sopaper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/sopaper/__init__.py


--------------------------------------------------------------------------------
/sopaper/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: UTF-8 -*-
  3 | # File: __main__.py
  4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  5 | 
  6 | # Command line script to use paper-downloader
  7 | # You don't need to setup virtualenv to use script
  8 | # But you'll need requests and BeautifulSoup4 installed
  9 | 
 10 | import sys
 11 | import os
 12 | import re
 13 | import os.path
 14 | import argparse
 15 | from multiprocessing import Pool
 16 | 
 17 | # Config must be set at the beginning
 18 | from sopaper import ukconfig
 19 | ukconfig.download_method = 'wget'
 20 | ukconfig.USE_DB = False
 21 | ukconfig.LOG_DIR = None
 22 | 
 23 | from sopaper import searcher
 24 | from sopaper.searcher import searcher_run
 25 | from sopaper.job import JobContext, SearchResult
 26 | from sopaper import fetcher
 27 | from sopaper.lib.pdfutil import pdf_compress
 28 | from sopaper.lib.textutil import finalize_filename, md5
 29 | from sopaper.uklogger import *
 30 | 
 31 | def get_args():
 32 |     desc = 'SoPaper command line tool -- ' \
 33 |         'Fully Automated Paper Searcher & Downloader' \
 34 |         '\nSoPaper, So Easy'
 35 |     parser = argparse.ArgumentParser(description = desc)
 36 | 
 37 |     parser.add_argument('title', nargs='+', help='Title of the paper or URL of an arxiv/ieee/dlacm page')
 38 |     parser.add_argument('-u', '--url', action='store_true', help='do not download, print URL only')
 39 |     parser.add_argument('-d', '--directory',
 40 |                         help='Output Directory (default: current directory)',
 41 |                         required=False, default='.')
 42 |     parser.add_argument('-o', '--output',
 43 |                         help='Manually specify a output file, rather than automatically determine the correct name.')
 44 |     ret = parser.parse_args()
 45 |     ret.title = ' '.join(ret.title)
 46 |     return ret
 47 | 
 48 | def main():
 49 |     global args
 50 |     args = get_args()
 51 |     query = args.title.strip()
 52 |     directory = args.directory
 53 | 
 54 | 
 55 |     searchers = searcher.register_searcher.get_searcher_list()
 56 |     parsers = fetcher.register_parser.get_parser_list()
 57 |     download_candidates = []
 58 |     if re.match('^http[s]?://', query):
 59 |         # skip search
 60 |         ctx = JobContext("")
 61 |         sr = SearchResult(None, query)
 62 |         for parser in parsers:
 63 |             if parser.can_handle(sr):
 64 |                 parser.fetch_info(ctx, sr)      # will update title
 65 |                 download_candidates.append((parser, sr))
 66 |     else:
 67 |         #query = "Distinctive image features from scale-invariant keypoint"
 68 |         ctx = JobContext(query)
 69 | 
 70 |         search_args = list(zip(searchers, [ctx] * len(searchers)))
 71 |         pool = Pool()
 72 |         as_results = [pool.apply_async(searcher_run, arg) for arg in search_args]
 73 |         #results = [searcher_run(*arg) for arg in search_args]  # for debug
 74 | 
 75 |         url_seen = set()    # avoid repeated url
 76 |         for s in as_results:
 77 |             s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
 78 |             if s is None:
 79 |                 continue
 80 |             ctx.update_meta_dict(s['ctx_update'])
 81 |             ctx.try_update_title_from_search_result(s)
 82 | 
 83 |             for sr in s['results']:
 84 |                 if sr.url in url_seen:
 85 |                     continue
 86 |                 url_seen.add(sr.url)
 87 |                 for parser in parsers:
 88 |                     if parser.can_handle(sr):
 89 |                         parser.fetch_info(ctx, sr)      # will update title
 90 |                         download_candidates.append((parser, sr))
 91 |         pool.terminate()
 92 | 
 93 |     download_candidates = sorted(
 94 |         download_candidates,
 95 |         key=lambda x: x[0].priority,
 96 |         reverse=True)
 97 | 
 98 |     if ctx.title:
 99 |         ctx.title = finalize_filename(ctx.title)
100 |     else:
101 |         log_info("Failed to guess paper title!")
102 |         ctx.title = "Unnamed Paper"
103 |     if args.url:
104 |         # url mode
105 |         print(("Results for {}:".format(ctx.title)))
106 |         for (_, sr) in download_candidates:
107 |             print((sr.url))
108 |         return
109 | 
110 |     for (parser, sr) in download_candidates:
111 |         data = parser.download(sr)
112 |         if not data:
113 |             continue
114 |         data = pdf_compress(data)
115 | 
116 |         filename = os.path.join(directory, ctx.title + ".pdf")
117 |         if os.path.exists(filename):
118 |             log_err("File \"{}\" exists! overwrite? (y/n)".format(os.path.basename(filename)))
119 |             resp = input()
120 |             if resp not in ['y', 'Y']:
121 |                 log_info("No file written. Exiting...")
122 |                 break
123 |         with open(filename, 'wb') as f:
124 |             f.write(data)
125 |         if args.output:
126 |             os.rename(filename, args.output)
127 |         break
128 |     else:
129 |         log_err("Failed to download {0}".format(ctx.title))
130 |         return
131 |     if ctx.meta.get('bibtex'):
132 |         log_info("Bibtex:\n{}".format(ctx.meta['bibtex']))
133 |     if ctx.meta.get('author'):
134 |         log_info("Author: {0}".format(ctx.meta['author']))
135 |     if ctx.meta.get('citecnt'):
136 |         log_info("Cite count: {0}".format(ctx.meta['citecnt']))
137 |     log_info("Successfully downloaded to {0}".format(filename))
138 | 
139 | if __name__ == '__main__':
140 |     main()
141 | 


--------------------------------------------------------------------------------
/sopaper/authorfetch.py:
--------------------------------------------------------------------------------
 1 | #!../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: authorfetch.py
 4 | # Date: 五 6月 13 18:06:08 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from collections import defaultdict
 8 | 
 9 | from .lib.mailutil import sendmail
10 | from .ukdbconn import get_mongo
11 | 
12 | def get_paper_list(name):
13 |     db = get_mongo('paper')
14 |     res = list(db.find({'author': name.lower(), 'pdf': {'$exists': True}},
15 |                        {'title': 1}))
16 |     def transform(r):
17 |         return (r["_id"], r["title"])
18 |     return list(map(transform, res))
19 | 
20 | def process_fetch_author(name, email):
21 |     l = get_paper_list(name)
22 |     sendmail(email, name, l)
23 |     print("Mail Sent to {0}".format(email))
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     print(get_paper_list('jie tang'))
28 | 


--------------------------------------------------------------------------------
/sopaper/contentsearch.py:
--------------------------------------------------------------------------------
  1 | #!../manage/exec-in-virtualenv.sh
  2 | # -*- coding: UTF-8 -*-
  3 | # File: contentsearch.py
  4 | # Date: 五 6月 13 16:55:19 2014 +0000
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import tempfile
  8 | import os
  9 | from threading import Lock, Condition
 10 | from .lib.timeout import timeout, timeout_command
 11 | 
 12 | from .xpengine.indexer import XapianIndexer
 13 | from .xpengine.searcher import XapianSearcher
 14 | from .lib.singleton import Singleton
 15 | from . import ukconfig
 16 | from .ukdbconn import get_mongo
 17 | from .uklogger import *
 18 | from .lib.textutil import filter_nonascii
 19 | from .lib.pdfutil import pdf2text
 20 | 
 21 | DB_DIR = ukconfig.XP_DB_DIR
 22 | 
 23 | 
 24 | class SoPaperSearcher(object, metaclass=Singleton):
 25 |     """ Search by content of paper
 26 |         Don't instantiate me
 27 |     """
 28 | 
 29 |     def __init__(self):
 30 |         if not os.path.isdir(DB_DIR):
 31 |             SoPaperIndexer().rebuild()
 32 |         self.searcher = XapianSearcher(DB_DIR)
 33 | 
 34 |     def search(self, query, offset=0,
 35 |                page_size=ukconfig.SEARCH_PAGE_SIZE,
 36 |                summary_len=ukconfig.SEARCH_SUMMARY_LEN):
 37 |         res = self.searcher.search(query, offset, page_size, summary_len)
 38 |         return res
 39 | 
 40 | class SoPaperIndexer(object, metaclass=Singleton):
 41 |     """ Don't instantiate me
 42 |     """
 43 | 
 44 |     def __init__(self):
 45 |         self.indexer = XapianIndexer(DB_DIR)
 46 | 
 47 |     def _do_add_paper(self, doc):
 48 |         try:
 49 |             self.indexer.add_doc(doc)
 50 |         except:
 51 |             log_exc("Exception in add_paper")
 52 |             log_info("Error with this doc: {0}".format(doc['id']))
 53 | 
 54 |     def add_paper(self, doc):
 55 |         assert doc.get('text')
 56 |         assert doc.get('title')
 57 |         assert doc.get('id')
 58 |         self._do_add_paper(doc)
 59 |         self.indexer.flush()
 60 |         SoPaperSearcher().searcher.reopen()
 61 | 
 62 |     def rebuild(self):
 63 |         self.indexer.clear()
 64 | 
 65 |         db = get_mongo('paper')
 66 |         itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1})
 67 |         for res in itr:
 68 |             text = res.get('text')
 69 |             if not text:
 70 |                 log_info("About to add text for paper {0}".format(res['_id']))
 71 |                 try:
 72 |                     data = res['pdf']
 73 |                     text = pdf2text(data)
 74 |                 except KeyError:
 75 |                     log_err("No pdf in pid={0},title={1}".format(
 76 |                         res['_id'], res['title']))
 77 |                     continue
 78 |                 except Exception:
 79 |                     log_exc("Exception in pdf2text")
 80 | 
 81 |                 db.update({'_id': res['_id']}, {'$set': {'text': text}})
 82 |             doc = {'text': text,
 83 |                    'title': res['title'],
 84 |                    'id': res['_id']
 85 |                   }
 86 |             self._do_add_paper(doc)
 87 |         self.indexer.flush()
 88 | 
 89 | indexer_lock = Lock()
 90 | def do_add_paper(doc):
 91 |     indexer_lock.acquire()
 92 |     idxer = SoPaperIndexer()
 93 |     idxer.add_paper(doc)
 94 |     SoPaperSearcher().searcher.reopen()
 95 |     indexer_lock.release()
 96 | 
 97 | if __name__ == '__main__':
 98 |     print("Rebuilding...")
 99 |     SoPaperIndexer().rebuild()
100 | 


--------------------------------------------------------------------------------
/sopaper/dbsearch.py:
--------------------------------------------------------------------------------
 1 | #!../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: dbsearch.py
 4 | # Date: 六 6月 14 03:18:57 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import operator
 8 | 
 9 | from .ukdbconn import get_mongo
10 | from .uklogger import *
11 | from .lib.textutil import title_beautify, levenshtein
12 | 
13 | SEARCH_RETURN_FIELDS = {'view_cnt': 1, 'download_cnt': 1,
14 |                         'title': 1, 'page': 1, 'source': 1,
15 |                         'page_url': 1,
16 |                         'author': 1, 'bibtex': 1, 'abstract': 1,
17 |                         'references': 1, 'citedby': 1,
18 |                         'comments': {'$slice': 10},
19 |                         'cmt_count': 1}
20 | 
21 | def beautify_results():
22 |     def wrap(func):
23 |         def call(query):
24 |             res = func(query.lower())
25 |             for k in res:
26 |                 k['title'] = title_beautify(k['title'])
27 |             return res
28 |         return call
29 |     return wrap
30 | 
31 | @beautify_results()
32 | def search_exact(query):
33 |     db = get_mongo('paper')
34 |     res = list(db.find({'title': query}, SEARCH_RETURN_FIELDS))
35 |     return res
36 | 
37 | @beautify_results()
38 | def search_startswith(query):
39 |     db = get_mongo('paper')
40 |     res = list(db.find({'title':
41 |                         {'$regex': '^{0}'.format(query) } },
42 |                        SEARCH_RETURN_FIELDS))
43 |     res = [k for k in res if levenshtein(k['title'], query) < 10]
44 |     print(res)
45 |     return res
46 | 
47 | @beautify_results()
48 | def search_regex(regex):
49 |     db = get_mongo('paper')
50 |     res = list(db.find({'title': {'$regex':
51 |                                   '{0}'.format(query) }
52 |                        }, SEARCH_RETURN_FIELDS))
53 |     return res
54 | 
55 | 
56 | # XXX Hack!!
57 | # Similar Search in cached memory
58 | 
59 | all_titles = []
60 | def similar_search(query):
61 |     """ return one result that is most similar to query"""
62 |     ret = []
63 |     query = query.strip().lower()
64 |     for cand in all_titles:
65 |         dist = levenshtein(query, cand[0])
66 |         if dist < 3:
67 |             ret.append((cand, dist))
68 |     if not ret:
69 |         return None
70 |     res = max(ret, key=operator.itemgetter(1))
71 | 
72 |     db = get_mongo('paper')
73 |     res = db.find_one({'_id': res[0][1]}, SEARCH_RETURN_FIELDS)
74 |     return res
75 | 
76 | 
77 | def add_title_for_similar_search(cand):
78 |     """ cand = (title, id) """
79 |     all_titles.append((cand[0].strip().lower(), cand[1]))
80 | 
81 | def init_title_for_similar_search():
82 |     if len(all_titles) > 0:
83 |         return
84 |     db = get_mongo('paper')
85 |     itr = db.find({}, {'title': 1})
86 |     for cand in itr:
87 |         add_title_for_similar_search((cand['title'], cand['_id']))
88 | 
89 | init_title_for_similar_search()
90 | 
91 | if __name__ == '__main__':
92 |     print(search_exact(title_beautify('Intriguing properties of neural networks')))
93 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/__init__.py:
--------------------------------------------------------------------------------
  1 | #!../../manage/exec-in-virtualenv.sh
  2 | # -*- coding: UTF-8 -*-
  3 | # File: __init__.py
  4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  5 | 
  6 | from ..lib.downloader import ProgressPrinter
  7 | from ..lib.textutil import title_beautify
  8 | from ..lib.ukutil import import_all_modules, ensure_unicode
  9 | from ..lib.pdfutil import check_buf_pdf, check_legal_pdf
 10 | from ..uklogger import *
 11 | from .. import ukconfig
 12 | from ..job import SearchResult
 13 | from ..lib.exc import RecoverableErr
 14 | 
 15 | if ukconfig.USE_DB:
 16 |     import ukdbconn
 17 |     from dbsearch import search_exact
 18 | 
 19 | from functools import wraps
 20 | import re
 21 | 
 22 | class register_parser(object):
 23 |     parser_dict = {}
 24 |     """ save the original parser func"""
 25 | 
 26 |     def __init__(self, *args, **kwargs):
 27 |         self.name = kwargs.pop('name')
 28 |         self.url_match = re.compile(kwargs.pop('urlmatch'))
 29 | 
 30 |         """ priority of this parser, higher for those unlikely to be blocked"""
 31 |         self.priority = kwargs.pop('priority', 5)
 32 | 
 33 |         self.type_match = kwargs.pop('typematch', None)
 34 |         self.legal = kwargs.pop('legal', True)
 35 | 
 36 |         """ what meta field this fetcher might provide"""
 37 |         self.support_meta_field = kwargs.pop('meta_field', [])
 38 | 
 39 |         """ whether this fetcher should be considered multiple times l
 40 |             during a search.
 41 |             For now, only 'DirectPdfParser' shall be considered multiple times
 42 |         """
 43 |         self.repeatable = kwargs.pop('repeatable', False)
 44 | 
 45 |         assert self.name not in self.parser_dict
 46 | 
 47 |     @staticmethod
 48 |     def get_parser_list():
 49 |         lst = list(register_parser.parser_dict.values())
 50 |         return sorted(lst, key=lambda x: x.priority, reverse=True)
 51 | 
 52 |     def __call__(self, fetcher_cls):
 53 |         """ fetcher_cls: subclass of FetcherBase to be used
 54 |         """
 55 |         self.fetcher_cls = fetcher_cls
 56 | 
 57 |         @wraps(fetcher_cls)
 58 |         def wrapper(res):
 59 |             assert isinstance(res, SearchResult)
 60 |             try:
 61 |                 fetcher = fetcher_cls(res)
 62 |                 fetcher.name = self.name
 63 |                 fetcher.get_title()
 64 |                 fetcher.get_meta()
 65 |                 return fetcher
 66 |             except KeyboardInterrupt:
 67 |                 raise
 68 |             except Exception:
 69 |                 log_exc("Error in parser '{0}' with url '{1}'".
 70 |                         format(self.name, res.url))
 71 |                 return None
 72 |         self.parser_dict[self.name] = self
 73 |         self.cb = wrapper
 74 |         return wrapper
 75 | 
 76 |     def can_handle(self, sr):
 77 |         if (self.type_match is None
 78 |             or self.type_match != sr.type) and \
 79 |            len(self.url_match.findall(sr.url)) == 0:
 80 |             return False
 81 |         return True
 82 | 
 83 |     def fetch_info(self, ctx, sr):
 84 |         """ run this parser against the SearchResult given
 85 |             return True/False indicate success,
 86 |             will update ctx metadata and ctx.success,
 87 | 
 88 |             on success, either ctx will be filled with fetched data,
 89 |                         or ctx.existing will contain a existing doc in db.
 90 |         """
 91 |         url = sr.url
 92 |         log_info("Parsing url {0} with parser {1}".
 93 |                  format(url, self.name))
 94 |         fetcher_inst = self.cb(sr)
 95 |         if fetcher_inst is None:
 96 |             return False
 97 | 
 98 |         newt = fetcher_inst.get_title()
 99 |         if newt and ctx.update_new_title(newt):
100 |             # check updated title against db before download
101 |             if ukconfig.USE_DB:
102 |                 doc = search_exact(newt)
103 |                 if doc:
104 |                     ctx.existing = doc[0]
105 |                     ukdbconn.update_meta(doc[0]['_id'], fetcher_inst.get_meta())
106 |                     return True
107 |         meta = fetcher_inst.get_meta()
108 |         if len(meta):
109 |             log_info("Fetcher {} Update Metadata: {}".format(
110 |                 fetcher_inst.name, str(list(meta.keys()))))
111 |         ctx.update_meta_dict(meta)
112 |         return True
113 | 
114 |     def download(self, sr, progress_updater=None):
115 |         """ return binary data or None"""
116 |         fetcher_inst = self.fetcher_cls(sr)
117 | 
118 |         succ = fetcher_inst.download(progress_updater)
119 |         if not succ:
120 |             return None
121 | 
122 |         data = fetcher_inst.get_data()
123 |         ft = check_buf_pdf(data)
124 |         if ft == True:
125 |             ft = check_legal_pdf(data)
126 |             if ft == True:
127 |                 return data
128 |             else:
129 |                 log_err("Found a broken pdf")
130 |         else:
131 |             log_err("Wrong Format.")
132 | 
133 | 
134 | if __name__ != '__main__':
135 |     import_all_modules(__file__, __name__)
136 | 
137 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/arxiv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: arxiv.py
 4 | # Date: Thu Jun 18 23:32:54 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import register_parser
 8 | from .base import FetcherBase, direct_download
 9 | from ..uklogger import *
10 | from ..ukconfig import BS_PARSER
11 | 
12 | import re
13 | from bs4 import BeautifulSoup
14 | import requests
15 | 
16 | #ARXIV_PAT = re.compile('arxiv\.org/[^/]*/(?P<id>.*)')
17 | 
18 | @register_parser(name='arxiv.org', urlmatch='arxiv.org',
19 |                 meta_field=['author', 'bibtex', 'abstract'],
20 |                 priority=7)
21 | class Arxiv(FetcherBase):
22 |     def _do_pre_parse(self):
23 |         if 'pdf' in self.url:   # change /pdf/xxx.xxx to /abs/xxx.xxx
24 |             self.url = self.url.replace('pdf', 'abs')
25 |             if self.url.endswith('.abs'):
26 |                 self.url = self.url[:-4]
27 |         text = requests.get(self.url).text.encode('utf-8')
28 |         self.soup = BeautifulSoup(text, BS_PARSER)
29 | 
30 |     def _do_download(self, updater):
31 |         full_text_div = self.soup.findAll('div', attrs={'class': 'full-text'})[0]
32 |         link = full_text_div.findAll('li')[0]
33 |         partial_link = link.children.next().get('href')
34 | 
35 |         prefix = 'http://arxiv.org'
36 |         if 'cn.arxiv.org' in self.url:  # handle cn.arxiv
37 |             prefix = 'http://cn.arxiv.org'
38 |         return direct_download(prefix + partial_link, updater)
39 | 
40 |         #match = ARXIV_PAT.search(self.url).groupdict()
41 |         #pid = match['id']
42 |         #pdflink = "http://arxiv.org/pdf/{0}.pdf".format(pid)
43 |         #return direct_download(pdflink, updater)
44 | 
45 |     def _do_get_title(self):
46 |         title = self.soup.findAll(attrs={'name': 'citation_title'})[0]
47 |         title = title.get('content')
48 |         return title
49 | 
50 |     def _do_get_meta(self):
51 |         meta = {}
52 |         try:
53 |             authors = self.soup.findAll(attrs={'class': 'authors'})[0]
54 |             authors = authors.findAll('a')
55 |             author = [a.text for a in authors]
56 |             meta['author'] = author
57 |         except:
58 |             pass
59 | 
60 |         try:
61 |             abstract = self.soup.findAll(attrs={'class': 'abstract mathjax'})[0]
62 |             abstract = abstract.text.strip()
63 |             abstract = abstract[abstract.find(':')+1:].strip()
64 |             meta['abstract'] = abstract
65 |         except:
66 |             pass
67 | 
68 |         try:
69 |             bibtex_url = self.soup.findAll(attrs={'title': 'DBLP bibtex record'})
70 |             bibtex_url = bibtex_url[0].get('href')
71 |             bibtex_text = requests.get(bibtex_url).text.encode('utf-8')
72 |             bibtex_soup = BeautifulSoup(bibtex_text, BS_PARSER)
73 |             pre = bibtex_soup.findAll('pre')[0]
74 |             bibtex = pre.text
75 |             meta['bibtex'] = bibtex
76 |         except:
77 |             pass
78 |         return meta
79 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: base.py
 4 | # Date: Thu Jun 25 16:29:22 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from ..uklogger import *
 8 | from ..lib.downloader import direct_download, ProgressPrinter
 9 | from ..lib.exc import RecoverableErr
10 | from ..lib.textutil import title_beautify
11 | 
12 | class FetcherBase(object):
13 | 
14 |     def __init__(self, search_result):
15 |         self.search_result = search_result
16 |         self.url = self.search_result.url
17 |         self.title = None
18 |         self.meta = None
19 |         self.name = None
20 |         self.data = None
21 | 
22 |         self.headers = None
23 |         try:
24 |             self._do_pre_parse()
25 |         except Exception as e:
26 |             log_exc("Exception in pre-parse")
27 | 
28 |     def _do_pre_parse(self):
29 |         """ parse right after getting the url"""
30 | 
31 |     def _do_download(self, updater):
32 |         """ return data, or raise"""
33 | 
34 |     def _do_get_meta(self):
35 |         """ return dict"""
36 |         pass
37 | 
38 |     def _do_get_title(self):
39 |         """ return string, or raise"""
40 |         pass
41 | 
42 |     def get_meta(self):
43 |         if self.meta is not None:
44 |             return self.meta
45 |         self.meta = self._do_get_meta()
46 |         return self.meta
47 | 
48 |     def get_title(self):
49 |         if self.title is not None:
50 |             if not self.title:
51 |                 return None
52 |             else:
53 |                 return self.title
54 |         try:
55 |             self.title = title_beautify(self._do_get_title())
56 |             return self.title
57 |         except:
58 |             self.title = ""
59 |             return None
60 | 
61 |     def download(self, updater=None):
62 |         """ save self.data"""
63 |         if updater is None:
64 |             updater = ProgressPrinter()
65 |         try:
66 |             self.data = self._do_download(updater)
67 |             return True
68 |         except KeyboardInterrupt:
69 |             raise
70 |         except RecoverableErr as e:
71 |             log_err(str(e))
72 |             return False
73 |         except Exception:
74 |             log_exc("Error while downloading with" \
75 |                     "url '{0}'".format(self.url))
76 |             return False
77 | 
78 |     def get_data(self):
79 |         if self.data is None:
80 |             raise Exception("Cannot call get_data() before download succeeds")
81 |         return self.data
82 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/direct.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: direct.py
 4 | # Date: 一 6月 09 14:29:17 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import register_parser
 8 | from .base import FetcherBase, direct_download
 9 | 
10 | @register_parser(name='direct link', urlmatch='.*\.pdf',
11 |                  typematch='directpdf', repeatable=True,
12 |                  priority=10)
13 | class DirectPdf(FetcherBase):
14 |     def _do_download(self, updater):
15 |         return direct_download(self.url, updater)
16 | 
17 |     def _do_get_title(self):
18 |        raise Exception("Cannot get title from direct pdf link" )
19 | 
20 |     def _do_get_meta(self):
21 |        return {}
22 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/dlacm.py:
--------------------------------------------------------------------------------
  1 | #!../../manage/exec-in-virtualenv.sh
  2 | # -*- coding: UTF-8 -*-
  3 | # File: dlacm.py
  4 | # Date: Thu Jun 25 16:33:15 2015 +0800
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import re
  8 | from . import register_parser, RecoverableErr
  9 | from .base import FetcherBase
 10 | from ..lib.downloader import wget_download
 11 | from ..uklogger import *
 12 | from .. import ukconfig
 13 | from ..ukconfig import BS_PARSER
 14 | 
 15 | from urllib.parse import urlparse
 16 | import requests
 17 | from bs4 import BeautifulSoup
 18 | 
 19 | HOSTNAME = 'dl.acm.org'
 20 | DEFAULT_TIMEOUT = '300.0'   # 5 minutes
 21 | 
 22 | def get_headers(url):
 23 |     return {'Host': urlparse(url).netloc,
 24 |             'User-Agent': ukconfig.USER_AGENT,
 25 |             'Connection': 'Keep-Alive'
 26 |            }
 27 | 
 28 | # Bug in requests:
 29 | # requests would fail to download paper from dl.acm.org. use wget instead
 30 | def download(url, updater):
 31 |     log_info("Custom Directly Download with URL {0} ...".format(url))
 32 |     headers = get_headers(url)
 33 | 
 34 |     resp = requests.get(url, headers=headers, allow_redirects=False)
 35 |     pdfurl = resp.headers.get('location')
 36 |     if pdfurl:
 37 |         headers['Host'] = urlparse(pdfurl).netloc
 38 |         return wget_download(pdfurl, updater, headers)
 39 |     else:
 40 |         return wget_download(url, updater)
 41 | 
 42 | @register_parser(name='dl.acm.org', urlmatch='dl.acm.org',
 43 |                  meta_field=['author', 'bibtex', 'citedby', 'references',
 44 |                              'abstract'],
 45 |                  priority=2)
 46 | class DLAcm(FetcherBase):
 47 |     def _do_pre_parse(self):
 48 |         self.text = requests.get(
 49 |             self.url, headers=get_headers(self.url)).text.encode('utf-8')
 50 |         #with open("/tmp/b.html", 'w') as f:
 51 |             #f.write(self.text)
 52 |         #text = open("/tmp/b.html").read()
 53 |         self.soup = BeautifulSoup(self.text, BS_PARSER)
 54 | 
 55 |     def _do_download(self, updater):
 56 |         pdf = self.soup.findAll(attrs={'name': 'FullTextPDF'})
 57 |         if not pdf:
 58 |             pdf = self.soup.findAll(attrs={'name': 'FullTextPdf'})
 59 |         if pdf:
 60 |             try:
 61 |                 url = pdf[0].get('href')
 62 |                 url = 'http://{0}/'.format(HOSTNAME) + url
 63 |                 log_info("dl.acm origin url: {0}".format(url))
 64 |                 r = requests.get(url, headers=get_headers(url), allow_redirects=False)
 65 |                 pdfurl = r.headers.get('location')
 66 |                 assert pdfurl
 67 |             except:
 68 |                 # probably something need to be fixed
 69 |                 log_exc('')
 70 |         else:
 71 |             raise RecoverableErr("dl.acm has no available download at {0}".format(self.url))
 72 |         return download(pdfurl, updater)
 73 | 
 74 |     def _do_get_title(self):
 75 |         titles = self.soup.findAll(attrs={'name': 'citation_title'})
 76 |         return titles[0]['content']
 77 | 
 78 |     def _do_get_meta(self):
 79 |         meta = {}
 80 |         try:
 81 |             log_info("Getting author...")
 82 |             authors = self.soup.findAll(
 83 |                 attrs={'title': 'Author Profile Page'})
 84 |             author = [a.text for a in authors]
 85 |             meta['author'] = author
 86 |         except KeyboardInterrupt:
 87 |             raise
 88 |         except:
 89 |             pass
 90 | 
 91 |         try:
 92 |             log_info("Getting abstract...")
 93 |             abstract_url = re.findall(r'\'tab_abstract.+\d+\'', self.text)[0][1:-1]
 94 |             abstract_text = requests.get('http://{0}/'.format(HOSTNAME) + abstract_url).text.encode('utf-8')
 95 |             abstract_soup = BeautifulSoup(abstract_text, BS_PARSER)
 96 |             abstract = abstract_soup.findAll('p')[0].text
 97 |             meta['abstract'] = abstract
 98 |         except KeyboardInterrupt:
 99 |             raise
100 |         except:
101 |             pass
102 | 
103 |         try:
104 |             log_info("Getting refs ...")
105 |             ref_url = re.findall(r'\'tab_references.+\d+\'', self.text)[0][1:-1]
106 |             ref_text = requests.get('http://{0}/'.format(HOSTNAME) + ref_url).text.encode('utf-8')
107 |             ref_soup = BeautifulSoup(ref_text, BS_PARSER)
108 |             trs = ref_soup.findAll('tr')
109 |             reference = []
110 |             for tr in trs:
111 |                 records = tr.findAll('a')
112 |                 if len(records) > 0:
113 |                     href = 'http://{0}/'.format(HOSTNAME) + records[0].get('href')
114 |                     ref = records[0].text.strip()
115 |                     reference.append({'ref': ref, 'href': href})
116 |             meta['references'] = reference
117 |         except KeyboardInterrupt:
118 |             raise
119 |         except:
120 |             pass
121 | 
122 |         try:
123 |             log_info("Getting cited ...")
124 |             cite_url = re.findall(r'\'tab_citings.+\d+\'', self.text)[0][1:-1]
125 |             cite_text = requests.get('http://{0}/'.format(HOSTNAME) +
126 |                                      cite_url, timeout=5
127 |                                     ).text.encode('utf-8')
128 |             cite_soup = BeautifulSoup(cite_text, BS_PARSER)
129 |             trs = cite_soup.findAll('tr')
130 |             citing = []
131 |             for tr in trs:
132 |                 records = tr.findAll('a')
133 |                 if len(records) > 0:
134 |                     href = 'http://{0}/'.format(HOSTNAME) + records[0].get('href')
135 |                     cite = records[0].text.strip()
136 |                     citing.append({'citing': cite, 'href': href})
137 |             meta['citedby'] = citing
138 |         except KeyboardInterrupt:
139 |             raise
140 |         except requests.exceptions.Timeout:
141 |             pass
142 |         except:
143 |             pass
144 | 
145 |         try:
146 |             log_info("Getting bibtex...")
147 |             bibtex_url = re.findall(r'exportformats.+bibtex', self.text)[0]
148 |             bibtex_text = requests.get('http://{0}/'.format(HOSTNAME) + bibtex_url).text.encode('utf-8')
149 |             bibtex_soup = BeautifulSoup(bibtex_text, BS_PARSER)
150 |             pre = bibtex_soup.find('pre')
151 |             bibtex = pre.text.strip()
152 |             meta['bibtex'] = bibtex
153 |         except KeyboardInterrupt:
154 |             raise
155 |         except:
156 |             pass
157 |         return meta
158 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/ieee.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: UTF-8 -*-
  3 | # File: ieee.py
  4 | # Date: 五 6月 13 18:22:19 2014 +0000
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | from . import register_parser
  8 | from .base import FetcherBase, direct_download
  9 | from ..uklogger import *
 10 | from ..ukconfig import BS_PARSER
 11 | 
 12 | import re
 13 | from bs4 import BeautifulSoup
 14 | import requests
 15 | 
 16 | HOSTNAME = 'ieeexplore.ieee.org'
 17 | STAMP_URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={0}"
 18 | REFERENCE_URL = "http://ieeexplore.ieee.org/xpl/abstractReferences.jsp?tp=&arnumber={0}"
 19 | CITATION_URL = "http://ieeexplore.ieee.org/xpl/abstractCitations.jsp?tp=&arnumber={0}"
 20 | 
 21 | @register_parser(name='ieeexplore.ieee.org',
 22 |                  urlmatch='ieeexplore.ieee.org',
 23 |                  meta_field=['author', 'abstract', 'references', 'citedby'])
 24 | class IEEE(FetcherBase):
 25 |     def _do_pre_parse(self):
 26 |         text = requests.get(self.url).text.encode('utf-8')
 27 |         self.soup = BeautifulSoup(text, BS_PARSER)
 28 | 
 29 |         number = re.findall('arnumber=[0-9]*', self.url)[0]
 30 |         self.number = re.findall('[0-9]+', number)[0]
 31 | 
 32 |     def _do_download(self, updater):
 33 |         url2 = STAMP_URL.format(self.number)
 34 |         text = requests.get(url2).text.encode('utf-8')
 35 |         soup = BeautifulSoup(text, BS_PARSER)
 36 |         fr = soup.findAll('frame')[-1]
 37 |         pdflink = fr.get('src')
 38 |         return direct_download(pdflink, updater)
 39 | 
 40 |     def _do_get_title(self):
 41 |         titles = self.soup.findAll('h1')
 42 |         title = titles[0].text.strip()
 43 |         return title
 44 | 
 45 |     def _do_get_meta(self):
 46 |         meta = {}
 47 |         try:
 48 |             authors = self.soup.findAll(attrs={'name': 'citation_author'})
 49 |             author = [a.get('content') for a in authors]
 50 |             meta['author'] = author
 51 |         except KeyboardInterrupt:
 52 |             raise
 53 |         except:
 54 |             pass
 55 | 
 56 |         try:
 57 |             abstract_div = self.soup.findAll(attrs={'class': 'article'})
 58 |             abstract = abstract_div[0].text.strip()
 59 |             meta['abstract'] = abstract
 60 |         except KeyboardInterrupt:
 61 |             raise
 62 |         except:
 63 |             pass
 64 | 
 65 |         try:
 66 |             ref_url = REFERENCE_URL.format(self.number)
 67 |             ref_text = requests.get(ref_url).text
 68 |             ref_soup = BeautifulSoup(ref_text, BS_PARSER)
 69 |             ol = ref_soup.findAll('ol')[0]
 70 |             lis = ol.findAll('li')
 71 |             reference = []
 72 |             for li in lis:
 73 |                 ref = li.text.strip()
 74 |                 ref = ref.replace('\t', '')
 75 |                 ref = ref.replace('\n', '')
 76 |                 ref = ref.replace('\r', '')
 77 |                 ref = ref.replace('\\', '')
 78 |                 ref = ref.replace('\xa0', ' ')
 79 |                 if 'Abstract' in ref:
 80 |                     ref = ref[:ref.find('Abstract')]
 81 |                 if 'Cross' in ref:
 82 |                     ref = ref[:ref.find('[')]
 83 |                 href = ''
 84 |                 link = li.findAll('a')
 85 |                 if len(link) > 0:
 86 |                     href = link[0].get('href')
 87 |                     if not 'http' in href:
 88 |                         href = "http://{0}{1}".format(HOSTNAME, href)
 89 |                 reference.append({'ref': ref, 'href': href})
 90 |             meta['references'] = reference
 91 |         except KeyboardInterrupt:
 92 |             raise
 93 |         except:
 94 |             pass
 95 | 
 96 |         try:
 97 |             cite_url = CITATION_URL.format(self.number)
 98 |             cite_text = requests.get(cite_url).text.encode('utf-8')
 99 |             cite_soup = BeautifulSoup(cite_text, BS_PARSER)
100 |             html = cite_soup.findAll('ol')[0]
101 |             lis = html.findAll('li')
102 |             citing = []
103 |             for li in lis:
104 |                 cite = li.text.strip()
105 |                 cite = cite.replace('\t', '')
106 |                 cite = cite.replace('\n', '')
107 |                 cite = cite.replace('\r', '')
108 |                 cite = cite.replace('\\', '')
109 |                 cite = cite.replace('  ', '')
110 |                 cite = cite.replace('\xa0', ' ')
111 |                 if 'Abstract' in cite:
112 |                     cite = cite[:cite.find('Abstract')]
113 |                 if 'Cross' in cite:
114 |                     cite = cite[:cite.find('[')]
115 |                 href = ''
116 |                 link = li.findAll('a')
117 |                 if len(link) > 0:
118 |                     href = "http://{0}{1}".format(HOSTNAME,
119 |                         re.findall(r'/xpl/articleDetails.jsp\?arnumber=[0-9]+', link[0].get('href'))[0])
120 |                 citing.append({'citing': cite, 'href': href})
121 |             meta['citedby'] = citing
122 |         except KeyboardInterrupt:
123 |             raise
124 |         except:
125 |             pass
126 | 
127 |         """
128 |         there are still difficulties getting the bibtex
129 |         """
130 | 
131 |         return meta
132 | 


--------------------------------------------------------------------------------
/sopaper/fetcher/sciencedirect.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: UTF-8 -*-
  3 | # File: sciencedirect.py
  4 | # Date: 一 6月 09 17:06:26 2014 +0000
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import re
  8 | from . import register_parser, RecoverableErr
  9 | from .base import FetcherBase, direct_download
 10 | from ..uklogger import *
 11 | from .. import ukconfig
 12 | 
 13 | from urllib.parse import urlparse
 14 | import requests
 15 | from bs4 import BeautifulSoup
 16 | 
 17 | HOSTNAME = 'www.sciencedirect.com'
 18 | 
 19 | # not working right now
 20 | #@register_parser(name='sciencedirect.com', urlmatch='sciencedirect.com',
 21 |                  #meta_field=['author', 'bibtex', 'abstract'],
 22 |                  #priority=8)
 23 | class ScienceDirect(FetcherBase):
 24 |     def _do_pre_parse(self):
 25 |         self.text = requests.get(self.url).text.encode('utf-8')
 26 |         with open("/tmp/b.html", 'w') as f:
 27 |             f.write(self.text)
 28 |         #text = open("/tmp/b.html").read()
 29 |         self.soup = BeautifulSoup(self.text)
 30 | 
 31 |     def _do_download(self, updater):
 32 |         pdf = self.soup.findAll(attrs={'id': 'pdfLink'})
 33 |         if pdf:
 34 |             try:
 35 |                 url = pdf[0]['pdfurl']
 36 |                 print(url)
 37 |             except:
 38 |                 # probably something need to be fixed
 39 |                 log_exc('')
 40 |         else:
 41 |             raise RecoverableErr("No available download at {0}".format(self.url))
 42 |         return direct_download(url, updater)
 43 | 
 44 |     def _do_get_title(self):
 45 |         titles = self.soup.findAll(attrs={'name': 'citation_title'})
 46 |         return titles[0]['content']
 47 | 
 48 |     def _do_get_meta(self):
 49 |         meta = {}
 50 |         try:
 51 |             log_info("Getting author...")
 52 |             authors = self.soup.findAll(
 53 |                 attrs={'title': 'Author Profile Page'})
 54 |             author = [a.text for a in authors]
 55 |             meta['author'] = author
 56 |         except KeyboardInterrupt:
 57 |             raise
 58 |         except:
 59 |             pass
 60 | 
 61 |         try:
 62 |             log_info("Getting abstract...")
 63 |             abstract_url = re.findall(r'\'tab_abstract.+\d+\'', self.text)[0][1:-1]
 64 |             abstract_text = requests.get('http://{0}/'.format(HOSTNAME) + abstract_url).text.encode('utf-8')
 65 |             abstract_soup = BeautifulSoup(abstract_text)
 66 |             abstract = abstract_soup.findAll('p')[0].text
 67 |             meta['abstract'] = abstract
 68 |         except KeyboardInterrupt:
 69 |             raise
 70 |         except:
 71 |             pass
 72 | 
 73 |         try:
 74 |             log_info("Getting refs ...")
 75 |             ref_url = re.findall(r'\'tab_references.+\d+\'', self.text)[0][1:-1]
 76 |             ref_text = requests.get('http://{0}/'.format(HOSTNAME) + ref_url).text.encode('utf-8')
 77 |             ref_soup = BeautifulSoup(ref_text)
 78 |             trs = ref_soup.findAll('tr')
 79 |             reference = []
 80 |             for tr in trs:
 81 |                 records = tr.findAll('a')
 82 |                 if len(records) > 0:
 83 |                     href = 'http://{0}/'.format(HOSTNAME) + records[0].get('href')
 84 |                     ref = records[0].text.strip()
 85 |                     reference.append({'ref': ref, 'href': href})
 86 |             meta['references'] = reference
 87 |         except KeyboardInterrupt:
 88 |             raise
 89 |         except:
 90 |             pass
 91 | 
 92 |         try:
 93 |             log_info("Getting cited ...")
 94 |             cite_url = re.findall(r'\'tab_citings.+\d+\'', self.text)[0][1:-1]
 95 |             cite_text = requests.get('http://{0}/'.format(HOSTNAME) +
 96 |                                      cite_url, timeout=5
 97 |                                     ).text.encode('utf-8')
 98 |             cite_soup = BeautifulSoup(cite_text)
 99 |             trs = cite_soup.findAll('tr')
100 |             citing = []
101 |             for tr in trs:
102 |                 records = tr.findAll('a')
103 |                 if len(records) > 0:
104 |                     href = 'http://{0}/'.format(HOSTNAME) + records[0].get('href')
105 |                     cite = records[0].text.strip()
106 |                     citing.append({'citing': cite, 'href': href})
107 |             meta['citedby'] = citing
108 |         except KeyboardInterrupt:
109 |             raise
110 |         except requests.exceptions.Timeout:
111 |             pass
112 |         except:
113 |             pass
114 | 
115 |         try:
116 |             log_info("Getting bibtex...")
117 |             bibtex_url = re.findall(r'exportformats.+bibtex', self.text)[0]
118 |             bibtex_text = requests.get('http://{0}/'.format(HOSTNAME) + bibtex_url).text.encode('utf-8')
119 |             bibtex_soup = BeautifulSoup(bibtex_text)
120 |             pre = bibtex_soup.find('pre')
121 |             bibtex = pre.text.strip()
122 |             meta['bibtex'] = bibtex
123 |         except KeyboardInterrupt:
124 |             raise
125 |         except:
126 |             pass
127 |         return meta
128 | 


--------------------------------------------------------------------------------
/sopaper/job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: job.py
 4 | # Date: Thu Jun 18 23:11:07 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from .lib.textutil import title_beautify
 8 | from .lib.ukutil import ensure_unicode
 9 | from .uklogger import *
10 | 
11 | class JobContext(object):
12 |     def __init__(self, query):
13 |         query = title_beautify(query)
14 |         self.query = query
15 |         self.success = False
16 |         self.title = query
17 |         self.existing = None
18 |         self.meta = {}
19 | 
20 |     def update_meta_dict(self, meta):
21 |         if 'title' in meta:
22 |             del meta['title']
23 |         if 'citecnt' not in self.meta and 'citedby' in meta:
24 |             self.meta['citecnt'] = len(meta['citedby'])
25 |         self.meta.update(meta)
26 | 
27 |     def need_field(self, fields):
28 |         for f in fields:
29 |             if f not in self.meta:
30 |                 return True
31 |         return False
32 | 
33 |     def update_new_title(self, title):
34 |         if title != self.title:
35 |             log_info("Using new title: {0}".format(ensure_unicode(title)))
36 |             self.title = title
37 |             return True
38 |         return False
39 | 
40 |     def try_update_title_from_search_result(self, s):
41 |         try:
42 |             updated_title = s['ctx_update']['title']
43 |         except KeyError:
44 |             pass
45 |         else:
46 |             self.update_new_title(updated_title)
47 | 
48 |     def __str__(self):
49 |         d = {'title': self.title,
50 |              'success': self.success,
51 |              'meta': list(self.meta.keys())
52 |             }
53 |         return str(d)
54 | 
55 | 
56 | class SearchResult(object):
57 |     def __init__(self, type, url):
58 |         self.url = url
59 |         self.type = type
60 |         self.searcher = None
61 | 
62 |     def __str__(self):
63 |         return str(self.searcher) + "; " + \
64 |                 str(self.type) + "; " + \
65 |                 self.url
66 | 


--------------------------------------------------------------------------------
/sopaper/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/sopaper/lib/__init__.py


--------------------------------------------------------------------------------
/sopaper/lib/abbr.dic:
--------------------------------------------------------------------------------
1 | convolutional neural networks?	CNN
2 | convolutional networks?	CNN
3 | deep neural networks?	DNN
4 | recurrent neural networks?	RNN
5 | deep reinforcement learning	DRL
6 | reinforcement learning	RL
7 | neural networks?	NN
8 | slam	SLAM
9 | 


--------------------------------------------------------------------------------
/sopaper/lib/downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: UTF-8 -*-
  3 | # File: downloader.py
  4 | # Date: Fri Jun 02 10:10:59 2017 -0700
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import sys
  8 | if __name__ == '__main__':
  9 |     sys.path.append('../')
 10 | from ..uklogger import *
 11 | from ..lib.textutil import parse_file_size
 12 | from ..lib.exc import FileCorrupted
 13 | from .. import ukconfig
 14 | 
 15 | import urllib.request, urllib.parse, urllib.error
 16 | import os
 17 | import tempfile
 18 | import requests
 19 | from urllib.parse import urlparse
 20 | 
 21 | class ProgressPrinter(object):
 22 |     def __init__(self):
 23 |         self.total = 0
 24 |         self.last_done_len = -1
 25 | 
 26 |     def finish(self, data):
 27 |         sys.stdout.write("\n")
 28 |         sys.stdout.flush()
 29 |         log_info("Download finished")
 30 | 
 31 |     def update(self, done):
 32 |         assert self.total != 0
 33 |         width = 50
 34 |         done_len = int(width * done / self.total)
 35 |         if done_len > self.last_done_len:
 36 |             sys.stdout.write("\r[{0}>{1}]".format('=' * done_len,
 37 |                                                   ' ' * (width - done_len)))
 38 |             sys.stdout.flush()
 39 |         self.last_done_len = done_len
 40 | 
 41 |     def set_total(self, size):
 42 |         """size: number of bytes"""
 43 |         log_info("File size is {0}".format(parse_file_size(size)))
 44 |         self.total = size
 45 | 
 46 | def wget_download(url, progress_updater, headers=None):
 47 |     log_info("Download with wget on {0} ...".format(url))
 48 | 
 49 |     headers = ' '.join(['--header="{0}: {1}"'.format(k, v) for k, v
 50 |                         in headers.items()])
 51 |     tf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
 52 |     tf.close()
 53 |     # set timeout and retry number
 54 |     cmd = 'wget "{0}" -O "{1}" {2} --timeout=5 -t 3'.format(url, tf.name, headers)
 55 |     ret = os.system(cmd)
 56 |     if ret:
 57 |         if os.path.exists(tf.name):
 58 |             os.remove(tf.name)
 59 |         raise FileCorrupted("wget failed with return code {}".format(ret))
 60 |     else:
 61 |         data = open(tf.name, 'rb').read()
 62 |         progress_updater.finish(data)
 63 |         os.remove(tf.name)
 64 |     return data
 65 | 
 66 | def requests_download(url, progress_updater, headers=None):
 67 |     resp = requests.get(url, stream=True, headers=headers)
 68 |     total_length = resp.headers.get('content-length')
 69 |     if total_length is None:
 70 |         data = resp.content
 71 |         progress_updater.finish(data)
 72 |         return data
 73 |     else:
 74 |         total_length = int(total_length)
 75 |         if total_length < ukconfig.FILE_SIZE_MINIMUM:
 76 |             raise FileCorrupted("File too small: " + parse_file_size(total_length))
 77 |         if total_length > ukconfig.FILE_SIZE_MAXIMUM:
 78 |             raise FileCorrupted("File too large: " + parse_file_size(total_length))
 79 |         progress_updater.set_total(total_length)
 80 |         dl = 0
 81 |         ret = ""
 82 |         for data in resp.iter_content(chunk_size=1024*1024):
 83 |             dl += len(data)
 84 |             ret += data
 85 |             progress_updater.update(dl)
 86 |         progress_updater.finish(data)
 87 |         return ret
 88 | 
 89 | def direct_download(url, progress_updater, headers=None):
 90 |     """ download with methods given by ukconfig.download_method
 91 |         return the data
 92 |     """
 93 |     log_info("Directly Download with URL {0} ...".format(url))
 94 | 
 95 |     if headers is None:
 96 |         headers = {'Host': urlparse(url).netloc,
 97 |                    'User-Agent': ukconfig.USER_AGENT,
 98 |                    'Connection': 'Keep-Alive'
 99 |                   }
100 | 
101 |     # for test and cmd tools only
102 |     if ukconfig.download_method == 'wget':
103 |         data = wget_download(url, progress_updater, headers)
104 |     else:
105 |         data = requests_download(url, progress_updater, headers)
106 |     if len(data) < ukconfig.FILE_SIZE_MINIMUM:
107 |         raise FileCorrupted("File too small: " + parse_file_size(len(data)))
108 |     return data
109 | 
110 | if __name__ == '__main__':
111 |     data = direct_download('http://delivery.acm.org/10.1145/330000/322274/p615-yao.pdf?ip=59.66.132.22&id=322274&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E587F3204F5B62A59%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=456185443&CFTOKEN=45860210&__acm__=1399725544_eebbed2ce2719c67c7a3642f2b21d80a')
112 |     print(data)
113 | 


--------------------------------------------------------------------------------
/sopaper/lib/exc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: exc.py
 4 | # Date: Wed Jul 08 22:52:49 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | class RecoverableErr(Exception):
 8 |     pass
 9 | 
10 | class FileCorrupted(RecoverableErr):
11 |     pass
12 | 


--------------------------------------------------------------------------------
/sopaper/lib/mailutil.py:
--------------------------------------------------------------------------------
 1 | #!../../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: mailutil.py
 4 | # Date: 五 6月 13 18:07:29 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from .textutil import title_beautify
 8 | from ukconfig import MAILSERVER_HOST, MAILSERVER_PORT
 9 | 
10 | import json
11 | import requests
12 | import urllib.request, urllib.parse, urllib.error
13 | 
14 | 
15 | PAT = "https://sopaper.net9.org/download?pid={0}"
16 | 
17 | def sendmail(addr, author, res):
18 |     subject = "[SoPaper] Your Paper Request Has Been Processed"
19 |     content = "Dear {0}, <br/>".format(addr)
20 |     content += "Thanks for using <a href=\"https://sopaper.net9.org\">SoPaper</a>. You have recently requested for papers of {0}. ".format(title_beautify(author))
21 |     content += "Here are the results:<br/><br/>"
22 |     for idx, (pid, title) in enumerate(res):
23 |         content += "<a href=\"{1}\">{2}. {0}</a><br/>\n".format(title_beautify(title), PAT.format(pid), idx)
24 | 
25 |     content += "<br/>Thanks for your support!<br/>"
26 | 
27 |     dic = {'addr': addr, 'subject': subject, 'content': content}
28 |     headers = {'Content-type': 'application/json', 'Accept': '*/*'}
29 |     resp = requests.post("http://{0}:{1}/mail".format(MAILSERVER_HOST, MAILSERVER_PORT),
30 |                  data=json.dumps(dic), headers=headers)
31 |     print(resp.content)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     sendmail('ppwwyyxxc@gmail.com', 'Yuxin Wu')
36 | 


--------------------------------------------------------------------------------
/sopaper/lib/pdf2html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: pdf2html.py
 4 | # Date: Mon May 26 15:43:21 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import os
 8 | import os.path
 9 | import tempfile
10 | import shutil
11 | 
12 | class PDF2Html(object):
13 |     def __init__(self, data, filename):
14 |         """ provide either data or filename"""
15 |         if data is not None:
16 |             f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
17 |             f.close()
18 |             with open(f.name, 'wb') as fpdf:
19 |                 fpdf.write(data)
20 |             self.fname = f.name
21 |             self.createfile = True
22 |         else:
23 |             self.fname = filename
24 |             self.createfile = False
25 |         assert self.fname is not None
26 |         self.convert()
27 | 
28 |     def convert(self):
29 |         self.outdir = tempfile.mkdtemp(prefix='sop')
30 |         ret = os.system('pdf2htmlEX "{0}" 0.html --dest-dir={1} --zoom=1.5'.
31 |                   format(self.fname, self.outdir) + \
32 |                   ' --split-pages=1 --page-filename %d.html')
33 |         if ret != 0:
34 |             raise Exception("pdf2htmlEx return error! original file: {0}".format(self.fname))
35 |         self.npages = len(os.listdir(self.outdir)) - 1
36 | 
37 |     def clean(self):
38 |         shutil.rmtree(self.outdir)
39 |         if self.createfile:
40 |             os.remove(self.fname)
41 | 
42 |     def get_npages(self):
43 |         return self.npages
44 | 
45 |     def get(self, t):
46 |         t = int(t)
47 |         assert t >= 0 and t <= self.npages
48 |         fname = os.path.join(self.outdir, '{0}.html'.format(t))
49 |         return open(fname).read()
50 | 
51 | if __name__ == '__main__':
52 |     w = PDF2Html(data=None, filename='/tmp/a.pdf')
53 |     print(len(w.get(1)))
54 |     w.clean()
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/sopaper/lib/pdfutil.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: pdfutil.py
 4 | # Date: Fri Jun 02 10:19:03 2017 -0700
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from ..lib.textutil import filter_nonascii, parse_file_size
 8 | from ..lib.ukutil import check_file_type, check_buf_filetype
 9 | from ..uklogger import *
10 | from .. import ukconfig
11 | 
12 | import tempfile
13 | import os
14 | 
15 | def check_buf_pdf(buf):
16 |     return check_buf_filetype(buf, 'PDF document')
17 | 
18 | def check_legal_pdf(buf):
19 |     def is_exe(path):
20 |         return os.path.isfile(path) and os.access(path, os.X_OK)
21 |     for d in os.environ["PATH"].split(os.pathsep):
22 |         d = d.strip('"')
23 |         exe = os.path.join(d, 'pdftk')
24 |         if is_exe(exe):
25 |             break
26 |     else:
27 |         log_info('pdftk not installed. Not sure if pdf file is valid!')
28 |         return True
29 |     f = tempfile.NamedTemporaryFile(delete=False)
30 |     f.write(buf)
31 |     f.close()
32 |     ret = os.system('pdftk "{}" output /dev/null'.format(f.name))
33 |     os.unlink(f.name)
34 |     return ret == 0
35 | 
36 | def pdf2text(data):
37 |     f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
38 |     f.write(data)
39 |     f.close()
40 | 
41 |     #with timeout(seconds=30):
42 |     ret = os.system('pdftotext "{0}"'.format(f.name))
43 |         #ret = timeout_command('pdftotext "{0}"'.format(os.path.realpath(f.name)),
44 |                              #3)
45 |     if ret != 0:
46 |         #raise Exception("Timeout in pdf2text")
47 |         raise Exception("pdftotext return error! original file: {0}".format(f.name))
48 |     fout = f.name.replace('.pdf', '.txt')
49 |     text = open(fout).read()
50 | 
51 |     os.remove(f.name)
52 |     os.remove(fout)
53 | 
54 |     text = filter_nonascii(text)
55 |     # TODO filter formulas..
56 |     return text
57 | 
58 | def pdf_compress(data):
59 |     """ take a pdf data string, return a compressed string
60 |         compression is done using ps2pdf14 in ghostscript
61 |     """
62 |     if os.name == 'nt':
63 |         log_info("Does not support pdf compress on windows!")
64 |         return data
65 |     f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
66 |     f.write(data)
67 |     f.close()
68 | 
69 |     f2 = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
70 |     f2.close()
71 |     ret = os.system('ps2pdf14 "{0}" "{1}"'.format(f.name, f2.name))
72 |     if ret != 0:
73 |         log_err("Compress: ps2pdf14 failed!")
74 |         newdata = None
75 |     else:
76 |         newdata = open(f2.name, 'rb').read()
77 |     file_succ = newdata is not None and \
78 |             check_file_type(f2.name, 'PDF document') and \
79 |             len(newdata) >= ukconfig.FILE_SIZE_MINIMUM
80 |     try:
81 |         os.remove(f2.name)
82 |         os.remove(f.name)
83 |     except OSError:
84 |         pass
85 |     if file_succ and \
86 |        len(newdata) < len(data):
87 |         log_info("Compress succeed: {0}->{1}".format(
88 |             parse_file_size(len(data)), parse_file_size(len(newdata))))
89 |         return newdata
90 |     else:
91 |         return data
92 | 


--------------------------------------------------------------------------------
/sopaper/lib/sanitize.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import unicodedata
  4 | import warnings
  5 | 
  6 | 
  7 | # https://github.com/ksze/sanitize
  8 | 
  9 | class ReplacementLengthWarning(UserWarning):
 10 |     pass
 11 | 
 12 | 
 13 | warnings.filterwarnings("always", category=ReplacementLengthWarning)
 14 | 
 15 | 
 16 | def _are_unicode(unicode_args=[]):
 17 |     if sys.version_info[0] == 2:
 18 |         return all((type(arg) == str) for arg in unicode_args)
 19 | 
 20 |     # Assume Python 3
 21 |     return all((type(arg) == str) for arg in unicode_args)
 22 | 
 23 | 
 24 | def sanitize_path_fragment(
 25 |         original_fragment,
 26 |         filename_extension = '', # when you do want a filename extension, there is no need to include the leading dot.
 27 |         target_file_systems = {
 28 |                 'btrfs', 'ext', 'ext2', 'ext3', 'ext3cow', 'ext4', 'exfat', 'fat32',
 29 |                 'hfs+', 'ntfs_win32', 'reiser4', 'reiserfs', 'xfs', 'zfs',
 30 |             },
 31 |         sanitization_method = 'underscore',
 32 |         truncate = True,
 33 |         replacement = '_',
 34 |         additional_illegal_characters=[],
 35 |     ):
 36 |     # Enforce that these args are unicode strings
 37 |     unicode_args = [original_fragment, filename_extension, replacement] + additional_illegal_characters
 38 |     if not _are_unicode(unicode_args):
 39 |         raise ValueError(
 40 |                 '`original_fragment`, `filename_extension`, `replacement`, and `additional_illegal_characters` '
 41 |                 'must be of the unicode type under Python 2 or str type under Python 3.'
 42 |             )
 43 | 
 44 |     if len(replacement) > 1:
 45 |         warnings.warn(
 46 |                 "The replacement is longer than one character. "
 47 |                 "The length of the resulting string cannot be guaranteed to fit the target file systems' length limit.",
 48 |                 ReplacementLengthWarning
 49 |             )
 50 | 
 51 |     sanitized_fragment = unicodedata.normalize('NFC', original_fragment)
 52 |     if len(filename_extension) > 0:
 53 |         filename_extension = unicodedata.normalize('NFC', '.' + filename_extension)
 54 | 
 55 |     if sanitization_method == 'underscore':
 56 |         illegal_characters = {
 57 |             'btrfs': {'\0', '/'},
 58 |             'ext': {'\0', '/'},
 59 |             'ext2': {'\0', '/'},
 60 |             'ext3': {'\0', '/'},
 61 |             'ext3cow': {'\0', '/', '@'},
 62 |             'ext4': {'\0', '/'},
 63 |             'exfat': {
 64 |                 '\00', '\01', '\02', '\03', '\04', '\05', '\06', '\07', '\10', '\11', '\12', '\13', '\14', '\15', '\16', '\17',
 65 |                 '\20', '\21', '\22', '\23', '\24', '\25', '\26', '\27', '\30', '\31', '\32', '\33', '\34', '\35', '\36', '\37',
 66 |                 '/', '\\', ':', '*', '?', '"', '<', '>', '|',
 67 |             },
 68 |             'fat32': { # TODO: Confirm this list; current list is just a wild guess, assuming UTF-16 encoding.
 69 |                 '\00', '\01', '\02', '\03', '\04', '\05', '\06', '\07', '\10', '\11', '\12', '\13', '\14', '\15', '\16', '\17',
 70 |                 '\20', '\21', '\22', '\23', '\24', '\25', '\26', '\27', '\30', '\31', '\32', '\33', '\34', '\35', '\36', '\37',
 71 |                 '/', '\\', ':', '*', '?', '"', '<', '>', '|',
 72 |             },
 73 |             # In theory, all Unicode characters, including NUL, are usable in HFS+; so this is just
 74 |             # a sane set for legacy compatibility - e.g. OS APIs that don't support '/' and ':'.
 75 |             'hfs+': {'\0', '/', ':'},
 76 |             'ntfs_win32': {'\0', '/', '\\', ':', '*', '?', '"', '<', '>', '|'}, # NTFS Win32 namespace (stricter)
 77 |             'ntfs_posix': {'\0', '/'},                                          # NTFS POSIX namespace (looser)
 78 |             'reiser4': {'\0', '/'},
 79 |             'reiserfs': {'\0', '/'},
 80 |             'xfs': {'\0', '/'},
 81 |             'zfs': {'\0', '/'},
 82 |             'additional_illegal_characters': set(additional_illegal_characters),
 83 |         }
 84 | 
 85 |         # Replace illegal characters with an underscore
 86 |         # `target_file_systems` is used further down, so we don't want to pollute it here.
 87 |         _temp_target_file_systems = set.union(target_file_systems, {'additional_illegal_characters'})
 88 | 
 89 |         illegal_character_set = set.union(*(illegal_characters[file_system] for file_system in _temp_target_file_systems))
 90 | 
 91 |         # It would be stupid if the replacement contains an illegal character.
 92 |         if any(character in replacement for character in illegal_character_set):
 93 |             raise ValueError('The replacement contains a character that would be illegal in the target file system(s).')
 94 | 
 95 |         for character in illegal_character_set:
 96 |             sanitized_fragment = sanitized_fragment.replace(character, replacement)
 97 |             filename_extension = filename_extension.replace(character, replacement)
 98 | 
 99 |         # "Quote" illegal filenames
100 |         if target_file_systems.intersection({'fat32', 'ntfs_win32'}):
101 |             windows_reserved_names = (
102 |                     "CON", "PRN", "AUX", "NUL",
103 |                     "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
104 |                     "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
105 |                 )
106 | 
107 |             if sanitized_fragment in windows_reserved_names:
108 |                 sanitized_fragment = replacement + sanitized_fragment + replacement
109 | 
110 |             if filename_extension in windows_reserved_names:
111 |                 filename_extension = replacement + filename_extension + replacement
112 | 
113 | 
114 |         # Truncate if the resulting string is too long
115 |         if truncate:
116 |             max_lengths = {
117 |                 # For the entries of file systems commonly found with Linux, the length, 'utf-8',
118 |                 # and 'NFC' are only assumptions that apply to mostly vanilla kernels with default
119 |                 # build parameters.
120 | 
121 |                 # Seriously, this is 2013. The fact that the Linux community does not move to a file
122 |                 # system with an enforced Unicode filename encoding is as bad as Windows 95's
123 |                 # codepage madness, some 18 years ago.
124 | 
125 |                 # If you add more file systems, see if it is affected by Unicode Normal Forms, like
126 |                 # HFS+; You may have to take extra care in editing the actual sanitization routine
127 |                 # below.
128 |                 'btrfs': (255, 'bytes', 'utf-8', 'NFC'),
129 |                 'ext': (255, 'bytes', 'utf-8', 'NFC'),
130 |                 'ext2': (255, 'bytes', 'utf-8', 'NFC'),
131 |                 'ext3': (255, 'bytes', 'utf-8', 'NFC'),
132 |                 'ext3cow': (255, 'bytes', 'utf-8', 'NFC'),
133 |                 'ext4': (255, 'bytes', 'utf-8', 'NFC'),
134 |                 'exfat': (255, 'characters', 'utf-16', 'NFC'),
135 | 
136 |                 # 'utf-16' is not entirely true. FAT32 used to be used with codepages; but since
137 |                 # Windows XP, the default seems to be UTF-16.
138 |                 'fat32': (255, 'characters', 'utf-16', 'NFC'),
139 | 
140 |                 # FIXME: improve HFS+ handling, because it does not use the standard NFD. It's
141 |                 # close, but it's not exactly the same thing.
142 |                 'hfs+': (255, 'characters', 'utf-16', 'NFD'),
143 | 
144 |                 'ntfs_win32': (255, 'characters', 'utf-16', 'NFC'),
145 |                 'ntfs_posix': (255, 'characters', 'utf-16', 'NFC'),
146 | 
147 |                 # ReiserFS 3 and 4 support filenames > 255 bytes. I don't care if the vanilla Linux
148 |                 # kernel can't support that. That's Linux's problem, not mine.
149 |                 'reiser4': (3976, 'bytes', 'utf-8', 'NFC'),
150 |                 'reiserfs': (4032, 'bytes', 'utf-8', 'NFC'),
151 | 
152 |                 'xfs': (255, 'bytes', 'utf-8', 'NFC'),
153 |                 'zfs': (255, 'bytes', 'utf-8', 'NFC'),
154 |             }
155 | 
156 |             for file_system in target_file_systems:
157 |                 if max_lengths[file_system][1] == 'bytes':
158 |                     extension_bytes = unicodedata.normalize(max_lengths[file_system][3], filename_extension).encode(max_lengths[file_system][2])
159 | 
160 |                     temp_fragment = bytearray()
161 | 
162 |                     for character in sanitized_fragment:
163 |                         encoded_bytes = unicodedata.normalize(max_lengths[file_system][3], character).encode(max_lengths[file_system][2])
164 | 
165 |                         if len(temp_fragment) + len(encoded_bytes) + len(extension_bytes)<= max_lengths[file_system][0]:
166 |                             temp_fragment = temp_fragment + encoded_bytes
167 |                         else:
168 |                             break
169 | 
170 |                     sanitized_fragment = unicodedata.normalize('NFC', temp_fragment.decode(max_lengths[file_system][2]))
171 | 
172 |                 else: # Assume 'characters'
173 |                     temp_fragment = ''
174 | 
175 |                     if file_system == 'hfs+':
176 |                         normalize = unicodedata.ucd_3_2_0.normalize
177 |                     else:
178 |                         normalize = unicodedata.normalize
179 | 
180 |                     normalized_extension = normalize(max_lengths[file_system][3], filename_extension)
181 | 
182 |                     for character in sanitized_fragment:
183 |                         normalized_character = normalize(max_lengths[file_system][3], character)
184 |                         if len(temp_fragment) + len(normalized_character) + len(normalized_extension) <= max_lengths[file_system][0]:
185 |                             temp_fragment += normalized_character
186 |                         else:
187 |                             break
188 | 
189 |                     sanitized_fragment = unicodedata.normalize('NFC', temp_fragment)
190 | 
191 |         sanitized_fragment = sanitized_fragment + filename_extension
192 | 
193 |         # Disallow a final dot or space for FAT32 and NTFS in Win32 namespace.
194 |         # This can only be done after truncations because otherwise we may fix the fragment, but
195 |         # still end up with a bad ending character once it's truncated
196 |         if (
197 |                 target_file_systems.intersection({'fat32', 'ntfs_win32'}) and
198 |                 (sanitized_fragment.endswith('.') or sanitized_fragment.endswith(' '))
199 |             ):
200 | 
201 |             if replacement.endswith('.') or replacement.endswith(' '):
202 |                 raise ValueError(
203 |                         'The sanitized string ends with a dot or space, and the replacement also ends with a dot or space. '
204 |                         'Therefore the string cannot be sanitized for fat32 or ntfs_win32.'
205 |                     )
206 | 
207 |             while (sanitized_fragment.endswith('.') or sanitized_fragment.endswith(' ')):
208 |                 sanitized_fragment = sanitized_fragment[:-1] + replacement
209 | 
210 |     else:
211 |         raise ValueError("sanitization_method must be a valid sanitization method.")
212 | 
213 |     return sanitized_fragment
214 | 


--------------------------------------------------------------------------------
/sopaper/lib/singleton.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: singleton.py
 4 | # Date: Fri May 23 22:12:27 2014 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | class Singleton(type):
 8 |     def __init__(cls, name, bases, dict):
 9 |         super(Singleton, cls).__init__(name, bases, dict)
10 |         cls.instance = None
11 | 
12 |     def __call__(cls,*args,**kw):
13 |         if cls.instance is None:
14 |             cls.instance = super(Singleton, cls).__call__(*args, **kw)
15 |         return cls.instance
16 | 
17 | def dec_singleton(cls):
18 |     instances = {}
19 |     def getinstance():
20 |         if cls not in instances:
21 |             instances[cls] = cls()
22 |         return instances[cls]
23 |     return getinstance
24 | 


--------------------------------------------------------------------------------
/sopaper/lib/textutil.py:
--------------------------------------------------------------------------------
  1 | #!../../manage/exec-in-virtualenv.sh
  2 | # -*- coding: UTF-8 -*-
  3 | # File: textutil.py
  4 | # Date: Fri Jun 02 10:39:42 2017 -0700
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import string
  8 | import re
  9 | import os
 10 | import hashlib
 11 | import platform
 12 | from .ukutil import ensure_unicode
 13 | from .sanitize import sanitize_path_fragment
 14 | 
 15 | STOPWORDS = set(['of', 'from', 'as', 'to', 'a', 'an', 'in', 'into', 'on',
 16 |                  'onto', 'with', 'about', 'the', 'for', 'and', 'or', 'by',
 17 |                  'without', 'instead', 'is', 'are', 'since', 'between',
 18 |                  'after', 'befoer', 'then', 'than', 'via'])
 19 | 
 20 | ABBR_DICT = [(k, v) for k, v in
 21 |             [l.strip().split('\t') for l in open(os.path.join(
 22 |                 os.path.dirname(__file__), 'abbr.dic')).readlines()]]
 23 | 
 24 | def title_beautify(title):
 25 |     title = title.strip().lower()
 26 |     title = " ".join(title.split())
 27 |     tk = title.title().split()
 28 |     for (idx, w) in enumerate(tk):
 29 |         if w.lower() in STOPWORDS and not idx == 0:
 30 |             tk[idx] = w.lower()
 31 |         else:
 32 |             tk[idx] = w.capitalize()
 33 |     return " ".join(tk)
 34 | 
 35 | def parse_file_size(size):
 36 |     if size > 1048576:
 37 |         return "{0:.2f}MB".format(float(size) / 1024 / 1024)
 38 |     if size > 1024:
 39 |         return "{0:.2f}KB".format(float(size) / 1024)
 40 |     return "{0}B".format(size)
 41 | 
 42 | def filter_title_fileformat(title):
 43 |     title = title.replace('[pdf]', '')
 44 |     title = title.replace('[PDF]', '')
 45 |     return title
 46 | 
 47 | def levenshtein(s1, s2):
 48 |     if len(s1) < len(s2):
 49 |         return levenshtein(s2, s1)
 50 | 
 51 |     if len(s2) == 0:
 52 |         return len(s1)
 53 | 
 54 |     previous_row = range(len(s2) + 1)
 55 |     for i, c1 in enumerate(s1):
 56 |         current_row = [i + 1]
 57 |         for j, c2 in enumerate(s2):
 58 |             insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
 59 |             deletions = current_row[j] + 1       # than s2
 60 |             substitutions = previous_row[j] + (c1 != c2)
 61 |             current_row.append(min(insertions, deletions, substitutions))
 62 |         previous_row = current_row
 63 |     return previous_row[-1]
 64 | 
 65 | def title_correct(query, title):
 66 |     """ return (match, update) """
 67 |     title = title.replace('[PDF]', '')
 68 |     q = ''.join([t for t in query if t in string.ascii_letters]).lower()
 69 |     now = ''.join([t for t in title if t in string.ascii_letters]).lower()
 70 |     ed_thres = min(len(query), len(title)) / 5
 71 |     ERROR_RATIO = 0.6
 72 |     if levenshtein(q, now) < ed_thres:
 73 |         return (True, True)
 74 |     for k in range(min([int(len(query) * ERROR_RATIO), 30]), len(query)):
 75 |         if levenshtein(q[:k], now) < ed_thres:
 76 |             return (True, False)
 77 |     for k in range(int(len(title) * ERROR_RATIO), len(title)):
 78 |         if levenshtein(now[:k], q) < ed_thres:
 79 |             return (True, False)
 80 |     return (False, False)
 81 | 
 82 | def name_clean(name):
 83 |     p = re.compile('\(.*?\)', re.DOTALL)
 84 |     ret = p.sub('', name).strip()
 85 |     return ensure_unicode(ret)
 86 | 
 87 | def filter_nonascii(string):
 88 |     return [x for x in string if ord(x) < 128]
 89 | 
 90 | def abbr_subst(s):
 91 |     for k, v in ABBR_DICT:
 92 |         s = re.sub(k, v, s, flags=re.IGNORECASE)
 93 |     return s
 94 | 
 95 | def finalize_filename(s):
 96 |     system = platform.system()
 97 |     fs = {
 98 |             'Windows': 'ntfs_win32',
 99 |             'Linux': 'ext4',
100 |             'Darwin': 'hfs+'
101 |         }[system]   # hopefully the guess can work in most cases..
102 |     s = sanitize_path_fragment(s, target_file_systems={fs}, replacement='-')
103 |     s = abbr_subst(s)
104 |     return s
105 | 
106 | def md5(s):
107 |     m = hashlib.md5()
108 |     m.update(s)
109 |     return m.hexdigest()
110 | 
111 | if __name__ == '__main__':
112 |     print(title_correct("Gated Softmax Classification",
113 |                         "[PDF]Gated Softmax Classification - NIPS Proceedings"))
114 | 


--------------------------------------------------------------------------------
/sopaper/lib/timeout.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: timeout.py
 4 | # Date: 二 6月 10 03:44:44 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import signal
 8 | class timeout:
 9 |     def __init__(self, seconds=1, error_message='Timeout'):
10 |         self.seconds = seconds
11 |         self.error_message = error_message
12 |     def handle_timeout(self, signum, frame):
13 |         raise TimeoutError(self.error_message)
14 |     def __enter__(self):
15 |         signal.signal(signal.SIGALRM, self.handle_timeout)
16 |         signal.alarm(self.seconds)
17 |     def __exit__(self, type, value, traceback):
18 |         signal.alarm(0)
19 | 
20 | def timeout_command(command, timeout):
21 |     """call shell-command and either return its output or kill it
22 |     if it doesn't normally exit within timeout seconds and return None"""
23 |     import subprocess, datetime, os, time, signal
24 |     start = datetime.datetime.now()
25 |     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
26 |     while process.poll() is None:
27 |         time.sleep(0.1)
28 |         now = datetime.datetime.now()
29 |         if (now - start).seconds > timeout:
30 |             os.kill(process.pid, signal.SIGKILL)
31 |             os.waitpid(-1, os.WNOHANG)
32 |             return False
33 |     return True
34 | 


--------------------------------------------------------------------------------
/sopaper/lib/ukutil.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | # File: ukutil.py
 4 | # Date: Fri Jun 02 10:18:29 2017 -0700
 5 | # Author: jiakai <jia.kai66@gmail.com>
 6 | #         Yuxin Wu <ppwwyyxxc@gmail.com>
 7 | 
 8 | """common utility functions"""
 9 | from importlib import import_module
10 | from pkgutil import walk_packages
11 | from datetime import datetime
12 | from subprocess import Popen, PIPE
13 | import tempfile
14 | import os
15 | 
16 | from .. import ukconfig
17 | from ..uklogger import *
18 | 
19 | 
20 | try:
21 |     import magic
22 | except:
23 |     pass
24 | 
25 | def ensure_unicode_anytype(s):
26 |     if isinstance(s, str):
27 |         return ensure_unicode(s)
28 |     return s
29 | 
30 | def ensure_unicode(s):
31 |     """assert type of s is basestring and convert s to unicode"""
32 |     if isinstance(s, bytes):
33 |         return s.decode('utf-8')
34 |     assert isinstance(s, str)
35 |     # In python 3 all strings are assumed to be unicode
36 |     return s
37 | 
38 |     # Python 2 legacy:
39 |     # assert isinstance(s, str), 's should be string' + str(s)
40 |     # if isinstance(s, str):
41 |     #     s = s.decode('utf-8')
42 |     # return s
43 | 
44 | def ensure_bin_str(s):
45 |     """assert type of s is basestring and convert s to byte string"""
46 |     if isinstance(s, bytes):
47 |         return s
48 |     assert isinstance(s, str)
49 |     return s.encode('utf-8')
50 | 
51 |     # Python 2 legacy:
52 |     # assert isinstance(s, str), 's should be string'
53 |     # if isinstance(s, str):
54 |     #     s = s.encode('utf-8')
55 |     # return s
56 | 
57 | def import_all_modules(file_path, pkg_name):
58 |     """import all modules recursively in a package
59 |     :param file_path: just pass __file__
60 |     :param pkg_name: just pass __name__
61 |     """
62 |     for _, module_name, _ in walk_packages(
63 |             [os.path.dirname(file_path)], pkg_name + '.'):
64 |         import_module(module_name)
65 | 
66 | def check_buf_filetype(buf, need_type):
67 |     if isinstance(need_type, str):
68 |         need_type = need_type.encode('utf-8')
69 |         
70 |     if ukconfig.USE_MAGIC_LIB:
71 |         s = magic.from_buffer(buf)
72 |     else:
73 |         assert os.name != 'nt', "Windows users please install python-magic."
74 |         f = tempfile.NamedTemporaryFile(delete=False)
75 |         f.write(buf)
76 |         f.close()
77 |         s = Popen('file "{0}"'.format(f.name),
78 |                   stdout=PIPE, shell=True).stdout.read()
79 |         os.unlink(f.name)
80 |     if s.find(need_type) != -1:
81 |         return True
82 |     else:
83 |         return False
84 | 
85 | def check_file_type(fname, need_type):
86 |     if isinstance(need_type, str):
87 |         need_type = need_type.encode('utf-8')
88 | 
89 |     s = Popen('file "{0}"'.format(fname), stdout=PIPE, shell=True).stdout.read()
90 |     if s.find(need_type) != -1:
91 |         return True
92 |     return False
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     print(check_filetype(open("./ukconfig.py", 'rb').read(), 'PDF'))
97 | 


--------------------------------------------------------------------------------
/sopaper/pdfprocess.py:
--------------------------------------------------------------------------------
 1 | #!../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: pdfprocess.py
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | import tempfile
 7 | import os
 8 | from bson.binary import Binary
 9 | 
10 | from .uklogger import *
11 | from .ukdbconn import get_mongo
12 | from .lib.pdf2html import PDF2Html
13 | from .lib.textutil import parse_file_size
14 | from .lib.pdfutil import *
15 | from . import contentsearch
16 | 
17 | def do_addhtml(data, pid):
18 |     # convert to html
19 |     converter = PDF2Html(data, filename=None)
20 |     npage = converter.get_npages()
21 |     htmls = [Binary(converter.get(x)) for x in range(npage + 1)]
22 |     converter.clean()
23 | 
24 |     db = get_mongo('paper')
25 |     db.update({'_id': pid}, {'$set': {'page': npage, 'html': htmls}})
26 |     log_info("Add html for pdf {0}, page={1}".format(pid, npage))
27 | 
28 | def do_compress(data, pid):
29 |     """ this *must* succeed adding the pdf"""
30 |     try:
31 |         # compress
32 |         data = pdf_compress(data)
33 |     except:
34 |         pass
35 | 
36 |     db = get_mongo('paper')
37 |     db.update({'_id': pid}, {'$set': {'pdf': Binary(data)}} )
38 |     log_info("Updated pdf {0}: size={1}".format(
39 |         pid, parse_file_size(len(data))))
40 |     return data
41 | 
42 | def do_buildindex(ctx, data, pid):
43 |     text = pdf2text(data)
44 |     db = get_mongo('paper')
45 |     db.update({'_id': pid}, {'$set': {'text': text}})
46 | 
47 |     doc = {'text': text,
48 |            'title': ctx.title,
49 |            'id': pid
50 |           }
51 | 
52 |     citedby = ctx.meta.get('citedby')
53 |     if citedby:
54 |         citecnt = len(citedby)
55 |         doc['citecnt'] = citecnt
56 |     if ctx.meta.get('citecnt'):
57 |         citecnt = ctx.meta.get('citecnt')
58 |         doc['citecnt'] = citecnt
59 | 
60 |     author = ctx.meta.get('author')
61 |     if author:
62 |         doc['author'] = author
63 |     contentsearch.do_add_paper(doc)
64 | 
65 | def postprocess(data, ctx, pid):
66 |     """ post-process routine right after adding a new pdf"""
67 |     log_info("Start compressing {0}".format(pid))
68 |     data = do_compress(data, pid)
69 | 
70 |     try:
71 |         log_info("Start converting to html {0}".format(pid))
72 |         do_addhtml(data, pid)
73 |     except Exception as e:
74 |         log_exc("Error converting to html")
75 | 
76 |     try:
77 |         log_info("Start converting to text {0}".format(pid))
78 |         do_buildindex(ctx, data, pid)
79 |     except Exception as e:
80 |         log_exc("Error converting to text")
81 | 
82 | if __name__ == '__main__':
83 |     import sys
84 |     f = sys.argv[1]
85 |     data = open(f, 'rb').read()
86 | 
87 |     text = pdf2text(data)
88 |     print(text)
89 | 


--------------------------------------------------------------------------------
/sopaper/queryhandler.py:
--------------------------------------------------------------------------------
  1 | #!../manage/exec-in-virtualenv.sh
  2 | # -*- coding: UTF-8 -*-
  3 | # File: queryhandler.py
  4 | # Date: Thu Jun 18 22:52:39 2015 +0800
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | from bson.binary import Binary
  8 | from threading import Thread
  9 | from multiprocessing import Pool
 10 | 
 11 | from .ukdbconn import get_mongo, global_counter, new_paper
 12 | from .uklogger import *
 13 | from .lib.textutil import title_beautify, parse_file_size
 14 | from . import searcher
 15 | from .searcher import searcher_run
 16 | from . import fetcher
 17 | from .job import JobContext
 18 | from .dbsearch import *
 19 | from .pdfprocess import postprocess
 20 | from .lib.downloader import ProgressPrinter
 21 | from .contentsearch import SoPaperSearcher
 22 | from . import ukconfig
 23 | 
 24 | # global. save all ongoing download
 25 | progress_dict = {}
 26 | 
 27 | class Updater(ProgressPrinter):
 28 |     def __init__(self, pid):
 29 |         self.pid = pid
 30 |         super(Updater, self).__init__()
 31 | 
 32 |     def update(self, done):
 33 |         percent = float(done) / self.total
 34 |         progress_dict[self.pid] = percent
 35 |         super(Updater, self).update(done)
 36 | 
 37 | def start_download(dl_candidates, ctx, pid):
 38 |     dl_candidates = sorted(dl_candidates, key=lambda x: x[0].priority,
 39 |                            reverse=True)
 40 |     updater = Updater(pid)
 41 |     for (parser, sr) in dl_candidates:
 42 |         data = parser.download(sr, updater)
 43 |         if data:
 44 |             db = get_mongo('paper')
 45 |             try:
 46 |                 db.update({'_id': pid},
 47 |                           {'$set': {
 48 |                             'pdf': Binary(data),
 49 |                             'page_url': sr.url,
 50 |                             'source': parser.name
 51 |                           }})
 52 |             except:
 53 |                 log_exc("Save pdf data error")
 54 |             postprocess(data, ctx, pid)
 55 |             progress_dict.pop(pid, None)
 56 |             return
 57 |     progress_dict.pop(pid, None)
 58 | 
 59 | def handle_title_query(query):
 60 |     query = title_beautify(query)
 61 |     log_info("Get title query: {0}".format(query))
 62 | 
 63 |      #starts search
 64 |     res = search_startswith(query) # and the idf is large
 65 |     if res:
 66 |         log_info("Found {0} results in db: {1}".format(
 67 |             len(res), str([x['_id'] for x in res])))
 68 |         return res
 69 |     # similar search
 70 |     res = similar_search(query)
 71 |     if res:
 72 |         log_info("Found similar results in db: {0}".format(res['_id']))
 73 |         return [res]
 74 | 
 75 |     # search on web
 76 |     searchers = searcher.register_searcher.get_searcher_list()
 77 |     parsers = fetcher.register_parser.get_parser_list()
 78 |     ctx = JobContext(query)
 79 | 
 80 |     args = list(zip(searchers, [ctx] * len(searchers)))
 81 |     pool = Pool()
 82 |     async_results = [pool.apply_async(searcher_run, arg) for arg in args]
 83 | 
 84 |     # Search and get all the results item
 85 |     all_search_results = []
 86 |     for s in async_results:
 87 |         s = s.get(ukconfig.PYTHON_POOL_TIMEOUT)
 88 |         if s is None:
 89 |             continue
 90 |         srs = s['results']
 91 | 
 92 |         # try search database with updated title
 93 |         try:
 94 |             updated_title = s['ctx_update']['title']
 95 |         except KeyError:
 96 |             pass
 97 |         else:
 98 |             if updated_title != query:
 99 |                 query = updated_title
100 |                 res = search_exact(query)
101 |                 if res:
102 |                     log_info("Found {0} results in db: {1}".format(
103 |                         len(res), str([x['_id'] for x in res])))
104 |                     return res
105 |         all_search_results.extend(srs)
106 | 
107 |         meta = s.get('ctx_update')
108 |         if meta:
109 |             log_info('Meat update from searcher: {0}'.format(str(list(meta.keys()))))
110 |             ctx.update_meta_dict(meta)
111 |     pool.close()
112 |     pool.terminate()
113 | 
114 |     # Analyse each result and try to parse info
115 |     download_candidates = []
116 |     parser_used = set()
117 |     found = False
118 |     for sr in all_search_results:
119 |         for parser in parsers:
120 |             if parser.can_handle(sr):
121 |                 download_candidates.append((parser, sr))
122 |                 if ctx.need_field(parser.support_meta_field):
123 |                     # Already tried this fetcher
124 |                     if not parser.repeatable and \
125 |                             parser.name in parser_used:
126 |                         continue
127 |                     else:
128 |                         parser_used.add(parser.name)
129 | 
130 |                     succ = parser.fetch_info(ctx, sr)
131 |                     if not succ:
132 |                         continue
133 |                     found = True
134 |                     if ctx.existing is not None:
135 |                         log_info("Found {0} results in db".format(len(ctx.existing)))
136 |                         return [ctx.existing]
137 | 
138 |     # no metadata or downloadable source found
139 |     if not found and len(download_candidates) == 0:
140 |         return None
141 |     # Save data, return data and start downloading
142 |     try:
143 |         pid = new_paper(ctx)
144 |         ret = [{'_id': pid,
145 |                 'title': ctx.title,
146 |                 'view_cnt': 1,
147 |                 'download_cnt': 0
148 |                }]
149 |         ret[0].update(ctx.meta)
150 | 
151 |         progress_dict[pid] = 0.0
152 |         if len(download_candidates) > 0:
153 |             thread = Thread(target=start_download, args=(download_candidates,
154 |                                                          ctx, pid))
155 |             thread.start()
156 |         return ret
157 |     except:
158 |         log_exc("Failed to save to db")
159 | 
160 | sp_searcher = SoPaperSearcher()
161 | 
162 | def handl_author_query(q):
163 |     db = get_mongo('paper')
164 |     res = list(db.find({'author': q}, SEARCH_RETURN_FIELDS))
165 |     return res
166 | 
167 | def handle_content_query(query):
168 |     log_info("Get content query: {0}".format(query))
169 |     res = sp_searcher.search(query)
170 |     db = get_mongo('paper')
171 | 
172 |     def transform(r):
173 |         pid = int(r['_id'])
174 |         # XXX should find use '$in' and then do sorting
175 |         doc = db.find_one({'_id': pid}, SEARCH_RETURN_FIELDS)
176 |         if not doc:
177 |             raise Exception("Impossible! Mongo doesn't have this paper in index: {0}".format(pid))
178 |         doc['content'] = r['content']
179 |         doc['weight'] = r['weight']
180 |         return doc
181 | 
182 |     ret = list(map(transform, res))
183 |     return ret
184 | 
185 | if __name__ == '__main__':
186 |     import sys
187 |     if len(sys.argv) > 1:
188 |         res = handle_title_query(sys.argv[1])
189 |         sys.exit(0)
190 |     #res = handle_title_query('test test test this is not a paper name')
191 |     #res = handle_title_query('Intriguing properties of neural networks')
192 |     res = handle_content_query('neural networks')
193 |     #res = handle_title_query("The WEka data mining software an update")
194 |     #res = handle_title_query("linear")
195 |     #print res
196 | 
197 | 


--------------------------------------------------------------------------------
/sopaper/searcher/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: __init__.py
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | from functools import wraps
 7 | 
 8 | from ..lib.ukutil import import_all_modules
 9 | from ..uklogger import *
10 | from ..job import JobContext
11 | 
12 | def searcher_run(searcher, ctx):
13 |     """ a global function to invoke with multiprocessing,
14 |         run a searcher against a JobContext"""
15 |     return searcher.run(ctx)
16 | 
17 | class register_searcher(object):
18 |     searcher_list = []
19 | 
20 |     def __init__(self, *args, **kwargs):
21 |         self.name = kwargs.pop('name')
22 |         self.priority = kwargs.pop('priority', 5)
23 | 
24 |     def __call__(self, func):
25 |         """ func: callable to be invoked, took a JobContext
26 |             func cannot change JobContext
27 |             func return a dict with keys:
28 |                 results : list of 'SearchResult'
29 |                 ctx_update: dict
30 |         """
31 | 
32 |         @wraps(func)
33 |         def wrapper(ctx):
34 |             assert isinstance(ctx, JobContext)
35 |             try:
36 |                 log_info("Searching '{1}' with searcher: '{0}' ...".
37 |                          format(self.name, ctx.query))
38 |                 res = func(ctx)
39 |                 for r in res['results']:
40 |                     r.searcher = self.name
41 |                 return res
42 |             except KeyboardInterrupt:
43 |                 raise
44 |             except Exception as e:
45 |                 log_exc("Error in searcher '{0}' with query '{1}': {2}".
46 |                         format(self.name, ctx.query, str(e)))
47 | 
48 |         self.searcher_list.append(self)
49 |         self.cb = wrapper
50 |         return wrapper
51 | 
52 |     def run(self, ctx):
53 |         """ run this searcher against the context given"""
54 |         res = self.cb(ctx)
55 |         if res and res['results']:
56 |             log_info("Got the following results from {0}:\n".format(self.name) +
57 |                     "\n".join([str(r) for r in res['results']]))
58 |         return res
59 | 
60 |     @staticmethod
61 |     def get_searcher_list():
62 |         return sorted(register_searcher.searcher_list, key=lambda x: x.priority, reverse=True)
63 | 
64 | import_all_modules(__file__, __name__)
65 | 


--------------------------------------------------------------------------------
/sopaper/searcher/google.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: google.py
 4 | # Date: Wed Mar 11 09:21:06 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import register_searcher
 8 | from ..job import SearchResult
 9 | from ..uklogger import *
10 | from ..lib.textutil import title_correct, filter_title_fileformat
11 | from ..ukconfig import BS_PARSER
12 | 
13 | import re
14 | import urllib.request, urllib.parse, urllib.error
15 | from bs4 import BeautifulSoup
16 | from urllib.parse import urlparse
17 | import traceback
18 | import requests
19 | 
20 | GOOGLE_URL = "https://www.google.com/search?q={0}"
21 | 
22 | def parse_google_link(url):
23 |     return url      # now it seems to be ok
24 |     real = re.findall('http[^&]*&', url)[0]
25 |     ret = urllib.parse.unquote(real[:-1])
26 |     return ret
27 | 
28 | @register_searcher(name='Google')
29 | def search(ctx):
30 |     query = ctx.query.lower()
31 | 
32 |     ret = {}
33 |     ret['ctx_update'] = {}
34 |     srs = []
35 | 
36 |     headers = { 'Hostname': 'www.google.com',
37 |                 'User-Agent': ukconfig.USER_AGENT,
38 |                 'Accept-Encoding': 'gzip'
39 |               }
40 |     r = requests.get(GOOGLE_URL.format(query), headers=headers, verify=True)
41 |     text = r.text.encode('utf-8')
42 |     #with open('/tmp/a.html', 'r') as f:
43 |         ##f.write(text)
44 |         #text = f.read()
45 | 
46 |     def find_citecnt(dom):
47 |         try:
48 |             find = dom.findAll(attrs={'class': 'f slp'})[0]
49 |             find = find.findAll('a')[0].text
50 |             citecnt = re.search('[0-9]+', find).group()
51 |             return int(citecnt)
52 |         except:
53 |             return None
54 | 
55 |     soup = BeautifulSoup(text, BS_PARSER)
56 |     results = soup.findAll(attrs={'class': 'g'})
57 |     for rst in results:
58 |         try:
59 |             h3 = rst.findAll('h3')
60 |             if not h3:  # frame search, e.g. picture/video/kg
61 |                 continue
62 |             real_title = h3[0].get_text()
63 |             tc = title_correct(query, real_title)
64 |             if not tc[0]:
65 |                 continue
66 |             # TODO do some title update?
67 |             cnt = find_citecnt(rst)
68 |             if cnt is not None:
69 |                 ret['ctx_update']['citecnt'] = cnt
70 |             #findpdf = rst.findAll(attrs={'class': 'mime'})
71 |             findpdf = rst.findAll('span')
72 |             if findpdf and findpdf[0].text == '[PDF]':
73 |                 pdflink = rst.findAll('a')[0].get('href')
74 |                 try:
75 |                     url = parse_google_link(pdflink)
76 |                 except:
77 |                     continue
78 |                 srs.append(SearchResult('directpdf', url))
79 |             else:
80 |                 url = rst.findAll('a')[0].get('href')
81 |                 try:
82 |                     url = parse_google_link(url)
83 |                 except:
84 |                     continue
85 |                 srs.append(SearchResult(None, url))
86 |         except Exception as e:
87 |             log_exc("Search Item parse error: {0}".format(str(e)))
88 |     ret['results'] = srs
89 |     return ret
90 | 


--------------------------------------------------------------------------------
/sopaper/searcher/gscholar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: gscholar.py
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | 
 7 | from . import register_searcher
 8 | from ..job import SearchResult
 9 | from ..uklogger import *
10 | from ..lib.textutil import title_correct, filter_title_fileformat, title_beautify
11 | from ..lib.ukutil import ensure_unicode, ensure_bin_str
12 | from ..ukconfig import BS_PARSER
13 | 
14 | import re
15 | import requests
16 | from bs4 import BeautifulSoup
17 | import urllib.request, urllib.parse, urllib.error
18 | from urllib.parse import urlparse
19 | import traceback
20 | 
21 | GOOGLE_SCHOLAR_URL = "https://scholar.google.com/scholar?hl=en&q={0}&btnG=&as_sdt=1%2C5&as_sdtp="
22 | 
23 | @register_searcher(name='Google Scholar', priority=10)
24 | def search(ctx):
25 |     query = ctx.query.lower()
26 | 
27 |     ret = {}
28 |     ret['ctx_update'] = {}
29 |     srs = []
30 | 
31 |     r = requests.get(GOOGLE_SCHOLAR_URL.format(query))
32 |     text = r.text.encode('utf-8')
33 |     #with open('/tmp/b.html', 'r') as f:
34 |         #text = f.read()
35 | 
36 |     def find_citecnt(dom):
37 |         try:
38 |             find = dom.findAll(attrs={'class': 'gs_ri'})[0]
39 |             find = find.findAll(attrs={'class': 'gs_fl'})[0]
40 |             find = find.findAll('a')[0].text
41 |             cnt = re.search('[0-9]+', find).group()
42 |             return int(cnt)
43 |         except:
44 |             return None
45 | 
46 | 
47 |     soup = BeautifulSoup(text, BS_PARSER)
48 |     results = soup.findAll(attrs={'class': 'gs_r'})
49 |     title_updated = None
50 |     for rst in results:
51 |         try:
52 |             h3 = rst.findAll('h3')[0]
53 |             real_title = h3.get_text()
54 |             real_title = filter_title_fileformat(real_title)
55 |             tc = title_correct(query, real_title)
56 |             if not tc[0]:
57 |                 continue
58 |             if not title_updated and tc[1]:
59 |                 title_updated = ensure_unicode(title_beautify(real_title))
60 |                 while True:     # fix things like '[citation][c] Title'
61 |                     new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip()
62 |                     if new_title == title_updated:
63 |                         title_updated = new_title
64 |                         break
65 |                     title_updated = new_title
66 |                 log_info("Title updated: {0}".format(title_updated))
67 |                 ret['ctx_update']['title'] = title_updated
68 | 
69 |             cnt = find_citecnt(rst)
70 |             if cnt is not None:
71 |                 ret['ctx_update']['citecnt'] = cnt
72 | 
73 |             try:
74 |                 url = str(h3.find('a').get('href'))
75 |                 srs.append(SearchResult(None, url))
76 |             except:
77 |                 pass
78 | 
79 |             findpdf = rst.findAll(attrs={'class': 'gs_ggs'})
80 |             if findpdf:
81 |                 pdflink = findpdf[0].find('a').get('href')
82 |                 url = str(pdflink)
83 |                 srs.append(SearchResult('directpdf', url))
84 |         except Exception as e:
85 |             log_exc("Search Item parse error: {0}".format(str(e)))
86 |     ret['results'] = srs
87 |     return ret
88 | 


--------------------------------------------------------------------------------
/sopaper/test-fetcher.py:
--------------------------------------------------------------------------------
 1 | #!../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: test-fetcher.py
 4 | # Date: Thu Jun 18 23:27:53 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import ukconfig
 8 | ukconfig.USE_DB = False
 9 | ukconfig.USE_INDEXER = False
10 | 
11 | from .fetcher import register_parser, SearchResult
12 | from .job import JobContext
13 | from .ukdbconn import new_paper
14 | 
15 | import sys
16 | 
17 | if __name__ == '__main__':
18 |     if len(sys.argv) == 2:
19 |         ukconfig.USE_DB = True
20 |     ctx = JobContext("Test Filename")
21 | 
22 |     parser = register_parser.parser_dict['arxiv.org']
23 |     sr = SearchResult(None, "http://arxiv.org/abs/1312.6680")
24 |     #sr = SearchResult(None, "  http://arxiv.org/abs/1404.3610")
25 | 
26 |     #parser = register_parser.parser_dict['dl.acm.org']
27 |     #url = "http://dl.acm.org/citation.cfm?id=1859761"  # twitter
28 |     #url = "http://dl.acm.org/citation.cfm?id=996342"    # SIFT # Large Number of cited
29 |     #url = "http://dl.acm.org/citation.cfm?id=2366157"  # big
30 |     #url = "http://dl.acm.org/citation.cfm?id=1656278"  # Weka
31 |     #sr = SearchResult(None, url)
32 | 
33 |     #parser = register_parser.parser_dict['ieeexplore.ieee.org']
34 |     #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791")
35 |     #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529")
36 | 
37 | 
38 |     #parser = register_parser.parser_dict['sciencedirect.com']
39 |     #url = "http://www.sciencedirect.com/science/article/pii/S1570870513000073"
40 |     #sr = SearchResult(None, url)
41 | 
42 | 
43 |     #params = parser.fetch_info(ctx, sr)
44 |     #print params
45 |     data = parser.download(sr)
46 | 
47 |     print(ctx.title)
48 |     if ukconfig.USE_DB and ctx.success:
49 |         pid = new_paper(ctx)
50 | 


--------------------------------------------------------------------------------
/sopaper/test-searcher.py:
--------------------------------------------------------------------------------
 1 | #!../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: test-searcher.py
 4 | # Date: Tue Jan 20 14:22:43 2015 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from multiprocessing import Pool
 8 | import sys
 9 | 
10 | from . import searcher
11 | from .job import JobContext
12 | from .searcher import searcher_run
13 | 
14 | if __name__ == '__main__':
15 |     query = sys.argv[1]
16 |     searchers = searcher.register_searcher.get_searcher_list()
17 |     searchers = searchers[1:]
18 |     print([k.name for k in searchers])
19 |     ctx = JobContext(query)
20 | 
21 |     args = list(zip(searchers, [ctx] * len(searchers)))
22 |     pool = Pool()
23 |     async_results = [pool.apply_async(searcher_run, arg) for arg in args]
24 | 
25 |     # Search and get all the results item
26 |     for s in async_results:
27 |         s = s.get()
28 |         if s is None:
29 |             continue
30 |         srs = s['results']
31 | 
32 |         print(srs)
33 | 
34 |         meta = s.get('ctx_update')
35 |         if meta:
36 |             ctx.update_meta_dict(meta)
37 |     pool.close()
38 |     pool.terminate()
39 | 


--------------------------------------------------------------------------------
/sopaper/ukconfig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: ukconfig.py
 4 | # Date: Mon Jan 04 02:33:47 2016 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | download_method = 'wget'
 8 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
 9 | 
10 | FILE_SIZE_MINIMUM = 10000        # at least 10kb
11 | FILE_SIZE_MAXIMUM = 100000000    # at most 100mb
12 | 
13 | PYTHON_POOL_TIMEOUT = 9999  # bug in python: see http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
14 | 
15 | import os
16 | 
17 | LOG_DIR = None
18 | 
19 | # this lib is not required for command line script
20 | USE_MAGIC_LIB = False
21 | try:
22 |     import magic
23 | except ImportError:
24 |     USE_MAGIC_LIB = False
25 | 
26 | USE_DB = True
27 | USE_INDEXER = True
28 | try:
29 |     import pymongo
30 | except ImportError:
31 |     USE_DB = False
32 | 
33 | mongo_conn = ('127.0.0.1', 27018)
34 | mongo_db = 'sopaper'
35 | 
36 | BS_PARSER = 'html.parser' # parser for beautifulsoup
37 | 
38 | DB_DIR_NAME = 'xapian-db'
39 | XP_DB_DIR = os.path.join(os.path.dirname(__file__),
40 |                          '../{0}'.format(DB_DIR_NAME))
41 | 
42 | SEARCH_PAGE_SIZE = 10
43 | SEARCH_SUMMARY_LEN = 300
44 | 
45 | MAILSERVER_HOST = 'server4.net9.org'
46 | MAILSERVER_PORT = 5184
47 | 


--------------------------------------------------------------------------------
/sopaper/ukdbconn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | # File: ukdbconn.py
 4 | # Date: 二 6月 10 04:01:51 2014 +0000
 5 | # Author: jiakai <jia.kai66@gmail.com>
 6 | #         Yuxin Wu <ppwwyyxxc@gmail.com>
 7 | 
 8 | """database connections"""
 9 | 
10 | 
11 | try:
12 |     from pymongo import MongoClient
13 | except ImportError:
14 |     from pymongo import Connection as MongoClient
15 | from pymongo.errors import DuplicateKeyError
16 | 
17 | from . import ukconfig
18 | from .uklogger import *
19 | 
20 | _db = None
21 | 
22 | def get_mongo(coll_name=None):
23 |     global _db
24 |     if _db is None:
25 |         _db = MongoClient(*ukconfig.mongo_conn)[ukconfig.mongo_db]
26 | 
27 |     if coll_name is None:
28 |         return _db
29 |     return _db[coll_name]
30 | 
31 | def new_paper(ctx):
32 |     pid = global_counter('paper')
33 |     log_info("Add new paper: {0}, pid={1}".format(
34 |         ctx.title, pid))
35 |     doc = {
36 |         '_id': pid,
37 |         'title': ctx.title.lower(),
38 |         'view_cnt': 1,
39 |         'download_cnt': 0
40 |     }
41 |     meta = ctx.meta
42 |     if 'author' in meta:
43 |         meta['author'] = [x.lower() for x in meta['author']]
44 |     doc.update(ctx.meta)
45 |     doc['title'] = doc['title'].lower()
46 | 
47 |     db = get_mongo('paper')
48 |     db.ensure_index('title')
49 |     ret = db.insert(doc)
50 |     return pid
51 | 
52 | def update_meta(pid, meta):
53 |     db = get_mongo('paper')
54 |     db.update({'_id': pid}, {'$set': meta})
55 | 
56 | def update_view_cnt(pid):
57 |     db = get_mongo('paper')
58 |     db.update({'_id': pid}, {'$inc': {'view_cnt': 1}})
59 | 
60 | def global_counter(name, delta=1):
61 |     """ atomically change a global int64 counter and return the newest value;
62 |     starting from 1
63 |     mongo document structure:
64 |     {
65 |         _id: counter name
66 |         val: current value
67 |     }"""
68 |     db = get_mongo('global_counter')
69 |     rst = db.find_and_modify(query={'_id': name},
70 |                             update={'$inc': {'val': delta}},
71 |                             new=True)
72 |     if rst:
73 |         k = rst.get('val')
74 |         if k:
75 |             return k
76 |     try:
77 |         val = int(1)
78 |         db.insert({'_id': name, 'val': val})
79 |         return val
80 |     except DuplicateKeyError:
81 |         return global_counter(name, delta)
82 | 


--------------------------------------------------------------------------------
/sopaper/uklogger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | # $File: uklogger.py
 4 | # $Date: 一 6月 09 17:20:34 2014 +0000
 5 | # $Author: jiakai <jia.kai66@gmail.com>
 6 | 
 7 | """utilities for handling logging"""
 8 | 
 9 | from . import ukconfig
10 | import traceback
11 | import time
12 | import os
13 | import os.path
14 | from termcolor import colored
15 | 
16 | 
17 | def log_api(msg):
18 |     """log a message from api-website"""
19 |     print(colored('API', 'green'), msg)
20 |     # TODO: use log util, log to file, including time, module, etc.
21 | 
22 | 
23 | def log_info(msg):
24 |     """log an info message"""
25 |     print(colored('INFO', 'blue'), msg)
26 |     if ukconfig.LOG_DIR:
27 |         with open(os.path.join(ukconfig.LOG_DIR, 'info.txt'), 'a') as f:
28 |             f.write(msg)
29 |             f.write('\n')
30 |     # TODO: use log util, log to file, including time, module, etc.
31 | 
32 | 
33 | def log_err(msg):
34 |     """log an err message"""
35 |     print(colored('ERR', 'red', attrs=['blink']), msg)
36 |     if ukconfig.LOG_DIR:
37 |         with open(os.path.join(ukconfig.LOG_DIR, 'error.txt'), 'a') as f:
38 |             f.write(msg)
39 |             f.write('\n')
40 |     # TODO: use log util, log to file, including time, module, etc.
41 | 
42 | def log_exc(exc):
43 |     """log an unexpected exception"""
44 |     log_err('Caught unexpected exception: {}\n{}'.format(
45 |         exc, traceback.format_exc()))
46 | 


--------------------------------------------------------------------------------
/sopaper/xpengine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/sopaper/xpengine/__init__.py


--------------------------------------------------------------------------------
/sopaper/xpengine/indexer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: indexer.py
 4 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 5 | 
 6 | __all__ = ['xapian_indexer']
 7 | 
 8 | import os
 9 | import threading
10 | import shutil
11 | import xappy
12 | from xappy import IndexerConnection, FieldActions, UnprocessedDocument, Field
13 | from xappy import errors
14 | from lib.ukutil import ensure_unicode_anytype as ensure_unicode
15 | 
16 | from .xpcommon import FIELD_NUM, STOPWORDS
17 | 
18 | class XapianIndexer(object):
19 | 
20 |     def __init__(self, dirname):
21 |         self.dbPath = os.path.abspath(dirname)
22 | 
23 |         self.dbconn = IndexerConnection(self.dbPath)
24 | 
25 |         self.dbconn.add_field_action('title', FieldActions.INDEX_FREETEXT,
26 |                                      weight=5, language='en')
27 |         self.dbconn.add_field_action('text', FieldActions.INDEX_FREETEXT,
28 |                                      language='en', spell=True, stop=STOPWORDS)
29 |         #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float')
30 |         #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT)
31 | 
32 |         self.lock = threading.Lock()
33 | 
34 |         for k in list(FIELD_NUM.keys()):
35 |             self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)
36 | 
37 |     def add_doc(self, doc):
38 |         """ doc: a dict """
39 |         content = doc['text']
40 |         document = UnprocessedDocument()
41 |         document.fields.append(Field('text', content))
42 | 
43 |         for k, v in doc.items():
44 |             if k in ['text', 'id']:
45 |                 continue
46 |             if type(v) == list:
47 |                 for item in v:
48 |                     document.fields.append(Field(k, ensure_unicode(item)))
49 |             else:
50 |                 document.fields.append(Field(k, ensure_unicode(v)))
51 |         document.id = str(doc['id'])
52 |         try:
53 |             self.lock.acquire()
54 |             self.dbconn.add(document)
55 |         except errors.IndexerError as e:
56 |             print(str(e))
57 |         finally:
58 |             self.lock.release()
59 | 
60 | 
61 |     def flush(self):
62 |         self.dbconn.flush()
63 | 
64 |     def close(self):
65 |         self.dbconn.close()
66 | 
67 |     def clear(self):
68 |         self.close()
69 |         shutil.rmtree(self.dbPath)
70 |         self.__init__(self.dbPath)
71 | 


--------------------------------------------------------------------------------
/sopaper/xpengine/run-xp.py:
--------------------------------------------------------------------------------
 1 | #!../../manage/exec-in-virtualenv.sh
 2 | # -*- coding: UTF-8 -*-
 3 | # File: run-xp.py
 4 | # Date: Fri May 23 20:19:26 2014 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from .indexer import *
 8 | from .searcher import *
 9 | 
10 | import json
11 | import glob
12 | import os
13 | import sys
14 | 
15 | db = './xapian-database'
16 | 
17 | def index():
18 |     indexer = XapianIndexer(db)
19 | 
20 |     for idx, f in enumerate(glob.glob('./zbigniew-herbert/*.txt')):
21 |         text = open(f).read()
22 | 
23 |         doc = {'text': text}
24 |         doc['id'] = idx
25 |         doc['title'] = os.path.basename(f)
26 |         doc['author'] = ['Yuxin Wu', 'Angela Doudou']
27 |         indexer.add_doc(doc)
28 |     indexer.flush()
29 | 
30 | def search(query):
31 |     searcher = XapianSearcher(db)
32 |     ret = searcher.search(query)
33 |     print(json.dumps(ret))
34 | 
35 | if sys.argv[1] == 'index':
36 |     index()
37 | elif sys.argv[1] == 'search':
38 |     search(sys.argv[2])
39 | 


--------------------------------------------------------------------------------
/sopaper/xpengine/searcher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: searcher.py
 4 | # Date: 二 6月 10 04:20:24 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | import os
 8 | import re
 9 | 
10 | import xappy
11 | from xappy import SearchConnection
12 | 
13 | from .xpcommon import FIELD_NUM,STOPWORDS
14 | 
15 | class XapianSearcher(object):
16 | 
17 |     def __init__(self, dirname):
18 |         self.dbPath = os.path.abspath(dirname)
19 |         self.conn = SearchConnection(self.dbPath)
20 |         # can use 'reopen()' to open the db again
21 | 
22 |     def reopen(self):
23 |         self.conn.reopen()
24 | 
25 |     def search(self, query, offset=0, page_size=10, summary_len=300):
26 |         query = self.conn.spell_correct(query)
27 |         words = query.split()
28 |         words = [x for x in words if x not in STOPWORDS]
29 |         query = ' OR '.join(words)
30 |         #query = ' '.join(words)
31 |         q = self.conn.query_field('text', query)
32 | 
33 |         res = self.conn.search(q, offset * page_size, page_size)
34 | 
35 |         def transform(r):
36 |             doc = {'_id': r.id,
37 |                     'title': r.data['title'][0],
38 |                     'content': r.summarise('text', maxlen=summary_len),
39 |                     'weight': r.weight
40 |                    }
41 |             return doc
42 | 
43 |         ret = list(map(transform, res))
44 |         return ret
45 | 
46 |     def close(self):
47 |         self.conn.close()
48 | 


--------------------------------------------------------------------------------
/sopaper/xpengine/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | according
  6 | accordingly
  7 | across
  8 | actually
  9 | after
 10 | afterwards
 11 | again
 12 | against
 13 | ain't
 14 | all
 15 | allow
 16 | allows
 17 | almost
 18 | alone
 19 | along
 20 | already
 21 | also
 22 | although
 23 | always
 24 | am
 25 | among
 26 | amongst
 27 | an
 28 | and
 29 | another
 30 | any
 31 | anybody
 32 | anyhow
 33 | anyone
 34 | anything
 35 | anyway
 36 | anyways
 37 | anywhere
 38 | apart
 39 | appear
 40 | appreciate
 41 | appropriate
 42 | are
 43 | aren't
 44 | around
 45 | as
 46 | a's
 47 | aside
 48 | ask
 49 | asking
 50 | associated
 51 | at
 52 | available
 53 | away
 54 | awfully
 55 | b
 56 | be
 57 | became
 58 | because
 59 | become
 60 | becomes
 61 | becoming
 62 | been
 63 | before
 64 | beforehand
 65 | behind
 66 | being
 67 | believe
 68 | below
 69 | beside
 70 | besides
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | c
 80 | came
 81 | can
 82 | cannot
 83 | cant
 84 | can't
 85 | cause
 86 | causes
 87 | certain
 88 | certainly
 89 | changes
 90 | clearly
 91 | c'mon
 92 | co
 93 | com
 94 | come
 95 | comes
 96 | concerning
 97 | consequently
 98 | consider
 99 | considering
100 | contain
101 | containing
102 | contains
103 | corresponding
104 | could
105 | couldn't
106 | course
107 | c's
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | done
121 | don't
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he'd
191 | he'll
192 | hello
193 | help
194 | hence
195 | her
196 | here
197 | hereafter
198 | hereby
199 | herein
200 | here's
201 | hereupon
202 | hers
203 | herself
204 | he's
205 | hi
206 | him
207 | himself
208 | his
209 | hither
210 | hopefully
211 | how
212 | howbeit
213 | however
214 | how's
215 | i
216 | i'd
217 | ie
218 | if
219 | ignored
220 | i'll
221 | i'm
222 | immediate
223 | in
224 | inasmuch
225 | inc
226 | indeed
227 | indicate
228 | indicated
229 | indicates
230 | inner
231 | insofar
232 | instead
233 | into
234 | inward
235 | is
236 | isn't
237 | it
238 | it'd
239 | it'll
240 | its
241 | it's
242 | itself
243 | i've
244 | j
245 | just
246 | k
247 | keep
248 | keeps
249 | kept
250 | know
251 | known
252 | knows
253 | l
254 | last
255 | lately
256 | later
257 | latter
258 | latterly
259 | least
260 | less
261 | lest
262 | let
263 | let's
264 | like
265 | liked
266 | likely
267 | little
268 | look
269 | looking
270 | looks
271 | ltd
272 | m
273 | mainly
274 | many
275 | may
276 | maybe
277 | me
278 | mean
279 | meanwhile
280 | merely
281 | might
282 | more
283 | moreover
284 | most
285 | mostly
286 | much
287 | must
288 | mustn't
289 | my
290 | myself
291 | n
292 | name
293 | namely
294 | nd
295 | near
296 | nearly
297 | necessary
298 | need
299 | needs
300 | neither
301 | never
302 | nevertheless
303 | new
304 | next
305 | nine
306 | no
307 | nobody
308 | non
309 | none
310 | noone
311 | nor
312 | normally
313 | not
314 | nothing
315 | novel
316 | now
317 | nowhere
318 | o
319 | obviously
320 | of
321 | off
322 | often
323 | oh
324 | ok
325 | okay
326 | old
327 | on
328 | once
329 | one
330 | ones
331 | only
332 | onto
333 | or
334 | other
335 | others
336 | otherwise
337 | ought
338 | our
339 | ours
340 | ourselves
341 | out
342 | outside
343 | over
344 | overall
345 | own
346 | p
347 | particular
348 | particularly
349 | per
350 | perhaps
351 | placed
352 | please
353 | plus
354 | possible
355 | presumably
356 | probably
357 | provides
358 | q
359 | que
360 | quite
361 | qv
362 | r
363 | rather
364 | rd
365 | re
366 | really
367 | reasonably
368 | regarding
369 | regardless
370 | regards
371 | relatively
372 | respectively
373 | right
374 | s
375 | said
376 | same
377 | saw
378 | say
379 | saying
380 | says
381 | second
382 | secondly
383 | see
384 | seeing
385 | seem
386 | seemed
387 | seeming
388 | seems
389 | seen
390 | self
391 | selves
392 | sensible
393 | sent
394 | serious
395 | seriously
396 | seven
397 | several
398 | shall
399 | shan't
400 | she
401 | she'd
402 | she'll
403 | she's
404 | should
405 | shouldn't
406 | since
407 | six
408 | so
409 | some
410 | somebody
411 | somehow
412 | someone
413 | something
414 | sometime
415 | sometimes
416 | somewhat
417 | somewhere
418 | soon
419 | sorry
420 | specified
421 | specify
422 | specifying
423 | still
424 | sub
425 | such
426 | sup
427 | sure
428 | t
429 | take
430 | taken
431 | tell
432 | tends
433 | th
434 | than
435 | thank
436 | thanks
437 | thanx
438 | that
439 | thats
440 | that's
441 | the
442 | their
443 | theirs
444 | them
445 | themselves
446 | then
447 | thence
448 | there
449 | thereafter
450 | thereby
451 | therefore
452 | therein
453 | theres
454 | there's
455 | thereupon
456 | these
457 | they
458 | they'd
459 | they'll
460 | they're
461 | they've
462 | think
463 | third
464 | this
465 | thorough
466 | thoroughly
467 | those
468 | though
469 | three
470 | through
471 | throughout
472 | thru
473 | thus
474 | to
475 | together
476 | too
477 | took
478 | toward
479 | towards
480 | tried
481 | tries
482 | truly
483 | try
484 | trying
485 | t's
486 | twice
487 | two
488 | u
489 | un
490 | under
491 | unfortunately
492 | unless
493 | unlikely
494 | until
495 | unto
496 | up
497 | upon
498 | us
499 | use
500 | used
501 | useful
502 | uses
503 | using
504 | usually
505 | v
506 | value
507 | various
508 | very
509 | via
510 | viz
511 | vs
512 | w
513 | want
514 | wants
515 | was
516 | wasn't
517 | way
518 | we
519 | we'd
520 | welcome
521 | well
522 | we'll
523 | went
524 | were
525 | we're
526 | weren't
527 | we've
528 | what
529 | whatever
530 | what's
531 | when
532 | whence
533 | whenever
534 | when's
535 | where
536 | whereafter
537 | whereas
538 | whereby
539 | wherein
540 | where's
541 | whereupon
542 | wherever
543 | whether
544 | which
545 | while
546 | whither
547 | who
548 | whoever
549 | whole
550 | whom
551 | who's
552 | whose
553 | why
554 | why's
555 | will
556 | willing
557 | wish
558 | with
559 | within
560 | without
561 | wonder
562 | won't
563 | would
564 | wouldn't
565 | x
566 | y
567 | yes
568 | yet
569 | you
570 | you'd
571 | you'll
572 | your
573 | you're
574 | yours
575 | yourself
576 | yourselves
577 | you've
578 | z
579 | zero
580 | 


--------------------------------------------------------------------------------
/sopaper/xpengine/wordlists/words.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/sopaper/xpengine/wordlists/words.txt.gz


--------------------------------------------------------------------------------
/sopaper/xpengine/xpcommon.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: xpcommon.py
 4 | # Date: Fri May 23 12:35:53 2014 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | FIELD_NUM = {'id': 0,
 8 |              'title': 1,
 9 |              'text': 2,
10 |              'author': 3
11 |             }
12 | 
13 | import os
14 | 
15 | STOPWORDS_FILE = os.path.join(os.path.dirname(__file__), 'stopwords.txt')
16 | 
17 | STOPWORDS = set([x.strip() for x in open(STOPWORDS_FILE).readlines()])
18 | 


--------------------------------------------------------------------------------
/webapi/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # $File: __init__.py
 3 | # $Author: jiakai <jia.kai66@gmail.com>
 4 | 
 5 | """uknow informatin hub API website"""
 6 | 
 7 | import os
 8 | from flask import Flask
 9 | 
10 | from flask_login import LoginManager
11 | 
12 | from lib.ukutil import import_all_modules
13 | 
14 | 
15 | class DefaultConfig(object):
16 |     API_HOST = '0.0.0.0'
17 |     API_PORT = None
18 |     API_RUN_OPTIONS = {}
19 | 
20 | _app = None
21 | login_manager = None
22 | 
23 | 
24 | def get_app():
25 |     """load API modules and return the WSGI application"""
26 |     global get_app, _app, login_manager
27 |     _app = Flask(__name__)
28 |     _app.config.from_object(DefaultConfig())
29 |     _app.secret_key = 'WTF is this!!'       # Should have this to work
30 | 
31 |     login_manager = LoginManager()
32 |     login_manager.init_app(_app)
33 | 
34 |     import_all_modules(__file__, __name__)
35 |     get_app = lambda: _app
36 |     return _app
37 | 


--------------------------------------------------------------------------------
/webapi/api/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | # $File: __init__.py
 4 | # $Date: Sat May 10 20:33:01 2014 +0800
 5 | # $Author: jiakai <jia.kai66@gmail.com>
 6 | 
 7 | """website API entry points"""
 8 | from .. import _app as app
 9 | 
10 | from flask import Response, request, redirect, url_for, make_response
11 | 
12 | import json
13 | import re
14 | 
15 | @app.errorhandler(404)
16 | def page_not_found(_):
17 |     return Response('{"error": "API not found"}', 404,
18 |                     mimetype='application/json')
19 | 
20 | VALID_CALLBACK_RE = re.compile('^[$A-Za-z_][0-9A-Za-z_$.]*$')
21 | 
22 | 
23 | class api_method(object):
24 |     """use as a decorator to register an API"""
25 |     all_url_rule = list()
26 |     """class level attribute for all url rules"""
27 | 
28 |     url_rule = None
29 |     """url rule for current API"""
30 | 
31 |     api_implementation = None
32 |     """a callable implementing current API, which takes no argument and
33 |     returns a dict"""
34 | 
35 |     url_rule_extra_kwargs = None
36 |     """extra keyword arguments for url rule"""
37 | 
38 |     def __init__(self, url_rule, **kwargs):
39 |         self.url_rule = url_rule
40 |         self.url_rule_extra_kwargs = kwargs
41 | 
42 |         meth = self.url_rule_extra_kwargs.get('methods', None)
43 |         if meth is not None:
44 |             if 'POST' in meth:
45 |                 meth.append('OPTIONS')
46 | 
47 |     def __call__(self, func):
48 |         self.api_implementation = func
49 |         endpoint = func.__module__ + '.' + func.__name__
50 | 
51 |         app.add_url_rule(self.url_rule,
52 |                          view_func=self.view_func,
53 |                          endpoint=endpoint, **self.url_rule_extra_kwargs)
54 | 
55 |         return func
56 | 
57 |     def view_func(self):
58 |         """the view_func passed to Flask.add_url_rule"""
59 |         if request.method == 'OPTIONS':
60 |             resp = Response('', 200)
61 |             resp.headers['Access-Control-Allow-Origin'] = '*'
62 |             resp.headers['Access-Control-Allow-Headers'] = \
63 |                 'Content-Type, Origin, Accept'
64 |             return resp
65 |         rst = self.api_implementation()
66 |         assert isinstance(rst, dict), \
67 |             "ret value {0} is not a dict".format(str(rst))
68 |         callback = request.values.get('callback')
69 |         if callback and VALID_CALLBACK_RE.match(callback) is not None:
70 |             rst = '{}({})'.format(callback, json.dumps(rst))
71 |         else:
72 |             rst = json.dumps(rst, indent=4)
73 |         resp = Response(rst, 200, mimetype='application/json')
74 |         resp.headers['Access-Control-Allow-Origin'] = '*'
75 |         resp.headers['Access-Control-Allow-Headers'] = \
76 |             'Content-Type, Origin, Accept'
77 |         return resp
78 | 


--------------------------------------------------------------------------------
/webapi/api/comment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: comment.py
 4 | # Author: Yichen Wang <wangycthu@gmail.com>
 5 | 
 6 | from uklogger import *
 7 | from . import api_method, request
 8 | from ukdbconn import get_mongo
 9 | 
10 | # api: /comment?pid=2&uid=xxx&cmt=xxxx
11 | @api_method('/comment')
12 | def do_comment():
13 |     """ update db with user's comment & uid """
14 |     try:
15 |         pid = int(request.values.get('pid'))
16 |         uid = request.values.get('uid')
17 |         comment = request.values.get('cmt')
18 |     except Exception:
19 |         return {'status': 'error',
20 |                 'reason': 'invalid request'}
21 | 
22 |     db = get_mongo('paper')
23 |     db.update({'_id': pid}, {'$push': {'comments': {'cmt': comment, 'uid': uid}}})
24 |     db.update({'_id': pid}, {'$inc': {'cmt_count': 1}})
25 |     log_info("Add {0}'s comment to pdf {1}".format(uid, pid))
26 | 
27 |     return {'status': 'ok'}
28 | 
29 | # api: /getcmt?pid=2&page=0,1,2...
30 | @api_method('/getcmt')
31 | def get_comment():
32 |     """ return first 10 comments of the paper with pid """
33 |     try:
34 |         pid = int(request.values.get('pid'))
35 |         page = int(request.values.get('page'))
36 |     except Exception:
37 |         return {'status': 'error',
38 |                 'reason': 'invalid request'}
39 | 
40 |     db = get_mongo('paper')
41 |     res = db.find_one({'_id': pid}, {'comments': {'$slice': [page*10, 10]}, 'cmt_count': 1})
42 |     log_info("Return 10 comments of paper {0}".format(pid))
43 | 
44 |     if res is None:
45 |         return {}
46 |     return res
47 | 
48 | 


--------------------------------------------------------------------------------
/webapi/api/download.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: download.py
 4 | # Date: 六 6月 14 03:23:41 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import app, make_response, request, api_method
 8 | from ukdbconn import get_mongo
 9 | from uklogger import *
10 | from lib.textutil import title_beautify
11 | from queryhandler import progress_dict
12 | from authorfetch import process_fetch_author
13 | 
14 | # api: /download?pid=1
15 | @app.route('/download')
16 | def download():
17 |     pid = int(request.values.get('pid'))
18 |     agent = str(request.user_agent)
19 |     db = get_mongo('paper')
20 | 
21 | 
22 |     doc = db.find_and_modify(query={'_id': pid},
23 |                              update={'$inc': {'download_cnt': 1}},
24 |                              fields={'pdf': 1, 'title': 1}
25 |                             )
26 |     title = title_beautify(doc['title'])
27 |     if not doc:
28 |         return make_response(''), 404
29 |     data = doc['pdf']
30 |     resp = make_response(data)
31 |     resp.headers['Content-Type'] = 'application/pdf'
32 | 
33 | 
34 |     # chrome doesn't work with comma in filename
35 |     #if agent.find('Chrom') != -1:
36 |         #title = title.replace(',', ' ')
37 | 
38 |     # TODO deal with unicode name!
39 |     resp.headers['Content-Disposition'] = \
40 |             'attachment; filename="{0}.pdf"'.format(title)
41 |     return resp
42 | 
43 | # api: /download_available?pid=1
44 | @api_method('/download_available')
45 | def available():
46 |     try:
47 |         pid = int(request.values.get('pid'))
48 |         #log_info("Query available of {0} with dic={1}".
49 |                      #format(pid, str(progress_dict)))
50 |     except:
51 |         return {'status': 'error',
52 |                 'reason': 'Invalid Request'}
53 |     prgs = progress_dict.get(pid)
54 |     if prgs is None:
55 |         db = get_mongo('paper')
56 |         doc = db.find_one({'_id': pid}, {'page': 1})
57 |         if not doc:
58 |             return {'status': 'error',
59 |                     'reason': 'no such item'}
60 |         if doc.get('page'):
61 |             doc['progress'] = 'done'
62 |         else:
63 |             doc['progress'] = 'failed'
64 | 
65 |         doc.update({'status': 'ok'})
66 |         return doc
67 |     return {'status': 'ok',
68 |             'progress': prgs}
69 | 
70 | # api: /fetchauthor?name=xxx&email=xxx
71 | @api_method('/fetchauthor')
72 | def fetchauthor():
73 |     """ fetch all papers of the author"""
74 |     name = request.values.get('name').lower()
75 |     email = request.values.get('email')
76 | 
77 |     process_fetch_author(name, email)
78 |     return {'status': 'ok'}
79 | 


--------------------------------------------------------------------------------
/webapi/api/html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: html.py
 4 | # Date: Tue May 20 18:01:39 2014 +0800
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import api_method, request
 8 | from ukdbconn import get_mongo
 9 | 
10 | # api: /html?pid=2&page=0,1,3,5
11 | # 0 is the html framework
12 | @api_method('/html')
13 | def html():
14 |     """ return a dict of {pagenum: 'html'} """
15 |     try:
16 |         pid = int(request.values.get('pid'))
17 |         page_str = request.values.get('page')
18 |         pages = list(map(int, page_str.split(',')))
19 |     except Exception:
20 |         return {'status': 'error',
21 |                 'reason': 'invalid request'}
22 |     db = get_mongo('paper')
23 |     doc = db.find_one({'_id': pid}, {'page': 1, 'html': 1})
24 | 
25 |     if max(pages) > doc['page'] or min(pages) < 0:
26 |         return {'status': 'error',
27 |                 'reason': 'invalid page index'}
28 | 
29 |     res = {}
30 |     for p in pages:
31 |         res[p] = doc['html'][p]
32 |     return {'status': 'ok',
33 |             'htmls': res }
34 | 


--------------------------------------------------------------------------------
/webapi/api/mark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: mark.py
 4 | # Author: Yichen Wang <wangycthu@gmail.com>
 5 | 
 6 | from uklogger import *
 7 | from . import api_method, request
 8 | from ukdbconn import get_mongo
 9 | 
10 | # api: /getmark?pid=2
11 | @api_method('/getmark')
12 | def mark():
13 |     """ get marks of the paper with pid """
14 |     try:
15 |         pid = int(request.values.get('pid'))
16 |     except Exception:
17 |         return {'status': 'error',
18 |                 'reason': 'invalid request'}
19 | 
20 |     db = get_mongo('paper')
21 |     res = db.find_one({'_id': pid}, {'upvote': 1, 'downvote': 1})
22 |     log_info("Return marks of pdf {0}".format(pid))
23 | 
24 |     if res is None:
25 |         return {}
26 |     return res
27 | 
28 | # api: /mark?pid=2&mark=1,-1
29 | # 1: good  -1: bad
30 | @api_method('/mark')
31 | def do_mark():
32 |     """ update db with user's mark & uid """
33 |     try:
34 |         pid = int(request.values.get('pid'))
35 |         mark = int(request.values.get('mark'))
36 |     except Exception:
37 |         return {'status': 'error',
38 |                 'reason': 'invalid request'}
39 | 
40 |     db = get_mongo('paper')
41 |     if mark == 1:
42 |         db.update({'_id': pid}, {'$inc': {'upvote': 1}})
43 |     else:
44 |         db.update({'_id': pid}, {'$inc': {'downvote': 1}})
45 |     log_info("Add mark to pdf {0}".format(pid))
46 | 
47 |     return {'status': 'ok'}
48 | 


--------------------------------------------------------------------------------
/webapi/api/query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: UTF-8 -*-
  3 | # File: query.py
  4 | # Date: 六 6月 14 03:17:06 2014 +0000
  5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
  6 | 
  7 | import math
  8 | from . import api_method, request
  9 | from lib.textutil import title_beautify
 10 | from queryhandler import handle_title_query, handle_content_query, handl_author_query
 11 | from ukdbconn import get_mongo
 12 | from dbsearch import SEARCH_RETURN_FIELDS
 13 | 
 14 | 
 15 | def transform(r):
 16 |     if r.get('page'):
 17 |         r['haspdf'] = 1
 18 |     else:
 19 |         r['haspdf'] = 0
 20 | 
 21 |     try:
 22 |         r['citecnt'] = len(r['citedby'])
 23 |         del r['citedby']
 24 |     except:
 25 |         r['citecnt'] = 0
 26 |     if 'author' in r:
 27 |         r['author'] = [title_beautify(x) for x in r['author']]
 28 |     r['title'] = title_beautify(r['title'])
 29 |     return r
 30 | 
 31 | def sort_content(res):
 32 |     def score(r):
 33 |         w = r['weight']
 34 |         c = r['citecnt']
 35 |         c = max([c, 10])
 36 |         return (w ** 2) * c
 37 | 
 38 |     print([r['weight']  for r in res])
 39 |     print([r['citecnt'] for r in res])
 40 |     print([score(r) for r in res])
 41 |     res = sorted(res, key=score)
 42 |     return res
 43 | 
 44 | def do_query(query):
 45 |     tp = 'title'
 46 |     res = handle_title_query(query)
 47 |     if not res:
 48 |         res = do_search_author(query)
 49 |         if res:
 50 |             return res
 51 |         res = handle_content_query(query)
 52 |         tp = 'content'
 53 | 
 54 |     assert isinstance(res, list)
 55 | 
 56 |     res = list(map(transform, res))
 57 | 
 58 |     if tp == 'content':
 59 |         res = sort_content(res)
 60 |     return {'status': 'ok',
 61 |             'type': tp,
 62 |             'results': res}
 63 | 
 64 | # api: /query?q=test
 65 | @api_method('/query')
 66 | def query():
 67 |     """ first try title-search, then content-search """
 68 |     query = request.values.get('q')
 69 |     if query == 'None':
 70 |         return {'status': 'error',
 71 |                 'reason': 'invalid request'}
 72 |     return do_query(query)
 73 | 
 74 | 
 75 | # api: /cquery?q=test
 76 | @api_method('/cquery')
 77 | def content_query():
 78 |     """ only use content-search backend """
 79 |     try:
 80 |         query = request.values.get('q')
 81 |         assert query != "None"
 82 |     except:
 83 |         return {'status': 'error',
 84 |                 'reason': 'invalid request'}
 85 | 
 86 |     res = handle_content_query(query)
 87 |     assert isinstance(res, list)
 88 |     res = list(map(transform, res))
 89 | 
 90 |     return {'status': 'ok',
 91 |             'type': 'author',
 92 |             'results': res}
 93 |     res = sort_content(res)
 94 | 
 95 |     return {'status': 'ok',
 96 |             'type': 'content',
 97 |             'results': res
 98 |            }
 99 | 
100 | def do_search_author(name):
101 |     res = handl_author_query(name)
102 |     if not res:
103 |         return None
104 | 
105 |     res = list(map(transform, res))
106 |     res = sorted(res, key=lambda x: x.get('citecnt', 0))
107 |     return {'status': 'ok',
108 |             'type': 'author',
109 |             'results': res}
110 | 
111 | 
112 | # api: /author?name=xxx
113 | @api_method('/author')
114 | def search_author():
115 |     """ search db by author name
116 |         return a list of paper info """
117 |     try:
118 |         name = request.values.get('name').lower()
119 |     except Exception:
120 |         return {'status': 'error',
121 |                 'reason': 'invalid request'}
122 | 
123 |     ret = do_search_author(name)
124 |     if ret:
125 |         return ret
126 |     else:
127 |         return {'status': 'ok',
128 |                 'type': 'author',
129 |                 'results': []}
130 | 


--------------------------------------------------------------------------------
/webapi/api/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | # $File: test.py
 4 | # $Date: Mon Mar 24 16:23:54 2014 +0800
 5 | # $Author: jiakai <jia.kai66@gmail.com>
 6 | 
 7 | """test with user id"""
 8 | 
 9 | from . import api_method, request
10 | 
11 | 
12 | @api_method('/test')
13 | def test():
14 |     """given user id and return all items"""
15 |     uid = request.values.get('uid')
16 |     if not uid:
17 |         return {'error': 'please visit with uid=1'}
18 |     return {'data': 'hi world {0}'.format(uid)}
19 | 


--------------------------------------------------------------------------------
/webapi/api/view.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: UTF-8 -*-
 3 | # File: view.py
 4 | # Date: Mon May 26 17:10:14 2014 +0000
 5 | # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
 6 | 
 7 | from . import app, redirect, url_for, make_response, request, api_method
 8 | 
 9 | from flask import render_template
10 | 
11 | @app.route('/')
12 | def home():
13 |     return render_template('index.html')
14 | 
15 | @app.route('/search')
16 | def nosearch():
17 |     return render_template('search.html')
18 | 
19 | 
20 | @app.route('/s')
21 | def search():
22 |     search_word = request.values.get('keyword', None)
23 |     return render_template('search.html', searchkeyword=search_word);
24 | 
25 |     # redirect(url_for('static', filename='search.html'))
26 | 
27 | 
28 | @app.route('/api')
29 | def api():
30 |     """ show all the apis"""
31 | 
32 |     ret = """
33 | api: /download?pid=1
34 | 
35 | api: /query?q=test
36 | 
37 | api: /html?pid=2&page=0,1,3,5
38 | 0 is the html framework
39 | 
40 | api: /download_available?pid=1
41 | 
42 | api: /author?name=xxx
43 | 
44 | api: /mark?pid=2&mark=1,-1
45 | 
46 | api: /comment?pid=2&uid=xxx&cmt=xxxx
47 | """
48 |     ret = ret.strip().replace('\n', '<br/>')
49 |     resp = make_response(ret)
50 |     resp.headers['Content-Type'] = 'text/html'
51 |     return resp
52 | 


--------------------------------------------------------------------------------
/webapi/standalone_server.py:
--------------------------------------------------------------------------------
 1 | #!./manage/exec-in-virtualenv.sh
 2 | # -*- coding: utf-8 -*-
 3 | # $File: standalone_server.py
 4 | # $Author: jiakai <jia.kai66@gmail.com>
 5 | 
 6 | from webapi import get_app
 7 | import signal
 8 | import sys
 9 | import os
10 | import os.path
11 | 
12 | 
13 | def sigint_handler(s, f):
14 |     """receive the SIGINT signal from unittest script
15 |     and exit correctly"""
16 |     print('api standalone server: SIGINT received, exit')
17 |     sys.exit()
18 | 
19 | 
20 | def main():
21 |     signal.signal(signal.SIGINT, sigint_handler)
22 | 
23 |     app = get_app()
24 |     try:
25 |         app.config.from_pyfile('../manage/api_website_config.py')
26 |     except IOError as e:
27 |         print(e)
28 |         print('WARNING: No configuration found, using builtin defaults.')
29 | 
30 |     app.run(app.config['API_HOST'], app.config['API_PORT'],
31 |             **app.config['API_RUN_OPTIONS'])
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/webapi/static/css/homepage.css:
--------------------------------------------------------------------------------
  1 | /*******************************
  2 |             Global
  3 | *******************************/
  4 | 
  5 | html,
  6 | body {
  7 |   font-size: 14px;
  8 |   height: 100%;
  9 |   line-height: 24px;
 10 | }
 11 | 
 12 | body {
 13 |   /*font-family: Microsoft YaHei ! important;*/
 14 |   font-family: "Open Sans", "Helvetica Neue", "Helvetica", "Arial", sans-serif;
 15 |   background: #FFFFFF;
 16 |   margin: 0px;
 17 |   padding: 0px;
 18 |   text-rendering: optimizeLegibility;
 19 |   min-width: 320px;
 20 | }
 21 | 
 22 | body.progress .ui.progress .bar {
 23 |   -webkit-animation: finish 10s ease-in-out infinite;
 24 |   -moz-animation: finish 10s ease-in-out infinite;
 25 |   -ms-animation: finish 10s ease-in-out infinite;
 26 |   animation: finish 10s ease-in-out infinite;
 27 | }
 28 | 
 29 | .ui.progress.inlist {
 30 |   padding-left:10px;
 31 | }
 32 | 
 33 | .ui.footer{
 34 |   color: rgba(0, 0, 0, 0.4);
 35 |   line-height: 15px;
 36 |   font-size: 12px;
 37 |   text-align: center;
 38 |   margin-left: auto;
 39 |   margin-right: auto;
 40 |   margin-top:50px;
 41 | }
 42 | 
 43 | .ui.inverted.dimmer .ui.loader.large,
 44 | .ui.loader.large {
 45 |   background-image: url(static/images/loader-large.gif);
 46 | }
 47 | 
 48 | .ui.author.images{
 49 |   margin-left: auto;
 50 |   margin-right: auto;
 51 |   margin-top:100px;
 52 |   width: 360px;
 53 | }
 54 | 
 55 | .ui.author.images .image{
 56 |   margin-right:20px;
 57 |   margin-left:20px;
 58 | }
 59 | 
 60 | 
 61 | /*******************************
 62 |           UI Overrides
 63 | *******************************/
 64 | .ui.sidebar .item  .icon{
 65 |   display: inline;
 66 | }
 67 | 
 68 | .ui.sidebar .item  .labels{
 69 |   margin-left: 32px;
 70 | }
 71 | 
 72 | .ui.right.sidebar{
 73 |   background: #FCFCFC url(static/images/bg.jpg) repeat;
 74 | }
 75 | 
 76 | .ui.icon.menu > .onleft.item{
 77 |   text-align: left;
 78 | }
 79 | 
 80 | .ui.bib.modal{
 81 |   top: 60%;
 82 |   left: 60%;
 83 |   text-align: left;
 84 |   width: 50%;
 85 | }
 86 | 
 87 | .ui.ref.modal{
 88 |   top: 50%;
 89 |   left: 50%;
 90 |   text-align: left;
 91 |   /*
 92 |    *width: 70%;
 93 |    */
 94 | }
 95 | 
 96 | .ui.com.modal{
 97 |   /*
 98 |    *width: 80%;
 99 |    */
100 |   height:500px;
101 | }
102 | 
103 | .ui.logo.image{
104 |   display: block;
105 |   margin-left: auto;
106 |   margin-right: auto
107 | }
108 | .ui.divided.list .description b{
109 |   color:#D95C5C;
110 |   font-weight: bold;
111 | }
112 | 
113 | .ui.attached.button{
114 |   z-index: 999;
115 | }
116 | .class-sprite{
117 |   background-image: url(images/allcat.jpg);
118 |   background-repeat: no-repeat;
119 |   margin-bottom: 10px;
120 | }
121 | 
122 | #classico1{
123 |   width: 100px;
124 |   height: 100px;
125 |   background-position: 0 0;
126 | }
127 | #classico2{
128 |   width: 100px;
129 |   height: 100px;
130 |   background-position: -100px 0px ;
131 | }
132 | #classico3{
133 |   width: 100px;
134 |   height: 100px;
135 |   background-position: -200px 0;
136 | }
137 | #classico4{
138 |   width: 100px;
139 |   height: 100px;
140 |   background-position: -300px 0;
141 | }
142 | #classico5{
143 |   width: 100px;
144 |   height: 100px;
145 |   background-position: -400px 0px ;
146 | }
147 | #classico6{
148 |   width: 100px;
149 |   height: 100px;
150 |   background-position: -500px 0;
151 | }
152 | #classico7{
153 |   width: 100px;
154 |   height: 100px;
155 |   background-position: 0px -100px;
156 | }
157 | #classico8{
158 |   width: 100px;
159 |   height: 100px;
160 |   background-position: -100px -100px;
161 | }
162 | #classico9{
163 |   width: 100px;
164 |   height: 100px;
165 |   background-position: -200px -100px;
166 | }
167 | #classico10{
168 |   width: 100px;
169 |   height: 100px;
170 |   background-position: -300px -100px;
171 | }
172 | #classico11{
173 |   width: 100px;
174 |   height: 100px;
175 |   background-position: -400px -100px;
176 | }
177 | #classico12{
178 |   width: 100px;
179 |   height: 100px;
180 |   background-position: -500px -100px;
181 | }
182 | #classico13{
183 |   width: 100px;
184 |   height: 100px;
185 |   background-position: 0px -200px;
186 | }
187 | #classico14{
188 |   width: 100px;
189 |   height: 100px;
190 |   background-position: -100px -200px;
191 | }
192 | #classico15{
193 |   width: 100px;
194 |   height: 100px;
195 |   background-position: -200px -200px;
196 | }
197 | #classico16{
198 |   width: 100px;
199 |   height: 100px;
200 |   background-position: -300px -200px;
201 | }
202 | 
203 | .ui.header {
204 |   font-family: Microsoft YaHei ! important;
205 |   /*'Source Sans Pro', "Helvetica Neue", "Helvetica", "Arial", sans-serif;*/
206 | }
207 | 
208 | .ui.header,
209 | h1,
210 | h2 {
211 |   font-weight: normal !important;
212 | }
213 | 
214 | h1.ui.header {
215 |   font-size: 3em;
216 | }
217 | 
218 | h4.ui.header,
219 | h5.ui.header,
220 | .ui.icon.header {
221 |   font-weight: bold !important;
222 | }
223 | 
224 | .ui.button {
225 |   font-weight: normal;
226 |   text-decoration: none;
227 | }
228 | 
229 | a {
230 |   color: #009FDA;
231 |   text-decoration: none;
232 |   -webkit-transition: color 0.3s ease;
233 |   transition: color 0.3s ease;
234 | }
235 | 
236 | a:hover {
237 |   color: #00BAFF;
238 | }
239 | 
240 | p a {
241 |   font-weight: bold;
242 | }
243 | 
244 | /*******************************
245 |             Global
246 | *******************************/
247 | 
248 | .menu .right.menu > .mobile.item {
249 |   display: none;
250 | }
251 | 
252 | .ui.page.grid.segment {
253 |   padding-top: 3.5rem;
254 |   padding-bottom: 3.5rem;
255 | }
256 | 
257 | .masthead.segment {
258 |   background-image: url(images/bg.jpg);
259 |   -webkit-background-size: cover;
260 |   background-size: cover;
261 |   margin: 0em;
262 |   padding-top: 4rem;
263 |   overflow: hidden;
264 |   -webkit-box-shadow: 0 -20px 10px 0 rgba(0, 0, 0, 0.1) inset !important;
265 |   box-shadow: 0 -20px 10px 0 rgba(0, 0, 0, 0.1) inset !important;
266 | }
267 | 
268 | .masthead.segment .column {
269 |   position: relative;
270 | }
271 | 
272 | .masthead.segment .information {
273 |   margin: 3em 1em 1em 280px;
274 | }
275 | 
276 | .masthead.segment .information p {
277 |   max-width: 600px;
278 |   margin: 0em 0px 2em 0em;
279 |   font-size: 1.2em;
280 | }
281 | 
282 | .masthead.segment h1.ui.header .sub {
283 |   font-size: 0.7em;
284 | }
285 | 
286 | .masthead.segment .image {
287 |   position: absolute;
288 |   left: 0%;
289 |   bottom: -110px;
290 | }
291 | 
292 | .overview .divided.grid .header + p {
293 |   min-height: 100px;
294 | }
295 | 
296 | .ui.shape .icon,
297 | .link.list .icon {
298 |   margin: 0em;
299 |   padding: 0em;
300 | }
301 | 
302 | .ui.cube.shape .side {
303 |   background-color: transparent;
304 | }
305 | 
306 | .ui.items > .row > .item,
307 | .ui.items > .item {
308 |   min-height: 100px;
309 | }
310 | 
311 | .ui.items > .item .avatar {
312 |   max-width: 150px;
313 | }
314 | 
315 | .ui.horizontal.items > .item > .content, .ui.items > .horizontal.item > .content {
316 | padding: 1% 1.7% 0% 3%;
317 | vertical-align: top;
318 | }
319 | 
320 | .ui.bottom.sidebar {
321 |   height: 80px !important;
322 | }
323 | 
324 | .ui.active.bottom.sidebar {
325 |   margin-top: -80px !important;
326 | }
327 | 
328 | /*Sliding Text and Icon Menu Style*/
329 | .sti-bg{
330 |   background:url(images/background-gray.png) repeat;
331 |   width:100%;
332 |   height: 348px;
333 | }
334 | 
335 | .sti-menu{
336 |   width:968px;
337 |   position:relative;
338 |   margin:0 auto;
339 |   padding-left:35px;
340 |   list-style:none;
341 | }
342 | .sti-menu li{
343 |   float:left;
344 |   width:242px;
345 |   height:330px;
346 | }
347 | .sti-menu li a{
348 |   display:block;
349 |   overflow:hidden;
350 |   text-align:center;
351 |   height:100%;
352 |   width:100%;
353 |   position:relative;
354 | 
355 | }
356 | .sti-menu li a h2{
357 |   color:#fff;
358 |   font-weight: normal;
359 |   font-size:22px;;
360 |   text-transform:uppercase;
361 |   position:absolute;
362 |   padding:10px;
363 |   width:190px;
364 |   top:215px;
365 |   left:0px;
366 |   text-shadow: 0px 1px 1px black;
367 | }
368 | .sti-menu li a h3{
369 |   font-size:15px;
370 |   font-style:italic;
371 |   color: #ebebeb;
372 |   position:absolute;
373 |   top:248px;
374 |   left:0px;
375 |   width:190px;
376 |   padding:10px;
377 | }
378 | .sti-icon{
379 |   width:178px;
380 |   height:178px;
381 |   position:absolute;
382 |   background-position:top left;
383 |   background-repeat:no-repeat;
384 |   background-color:transparent;
385 |   left:35px;
386 |   top:35px;
387 | }
388 | .sti-icon-one{
389 |   background-image:url(images/icons/button1.png);
390 | }
391 | .sti-icon-two{
392 |   background-image:url(images/icons/button2.png);
393 | }
394 | .sti-icon-three{
395 |   background-image:url(images/icons/button3.png);
396 | }
397 | .sti-icon-four{
398 |   background-image:url(images/icons/button4.png);
399 | }
400 | 
401 | .fallback{
402 |   display: none;
403 | }
404 | /*******************************
405 |           Responsive
406 | *******************************/
407 | 
408 | /* Mobile Only */
409 | 
410 | @media only screen and (max-width : 768px) {
411 |   .ui.page.grid.segment {
412 |     padding-top: 2rem;
413 |     padding-bottom: 2rem;
414 |   }
415 | 
416 |   .menu .right.menu > .item {
417 |     display: none;
418 |   }
419 | 
420 |   .menu .right.menu > .mobile.item {
421 |     display: block;
422 |   }
423 | 
424 |   h1.ui.header {
425 |     font-size: 1.5em;
426 |   }
427 | 
428 |   .masthead.segment {
429 |     -webkit-background-size: auto;
430 |     background-size: auto;
431 |   }
432 | 
433 |   .masthead.segment .information {
434 |     margin-left: 190px;
435 |   }
436 | 
437 |   .masthead.segment .image {
438 |     bottom: -65px;
439 |     width: 150px;
440 |   }
441 | 
442 |   .masthead.segment .button {
443 |     font-size: 1rem;
444 |   }
445 | 
446 |   .overview .divided.grid .header .icon {
447 |     font-size: 1.5em;
448 |   }
449 | 
450 |   .overview .divided.grid .header + p {
451 |     min-height: 0px;
452 |   }
453 | 
454 |   .masthead.segment .column {
455 |     font-size: 0.7rem;
456 |   }
457 | 
458 |   .masthead.segment .column p {
459 |     display: none;
460 |   }
461 | 
462 |   .selection.list .right.floated {
463 |     display: none;
464 |   }
465 | 
466 |   .sti-menu { padding-left:16px; width:452px;}
467 |   .sti-menu li {width: 226px; height:300px;}
468 |   .sti-bg{height: 676px;}
469 |   }
470 | 
471 | @media only screen and (max-width : 1000px) {
472 | 
473 | }
474 | 
475 | 


--------------------------------------------------------------------------------
/webapi/static/fonts/basic.icons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/basic.icons.eot


--------------------------------------------------------------------------------
/webapi/static/fonts/basic.icons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/basic.icons.ttf


--------------------------------------------------------------------------------
/webapi/static/fonts/basic.icons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/basic.icons.woff


--------------------------------------------------------------------------------
/webapi/static/fonts/icons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/icons.eot


--------------------------------------------------------------------------------
/webapi/static/fonts/icons.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/icons.otf


--------------------------------------------------------------------------------
/webapi/static/fonts/icons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/icons.ttf


--------------------------------------------------------------------------------
/webapi/static/fonts/icons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/fonts/icons.woff


--------------------------------------------------------------------------------
/webapi/static/images/bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/bg.jpg


--------------------------------------------------------------------------------
/webapi/static/images/dark-bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/dark-bg.png


--------------------------------------------------------------------------------
/webapi/static/images/loader-large-inverted.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/loader-large-inverted.gif


--------------------------------------------------------------------------------
/webapi/static/images/loader-large.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/loader-large.gif


--------------------------------------------------------------------------------
/webapi/static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/logo.png


--------------------------------------------------------------------------------
/webapi/static/images/ltz.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/ltz.jpg


--------------------------------------------------------------------------------
/webapi/static/images/tile-bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/tile-bg.png


--------------------------------------------------------------------------------
/webapi/static/images/wyc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/wyc.jpg


--------------------------------------------------------------------------------
/webapi/static/images/wyx.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/static/images/wyx.jpg


--------------------------------------------------------------------------------
/webapi/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en" ng-app="SearchApp">
 3 | 	<head>
 4 | 		<!-- Standard Meta -->
 5 | 		<meta charset="utf-8" />
 6 | 		<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
 7 | 		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
 8 | 		<!-- Site Properities -->
 9 | 		<title>Sopaper</title>
10 | 		<link rel="stylesheet" type="text/css" href="css/semantic.css">
11 | 		<link rel="stylesheet" type="text/css" href="css/homepage.css">
12 | 		<script src="javascript/angular.min.js"></script>
13 | 		<script src="javascript/angular-resource.min.js"></script>
14 | 		<script src="javascript/angular-route.min.js"></script>
15 | 		<script src="javascript/jquery.js"></script>
16 | 		<script src="javascript/semantic.js"></script>
17 | 		<script src="javascript/homepage.js"></script>
18 | 		<script src="javascript/search.js"></script>
19 | 		
20 | 	</head>
21 | 	<body id="home" class="masthead" ng-controller="SearchCtrl">
22 | 		<!-- Navigation Start-->
23 | 		<div class="ui large inverted menu">
24 | 			<a class="green item">
25 | 			<i class="book icon"></i> <b>SoPaper</b>
26 | 			</a>
27 | 			<div class="right menu">
28 | 				<a class="green item">
29 | 				<b>Sign in </b><i class="setting icon"></i>
30 | 				</a>
31 | 			</div>
32 | 		</div>
33 | 		<!-- Navigation End-->
34 | 		<div class="ui page grid">
35 | 			<img class="ui medium image" src="static/images/Sopaper_logo.jpg">
36 | 			<h1 class="ui header">Sopaper</h1>
37 | 			<div class="row"></div>		
38 | 			<div class="ui fluid input">
39 | 				<input type="text"  placeholder="Search papers, authors, conferences..." id="Searchword" ng-model="keyword">
40 | 				<input type="submit" name="reg" class="ui icon button" ng-click="Search()" value="Search"/>
41 | 			</div>
42 | 			<div class="footer">
43 | 				This is a project designed for researchers to conveniently getting papers they need.
44 | 			</div>
45 | 		</div>
46 | 	</body>
47 | </html>


--------------------------------------------------------------------------------
/webapi/static/javascript/angular-resource.min.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  AngularJS v1.3.0-beta.5
 3 |  (c) 2010-2014 Google, Inc. http://angularjs.org
 4 |  License: MIT
 5 | */
 6 | (function(H,a,A){'use strict';function D(p,g){g=g||{};a.forEach(g,function(a,c){delete g[c]});for(var c in p)!p.hasOwnProperty(c)||"$"===c.charAt(0)&&"$"===c.charAt(1)||(g[c]=p[c]);return g}var v=a.$$minErr("$resource"),C=/^(\.[a-zA-Z_$][0-9a-zA-Z_$]*)+$/;a.module("ngResource",["ng"]).factory("$resource",["$http","$q",function(p,g){function c(a,c){this.template=a;this.defaults=c||{};this.urlParams={}}function t(n,w,l){function r(h,d){var e={};d=x({},w,d);s(d,function(b,d){u(b)&&(b=b());var k;if(b&&
 7 | b.charAt&&"@"==b.charAt(0)){k=h;var a=b.substr(1);if(null==a||""===a||"hasOwnProperty"===a||!C.test("."+a))throw v("badmember",a);for(var a=a.split("."),f=0,c=a.length;f<c&&k!==A;f++){var g=a[f];k=null!==k?k[g]:A}}else k=b;e[d]=k});return e}function e(a){return a.resource}function f(a){D(a||{},this)}var F=new c(n);l=x({},B,l);s(l,function(h,d){var c=/^(POST|PUT|PATCH)$/i.test(h.method);f[d]=function(b,d,k,w){var q={},n,l,y;switch(arguments.length){case 4:y=w,l=k;case 3:case 2:if(u(d)){if(u(b)){l=
 8 | b;y=d;break}l=d;y=k}else{q=b;n=d;l=k;break}case 1:u(b)?l=b:c?n=b:q=b;break;case 0:break;default:throw v("badargs",arguments.length);}var t=this instanceof f,m=t?n:h.isArray?[]:new f(n),z={},B=h.interceptor&&h.interceptor.response||e,C=h.interceptor&&h.interceptor.responseError||A;s(h,function(a,b){"params"!=b&&("isArray"!=b&&"interceptor"!=b)&&(z[b]=G(a))});c&&(z.data=n);F.setUrlParams(z,x({},r(n,h.params||{}),q),h.url);q=p(z).then(function(b){var d=b.data,k=m.$promise;if(d){if(a.isArray(d)!==!!h.isArray)throw v("badcfg",
 9 | h.isArray?"array":"object",a.isArray(d)?"array":"object");h.isArray?(m.length=0,s(d,function(b){m.push(new f(b))})):(D(d,m),m.$promise=k)}m.$resolved=!0;b.resource=m;return b},function(b){m.$resolved=!0;(y||E)(b);return g.reject(b)});q=q.then(function(b){var a=B(b);(l||E)(a,b.headers);return a},C);return t?q:(m.$promise=q,m.$resolved=!1,m)};f.prototype["$"+d]=function(b,a,k){u(b)&&(k=a,a=b,b={});b=f[d].call(this,b,this,a,k);return b.$promise||b}});f.bind=function(a){return t(n,x({},w,a),l)};return f}
10 | var B={get:{method:"GET"},save:{method:"POST"},query:{method:"GET",isArray:!0},remove:{method:"DELETE"},"delete":{method:"DELETE"}},E=a.noop,s=a.forEach,x=a.extend,G=a.copy,u=a.isFunction;c.prototype={setUrlParams:function(c,g,l){var r=this,e=l||r.template,f,p,h=r.urlParams={};s(e.split(/\W/),function(a){if("hasOwnProperty"===a)throw v("badname");!/^\d+$/.test(a)&&(a&&RegExp("(^|[^\\\\]):"+a+"(\\W|$)").test(e))&&(h[a]=!0)});e=e.replace(/\\:/g,":");g=g||{};s(r.urlParams,function(d,c){f=g.hasOwnProperty(c)?
11 | g[c]:r.defaults[c];a.isDefined(f)&&null!==f?(p=encodeURIComponent(f).replace(/%40/gi,"@").replace(/%3A/gi,":").replace(/%24/g,"$").replace(/%2C/gi,",").replace(/%20/g,"%20").replace(/%26/gi,"&").replace(/%3D/gi,"=").replace(/%2B/gi,"+"),e=e.replace(RegExp(":"+c+"(\\W|$)","g"),function(a,c){return p+c})):e=e.replace(RegExp("(/?):"+c+"(\\W|$)","g"),function(a,c,d){return"/"==d.charAt(0)?d:c+d})});e=e.replace(/\/+$/,"")||"/";e=e.replace(/\/\.(?=\w+($|\?))/,".");c.url=e.replace(/\/\\\./,"/.");s(g,function(a,
12 | e){r.urlParams[e]||(c.params=c.params||{},c.params[e]=a)})}};return t}])})(window,window.angular);
13 | //# sourceMappingURL=angular-resource.min.js.map
14 | 


--------------------------------------------------------------------------------
/webapi/static/javascript/angular-route.min.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  AngularJS v1.3.0-beta.5
 3 |  (c) 2010-2014 Google, Inc. http://angularjs.org
 4 |  License: MIT
 5 | */
 6 | (function(n,e,A){'use strict';function x(s,g,k){return{restrict:"ECA",terminal:!0,priority:400,transclude:"element",link:function(a,c,b,f,w){function y(){p&&(p.remove(),p=null);h&&(h.$destroy(),h=null);l&&(k.leave(l,function(){p=null}),p=l,l=null)}function v(){var b=s.current&&s.current.locals;if(e.isDefined(b&&b.$template)){var b=a.$new(),d=s.current;l=w(b,function(d){k.enter(d,null,l||c,function(){!e.isDefined(t)||t&&!a.$eval(t)||g()});y()});h=d.scope=b;h.$emit("$viewContentLoaded");h.$eval(u)}else y()}
 7 | var h,l,p,t=b.autoscroll,u=b.onload||"";a.$on("$routeChangeSuccess",v);v()}}}function z(e,g,k){return{restrict:"ECA",priority:-400,link:function(a,c){var b=k.current,f=b.locals;c.html(f.$template);var w=e(c.contents());b.controller&&(f.$scope=a,f=g(b.controller,f),b.controllerAs&&(a[b.controllerAs]=f),c.data("$ngControllerController",f),c.children().data("$ngControllerController",f));w(a)}}}n=e.module("ngRoute",["ng"]).provider("$route",function(){function s(a,c){return e.extend(new (e.extend(function(){},
 8 | {prototype:a})),c)}function g(a,e){var b=e.caseInsensitiveMatch,f={originalPath:a,regexp:a},k=f.keys=[];a=a.replace(/([().])/g,"\\$1").replace(/(\/)?:(\w+)([\?\*])?/g,function(a,e,b,c){a="?"===c?c:null;c="*"===c?c:null;k.push({name:b,optional:!!a});e=e||"";return""+(a?"":e)+"(?:"+(a?e:"")+(c&&"(.+?)"||"([^/]+)")+(a||"")+")"+(a||"")}).replace(/([\/$\*])/g,"\\$1");f.regexp=RegExp("^"+a+"$",b?"i":"");return f}var k={};this.when=function(a,c){k[a]=e.extend({reloadOnSearch:!0},c,a&&g(a,c));if(a){var b=
 9 | "/"==a[a.length-1]?a.substr(0,a.length-1):a+"/";k[b]=e.extend({redirectTo:a},g(b,c))}return this};this.otherwise=function(a){this.when(null,a);return this};this.$get=["$rootScope","$location","$routeParams","$q","$injector","$http","$templateCache","$sce",function(a,c,b,f,g,n,v,h){function l(){var d=p(),m=r.current;if(d&&m&&d.$$route===m.$$route&&e.equals(d.pathParams,m.pathParams)&&!d.reloadOnSearch&&!u)m.params=d.params,e.copy(m.params,b),a.$broadcast("$routeUpdate",m);else if(d||m)u=!1,a.$broadcast("$routeChangeStart",
10 | d,m),(r.current=d)&&d.redirectTo&&(e.isString(d.redirectTo)?c.path(t(d.redirectTo,d.params)).search(d.params).replace():c.url(d.redirectTo(d.pathParams,c.path(),c.search())).replace()),f.when(d).then(function(){if(d){var a=e.extend({},d.resolve),c,b;e.forEach(a,function(d,c){a[c]=e.isString(d)?g.get(d):g.invoke(d)});e.isDefined(c=d.template)?e.isFunction(c)&&(c=c(d.params)):e.isDefined(b=d.templateUrl)&&(e.isFunction(b)&&(b=b(d.params)),b=h.getTrustedResourceUrl(b),e.isDefined(b)&&(d.loadedTemplateUrl=
11 | b,c=n.get(b,{cache:v}).then(function(a){return a.data})));e.isDefined(c)&&(a.$template=c);return f.all(a)}}).then(function(c){d==r.current&&(d&&(d.locals=c,e.copy(d.params,b)),a.$broadcast("$routeChangeSuccess",d,m))},function(c){d==r.current&&a.$broadcast("$routeChangeError",d,m,c)})}function p(){var a,b;e.forEach(k,function(f,k){var q;if(q=!b){var g=c.path();q=f.keys;var l={};if(f.regexp)if(g=f.regexp.exec(g)){for(var h=1,p=g.length;h<p;++h){var n=q[h-1],r="string"==typeof g[h]?decodeURIComponent(g[h]):
12 | g[h];n&&r&&(l[n.name]=r)}q=l}else q=null;else q=null;q=a=q}q&&(b=s(f,{params:e.extend({},c.search(),a),pathParams:a}),b.$$route=f)});return b||k[null]&&s(k[null],{params:{},pathParams:{}})}function t(a,c){var b=[];e.forEach((a||"").split(":"),function(a,d){if(0===d)b.push(a);else{var e=a.match(/(\w+)(.*)/),f=e[1];b.push(c[f]);b.push(e[2]||"");delete c[f]}});return b.join("")}var u=!1,r={routes:k,reload:function(){u=!0;a.$evalAsync(l)}};a.$on("$locationChangeSuccess",l);return r}]});n.provider("$routeParams",
13 | function(){this.$get=function(){return{}}});n.directive("ngView",x);n.directive("ngView",z);x.$inject=["$route","$anchorScroll","$animate"];z.$inject=["$compile","$controller","$route"]})(window,window.angular);
14 | //# sourceMappingURL=angular-route.min.js.map
15 | 


--------------------------------------------------------------------------------
/webapi/static/javascript/angular-sanitize.min.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  AngularJS v1.3.0-beta.10
 3 |  (c) 2010-2014 Google, Inc. http://angularjs.org
 4 |  License: MIT
 5 | */
 6 | (function(p,h,q){'use strict';function E(a){var d=[];s(d,h.noop).chars(a);return d.join("")}function k(a){var d={};a=a.split(",");var b;for(b=0;b<a.length;b++)d[a[b]]=!0;return d}function F(a,d){function b(a,c,b,g){c=h.lowercase(c);if(t[c])for(;f.last()&&u[f.last()];)e("",f.last());v[c]&&f.last()==c&&e("",c);(g=w[c]||!!g)||f.push(c);var l={};b.replace(G,function(a,c,d,b,e){l[c]=r(d||b||e||"")});d.start&&d.start(c,l,g)}function e(a,c){var b=0,e;if(c=h.lowercase(c))for(b=f.length-1;0<=b&&f[b]!=c;b--);
 7 | if(0<=b){for(e=f.length-1;e>=b;e--)d.end&&d.end(f[e]);f.length=b}}var c,g,f=[],l=a;for(f.last=function(){return f[f.length-1]};a;){g=!0;if(f.last()&&x[f.last()])a=a.replace(RegExp("(.*)<\\s*\\/\\s*"+f.last()+"[^>]*>","i"),function(c,a){a=a.replace(H,"$1").replace(I,"$1");d.chars&&d.chars(r(a));return""}),e("",f.last());else{if(0===a.indexOf("\x3c!--"))c=a.indexOf("--",4),0<=c&&a.lastIndexOf("--\x3e",c)===c&&(d.comment&&d.comment(a.substring(4,c)),a=a.substring(c+3),g=!1);else if(y.test(a)){if(c=a.match(y))a=
 8 | a.replace(c[0],""),g=!1}else if(J.test(a)){if(c=a.match(z))a=a.substring(c[0].length),c[0].replace(z,e),g=!1}else K.test(a)&&(c=a.match(A))&&(a=a.substring(c[0].length),c[0].replace(A,b),g=!1);g&&(c=a.indexOf("<"),g=0>c?a:a.substring(0,c),a=0>c?"":a.substring(c),d.chars&&d.chars(r(g)))}if(a==l)throw L("badparse",a);l=a}e()}function r(a){if(!a)return"";var d=M.exec(a);a=d[1];var b=d[3];if(d=d[2])n.innerHTML=d.replace(/</g,"&lt;"),d="textContent"in n?n.textContent:n.innerText;return a+d+b}function B(a){return a.replace(/&/g,
 9 | "&amp;").replace(N,function(a){var b=a.charCodeAt(0);a=a.charCodeAt(1);return"&#"+(1024*(b-55296)+(a-56320)+65536)+";"}).replace(O,function(a){return"&#"+a.charCodeAt(0)+";"}).replace(/</g,"&lt;").replace(/>/g,"&gt;")}function s(a,d){var b=!1,e=h.bind(a,a.push);return{start:function(a,g,f){a=h.lowercase(a);!b&&x[a]&&(b=a);b||!0!==C[a]||(e("<"),e(a),h.forEach(g,function(b,f){var g=h.lowercase(f),k="img"===a&&"src"===g||"background"===g;!0!==P[g]||!0===D[g]&&!d(b,k)||(e(" "),e(f),e('="'),e(B(b)),e('"'))}),
10 | e(f?"/>":">"))},end:function(a){a=h.lowercase(a);b||!0!==C[a]||(e("</"),e(a),e(">"));a==b&&(b=!1)},chars:function(a){b||e(B(a))}}}var L=h.$$minErr("$sanitize"),A=/^<\s*([\w:-]+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,z=/^<\s*\/\s*([\w:-]+)[^>]*>/,G=/([\w:-]+)(?:\s*=\s*(?:(?:"((?:[^"])*)")|(?:'((?:[^'])*)')|([^>\s]+)))?/g,K=/^</,J=/^<\s*\//,H=/\x3c!--(.*?)--\x3e/g,y=/<!DOCTYPE([^>]*?)>/i,I=/<!\[CDATA\[(.*?)]]\x3e/g,N=/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,O=/([^\#-~| |!])/g,
11 | w=k("area,br,col,hr,img,wbr");p=k("colgroup,dd,dt,li,p,tbody,td,tfoot,th,thead,tr");q=k("rp,rt");var v=h.extend({},q,p),t=h.extend({},p,k("address,article,aside,blockquote,caption,center,del,dir,div,dl,figure,figcaption,footer,h1,h2,h3,h4,h5,h6,header,hgroup,hr,ins,map,menu,nav,ol,pre,script,section,table,ul")),u=h.extend({},q,k("a,abbr,acronym,b,bdi,bdo,big,br,cite,code,del,dfn,em,font,i,img,ins,kbd,label,map,mark,q,ruby,rp,rt,s,samp,small,span,strike,strong,sub,sup,time,tt,u,var")),x=k("script,style"),
12 | C=h.extend({},w,t,u,v),D=k("background,cite,href,longdesc,src,usemap"),P=h.extend({},D,k("abbr,align,alt,axis,bgcolor,border,cellpadding,cellspacing,class,clear,color,cols,colspan,compact,coords,dir,face,headers,height,hreflang,hspace,ismap,lang,language,nohref,nowrap,rel,rev,rows,rowspan,rules,scope,scrolling,shape,size,span,start,summary,target,title,type,valign,value,vspace,width")),n=document.createElement("pre"),M=/^(\s*)([\s\S]*?)(\s*)$/;h.module("ngSanitize",[]).provider("$sanitize",function(){this.$get=
13 | ["$$sanitizeUri",function(a){return function(d){var b=[];F(d,s(b,function(b,c){return!/^unsafe/.test(a(b,c))}));return b.join("")}}]});h.module("ngSanitize").filter("linky",["$sanitize",function(a){var d=/((ftp|https?):\/\/|(mailto:)?[A-Za-z0-9._%+-]+@)\S*[^\s.;,(){}<>]/,b=/^mailto:/;return function(e,c){function g(a){a&&m.push(E(a))}function f(a,b){m.push("<a ");h.isDefined(c)&&(m.push('target="'),m.push(c),m.push('" '));m.push('href="');m.push(a);m.push('">');g(b);m.push("</a>")}if(!e)return e;
14 | for(var l,k=e,m=[],n,p;l=k.match(d);)n=l[0],l[2]==l[3]&&(n="mailto:"+n),p=l.index,g(k.substr(0,p)),f(n,l[0].replace(b,"")),k=k.substring(p+l[0].length);g(k);return a(m.join(""))}}])})(window,window.angular);
15 | //# sourceMappingURL=angular-sanitize.min.js.map
16 | 


--------------------------------------------------------------------------------
/webapi/static/javascript/jquery.easing.1.3.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * jQuery Easing v1.3 - http://gsgd.co.uk/sandbox/jquery/easing/
  3 |  *
  4 |  * Uses the built in easing capabilities added In jQuery 1.1
  5 |  * to offer multiple easing options
  6 |  *
  7 |  * TERMS OF USE - jQuery Easing
  8 |  * 
  9 |  * Open source under the BSD License. 
 10 |  * 
 11 |  * Copyright Ã‚Â© 2008 George McGinley Smith
 12 |  * All rights reserved.
 13 |  * 
 14 |  * Redistribution and use in source and binary forms, with or without modification, 
 15 |  * are permitted provided that the following conditions are met:
 16 |  * 
 17 |  * Redistributions of source code must retain the above copyright notice, this list of 
 18 |  * conditions and the following disclaimer.
 19 |  * Redistributions in binary form must reproduce the above copyright notice, this list 
 20 |  * of conditions and the following disclaimer in the documentation and/or other materials 
 21 |  * provided with the distribution.
 22 |  * 
 23 |  * Neither the name of the author nor the names of contributors may be used to endorse 
 24 |  * or promote products derived from this software without specific prior written permission.
 25 |  * 
 26 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
 27 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 28 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 29 |  *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 30 |  *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 31 |  *  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 
 32 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 33 |  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
 34 |  * OF THE POSSIBILITY OF SUCH DAMAGE. 
 35 |  *
 36 | */
 37 | 
 38 | // t: current time, b: begInnIng value, c: change In value, d: duration
 39 | jQuery.easing['jswing'] = jQuery.easing['swing'];
 40 | 
 41 | jQuery.extend( jQuery.easing,
 42 | {
 43 | 	def: 'easeOutQuad',
 44 | 	swing: function (x, t, b, c, d) {
 45 | 		//alert(jQuery.easing.default);
 46 | 		return jQuery.easing[jQuery.easing.def](x, t, b, c, d);
 47 | 	},
 48 | 	easeInQuad: function (x, t, b, c, d) {
 49 | 		return c*(t/=d)*t + b;
 50 | 	},
 51 | 	easeOutQuad: function (x, t, b, c, d) {
 52 | 		return -c *(t/=d)*(t-2) + b;
 53 | 	},
 54 | 	easeInOutQuad: function (x, t, b, c, d) {
 55 | 		if ((t/=d/2) < 1) return c/2*t*t + b;
 56 | 		return -c/2 * ((--t)*(t-2) - 1) + b;
 57 | 	},
 58 | 	easeInCubic: function (x, t, b, c, d) {
 59 | 		return c*(t/=d)*t*t + b;
 60 | 	},
 61 | 	easeOutCubic: function (x, t, b, c, d) {
 62 | 		return c*((t=t/d-1)*t*t + 1) + b;
 63 | 	},
 64 | 	easeInOutCubic: function (x, t, b, c, d) {
 65 | 		if ((t/=d/2) < 1) return c/2*t*t*t + b;
 66 | 		return c/2*((t-=2)*t*t + 2) + b;
 67 | 	},
 68 | 	easeInQuart: function (x, t, b, c, d) {
 69 | 		return c*(t/=d)*t*t*t + b;
 70 | 	},
 71 | 	easeOutQuart: function (x, t, b, c, d) {
 72 | 		return -c * ((t=t/d-1)*t*t*t - 1) + b;
 73 | 	},
 74 | 	easeInOutQuart: function (x, t, b, c, d) {
 75 | 		if ((t/=d/2) < 1) return c/2*t*t*t*t + b;
 76 | 		return -c/2 * ((t-=2)*t*t*t - 2) + b;
 77 | 	},
 78 | 	easeInQuint: function (x, t, b, c, d) {
 79 | 		return c*(t/=d)*t*t*t*t + b;
 80 | 	},
 81 | 	easeOutQuint: function (x, t, b, c, d) {
 82 | 		return c*((t=t/d-1)*t*t*t*t + 1) + b;
 83 | 	},
 84 | 	easeInOutQuint: function (x, t, b, c, d) {
 85 | 		if ((t/=d/2) < 1) return c/2*t*t*t*t*t + b;
 86 | 		return c/2*((t-=2)*t*t*t*t + 2) + b;
 87 | 	},
 88 | 	easeInSine: function (x, t, b, c, d) {
 89 | 		return -c * Math.cos(t/d * (Math.PI/2)) + c + b;
 90 | 	},
 91 | 	easeOutSine: function (x, t, b, c, d) {
 92 | 		return c * Math.sin(t/d * (Math.PI/2)) + b;
 93 | 	},
 94 | 	easeInOutSine: function (x, t, b, c, d) {
 95 | 		return -c/2 * (Math.cos(Math.PI*t/d) - 1) + b;
 96 | 	},
 97 | 	easeInExpo: function (x, t, b, c, d) {
 98 | 		return (t==0) ? b : c * Math.pow(2, 10 * (t/d - 1)) + b;
 99 | 	},
100 | 	easeOutExpo: function (x, t, b, c, d) {
101 | 		return (t==d) ? b+c : c * (-Math.pow(2, -10 * t/d) + 1) + b;
102 | 	},
103 | 	easeInOutExpo: function (x, t, b, c, d) {
104 | 		if (t==0) return b;
105 | 		if (t==d) return b+c;
106 | 		if ((t/=d/2) < 1) return c/2 * Math.pow(2, 10 * (t - 1)) + b;
107 | 		return c/2 * (-Math.pow(2, -10 * --t) + 2) + b;
108 | 	},
109 | 	easeInCirc: function (x, t, b, c, d) {
110 | 		return -c * (Math.sqrt(1 - (t/=d)*t) - 1) + b;
111 | 	},
112 | 	easeOutCirc: function (x, t, b, c, d) {
113 | 		return c * Math.sqrt(1 - (t=t/d-1)*t) + b;
114 | 	},
115 | 	easeInOutCirc: function (x, t, b, c, d) {
116 | 		if ((t/=d/2) < 1) return -c/2 * (Math.sqrt(1 - t*t) - 1) + b;
117 | 		return c/2 * (Math.sqrt(1 - (t-=2)*t) + 1) + b;
118 | 	},
119 | 	easeInElastic: function (x, t, b, c, d) {
120 | 		var s=1.70158;var p=0;var a=c;
121 | 		if (t==0) return b;  if ((t/=d)==1) return b+c;  if (!p) p=d*.3;
122 | 		if (a < Math.abs(c)) { a=c; var s=p/4; }
123 | 		else var s = p/(2*Math.PI) * Math.asin (c/a);
124 | 		return -(a*Math.pow(2,10*(t-=1)) * Math.sin( (t*d-s)*(2*Math.PI)/p )) + b;
125 | 	},
126 | 	easeOutElastic: function (x, t, b, c, d) {
127 | 		var s=1.70158;var p=0;var a=c;
128 | 		if (t==0) return b;  if ((t/=d)==1) return b+c;  if (!p) p=d*.3;
129 | 		if (a < Math.abs(c)) { a=c; var s=p/4; }
130 | 		else var s = p/(2*Math.PI) * Math.asin (c/a);
131 | 		return a*Math.pow(2,-10*t) * Math.sin( (t*d-s)*(2*Math.PI)/p ) + c + b;
132 | 	},
133 | 	easeInOutElastic: function (x, t, b, c, d) {
134 | 		var s=1.70158;var p=0;var a=c;
135 | 		if (t==0) return b;  if ((t/=d/2)==2) return b+c;  if (!p) p=d*(.3*1.5);
136 | 		if (a < Math.abs(c)) { a=c; var s=p/4; }
137 | 		else var s = p/(2*Math.PI) * Math.asin (c/a);
138 | 		if (t < 1) return -.5*(a*Math.pow(2,10*(t-=1)) * Math.sin( (t*d-s)*(2*Math.PI)/p )) + b;
139 | 		return a*Math.pow(2,-10*(t-=1)) * Math.sin( (t*d-s)*(2*Math.PI)/p )*.5 + c + b;
140 | 	},
141 | 	easeInBack: function (x, t, b, c, d, s) {
142 | 		if (s == undefined) s = 1.70158;
143 | 		return c*(t/=d)*t*((s+1)*t - s) + b;
144 | 	},
145 | 	easeOutBack: function (x, t, b, c, d, s) {
146 | 		if (s == undefined) s = 1.70158;
147 | 		return c*((t=t/d-1)*t*((s+1)*t + s) + 1) + b;
148 | 	},
149 | 	easeInOutBack: function (x, t, b, c, d, s) {
150 | 		if (s == undefined) s = 1.70158; 
151 | 		if ((t/=d/2) < 1) return c/2*(t*t*(((s*=(1.525))+1)*t - s)) + b;
152 | 		return c/2*((t-=2)*t*(((s*=(1.525))+1)*t + s) + 2) + b;
153 | 	},
154 | 	easeInBounce: function (x, t, b, c, d) {
155 | 		return c - jQuery.easing.easeOutBounce (x, d-t, 0, c, d) + b;
156 | 	},
157 | 	easeOutBounce: function (x, t, b, c, d) {
158 | 		if ((t/=d) < (1/2.75)) {
159 | 			return c*(7.5625*t*t) + b;
160 | 		} else if (t < (2/2.75)) {
161 | 			return c*(7.5625*(t-=(1.5/2.75))*t + .75) + b;
162 | 		} else if (t < (2.5/2.75)) {
163 | 			return c*(7.5625*(t-=(2.25/2.75))*t + .9375) + b;
164 | 		} else {
165 | 			return c*(7.5625*(t-=(2.625/2.75))*t + .984375) + b;
166 | 		}
167 | 	},
168 | 	easeInOutBounce: function (x, t, b, c, d) {
169 | 		if (t < d/2) return jQuery.easing.easeInBounce (x, t*2, 0, c, d) * .5 + b;
170 | 		return jQuery.easing.easeOutBounce (x, t*2-d, 0, c, d) * .5 + c*.5 + b;
171 | 	}
172 | });
173 | 
174 | /*
175 |  *
176 |  * TERMS OF USE - EASING EQUATIONS
177 |  * 
178 |  * Open source under the BSD License. 
179 |  * 
180 |  * Copyright Ã‚Â© 2001 Robert Penner
181 |  * All rights reserved.
182 |  * 
183 |  * Redistribution and use in source and binary forms, with or without modification, 
184 |  * are permitted provided that the following conditions are met:
185 |  * 
186 |  * Redistributions of source code must retain the above copyright notice, this list of 
187 |  * conditions and the following disclaimer.
188 |  * Redistributions in binary form must reproduce the above copyright notice, this list 
189 |  * of conditions and the following disclaimer in the documentation and/or other materials 
190 |  * provided with the distribution.
191 |  * 
192 |  * Neither the name of the author nor the names of contributors may be used to endorse 
193 |  * or promote products derived from this software without specific prior written permission.
194 |  * 
195 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
196 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
197 |  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
198 |  *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
199 |  *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
200 |  *  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 
201 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
202 |  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
203 |  * OF THE POSSIBILITY OF SUCH DAMAGE. 
204 |  *
205 |  */


--------------------------------------------------------------------------------
/webapi/static/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 	<head>
  4 | 		<!-- Standard Meta -->
  5 | 		<meta charset="utf-8" />
  6 | 		<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
  7 | 		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
  8 | 		<!-- Site Properities -->
  9 | 		<title>Sopaper</title>
 10 | 		<link rel="stylesheet" type="text/css" href="css/semantic.css">
 11 | 		<link rel="stylesheet" type="text/css" href="css/homepage.css">
 12 | 		<script src="javascript/jquery.js"></script>
 13 | 		<script src="javascript/semantic.js"></script>
 14 | 		<script src="javascript/homepage.js"></script>
 15 | 	</head>
 16 | 	<body id="home" class="masthead">
 17 | 		<!-- Navigation Start-->
 18 | 		<div class="ui large inverted menu">
 19 | 			<a class="green item">
 20 | 			<i class="book icon"></i> <b>SoPaper </b>
 21 | 			</a>
 22 | 			<div class="right menu">
 23 | 				
 24 | 				<div class="ui dropdown item">
 25 | 					<i class="green user icon"></i>
 26 | 					ppwwyyxx
 27 | 					<i class="dropdown icon"></i>
 28 | 					<div class="ui inverted menu">
 29 | 						<a class="item">
 30 | 						<i class="book icon"></i> My papers
 31 | 						</a>
 32 | 						<a class="item">
 33 | 						<i class="setting icon"></i>Account Settings
 34 | 						</a>
 35 | 						<a class="item">
 36 | 						<i class="sign out icon"></i> Log Out
 37 | 						</a>
 38 | 					</div>
 39 | 				</div>
 40 | 			</div>
 41 | 		</div>
 42 | 		<!-- Navigation End-->
 43 | 		<div class="ui large wide vertical inverted labeled icon sidebar menu" id="menu">
 44 | 			<a class="item" href="#">
 45 | 			<i class="inverted circular red awesome download cloud icon"></i> <b>Download Paper</b>
 46 | 			</a>
 47 | 			<div class="onleft item">
 48 | 				<h2 class="ui header">
 49 | 				Ontology learning for the semantic web
 50 | 				</h2>
 51 | 			</div>
 52 | 			<div class="onleft item">
 53 | 				<h4 class="ui header">
 54 | 				<i class="users icon"></i>
 55 | 				Author(s): <a href="#">Yuxin Wu</a>, <a href="#">Tiezheng Li</a>
 56 | 				</h4>
 57 | 			</div>
 58 | 			<div class="onleft item">
 59 | 				<h4 class="ui header">
 60 | 				<i class="sitemap icon"></i>
 61 | 				Conference: <a href="#">SIGGRAPH 2013</a>
 62 | 				</h4>
 63 | 			</div>
 64 | 			<div class="onleft item">
 65 | 				<h4 class="ui header">
 66 | 				<i class="share icon"></i>
 67 | 				Citation: 20198
 68 | 				</h4>
 69 | 			</div>
 70 | 			<div class="onleft item">
 71 | 				<h4 class="ui header">
 72 | 				<i class="tags icon"></i>
 73 | 				Tags:
 74 | 				</h4>
 75 | 				<div class="ui labels">
 76 | 					<div class="ui label">
 77 | 						<a href="#">Semantic Web</a>
 78 | 					</div>
 79 | 					<div class="ui label">
 80 | 						<a href="#">Machine learning</a>
 81 | 					</div>
 82 | 				</div>
 83 | 			</div>
 84 | 			<div class="item">
 85 | 				<div class="ui buttons">
 86 | 					<div class="ui bib blue button">Bibtex</div>
 87 | 					<div class="or"></div>
 88 | 					<div class="ui ref green button">Reflist</div>
 89 | 				</div>
 90 | 				<div class="ui bib modal">
 91 | 					<i class="close icon"></i>
 92 | 					<div class="header">
 93 | 						Bibtex
 94 | 					</div>
 95 | 					<div class="content">
 96 | 						<div class="ui piled segment">
 97 | 							<p>@ARTICLE{ h. john:estimating,<br/>
 98 | 							AUTHOR = "George H. John and Pat Langley",<br/>
 99 | 							TITLE = "Estimating Continuous Distributions in Bayesian Classifiers",<br/>
100 | 							JOURNAL = "CoRR",<br/>
101 | 							PAGES = {-1--1},<br/>
102 | 							YEAR = {2013},<br/>
103 | 							}<br/>
104 | 							</p>
105 | 						</div>
106 | 					</div>
107 | 					<div class="actions">
108 | 						<div class="ui teal right labeled icon button"><i class="copy icon"></i> Copy to Clipboard</div>
109 | 						<div class="ui button"> Close</div>
110 | 					</div>
111 | 				</div>
112 | 				<div class="ui ref modal">
113 | 					<i class="close icon"></i>
114 | 					<div class="header">
115 | 						Reference list
116 | 					</div>
117 | 					<div class="content">
118 | 						<div class="ui piled segment">
119 | 							<p>Fayyad U M, Piatetsky-Shapiro G, Smyth P, et al. Advances in knowledge discovery and data mining[J]. 1996.<br/>
120 | 							Berry M J, Linoff G. Data mining techniques: for marketing, sales, and customer support[M]. John Wiley Sons, Inc., 1997.<br/>
121 | 							</p>
122 | 						</div>
123 | 					</div>
124 | 					<div class="actions">
125 | 						<div class="ui teal right labeled icon button"><i class="copy icon"></i> Copy to Clipboard</div>
126 | 						<div class="ui button"> Close</div>
127 | 					</div>
128 | 				</div>
129 | 			</div>
130 | 			<div class="item">
131 | 				<div style="margin-bottom:10px">
132 | 					Average rating:
133 | 					<div class="ui star rating">
134 | 						<i class="active icon"></i>
135 | 						<i class="active icon"></i>
136 | 						<i class="active icon"></i>
137 | 						<i class="icon"></i>
138 | 						<i class="icon"></i>
139 | 					</div>
140 | 				</div>
141 | 				<div class="item">
142 | 					<div class="ui basic inverted vertical fade animated rate button">
143 | 						<div class="hidden content">Useful</div>
144 | 						<div class="visible content">
145 | 							<i class="thumbs up  icon"></i>
146 | 						</div>
147 | 					</div>
148 | 					<div class="ui basic inverted vertical fade animated rate button">
149 | 						<div class="hidden content">Junk</div>
150 | 						<div class="visible content">
151 | 							<i class="thumbs down icon"></i>
152 | 						</div>
153 | 					</div>
154 | 					<div class="ui basic inverted vertical fade animated button">
155 | 						<div class="hidden content">Share</div>
156 | 						<div class="visible content">
157 | 							<i class="share sign icon"></i>
158 | 						</div>
159 | 					</div>
160 | 					<div class="ui basic inverted vertical fade animated com button">
161 | 						<div class="hidden content">Comment</div>
162 | 						<div class="visible content">
163 | 							<i class="chat icon"></i>
164 | 						</div>
165 | 					</div>
166 | 					<div class="ui very small success message" style="margin-left:20px;width:86%">
167 | 						<i class="close icon"></i>
168 | 						<div class="header">
169 | 							Rated successfully!
170 | 						</div>
171 | 					</div>
172 | 					<div class="ui com modal">
173 | 						<i class="close icon"></i>
174 | 						<div class="header">
175 | 							Comments on "Ontology learning for the semantic web"
176 | 						</div>
177 | 						<div class="content">
178 | 							<div class="ui comments">
179 | 								<div class="comment">
180 | 									<div class="content">
181 | 										<a class="author">Jiakai</a>
182 | 										<div class="metadata">
183 | 											<div class="date">2 days ago</div>
184 | 											<div class="stars">
185 | 												<i class="icon star"></i>
186 | 												<i class="icon star"></i>
187 | 												<i class="icon star"></i>
188 | 												<i class="icon star"></i>
189 | 												<i class="icon star"></i>
190 | 											</div>
191 | 										</div>
192 | 										<div class="text">
193 | 											I think this is a great paper and i am voting on it.
194 | 										</div>
195 | 									</div>
196 | 								</div>
197 | 								
198 | 								<div class="comment">
199 | 									<div class="content">
200 | 										<a class="author">Blahgeek</a>
201 | 										<div class="metadata">
202 | 											<div class="date">3 hours ago</div>
203 | 											<div class="stars">
204 | 												<i class="icon star"></i>
205 | 												<i class="icon empty star"></i>
206 | 												<i class="icon empty star"></i>
207 | 												<i class="icon empty star"></i>
208 | 												<i class="icon empty star"></i>
209 | 											</div>
210 | 										</div>
211 | 										<div class="text">
212 | 											Totally bullshit to me.
213 | 										</div>
214 | 									</div>
215 | 								</div>
216 | 								<form class="ui reply form">
217 | 									<div class="field">
218 | 										<textarea placeholder="Write down your thoughts..."></textarea>
219 | 									</div>
220 | 									Rate this paper:
221 | 									<div class="ui star rating">
222 | 										<i class="icon"></i>
223 | 										<i class="icon"></i>
224 | 										<i class="icon"></i>
225 | 										<i class="icon"></i>
226 | 										<i class="icon"></i>
227 | 									</div>
228 | 									<div class="ui right floated button teal submit labeled icon">
229 | 										<i class="icon edit"></i> Add Comment
230 | 									</div>
231 | 								</form>
232 | 							</div>
233 | 						</div>
234 | 						<div class="actions">
235 | 							<div class="ui button"> Close</div>
236 | 						</div>
237 | 					</div>
238 | 				</div>
239 | 			</div>
240 | 		</div>
241 | 		<div class="ui black huge launch detail right attached button">
242 | 			<i class="book icon"></i>
243 | 			<span class="text"><b>DETAIL</b></span>
244 | 		</div>
245 | 		<div class="ui right floated black huge launch summary right attached button">
246 | 			<span class="text"><b>SEARCH</b></span>
247 | 			<i class="icon list layout"></i>
248 | 		</div>
249 | 		<!-- Search Summary Start-->
250 | 		<div class="ui right extremly wide sidebar">
251 | 			<div class="ui large inverted menu">
252 | 				<a class="green item">
253 | 				<i class="book icon"></i> <b>SoPaper</b>
254 | 				</a>
255 | 			</div>
256 | 			<div class="ui action input" style="width:500px; margin-left:40px">
257 | 				<input type="text"  placeholder="Search papers, authors, conferences...">
258 | 				<div class="ui icon button"><i class="search icon"></i></div>
259 | 			</div>
260 | 			<div class="ui page grid">
261 | 				<h4 class="ui header" style="margin-top:10px">
262 | 				About 280 results (0.003s)
263 | 				</h4>
264 | 				<div class="ui secondary pointing menu">
265 | 					<a class="active item">
266 | 					<i class="book icon"></i> Papers
267 | 					</a>
268 | 					<a class="item">
269 | 					<i class="user icon"></i> Authors
270 | 					</a>
271 | 					<a class="item">
272 | 					<i class="sitemap icon"></i> Conferences
273 | 					</a>
274 | 				</div>
275 | 				<div class="ui divided list">
276 | 					<div class="item">
277 | 						<i class="bookmark icon"></i>
278 | 						<div class="content">
279 | 							<a class="header">Advances in knowledge discovery and data mining</a>
280 | 							<div class="metadata">
281 | 								UM Fayyad, G Piatetsky-Shapiro, P Smyth… - 1996 - citeulike.org
282 | 							</div>
283 | 							<div class="description"> Advances in Knowledge Discovery and <span class="keyword">Data Mining</span> brings together the latest research--in statistics, databases, machine learning, and artificial intelligence--that are part of the exciting and rapidly growing field of Knowledge Discovery and <span class="keyword">Data Mining</span>. ...</div>
284 | 						</div>
285 | 					</div>
286 | 					<div class="item">
287 | 						<i class="bookmark icon"></i>
288 | 						<div class="content">
289 | 							<a class="header">Data mining techniques: for marketing, sales, and customer support</a>
290 | 							<div class="metadata">
291 | 								MJ Berry, G Linoff - 1997 - dl.acm.org
292 | 							</div>
293 | 							<div class="description"> <span class="keyword">Data Mining</span> Techniques thoroughly acquaints you with the new generation of data mining tools and techniques and shows you how to use them to make better business decisions. One of the first practical guides to mining business data, it describes techniques for ...</div>
294 | 						</div>
295 | 					</div>
296 | 					<div class="item">
297 | 						<i class="bookmark icon"></i>
298 | 						<div class="content">
299 | 							<a class="header">Advances in knowledge discovery and data mining</a>
300 | 							<div class="metadata">
301 | 								UM Fayyad, G Piatetsky-Shapiro, P Smyth… - 1996 - citeulike.org
302 | 							</div>
303 | 							<div class="description"> Advances in Knowledge Discovery and <span class="keyword">Data Mining</span> brings together the latest research--in statistics, databases, machine learning, and artificial intelligence--that are part of the exciting and rapidly growing field of Knowledge Discovery and <span class="keyword">Data Mining</span>. ...</div>
304 | 						</div>
305 | 					</div>
306 | 				</div>
307 | 				<div class="ui pagination menu" style="margin-left:40px">
308 | 					<a class="icon item">
309 | 					<i class="left arrow icon"></i>
310 | 					</a>
311 | 					<a class="active item">
312 | 					1
313 | 					</a>
314 | 					<a class="item">
315 | 					2
316 | 					</a>
317 | 					<a class="item">
318 | 					3
319 | 					</a>
320 | 					<div class="disabled item">
321 | 						...
322 | 					</div>
323 | 					<a class="item">
324 | 					10
325 | 					</a>
326 | 					<a class="icon item">
327 | 					<i class="right arrow icon"></i>
328 | 					</a>
329 | 				</div>
330 | 			</div>
331 | 			
332 | 		</div>
333 | 		<!-- Search Summary End-->
334 | 		
335 | 		<!-- Main Content Start-->
336 | 		<div class="ui page grid">
337 | 			<div class="row">
338 | 					<p>PDF FILE GOES HERE</p>	
339 | 			</div>
340 | 			<!-- SideBar Start-->
341 | 			<!-- SideBar End-->
342 | 			
343 | 		</div>
344 | 		<!-- Main Content End-->
345 | 	</body>
346 | </html>


--------------------------------------------------------------------------------
/webapi/templates/ZeroClipboard.swf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppwwyyxx/SoPaper/2b50495d376887f46609d9dfbb6a08673434a2e6/webapi/templates/ZeroClipboard.swf


--------------------------------------------------------------------------------
/webapi/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en" ng-app="SearchApp">
 3 | 	<head>
 4 | 		<!-- Standard Meta -->
 5 | 		<meta charset="utf-8" />
 6 | 		<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
 7 | 		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
 8 | 		<!-- Site Properities -->
 9 | 		<title>Sopaper</title>
10 | 		<link rel="stylesheet" type="text/css" href="static/css/semantic.css">
11 | 		<link rel="stylesheet" type="text/css" href="static/css/homepage.css">
12 | 		<script src="static/javascript/angular.min.js"></script>
13 | 		<script src="static/javascript/angular-resource.min.js"></script>
14 | 		<script src="static/javascript/angular-route.min.js"></script>
15 | 		<script src="static/javascript/angular-sanitize.min.js"></script>
16 | 		<script src="static/javascript/jquery.js"></script>
17 | 		<script src="static/javascript/semantic.js"></script>
18 | 		<script src="static/javascript/homepage.js"></script>
19 | 		<script src="static/javascript/search.js"></script>
20 | 
21 | 	</head>
22 | 	<body id="home" class="masthead" ng-controller="IndexCtrl">
23 | 		<!-- Navigation Start-->
24 | 		<div class="ui large inverted menu">
25 | 			<a href="/" class="green item">
26 | 			<i class="book icon"></i> <b>SoPaper</b>
27 | 			</a>
28 | 			<div class="right menu">
29 | 				<a class="green item">
30 | 				<b>Sign in </b><i class="setting icon"></i>
31 | 				</a>
32 | 			</div>
33 | 		</div>
34 | 		<!-- Navigation End-->
35 | 		<div class="ui page grid">
36 | 			<img class="ui large logo image" src="static/images/logo.png">
37 | 			<div class="row"></div>
38 | 			<div class="ui action input" style="width:700px; margin-left:auto; margin-right:auto">
39 | 				<input type="text"  placeholder="Search papers, authors, conferences..." ng-model="keyword">
40 | 				<div class="ui icon button" ng-click="IndexSearch()">
41 | 					<i class="search icon"></i>
42 | 				</div>
43 | 			</div>
44 | 		<div class="ui circular small author images">
45 | 			<img class="ui image" src="static/images/wyx.jpg" data-content="Yuxin Wu, the back-end architect">
46 | 			<img class="ui image" src="static/images/ltz.jpg" data-content="Tiezheng Li, the front-end designer">
47 | 			<img class="ui image" src="static/images/wyc.jpg" data-content="Yichen Wang, the goddess">
48 | 		</div>
49 | 
50 | 			<div class="ui footer">
51 | 				Sopaper is designed for researchers to conveniently get papers they need. <br/>
52 | 				Final project for Search Engine Introduction, CST, THU, 2014 Spring.No rights reserved.
53 | 			</div>
54 | 
55 | 		</div>
56 | 	</body>
57 | </html>
58 | 


--------------------------------------------------------------------------------