├── client
├── views
│ └── config.pug
├── src
│ ├── config.js
│ └── components
│ │ └── config
│ │ ├── dialog.jsx
│ │ ├── miniDashboard.jsx
│ │ ├── dashboard.jsx
│ │ ├── createUser.jsx
│ │ ├── invites.jsx
│ │ └── app.jsx
└── public
│ └── index.html
├── server
├── crawler
│ ├── crawlerModel.js
│ ├── intervalModel.js
│ ├── crawlerController.js
│ └── crawler.js
├── user
│ ├── userModel.js
│ └── userController.js
├── session
│ ├── sessionModel.js
│ └── sessionController.js
├── invite
│ ├── inviteModel.js
│ └── inviteController.js
├── endpoint
│ ├── endpointModel.js
│ └── endpointController.js
└── server.js
├── webpack.config.js
├── bin
├── pull.sh
└── start.sh
├── .gitignore
├── package.json
└── README.md
/client/views/config.pug:
--------------------------------------------------------------------------------
1 | html
2 | head
3 | title LiveAPI - Config
4 | link(rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/semantic-ui/2.2.11/semantic.min.css')
5 | body
6 | div#root
7 | script.
8 | const status='#{status}';
9 | script(src='static/bundles/config.bundle.js')
10 |
--------------------------------------------------------------------------------
/server/crawler/crawlerModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 | mongoose.connect('mongodb://localhost:27017', {
4 | useMongoClient: true,
5 | });
6 |
7 | mongoose.Promise = global.Promise;
8 |
9 | const crawlerSchema = new Schema({
10 | endpoint: {type: String, required: true}, // config file
11 | scrape_date: {type: Date, default: Date.now(), required: true},
12 | data: {type: Object, required: true}, // crawler
13 | });
14 |
15 | module.exports = mongoose.model("Crawler", crawlerSchema);
--------------------------------------------------------------------------------
/server/crawler/intervalModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 | mongoose.connect('mongodb://localhost:27017', {
4 | useMongoClient: true,
5 | });
6 |
7 | mongoose.Promise = global.Promise;
8 |
9 | const intervalSchema = new Schema({
10 | endpoint: {type: String, required: true},
11 | url: {type: String, required: true},
12 | interval: {type: Number, required: true, default: 600000}, // 10 minute by default
13 | });
14 |
15 | module.exports = mongoose.model("Interval", intervalSchema);
--------------------------------------------------------------------------------
/client/src/config.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { render } from 'react-dom';
3 | //import 'semantic-ui-css/semantic.min.css';
4 | import App from './components/config/app.jsx';
5 |
6 | const randomGradient = () => {
7 | return 'background: #FFAFBD; background: -webkit-linear-gradient(to right, #ffc3a0, #FFAFBD);background: linear-gradient(to right, #ffc3a0, #FFAFBD);'
8 | }
9 | document.getElementById('root').setAttribute('style', randomGradient());
10 |
11 | render(
12 | ,
13 | document.getElementById('root')
14 | );
15 |
--------------------------------------------------------------------------------
/client/src/components/config/dialog.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import { Card, Input, Divider, Button, Form, Icon } from 'semantic-ui-react'
4 |
5 | const InfoDialog = (props) => {
6 | return (
7 |
8 |
9 |
10 | {props.header}
11 |
12 |
13 |
14 |
15 | {props.linkText}
16 |
17 |
18 | );
19 | }
20 |
21 | export default InfoDialog;
--------------------------------------------------------------------------------
/server/user/userModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 | mongoose.connect('mongodb://localhost:27017', {
4 | useMongoClient: true,
5 | });
6 |
7 | mongoose.Promise = global.Promise;
8 |
9 | const SALT_WORK_FACTOR = 10;
10 | const bcrypt = require('bcryptjs');
11 |
12 | const userSchema = new Schema({
13 | username: {type: String, required: true, unique: true},
14 | password: {type: String, required: true},
15 | admin: {type: Boolean, default: false},
16 | });
17 |
18 | userSchema.pre('save', function(next, done) {
19 | this.password = bcrypt.hashSync(this.password, SALT_WORK_FACTOR);
20 | next();
21 | });
22 |
23 | module.exports = mongoose.model('User', userSchema);
24 |
--------------------------------------------------------------------------------
/server/session/sessionModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 |
4 | /**
5 | * Check out the `createdAt` field below. This is set up to use Mongo's automatic document
6 | * expiration service by giving the Mongoose schema the `expires` property.
7 | * After 30 seconds, the session will automatically be removed from the collection!
8 | * (actually, Mongo's cleanup service only runs once per minute so the session
9 | * could last up to 90 seconds before it's deleted, but still pretty cool!)
10 | */
11 | const sessionSchema = new Schema({
12 | cookieId: { type: String, required: true, unique: false },
13 | createdAt: { type: Date, expires: 86400, default: Date.now }
14 | });
15 |
16 | module.exports = mongoose.model('Session', sessionSchema);
17 |
--------------------------------------------------------------------------------
/server/invite/inviteModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 |
4 | /**
5 | * Check out the `createdAt` field below. This is set up to use Mongo's automatic document
6 | * expiration service by giving the Mongoose schema the `expires` property.
7 | * After 30 seconds, the session will automatically be removed from the collection!
8 | * (actually, Mongo's cleanup service only runs once per minute so the session
9 | * could last up to 90 seconds before it's deleted, but still pretty cool!)
10 | */
11 | const sessionSchema = new Schema({
12 | valid: { type: Boolean, required: true, default: true, },
13 | creator: { type: String, required: false },
14 | redeemer: { type: String, required: false }
15 | });
16 |
17 | module.exports = mongoose.model('Invite', sessionSchema);
18 |
--------------------------------------------------------------------------------
/client/src/components/config/miniDashboard.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import { Card, Menu, Label } from 'semantic-ui-react';
4 | import Invites from './invites.jsx';
5 |
6 | const Dashboard = (props) => {
7 | let message = null;
8 | if (props.message) message =
;
9 |
10 | return (
11 |
12 |
13 |
14 | LiveAPI Dashboard
15 | {message}
16 |
17 |
18 |
19 |
20 |
21 |
22 | Documentation
23 |
24 |
25 | );
26 | }
27 |
28 | export default Dashboard;
29 |
--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
1 | const path = require('path');
2 | const webpack = require('webpack');
3 |
4 | module.exports = {
5 | entry: {
6 | // Components for '/config' route
7 | config: path.join(__dirname, 'client/src/config.js'),
8 | },
9 | output: {
10 | path: path.join(__dirname, 'client/public/bundles'),
11 | filename: '[name].bundle.js'
12 | },
13 | module: {
14 | loaders: [
15 | {
16 | test: /.jsx?$/,
17 | loader: 'babel-loader',
18 | include: path.join(__dirname, 'client/src'),
19 | exclude: /node_modules/,
20 | query: {
21 | presets: [
22 | 'react',
23 | ['env', {
24 | "modules": false,
25 | "targets": {
26 | "browsers": ["last 2 Chrome versions"]
27 | }
28 | }]
29 | ],
30 | 'plugins': [],
31 | }
32 | }
33 | ]
34 | },
35 | };
36 |
--------------------------------------------------------------------------------
/client/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | sample text in div 1
7 |
8 | - list item 1 in 1st div
9 | - list item 2 in 1st div (bold text)
10 |
11 |
12 |
13 |
14 | second div here with emphasized with deeper text
15 |
16 |
17 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/bin/pull.sh:
--------------------------------------------------------------------------------
1 | # The purpose of this script is to pull the LAS source form github and prepare it for installation
2 |
3 | # Install with:
4 | # curl -s https://raw.githubusercontent.com/Live-API/LAS/4b82aa830d5691f1815f5660d99d3198c3bc4849/bin/pull.sh | bash -s
5 |
6 | #!/bin/bash
7 |
8 | # Create a LAS directory
9 | #INSTALL_DIR='LiveAPI'
10 | #mkdir $INSTALL_DIR
11 | #cd $INSTALL_DIR
12 |
13 | # Install git
14 |
15 | # Mac OS
16 | if hash brew 2>/dev/null; then
17 | echo "Installing git with homebrew"
18 | brew install git
19 | else
20 | # Ubuntu/Debian
21 | if hash apt-get 2>/dev/null; then
22 | echo "Installing git with apt-get"
23 | sudo apt-get install -y git-all
24 | else
25 | # Enterprise Linus (e.g. Amazon Linux)
26 | if hash yum 2>/dev/null; then
27 | echo "Installing git with yum"
28 | sudo yum -y install git
29 | fi
30 | fi
31 | fi
32 |
33 | # Clone the repo
34 | echo "Cloning git repo"
35 | git clone -b master --single-branch https://github.com/live-api/las --depth 1
36 |
37 | cd las
38 | sudo bin/start.sh
39 |
--------------------------------------------------------------------------------
/server/endpoint/endpointModel.js:
--------------------------------------------------------------------------------
1 | const mongoose = require('mongoose');
2 | const Schema = mongoose.Schema;
3 | mongoose.connect('mongodb://localhost:27017', {
4 | useMongoClient: true,
5 | });
6 |
7 | mongoose.Promise = global.Promise;
8 |
9 | // Definition data from endpoint creation POST
10 | //{
11 | // url: Starting URL of scrape
12 | // interval: Time between scrapes in seconds, used in IntervalModel
13 | // endpoint: Unique name of the endpoint and where the data can be retrieved from
14 | // text: Object of elements to scrape (e.g. {name: [...DOM paths]})
15 | // images: Object of images to scrape
16 | // backgroundImages: Object of background images to scrape
17 | // pagination: Button to press after scraping a page
18 | //}
19 |
20 | const endpointSchema = new Schema({
21 | endpoint: {type: String, required: true, unique: true},
22 | creator: {type: String},
23 | url: {type: String, default: Date.now(), required: true},
24 | text: {type: Object, required: false},
25 | images: {type: Object, required: false},
26 | backgroundImages: {type: Object, required: false},
27 | pagination: {type: Object, required: false},
28 | });
29 |
30 | module.exports = mongoose.model("Endpoint", endpointSchema);
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 |
8 | # Runtime data
9 | pids
10 | *.pid
11 | *.seed
12 | *.pid.lock
13 |
14 | # Directory for instrumented libs generated by jscoverage/JSCover
15 | lib-cov
16 |
17 | # Coverage directory used by tools like istanbul
18 | coverage
19 |
20 | # nyc test coverage
21 | .nyc_output
22 |
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 |
26 | # Bower dependency directory (https://bower.io/)
27 | bower_components
28 |
29 | # node-waf configuration
30 | .lock-wscript
31 |
32 | # Compiled binary addons (http://nodejs.org/api/addons.html)
33 | build/Release
34 |
35 | # Dependency directories
36 | node_modules/
37 | jspm_packages/
38 |
39 | # Typescript v1 declaration files
40 | typings/
41 |
42 | # Optional npm cache directory
43 | .npm
44 |
45 | # Optional eslint cache
46 | .eslintcache
47 |
48 | # Optional REPL history
49 | .node_repl_history
50 |
51 | # Output of 'npm pack'
52 | *.tgz
53 |
54 | # Yarn Integrity file
55 | .yarn-integrity
56 |
57 | # dotenv environment variables file
58 | .env
59 |
60 | # Database files
61 | db/
62 |
63 | # Build files
64 | client/public/bundles
65 |
66 | # SSL Keys
67 | ssl/
68 |
69 | # Hidden file for installation script to know which version is installed
70 | .LAS_status
--------------------------------------------------------------------------------
/client/src/components/config/dashboard.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import { Grid, Segment, Menu, Header, Label, Divider } from 'semantic-ui-react';
4 | import Invites from './invites.jsx';
5 |
6 | class Dashboard extends Component {
7 |
8 | // A larger dashboard that can be used when we have more content to put on it
9 |
10 | render() {
11 | let message = null;
12 | if (this.props.message) message = ;
13 | return (
14 |
15 |
16 |
22 |
23 |
24 |
25 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | );
37 | }
38 | }
39 |
40 | export default Dashboard;
41 |
--------------------------------------------------------------------------------
/server/session/sessionController.js:
--------------------------------------------------------------------------------
1 | const Session = require('./sessionModel');
2 |
3 | const sessionController = {};
4 |
5 | /**
6 | * isLoggedIn - find the appropriate session for this request in the database, then
7 | * verify whether or not the session is still valid.
8 | *
9 | *
10 | */
11 | sessionController.isLoggedIn = async (req, res, next) => {
12 | try {
13 | // If session exists for current sid and if the session is not expired
14 | if (res.locals.userId = (await Session.findById(req.cookies.sid)).cookieId) next();
15 | // Else respond with error status
16 | else res.status(401).send('Invalid or expired token')
17 | }
18 | catch (err) {
19 | console.log(err);
20 | res.status(401);
21 | res.send('Invalid or expired token')
22 | }
23 | };
24 |
25 | /**
26 | * startSession - create a new Session model and then save the new session to the
27 | * database.
28 | *
29 | *
30 | */
31 | sessionController.startSession = async (req, res, next) => {
32 | try {
33 | // Save the sid cookie to the db as a new session model instance
34 | const session = new Session({cookieId: res.locals.userId});
35 | res.cookie('sid',(await session.save())._id);
36 | next();
37 | }
38 | catch (err) {
39 | console.log('Error starting session', err)
40 | res.status(500);
41 | res.send();
42 | }
43 | };
44 |
45 | module.exports = sessionController;
46 |
--------------------------------------------------------------------------------
/server/invite/inviteController.js:
--------------------------------------------------------------------------------
1 | const Invite = require('./inviteModel');
2 |
3 | const inviteController = {};
4 |
5 | // Generates an invite ID
6 | inviteController.createInvite = async (req, res, next) => {
7 | // Create a new invite and save the id
8 | try { res.locals.invite = { id: (await Invite.create({ creator: res.locals.userId }))._id } }
9 | // Else pass along the error
10 | catch (err) { res.locals.invite = { err }}
11 | next();
12 | }
13 |
14 | // Updates invite with redeemer's ID
15 | inviteController.redeemInvite = async (req, res, next) => {
16 | // Update invite
17 | try { await Invite.findByIdAndUpdate(res.locals.inviteId, { redeemer: res.locals.userid }); next() }
18 | // Else pass along error
19 | catch (err) { res.status(500).send() }
20 | }
21 |
22 | // Checks if the current invite ID is valid
23 | inviteController.verifyInvite = async (req, res, next) => {
24 | try {
25 | // If invite exists and is valid and is not yet redeemed
26 | const invite = await Invite.findById(res.locals.inviteId);
27 | if (invite && invite.valid && !invite.redeemer) next();
28 | // Else respond with error status
29 | else res.status(401).send('Invalid or expired invite')
30 | }
31 | catch (err) {
32 | console.log(err);
33 | res.status(401);
34 | res.send('Invalid or expired invite')
35 | }
36 | };
37 |
38 | module.exports = inviteController;
39 |
--------------------------------------------------------------------------------
/server/endpoint/endpointController.js:
--------------------------------------------------------------------------------
1 | const Endpoint = require('./endpointModel.js');
2 | const Interval = require('./../crawler/intervalModel.js');
3 |
4 | const endpointController = {
5 |
6 | // Retrieves an endpoint definition for a given endpoint name
7 | // If none found, returns null
8 | getEndpoint: async endpoint => {
9 | try { return await Endpoint.findOne({endpoint}); }
10 | catch (err) { return null; }
11 | },
12 |
13 | // Express middleware
14 | // If successfully creates (upserts) endpoint in endpoints collection, responds with 200
15 | // Else responds with 400
16 | setEndpoint: async (req, res, next) => {
17 | try {
18 | console.log('Incoming endpoint definition: ', req.body);
19 | // Insert if the endpoint doesn't exist, update if it does
20 | const doc = await Endpoint.update(
21 | { endpoint: req.body.endpoint },
22 | {
23 | endpoint: req.body.endpoint,
24 | creator: res.locals.userId,
25 | url: req.body.url,
26 | text: req.body.text,
27 | images: req.body.images,
28 | backgroundImages: req.body.backgroundImages,
29 | pagination: req.body.pagination,
30 | },
31 | { upsert : true });
32 | console.log('doc', doc);
33 | console.log('endpointController', req.body.endpoint);
34 | res.status(200);
35 | res.send(`Endpoint successfully created: /crawls/${req.body.endpoint}`);
36 | }
37 | catch (err) {
38 | console.log('Error saving endpoint: ', err);
39 | res.status(400)
40 | res.send(err);
41 | }
42 | next();
43 | }
44 |
45 | }
46 |
47 | module.exports = endpointController;
48 |
--------------------------------------------------------------------------------
/client/src/components/config/createUser.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import { Card, Input, Divider, Button, Form, Icon } from 'semantic-ui-react'
4 |
5 | class CreateUserDialog extends Component {
6 | constructor(props) {
7 | super(props);
8 |
9 | this.state = {};
10 |
11 | this.handleChange = this.handleChange.bind(this);
12 | this.handleSubmit = this.handleSubmit.bind(this);
13 | }
14 |
15 | handleChange(event) {
16 | const state = {};
17 | state[event.target.name] = event.target.value;
18 | this.setState(state);
19 | }
20 |
21 | handleSubmit(event) {
22 | this.props.submission(this.state);
23 | event.preventDefault();
24 | }
25 |
26 | render() {
27 | return (
28 |
29 |
30 |
31 | {this.props.description}
32 |
33 |
34 |
35 |
36 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | )
51 | }
52 | }
53 |
54 | export default CreateUserDialog;
55 |
--------------------------------------------------------------------------------
/client/src/components/config/invites.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import axios from 'axios';
4 | import CopyToClipboard from 'react-copy-to-clipboard';
5 | import { Container, Button, Input } from 'semantic-ui-react'
6 |
7 | // A form for creation of invites
8 | class Invites extends Component {
9 | constructor() {
10 | super()
11 | this.generateInvite = this.generateInvite.bind(this);
12 | this.state = { inviteId: null };
13 | this.domain = window.location.href.match(/(https?:\/\/[^\/]*)/)[0];
14 | }
15 |
16 | async generateInvite() {
17 | const route = '/invites';
18 | try {
19 | const response = (await axios.post(route, {}));
20 | if (response.status === 200) this.setState({ inviteId: response.data });
21 | console.log(this.state.inviteId);
22 | }
23 | catch (err) {
24 | console.log(err);
25 | }
26 | }
27 |
28 | render() {
29 | const content = this.state.inviteId ? 'Copy Link' : 'Generate Invite Link';
30 | const onClick = this.state.inviteId ? console.log : this.generateInvite;
31 | const otherAttrs = {};
32 | const inviteUrl = `${this.domain}/invites/${this.state.inviteId}`;
33 | if (this.state.inviteId) otherAttrs.label = inviteUrl;
34 |
35 | const button = ;
41 |
42 |
43 | // If the invite has been generated, wrap the button in a CopyToClipboard
44 | return this.state.inviteId ? (
45 |
46 |
47 | {button}
48 |
49 |
50 | ) : (
51 |
52 | {button}
53 |
54 | );
55 | }
56 | }
57 |
58 | export default Invites;
59 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "liveapi",
3 | "version": "0.0.3",
4 | "description": "LiveAPI: A few-click-install, end-to-end scraping server",
5 | "scripts": {
6 | "prestart": "pgrep mongod || (mkdir -p db/data && mkdir -p db/logs && mongod --dbpath db/data --fork --logpath db/logs/mongodb.log)",
7 | "start": "nodemon server/server.js",
8 | "test": "echo \"Error: no test specified\" && exit 1",
9 | "build": "webpack --colors -w",
10 | "go": "./bin/start.sh",
11 | "update": "./bin/start.sh -f"
12 | },
13 | "repository": {
14 | "type": "git",
15 | "url": "git+https://github.com/Live-API/LAS.git"
16 | },
17 | "author": "LiveAPI",
18 | "license": "ISC",
19 | "bugs": {
20 | "url": "https://github.com/Live-API/LAS/issues"
21 | },
22 | "homepage": "https://github.com/Live-API/LAS#readme",
23 | "dependencies": {
24 | "axios": "^0.16.2",
25 | "bcryptjs": "^2.4.3",
26 | "body-parser": "^1.17.2",
27 | "commander": "^2.11.0",
28 | "cookie-parser": "^1.4.3",
29 | "crawler": "^1.0.5",
30 | "express": "^4.15.3",
31 | "mongodb": "^2.2.30",
32 | "mongoose": "^4.11.4",
33 | "path": "^0.12.7",
34 | "pug": "^2.0.0-rc.2",
35 | "react": "^15.6.1",
36 | "react-copy-to-clipboard": "^5.0.0",
37 | "react-dom": "^15.6.1",
38 | "semantic-ui-react": "^0.71.2",
39 | "webpack": "^3.4.1"
40 | },
41 | "devDependencies": {
42 | "babel-cli": "^6.24.1",
43 | "babel-core": "^6.25.0",
44 | "babel-loader": "^7.1.1",
45 | "babel-polyfill": "^6.23.0",
46 | "babel-preset-env": "^1.6.0",
47 | "babel-preset-es2015": "^6.24.1",
48 | "babel-preset-es2017": "^6.24.1",
49 | "babel-preset-react": "^6.24.1",
50 | "nodemon": "^1.11.0",
51 | "react-hot-loader": "^1.3.1",
52 | "webpack-dev-middleware": "^1.11.0"
53 | },
54 | "babel": {
55 | "plugins": [
56 | "transform-async-to-generator"
57 | ]
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LiveAPI
2 | Successful apps are built on data. As developers, we don’t always have access to the data that would help make our app successful. While the internet is a nearly-bottomless source of public data in the form of websites, that data is not always structured or available programmatically through an API. Time spent building an extraction algorithm and server is time not spent building your app.
3 |
4 | We’re developing LiveAPI, a developer tool to turn any website’s public data into an API in a few minutes. LiveAPI has two parts: a Chrome Extension to select data to extract and a user-hostable server that extracts data and serves up the user-created API endpoints.
5 |
6 | The following three-part guide that walks through how to get started and use LiveAPI.
7 |
8 | * [Part 1: Installation](https://medium.com/@brett.beekley/using-liveapi-part-1-installation-ba1aa13bc73b)
9 | * [Part 2: Authentication](https://medium.com/@pennwu/liveapi-a-visual-data-extraction-tool-part-2-17a1d32b2d52)
10 | * [Part 3: Using the Chrome Extension](https://medium.com/@melissjs/liveapi-a-visual-data-extraction-tool-part-3-e9d60c9ab28d)
11 |
12 | ## Installation
13 | LiveAPI server can be installed in one shell command:
14 |
15 | `sudo curl -s https://raw.githubusercontent.com/live-API/LAPI-Server/master/bin/pull.sh | bash -s`
16 |
17 | This command pulls a shell script (`./bin/pull.sh` of this repository), which installs git and clones the latest version of the LiveAPI master into a `./las` directory. This script then executes another shell script (`./bin/start.sh`), which installs the other prerequisates and starts the server. This method is currently supported on Mac OS, Ubuntu and Amazon Linux.
18 |
19 | LiveAPI server can be also installed manually using the following steps.
20 |
21 | 1. Install git, NodeJS and MongoDB
22 | 2. Clone this repository
23 | 3. Run `npm install` in the cloned folder
24 | 4. Run `npm run update`
25 | 5. To start the server in the future, run `npm start`
26 |
--------------------------------------------------------------------------------
/server/crawler/crawlerController.js:
--------------------------------------------------------------------------------
1 | const Crawler = require('./crawlerModel.js');
2 | const Interval = require('./intervalModel.js');
3 | const NodeCrawler = require('./crawler.js');
4 |
5 | // Object containing intervals, so they can be paused or terminated
6 | // Key is the name of the endpoint/scrape
7 | // Value is the timer object returned when the interval is set
8 | const intervals = {};
9 |
10 | const crawlerController = {
11 | getCache: async (req, res, next) => {
12 | try {
13 | const endpoint = req.params.endpoint;
14 | // Send document back as JSON object
15 | res.json(await Crawler.find({ endpoint }));
16 | } catch (err) {
17 | console.log (err);
18 | }
19 | next();
20 | },
21 |
22 | // Sets up a scrape to run on an interval
23 | // Currently scrapes trulia only
24 | startScrapeInterval: async (req, res) => {
25 | // For test purposes:
26 | const url = req.body.url;
27 | const endpoint = req.body.endpoint;
28 | const interval = req.body.interval * 1000;
29 |
30 | // If the endpoint already has an interval
31 | // Stop the interval
32 | if (intervals[endpoint]) clearInterval(intervals[endpoint]);
33 |
34 | // Create a new interval
35 | intervals[endpoint] = setInterval(
36 | () => NodeCrawler(url, endpoint),
37 | interval
38 | );
39 |
40 | // Save the interval to the DB
41 | // Upsert (insert if doesn't exist, else update)
42 | try { await Interval.update({ endpoint }, { endpoint, url, interval }, { upsert : true }); }
43 | catch (err) { console.log(err); }
44 | },
45 |
46 | // Creates intervals for each endpoint in Intervals collection
47 | // May be used when server restarts and intervals should start again
48 | restartIntervals: async function () {
49 | // Get all endpoints
50 | const endpointsToRestart = await Interval.find({});
51 |
52 | // Restart endpoints
53 | endpointsToRestart.forEach(endpoint => this.startScrapeInterval(endpoint.endpoint, endpoint.interval));
54 | }
55 | }
56 |
57 |
58 |
59 | module.exports = crawlerController;
--------------------------------------------------------------------------------
/server/user/userController.js:
--------------------------------------------------------------------------------
1 | const User = require('./userModel');
2 | const bcrypt = require('bcryptjs');
3 |
4 | const userController = {};
5 |
6 | /**
7 | * createUser - create a new User model and then save the user to the database.
8 | */
9 | userController.createUser = async (req, res, next) => {
10 | // Create a user
11 | try {
12 | // If there is a user to create
13 | if (res.locals.newUser) {
14 | const user = new User(res.locals.newUser);
15 |
16 | // bcrypt being done in mongoose middleware in userModel
17 | res.locals.userid = (await user.save())._id;
18 | }
19 | next();
20 | } catch (err) {
21 | // Send an error message
22 | console.log('Failure to create user', err);
23 | res.status(500).send('Failed to create user');
24 | }
25 | };
26 |
27 | /**
28 | * verifyUser - Obtain username and password from the request body, locate
29 | * the appropriate user in the database, and then authenticate the submitted password
30 | * against the password stored in the database.
31 | */
32 | userController.verifyUser = (req, res, next) => {
33 | // Query db for username in req body
34 | User.findOne({ username: req.body.username }, (err, user) => {
35 | // If it exists continue
36 | if (user && bcrypt.compareSync(req.body.password, user.password)) {
37 | res.locals.userId = user._id;
38 | next();
39 | }
40 | // Else send an error message
41 | else {
42 | res.status(401);
43 | res.send('Incorrect username or password');
44 | }
45 | });
46 | };
47 |
48 | /**
49 | * Checks if this is the first admin to be created
50 | * If so, then parses the body as an admin user
51 | * Else, returns an error
52 | */
53 | userController.checkFirstUser = async (req, res, next) => {
54 | // If an admin already exists, return false
55 | if (await User.findOne({ admin: true }) !== null) {
56 | res.locals.newUser = false;
57 | next();
58 | }
59 | else {
60 | res.locals.newUser = {
61 | username: req.body.username,
62 | password: req.body.password,
63 | admin: true,
64 | }
65 | next();
66 | }
67 | }
68 |
69 | module.exports = userController;
70 |
--------------------------------------------------------------------------------
/client/src/components/config/app.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import { render } from 'react-dom';
3 | import { Grid } from 'semantic-ui-react';
4 | import axios from 'axios';
5 | import CreateUserDialog from './createUser.jsx';
6 | import InfoDialog from './dialog.jsx';
7 | import Dashboard from './miniDashboard.jsx';
8 |
9 | class App extends Component {
10 | constructor(props) {
11 | super(props);
12 | this.state = {
13 | status: this.props.status
14 | }
15 | this.createAdmin = this.createAdmin.bind(this);
16 | this.createUser = this.createUser.bind(this);
17 | this.authenticate = this.authenticate.bind(this);
18 | }
19 |
20 | // POSTS to server to create the initial admin user
21 | async createAdmin(data) {
22 | const route = '/config/admin';
23 | try {
24 | const status = (await axios.post(route, data)).data.status;
25 | if (status === 'OK') this.setState({
26 | status: 'dashboard',
27 | message: 'Account successfully created'
28 | });
29 | }
30 | catch (err) {
31 | console.log(err);
32 | }
33 | }
34 |
35 | // POSTS to server to create a user
36 | async createUser(data) {
37 | // Extract invite ID from URL
38 | data.inviteId = window.location.href.match(/[^\/]+(?=\/$|$)/)[0];
39 | const route = '/users';
40 | try {
41 | const status = (await axios.post(route, data)).data.status;
42 | if (status === 'OK') this.setState({
43 | status: 'dashboard',
44 | message: 'Account successfully created'
45 | });
46 | }
47 | catch (err) {
48 | console.log(err);
49 | }
50 | }
51 |
52 | // POSTS to server to log in
53 | async authenticate(data) {
54 | const route = '/auth';
55 | try {
56 | const response = (await axios.post(route, data));
57 | if (response.status === 200) this.setState({
58 | status: 'dashboard',
59 | message: 'Successfully logged in'
60 | });
61 | else this.setState({error: 'Incorrect Username or Password'});
62 | }
63 | catch (err) {
64 | console.log(err);
65 | }
66 | }
67 |
68 | render() {
69 |
70 | // Is this the first time to this page?
71 | let content;
72 | // Display user creation dialog
73 | if (this.state.status === 'createAdmin')
74 | content = ;
75 | // Display user creation dialog
76 | else if (this.state.status === 'createUser')
77 | content = ;
78 | // Display the info dialog
79 | else if (this.state.status === 'login')
80 | content = ;
81 | else if (this.state.status === 'dashboard')
82 | content = ;
83 | // So grid elements are centered on entire page
84 | const gridStyle = { height: '100%', margin: 0}
85 | return (
86 |
87 | {content}
88 |
89 | )
90 | }
91 | }
92 |
93 | export default App;
94 |
--------------------------------------------------------------------------------
/bin/start.sh:
--------------------------------------------------------------------------------
1 | # The purpose of this script is to install the dependencies of LAS from npm, then start the server
2 |
3 | #!/bin/bash
4 |
5 | # If given a -f argument to force (re)install
6 | FORCE_REINSTALL="false"
7 | while getopts ":f:" opt; do
8 | case $opt in
9 | :)
10 | echo "Force installing..."
11 | FORCE_REINSTALL="true"
12 | ;;
13 | esac
14 | done
15 |
16 | # If no .LAS_status file, or force reinstall is selected
17 | if [ "$FORCE_REINSTALL" == "true" ] || [ ! -f .LAS_status ]; then
18 |
19 | # Install NodeJS
20 | # Checks for and uses brew or apt-get. Else may use https://gist.github.com/isaacs/579814
21 |
22 | # Mac OS
23 | if hash brew 2>/dev/null; then
24 | echo "Installing Node with homebrew"
25 | brew install node
26 | else
27 | # Ubuntu/Debian
28 | if hash apt-get 2>/dev/null; then
29 | echo "Installing Node with apt-get"
30 | curl -sL https://deb.nodesource.com/setup_8.x | sudo -E bash -
31 | sudo apt-get install -y nodejs
32 | else
33 | # Enterprise Linus (e.g. Amazon Linux)
34 | if hash yum 2>/dev/null; then
35 | echo "Installing Node with yum"
36 | curl --silent --location https://rpm.nodesource.com/setup_8.x | sudo bash -
37 | sudo yum -y install nodejs
38 | fi
39 | fi
40 | fi
41 |
42 | # Install MongoDB
43 |
44 | # Mac OS
45 | if hash brew 2>/dev/null; then
46 | echo "Installing MongoDB with homebrew"
47 | brew install mongodb
48 | else
49 | # Ubuntu/Debian
50 | if hash apt-get 2>/dev/null; then
51 | echo "Installing MongoDB with apt-get"
52 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv EA312927
53 | echo "deb http://repo.mongodb.org/apt/ubuntu "$(lsb_release -sc)"/mongodb-org/3.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.2.list
54 | sudo apt-get install -y mongodb-org
55 | else
56 | # Enterprise Linux (e.g. Amazon Linux)
57 | if hash yum 2>/dev/null; then
58 | echo "Installing MongoDB with yum"
59 | echo "[mongodb-org-3.2]
60 | name=MongoDB Repository
61 | baseurl=https://repo.mongodb.org/yum/amazon/2013.03/mongodb-org/3.2/x86_64/
62 | gpgcheck=1
63 | enabled=1
64 | gpgkey=https://www.mongodb.org/static/pgp/server-3.2.asc" |
65 | sudo tee -a /etc/yum.repos.d/mongodb-org-3.2.repo
66 | sudo yum install -y mongodb-org
67 | fi
68 | fi
69 | fi
70 |
71 | # Install npm dependencies
72 | echo Installing dependencies from npm
73 | npm install
74 | npm install webpack -g
75 |
76 | # Add/update hidden status file
77 | touch .LAS_status
78 | echo "Latest Update:" > .LAS_status
79 | echo `date` > .LAS_status
80 |
81 | # Build bundles
82 | echo Bundling React components
83 | webpack
84 |
85 | # Create SSL Cert
86 | echo Generating SSL certificate
87 | mkdir ssl
88 |
89 | #Change to your company details
90 | country=US
91 | state=CA
92 | locality=LosAngeles
93 | organization=LiveAPI
94 | organizationalunit=IT
95 | email=test@test.com
96 |
97 | openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem -days 365 -nodes -subj "/C=$country/ST=$state/L=$locality/O=$organization/OU=$organizationalunit/CN=$commonname/emailAddress=$email"
98 |
99 |
100 | # Set up forwarding to port 4000
101 | #sudo iptables -t nat -A PREROUTING -p tcp --dport 80 -j REDIRECT --to-ports 4000
102 | fi
103 |
104 | # Start server
105 | echo Starting LiveAPI Server
106 | npm start
107 |
108 | exit
109 |
--------------------------------------------------------------------------------
/server/crawler/crawler.js:
--------------------------------------------------------------------------------
1 | const NodeCrawler = require('crawler');
2 | const Crawler = require('./crawlerModel.js');
3 | const EndpointModel = require('./../endpoint/endpointModel.js');
4 |
5 | /* Crawler calls 'callback' function expression, scraping website (c.queue)
6 | for Cheerio selectors. Document is created using data array, and saved to the Crawler model. */
7 |
8 | // *** represents items that will be provided by the config file
9 |
10 | /*
11 |
12 | Passing data to the Web Crawler:
13 | {
14 | Text: [ ]
15 | Images: [ ]
16 | BackgroundImages: [ ]
17 | Pagination: Key
18 | }
19 |
20 | Our callback function is divided into different sections:
21 |
22 | Extracting Text for DOM Elements
23 | Pagination
24 | Images - TBD
25 | BackgroundImages - TBD
26 |
27 | Accessing properties would result in clean data flow
28 |
29 | */
30 |
31 | module.exports = (url, endpoint) => {
32 | let data = [];
33 | console.log('url', url);
34 | console.log('endpoint', endpoint);
35 | // look up text within the endpoint model
36 |
37 | const c = new NodeCrawler({
38 | maxConnections: 10,
39 | callback: (error, res, done) => {
40 | console.log(`Processing page scrape for ${endpoint}`);
41 | if (error) console.log(error);
42 | else {
43 | const $ = res.$;
44 | async function extractData() {
45 | try {
46 | let text = await EndpointModel.find({ endpoint });
47 | // console.log('text', text);
48 | // console.log('text[0]', text[0]);
49 | // console.log('text[0].text', text[0].text);
50 | text = text[0].text;
51 | // *** Selectors for DOM elements
52 | // Text only
53 | // JSON.stringify => added the await section
54 | // Iterate through properties
55 | let properties = Object.keys(text);
56 | let domArr;
57 | await properties.forEach((property) => {
58 | domArr = text[property];
59 | domArr.forEach((element) => {
60 | // console.log('element', element);
61 | console.log('$(element)', $(element));
62 | data.push($(element).text());
63 | });
64 | });
65 | // Support for Pagination
66 | // let href = $('[aria-label="Next page"]').attr('href');
67 | let href = false;
68 | // If href exists, scrape next page
69 | if (href) c.queue('https:' + href);
70 | // At end of pagination, add/update document
71 | else {
72 | let scrapedData = new Crawler({
73 | // *** Endpoint Name
74 | "endpoint": endpoint,
75 | data: JSON.stringify(data)
76 | })
77 | // Replace existing data property if the document exists
78 | const cachedData = (await Crawler.find({ "endpoint": endpoint }));
79 | if (cachedData.length > 0) {
80 | await Crawler.update({
81 | "endpoint": endpoint
82 | },
83 | {
84 | $set: {
85 | data: JSON.stringify(data),
86 | scrape_date: Date.now(),
87 | }
88 | });
89 | }
90 |
91 | // Create a new document if scraping for the first time
92 | else {
93 | await scrapedData.save();
94 | }
95 | }
96 | } catch (err) {
97 | console.log(err);
98 | }
99 | }
100 | extractData();
101 | console.log('datalength:', data.length);
102 | };
103 | done();
104 | }
105 | });
106 |
107 | // *** url from config file is passed to queue
108 | c.queue(url);
109 | }
110 |
--------------------------------------------------------------------------------
/server/server.js:
--------------------------------------------------------------------------------
1 | const express = require('express');
2 | const https = require('https');
3 | const http = require('http');
4 | const fs = require('fs')
5 | const pug = require('pug');
6 | const path = require('path');
7 | const bodyParser = require('body-parser');
8 | const cookieParser = require('cookie-parser');
9 | const userController = require('./user/userController.js');
10 | const crawlerController = require('./crawler/crawlerController.js');
11 | const endpointController = require('./endpoint/endpointController.js');
12 | const sessionController = require('./session/sessionController.js');
13 | const inviteController = require('./invite/inviteController.js');
14 |
15 | const app = express();
16 |
17 | app.set('view engine', 'pug');
18 | app.set('views', path.join(__dirname, '../client/views'));
19 | app.use('/static', express.static(path.join(__dirname, '../client/public')));
20 | app.use('/invites/static', express.static(path.join(__dirname, '../client/public')));
21 | app.use(bodyParser.json());
22 | app.use(cookieParser());
23 |
24 | // ----------------------
25 | // Home
26 | // ----------------------
27 |
28 | // Testing this index.html page to get the DOM pointer to work
29 | app.get('/static', (req, res) => {
30 | res.sendFile(path.join(__dirname + './../index.html'));
31 | });
32 |
33 |
34 | // ----------------------
35 | // Authentication
36 | // ----------------------
37 |
38 | // Temporary authentication route
39 | app.post('/auth',
40 | userController.verifyUser,
41 | sessionController.startSession,
42 | (req, res) => {
43 | res.status(200);
44 | res.send('Authenticated!');
45 | });
46 |
47 | app.get('/config',
48 | userController.checkFirstUser,
49 | (req, res) => {
50 | // If this is the first time visiting config
51 | if (res.locals.newUser) res.render('config', {status: !!res.locals.newUser ? 'createAdmin' : 'login'});
52 | // Else prompt to log in
53 | else res.render('config', {status: !!res.locals.newUser ? 'createAdmin' : 'login'});
54 | }
55 | );
56 |
57 | /*
58 |
59 | For our route, we would define different endpoints, depending on the website we are looking to scrape
60 | Brett mentioned how we would eventually support different versions of configuration
61 |
62 | */
63 |
64 | app.post('/config/admin',
65 | userController.checkFirstUser,
66 | userController.createUser,
67 | (req, res) => {
68 | res.send({status: (res.locals.userid ? 'OK' : 'Admin already exists')});
69 | }
70 | );
71 |
72 | // Creates and responds with a new invite id, if the request if authenticated successfully
73 | app.post('/invites',
74 | sessionController.isLoggedIn,
75 | inviteController.createInvite,
76 | (req, res) => {
77 | res.send(res.locals.invite.id);
78 | }
79 | );
80 |
81 | // Renders a signup page for valid invite
82 | app.get('/invites/:inviteId',
83 | (req, res, next) => { res.locals.inviteId = req.params.inviteId; next() },
84 | inviteController.verifyInvite,
85 | (req, res) => (res.render('config', { status: 'createUser' }))
86 | );
87 |
88 | // Creates a user if given a valid invite ID
89 | app.post('/users',
90 | (req, res, next) => { res.locals.inviteId = req.body.inviteId; next() },
91 | inviteController.verifyInvite,
92 | (req, res, next) => { res.locals.newUser = { username: req.body.username, password: req.body.password}; next() },
93 | userController.createUser,
94 | inviteController.redeemInvite,
95 | (req, res) => {
96 | res.send({status: (res.locals.userid ? 'OK' : 'Something went wrong')});
97 | }
98 | );
99 |
100 | // ----------------------
101 | // Crawl endpoints
102 | // ----------------------
103 |
104 | app.get('/crawls/:endpoint', crawlerController.getCache);
105 |
106 | app.post('/crawls',
107 | sessionController.isLoggedIn,
108 | endpointController.setEndpoint,
109 | crawlerController.startScrapeInterval
110 | );
111 |
112 |
113 | // ----------------------
114 | // HTTPS Config
115 | // ----------------------
116 |
117 | const privateKey = fs.readFileSync('ssl/key.pem', 'utf8');
118 | const certificate = fs.readFileSync('ssl/cert.pem', 'utf8');
119 | const credentials = {key: privateKey, cert: certificate};
120 | const httpServer = http.createServer(app);
121 | const httpsServer = https.createServer(credentials, app);
122 | const HTTP_PORT = 4000;
123 | const HTTPS_PORT = 4443;
124 |
125 | httpServer.listen(HTTP_PORT, () => console.log(`HTTP on port ${HTTP_PORT}`));
126 | httpsServer.listen(HTTPS_PORT, () => console.log(`HTTPS on port ${HTTPS_PORT}`));
127 |
128 | //app.listen(PORT, () => {
129 | // console.log(`App is listening on Port ${PORT}`);
130 | //});
131 |
132 | // Demo interval scrape
133 | //crawlerController.startScrapeInterval('pizza', 10000);
134 | // crawlerController.restartIntervals();
135 |
--------------------------------------------------------------------------------