├── .gitignore ├── hooks └── build ├── index.yaml ├── Dockerfile ├── src ├── fetch-tweets.js ├── server.js ├── plainApi.mjs ├── twtxt-utils │ └── TwtxtTxt.mjs ├── swagger.json └── Storage.mjs ├── package.json └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | -------------------------------------------------------------------------------- /hooks/build: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | docker build --build-arg APP_VERSION=$SOURCE_BRANCH -f $DOCKERFILE_PATH -t $IMAGE_NAME . 3 | -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | - kind: tweets 3 | properties: 4 | - name: mentions 5 | - name: timestamp 6 | direction: desc 7 | - kind: tweets 8 | properties: 9 | - name: stems 10 | - name: timestamp 11 | direction: desc 12 | - kind: tweets 13 | properties: 14 | - name: hashTags 15 | - name: timestamp 16 | direction: desc 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 node:16.19.1-alpine3.17 2 | RUN npm install -g npm 3 | RUN mkdir -p /usr/src/app/ 4 | RUN chown node:node /usr/src/app 5 | ADD --chown=node:node package*.json /usr/src/app/ 6 | ADD --chown=node:node src /usr/src/app/src 7 | WORKDIR /usr/src/app 8 | USER node 9 | RUN npm install 10 | ARG APP_VERSION 11 | ENV APP_VERSION $APP_VERSION 12 | ENV NODE_ENV "production" 13 | ENV PORT 8080 14 | CMD npm start 15 | 16 | -------------------------------------------------------------------------------- /src/fetch-tweets.js: -------------------------------------------------------------------------------- 1 | import Storage from './Storage.mjs'; 2 | import dotenv from 'dotenv'; 3 | import express from 'express'; 4 | import {Datastore} from "@google-cloud/datastore"; 5 | import http from "http"; 6 | import fs from "fs"; 7 | import plainApi from './plainApi.mjs'; 8 | 9 | dotenv.config(); 10 | 11 | process.on('unhandledRejection', (reason, promise) => { 12 | throw reason; 13 | }); 14 | 15 | var storage = new Storage( 16 | new Datastore() 17 | ); 18 | 19 | const updateInterval = parseInt(process.env.UPDATING_INTERVAL || "900", 10); 20 | 21 | (async () => { 22 | await storage.executeUpdate(updateInterval); 23 | process.exit(0); 24 | })(); 25 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "twtxt-registry", 3 | "description": "", 4 | "main": "src/server.js", 5 | "type": "module", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node src/server.js", 9 | "fetch-tweets": "node src/fetch-tweets.js" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git+https://github.com/DracoBlue/twtxt-registry.git" 14 | }, 15 | "dependencies": { 16 | "@google-cloud/datastore": "^7.3.2", 17 | "dotenv": "^16.0.3", 18 | "express": "^4.18.2", 19 | "md5": "2.0.0", 20 | "moment": "^2.29.4", 21 | "robots": "0.9.4", 22 | "stemmer": "^2.0.1", 23 | "swagger-ui-dist": "^4.16.1" 24 | }, 25 | "author": "DracoBlue ", 26 | "license": "MIT", 27 | "bugs": { 28 | "url": "https://github.com/DracoBlue/twtxt-registry/issues" 29 | }, 30 | "homepage": "https://github.com/DracoBlue/twtxt-registry#readme" 31 | } 32 | -------------------------------------------------------------------------------- /src/server.js: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv'; 2 | dotenv.config(); 3 | 4 | import Storage from './Storage.mjs'; 5 | import express from 'express'; 6 | import {Datastore} from "@google-cloud/datastore"; 7 | import http from "http"; 8 | import fs from "fs"; 9 | import plainApi from './plainApi.mjs'; 10 | 11 | 12 | 13 | // Create an express instance and set a port variable 14 | var app = express(); 15 | var port = process.env.PORT || 8080; 16 | 17 | // Disable etag headers on responses 18 | app.disable('etag'); 19 | 20 | var storage = new Storage( 21 | new Datastore() 22 | ); 23 | 24 | app.use('/api/plain/', plainApi(storage)); 25 | 26 | app.get('/', function (req, res) { 27 | var response = [ 28 | "", 29 | "", 30 | "", 31 | "twtxt registry", 32 | "", 33 | "", 34 | "

Twtxt Registry

", 35 | '

This is a hosted registry for https://github.com/buckket/twtxt. The registry software is developed by dracoblue and you may find the source code at https://github.com/DracoBlue/twtxt-registry.

', 36 | '

The api doc can be found at /swagger-ui/.

', 37 | "", 38 | "" 39 | ]; 40 | res.set('Content-Type', 'text/html'); 41 | res.send(response.join("\n")); 42 | }); 43 | 44 | var renderSwaggerInitializerJson = function(req, res) { 45 | var response = fs.readFileSync( './node_modules/swagger-ui-dist/swagger-initializer.js').toString(); 46 | response = response.replace("https://petstore.swagger.io/v2/swagger.json", "/api/swagger.json"); 47 | 48 | res.set('Content-Type', 'application/json'); 49 | res.send(response); 50 | }; 51 | var renderSwaggerHtml = function(req, res) { 52 | var response = fs.readFileSync( './node_modules/swagger-ui-dist/index.html').toString(); 53 | res.set('Content-Type', 'text/html'); 54 | res.send(response); 55 | }; 56 | 57 | app.get("/swagger-ui/swagger-initializer.js", renderSwaggerInitializerJson); 58 | app.get("/swagger-ui/index.html", renderSwaggerHtml); 59 | app.get("/swagger-ui/", renderSwaggerHtml); 60 | 61 | app.get("/api/swagger.json", function(req, res) { 62 | res.set('Content-Type', 'application/json'); 63 | var response = JSON.parse(fs.readFileSync( './src/swagger.json').toString()); 64 | var info = JSON.parse(fs.readFileSync('./package.json')); 65 | response.info.version = info.version || 'dev'; 66 | res.send(JSON.stringify(response)); 67 | }); 68 | 69 | // Set /public as our static content dir 70 | app.use("/swagger-ui/", express.static("./node_modules/swagger-ui-dist/")); 71 | 72 | var server = http.createServer(app).listen(port, function() { 73 | console.log('twtxt registry listening on port ' + port); 74 | if (process.env.START_UPDATING) { 75 | const updateInterval = parseInt(process.env.UPDATING_INTERVAL || "900", 10); 76 | storage.startUpdating(updateInterval); 77 | } 78 | }); 79 | 80 | storage.addUser("https://buckket.org/twtxt.txt", "buckket", function() { 81 | }); 82 | 83 | storage.addUser("https://buckket.org/twtxt_news.txt", "twtxt_news", function() { 84 | }); 85 | 86 | storage.addUser("https://dracoblue.net/twtxt.txt", "dracoblue", function() { 87 | }); 88 | -------------------------------------------------------------------------------- /src/plainApi.mjs: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import url from "url"; 3 | 4 | var plainApi = function(storage) { 5 | var api = express.Router(); 6 | 7 | var renderAuthorForTweet = function(tweet) { 8 | if (tweet.author_nickname) { 9 | return tweet.author_nickname + "\t" + tweet.author_url; 10 | } 11 | 12 | return tweet.author_url + "\t" + tweet.author_url; 13 | }; 14 | 15 | api.use(function (req, res, next) { 16 | res.set('Content-Type', 'text/plain'); 17 | next(); 18 | }); 19 | 20 | api.get('/tags/:tag', function (req, res) { 21 | if (!req.params.tag) { 22 | res.status(400); 23 | res.end(); 24 | return; 25 | } 26 | 27 | var page = parseInt(req.query.page || 1, 10) || 1; 28 | 29 | storage.getTweetsByHashTag("#" + req.params.tag, page, function (tweets) { 30 | var response = []; 31 | 32 | tweets.forEach(function (tweet) { 33 | response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text); 34 | }); 35 | res.send(response.join("\n")); 36 | }) 37 | }); 38 | 39 | api.get('/tweets', function (req, res) { 40 | var page = parseInt(req.query.page || 1, 10) || 1; 41 | 42 | storage.searchTweets(req.query.q || '', page, function (tweets) { 43 | var response = []; 44 | 45 | tweets.forEach(function (tweet) { 46 | response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text); 47 | }); 48 | res.send(response.join("\n")); 49 | }) 50 | }); 51 | 52 | api.get('/mentions', function (req, res) { 53 | if (!req.query.url) { 54 | res.status(400); 55 | res.send("`url` must be provided."); 56 | res.end(); 57 | return; 58 | } 59 | 60 | var page = parseInt(req.query.page || 1, 10) || 1; 61 | 62 | storage.getTweetsByMentions(req.query.url, page, function (tweets) { 63 | var response = []; 64 | 65 | tweets.forEach(function (tweet) { 66 | response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text); 67 | }); 68 | res.send(response.join("\n")); 69 | }) 70 | }); 71 | 72 | api.post('/users', function (req, res) { 73 | if (!req.query.url || !req.query.nickname) { 74 | res.status(400); 75 | res.send("`nickname` and `url` must be provided."); 76 | return; 77 | } 78 | 79 | if (!req.query.nickname.match(/^[A-Za-z0-9_-]+$/)) { 80 | res.status(400); 81 | res.send("`nickname` must match ^[A-Za-z0-9_-]+$"); 82 | return ; 83 | } 84 | 85 | var urlParts = url.parse(req.query.url); 86 | if (!urlParts['hostname'] || !urlParts['protocol'] || (urlParts['protocol'] != 'https:' && urlParts['protocol'] != 'http:') ) { 87 | res.status(400); 88 | res.send("`url` must provide hostname and either protocol as http or https!"); 89 | return ; 90 | } 91 | 92 | storage.addUser(req.query.url, req.query.nickname, function () { 93 | res.send("OK"); 94 | }); 95 | }); 96 | 97 | api.get('/users', function (req, res) { 98 | var page = parseInt(req.query.page || 1, 10) || 1; 99 | 100 | storage.searchUsers(req.query.q || '', page, function (users) { 101 | var response = []; 102 | 103 | users.forEach(function (user) { 104 | response.push(user.nickname + "\t" + user.url + "\t" + user.timestamp); 105 | }); 106 | res.send(response.join("\n")); 107 | }) 108 | }); 109 | 110 | return api; 111 | }; 112 | 113 | export default plainApi; -------------------------------------------------------------------------------- /src/twtxt-utils/TwtxtTxt.mjs: -------------------------------------------------------------------------------- 1 | import moment from 'moment'; 2 | import urlUtils from "url"; 3 | import md5 from 'md5'; 4 | 5 | var TwtxtTxt = function(url, nickname, body) { 6 | this.body = body; 7 | this.url = url; 8 | this.nickname = nickname; 9 | 10 | this.setTweetsByBody(this.body); 11 | }; 12 | 13 | TwtxtTxt.prototype.getTweets = function() { 14 | return this.tweets; 15 | }; 16 | 17 | TwtxtTxt.prototype.setTweetsByBody = function(body) { 18 | var that = this; 19 | this.tweets = []; 20 | 21 | body.split("\n").forEach(function(row) { 22 | row = (row || "").trim(); 23 | 24 | if (row) { 25 | var match = row.match(/^([^\t]+)\t(.+)/); 26 | 27 | if (match && moment(match[1]).isValid()) { 28 | 29 | var text = match[2].trim(); 30 | var body = text.toString(); 31 | 32 | var hashTags = that.extractHashTagsByRow(text); 33 | var mentions = that.extractMentionsByRow(text); 34 | 35 | if (body) { 36 | var currentMatch = body.match(/@<([^ ]+) ([^> ]+)>/); 37 | while (currentMatch) { 38 | body = body.replace(currentMatch[0], '@' + that.encodeXml(currentMatch[1]) + ''); 39 | currentMatch = body.match(/@<([^ ]+) ([^> ]+)>/); 40 | } 41 | 42 | currentMatch = body.match(/@<([^> ]+)>/); 43 | while (currentMatch) { 44 | body = body.replace(currentMatch[0], '@' + that.encodeXml(urlUtils.parse(currentMatch[1])['hostname'] || currentMatch[1]) + ''); 45 | currentMatch = body.match(/@<([^> ]+)>/); 46 | } 47 | } 48 | 49 | 50 | that.tweets.push({ 51 | id: md5(that.url + "\t" + row), 52 | timestamp: moment(match[1]).toISOString(), 53 | hashTags: hashTags, 54 | mentions: mentions, 55 | author_url: that.url, 56 | author_nickname: that.nickname, 57 | body: body, 58 | text: text 59 | }); 60 | } 61 | } 62 | }); 63 | }; 64 | 65 | 66 | var xml_special_to_escaped_one_map = { 67 | '&': '&', 68 | '"': '"', 69 | '<': '<', 70 | '>': '>' 71 | }; 72 | 73 | var escaped_one_to_xml_special_map = { 74 | '&': '&', 75 | '"': '"', 76 | '<': '<', 77 | '>': '>' 78 | }; 79 | 80 | TwtxtTxt.prototype.encodeXml = function(string) { 81 | return string.replace(/([\&"<>])/g, function(str, item) { 82 | return xml_special_to_escaped_one_map[item]; 83 | }); 84 | }; 85 | 86 | TwtxtTxt.prototype.decodeXml = function(string) { 87 | return string.replace(/("|<|>|&)/g, 88 | function(str, item) { 89 | return escaped_one_to_xml_special_map[item]; 90 | }); 91 | }; 92 | 93 | TwtxtTxt.prototype.extractHashTagsByRow = function(string) { 94 | return string.match(/(#[^\s#<>'"]+)/g) || []; 95 | }; 96 | 97 | 98 | TwtxtTxt.prototype.extractMentionsByRow = function(body) { 99 | var mentions = []; 100 | var currentMatch = body.match(/@<([^ ]+ [^> ]+)>/g); 101 | if (currentMatch) { 102 | currentMatch.forEach(function(mention) { 103 | var rowMatch = mention.match(/@<([^ ]+) ([^> ]+)>/); 104 | mentions.push(rowMatch[2]); 105 | }) 106 | } 107 | 108 | currentMatch = body.match(/@<([^> ]+)>/g); 109 | if (currentMatch) { 110 | currentMatch.forEach(function(mention) { 111 | var rowMatch = mention.match(/@<([^> ]+)>/); 112 | mentions.push(rowMatch[1]); 113 | }) 114 | } 115 | 116 | return mentions; 117 | }; 118 | 119 | export default TwtxtTxt; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twtxt Registry Server 2 | 3 | A small registry server for twtxt, which allows to query for mentions and hash tags. 4 | 5 | ## Prerequisites 6 | 7 | * [gcp firestore in datastore mode](https://cloud.google.com/datastore/docs) 8 | * [npm](https://nodejs.org) installed 9 | 10 | ## Installation 11 | 12 | Set up the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to a user (or use gcloud auth login) for a gcp datastore project. 13 | Replace `$GOOGLE_PROJECT_NAME` with your google project name. 14 | 15 | ``` console 16 | $ npm install 17 | $ export PORT=8080 18 | $ export START_UPDATING=1 19 | $ export UPDATING_INTERVAL=900 20 | $ gcloud datastore indexes create --project $GOOGLE_PROJECT_NAME index.yaml 21 | Configurations to update: 22 | 23 | descriptor: [index.yaml] 24 | type: [datastore indexes] 25 | target project: [your-project-name] 26 | 27 | 28 | Do you want to continue (Y/n)? y 29 | 30 | ....done. 31 | $ node src/server.js 32 | ``` 33 | 34 | ## Example API calls for the Plain-Text-Api 35 | 36 | You can see a demo running at and a swagger-ui api doc at . 37 | 38 | Add a new Twtxt User to the Registry: 39 | 40 | ``` console 41 | $ curl -X POST 'http://localhost:8080/api/plain/users?url=https://dracoblue.net/twtxt.txt&nickname=dracoblue' 42 | OK 43 | ``` 44 | 45 | See latest tweets in the Registry (e.g. ): 46 | 47 | ``` console 48 | $ curl 'http://localhost:8080/api/plain/tweets' 49 | dracoblue https://dracoblue.net/twtxt.txt 2016-02-06T21:32:02.000Z @erlehmann is messing with timestamps in @buckket #twtxt :) 50 | dracoblue https://dracoblue.net/twtxt.txt 2016-02-06T12:14:18.000Z Simple nodejs script to convert your twitter timeline to twtxt: https://t.co/txnWsC5jvA ( find my #twtxt at https://t.co/uN1KDXwJ8B ) 51 | ``` 52 | 53 | Search for tweets in the Registry (e.g. ): 54 | 55 | ``` console 56 | $ curl 'http://localhost:8080/api/plain/tweets?q=twtxt' 57 | buckket https://buckket.org/twtxt.txt 2016-02-09T12:42:26.000Z Do we need an IRC channel for twtxt? 58 | buckket https://buckket.org/twtxt.txt 2016-02-09T12:42:12.000Z Good Morning, twtxt-world! 59 | ``` 60 | 61 | Retrieve a list of all mentions of a specific twtxt User like `https://buckket.org/twtxt.txt` (e.g. ): 62 | 63 | ``` console 64 | $ curl 'http://localhost:8080/api/plain/mentions?url=https://buckket.org/twtxt.txt' 65 | dracoblue https://dracoblue.net/twtxt.txt 2016-02-09T12:57:59.000Z @ something like https://gitter.im/ or a freenode channel? 66 | dracoblue https://dracoblue.net/twtxt.txt 2016-02-08T22:51:47.000Z @ looks nice ;) 67 | ``` 68 | 69 | Retrieve a list of all tweets with a specific tag like `#twtxt` (e.g. ): 70 | 71 | ``` console 72 | $ curl 'http://localhost:8080/api/plain/tags/twtxt' 73 | dracoblue https://dracoblue.net/twtxt.txt> 2016-02-06T21:32:02.000Z @erlehmann is messing with timestamps in @buckket #twtxt :) 74 | dracoblue https://dracoblue.net/twtxt.txt> 2016-02-06T12:14:18.000Z Simple nodejs script to convert your twitter timeline to twtxt: https://t.co/txnWsC5jvA ( find my #twtxt at https://t.co/uN1KDXwJ8B ) 75 | ``` 76 | 77 | Search for users in the Registry (e.g. ): 78 | 79 | ``` console 80 | $ curl 'http://localhost:8080/api/plain/users?q=dracoblue' 81 | https://dracoblue.net/twtxt.txt 2016-02-09T12:42:26.000Z dracoblue 82 | ``` 83 | 84 | ## License 85 | 86 | This work is copyright by DracoBlue (http://dracoblue.net) and licensed under the terms of MIT License. 87 | -------------------------------------------------------------------------------- /src/swagger.json: -------------------------------------------------------------------------------- 1 | { 2 | "swagger": "2.0", 3 | "info": { 4 | "description": "This is a registry server for twtxt. Sourcecode at [Github](https://github.com/DracoBlue/twtxt-registry).", 5 | "version": "1.0.0", 6 | "title": "Twtxt Registry", 7 | "termsOfService": "https://dracoblue.net/about/#nutzungsbedingungen", 8 | "contact": {"email": "JanS@DracoBlue.de"}, 9 | "license": { 10 | "name": "MIT", 11 | "url": "http://www.apache.org/licenses/LICENSE-2.0.html" 12 | } 13 | }, 14 | "basePath": "/api", 15 | "tags": [ 16 | { 17 | "name": "tweets", 18 | "description": "Tweets in this twtxt registry" 19 | }, 20 | { 21 | "name": "users", 22 | "description": "Users in this twtxt registry" 23 | } 24 | ], 25 | "paths": { 26 | "/plain/users": { 27 | "post": { 28 | "tags": ["users"], 29 | "summary": "Add a user to the registry", 30 | "description": "", 31 | "operationId": "addUser", 32 | "consumes": [ 33 | "application/json" 34 | ], 35 | "produces": [ 36 | "text/plain" 37 | ], 38 | "parameters": [ 39 | { 40 | "in": "query", 41 | "name": "nickname", 42 | "description": "Nickname at the registry", 43 | "required": true, 44 | "type": "string" 45 | }, 46 | { 47 | "in": "query", 48 | "name": "url", 49 | "description": "Url of the twtxt.txt file", 50 | "required": true, 51 | "type": "string" 52 | } 53 | ] 54 | }, 55 | "get": { 56 | "tags": ["users"], 57 | "summary": "Finds users", 58 | "description": "", 59 | "operationId": "findUsers", 60 | "consumes": [ 61 | "application/json" 62 | ], 63 | "produces": [ 64 | "text/plain" 65 | ], 66 | "parameters": [ 67 | { 68 | "in": "query", 69 | "name": "q", 70 | "description": "Part of the nickname", 71 | "required": false, 72 | "default": "buckket", 73 | "type": "string" 74 | }, 75 | { 76 | "in": "query", 77 | "name": "page", 78 | "description": "Page to query for", 79 | "required": false, 80 | "type": "number" 81 | } 82 | ] 83 | } 84 | }, 85 | "/plain/mentions": { 86 | "get": { 87 | "tags": ["tweets"], 88 | "summary": "Finds tweets by mention", 89 | "description": "", 90 | "operationId": "findMentions", 91 | "consumes": [ 92 | "application/json" 93 | ], 94 | "produces": [ 95 | "text/plain" 96 | ], 97 | "parameters": [ 98 | { 99 | "in": "query", 100 | "name": "url", 101 | "description": "Url of the twtxt.txt file", 102 | "default": "https://buckket.org/twtxt.txt", 103 | "required": true, 104 | "type": "string" 105 | }, 106 | { 107 | "in": "query", 108 | "name": "page", 109 | "description": "Page to query for", 110 | "required": false, 111 | "type": "number" 112 | } 113 | ] 114 | } 115 | }, 116 | "/plain/tweets": { 117 | "get": { 118 | "tags": ["tweets"], 119 | "summary": "Finds tweets by query", 120 | "description": "", 121 | "operationId": "findTweets", 122 | "consumes": [ 123 | "application/json" 124 | ], 125 | "produces": [ 126 | "text/plain" 127 | ], 128 | "parameters": [ 129 | { 130 | "in": "query", 131 | "name": "q", 132 | "description": "Search for tweets", 133 | "required": false, 134 | "default": "twtxt", 135 | "type": "string" 136 | }, 137 | { 138 | "in": "query", 139 | "name": "page", 140 | "description": "Page to query for", 141 | "required": false, 142 | "type": "number" 143 | } 144 | ] 145 | } 146 | }, 147 | "/plain/tags/{tag}": { 148 | "get": { 149 | "tags": ["tweets"], 150 | "summary": "Finds tweets by query", 151 | "description": "", 152 | "operationId": "find", 153 | "consumes": [ 154 | "application/json" 155 | ], 156 | "produces": [ 157 | "text/plain" 158 | ], 159 | "parameters": [ 160 | { 161 | "in": "path", 162 | "name": "tag", 163 | "description": "The hash tag", 164 | "required": true, 165 | "type": "string", 166 | "default": "twtxt" 167 | }, 168 | { 169 | "in": "query", 170 | "name": "page", 171 | "description": "Page to query for", 172 | "required": false, 173 | "type": "number" 174 | } 175 | ] 176 | } 177 | } 178 | } 179 | } -------------------------------------------------------------------------------- /src/Storage.mjs: -------------------------------------------------------------------------------- 1 | import {stemmer} from 'stemmer' 2 | import md5 from 'md5'; 3 | import TwtxtTxt from './twtxt-utils/TwtxtTxt.mjs'; 4 | import urlUtils from 'url'; 5 | import http from 'http'; 6 | import https from 'https'; 7 | import fs from 'fs'; 8 | import robots from 'robots'; 9 | 10 | const info = JSON.parse(fs.readFileSync( './package.json')); 11 | info.version = process.env.APP_VERSION || info.version || 'dev'; 12 | 13 | let Storage = function(datastore) { 14 | this.datastore = datastore; 15 | this.robotsParserUrlMap = {}; 16 | this.userAgent = "twtxt-registry/" + info.version; 17 | }; 18 | 19 | Storage.prototype.addUser = function(url, nickname, cb) { 20 | console.log("Save user for url", url); 21 | 22 | this.datastore.save({ 23 | key: this.datastore.key(['users', url]), 24 | data: { 25 | timestamp: new Date().toJSON(), 26 | nickname, 27 | url 28 | } 29 | }).then(cb); 30 | }; 31 | 32 | Storage.prototype.storeTweet = async function(tweet) { 33 | console.log("Save tweet for id", tweet.id); 34 | 35 | tweet.stems = tweet.text.split(" ").map((word) => { 36 | return stemmer(word.trim()).trim(); 37 | }); 38 | 39 | await this.datastore.save({ 40 | key: this.datastore.key(['tweets', tweet.id]), 41 | data: tweet 42 | }); 43 | }; 44 | 45 | Storage.prototype.getAllUsers = async function() { 46 | const query = this.datastore.createQuery('users'); 47 | const [users] = await query.run(); 48 | return users; 49 | } 50 | 51 | Storage.prototype.forEachUser = function(cb) { 52 | const query = this.datastore.createQuery('users'); 53 | query.run((err, entities) => { 54 | entities.map((entity) => { 55 | return entity; 56 | }).forEach(cb); 57 | }); 58 | }; 59 | 60 | Storage.prototype.getTweetsByHashTag = function(hashTag, page, cb) { 61 | const query = this.datastore.createQuery('tweets'); 62 | query.filter('hashTags', '=', hashTag); 63 | query.order('timestamp',{ 64 | descending: true 65 | }); 66 | query.offset((page * 20) - 20); 67 | query.limit(20); 68 | query.run((err, entities) => { 69 | cb(entities.map((entity) => { 70 | return entity; 71 | })); 72 | }); 73 | }; 74 | 75 | Storage.prototype.searchTweets = function(queryString, page, cb) { 76 | const query = this.datastore.createQuery('tweets'); 77 | if (queryString) { 78 | query.filter('stems', 'IN', stemmer(queryString).split(" ")); 79 | } 80 | query.order('timestamp',{ 81 | descending: true 82 | }); 83 | query.offset((page * 20) - 20); 84 | query.limit(20); 85 | query.run((err, entities) => { 86 | cb(entities.map((entity) => { 87 | return entity; 88 | })); 89 | }); 90 | }; 91 | 92 | Storage.prototype.searchUsers = function(queryString, page, cb) { 93 | const query = this.datastore.createQuery('users'); 94 | if (queryString) { 95 | query.filter('nickname', '=', queryString); 96 | } 97 | 98 | query.run((err, entities) => { 99 | cb(entities.map((entity) => { 100 | return entity; 101 | })); 102 | }); 103 | }; 104 | 105 | 106 | Storage.prototype.getTweetsByMentions = function(twtxtUrl, page, cb) { 107 | const query = this.datastore.createQuery('tweets'); 108 | query.filter('mentions', '=', twtxtUrl); 109 | query.order('timestamp',{ 110 | descending: true 111 | }); 112 | query.offset((page * 20) - 20); 113 | query.limit(20); 114 | query.run((err, entities) => { 115 | if (err) { 116 | throw err; 117 | } 118 | cb(entities.map((entity) => { 119 | return entity; 120 | })); 121 | }); 122 | }; 123 | 124 | Storage.prototype.isTimeForCrawl = function (crawlDelay, secondsSinceStartOfDay, updateIntervalInSeconds) { 125 | if (!crawlDelay) { 126 | return true; 127 | } 128 | 129 | let crawlSlotNumberPreviousSlot = Math.floor(secondsSinceStartOfDay / crawlDelay); 130 | let crawlSlotNumberCurrentSlot = Math.floor((secondsSinceStartOfDay + updateIntervalInSeconds) / crawlDelay); 131 | 132 | if (crawlSlotNumberCurrentSlot > crawlSlotNumberPreviousSlot) { 133 | return true; 134 | } 135 | 136 | return false; 137 | 138 | /* secondsSinceStartOfDay: 139 | * 0 140 | * 900 141 | * 1800 142 | * 2700 143 | * 3600 ---> Math.floor(3600 / 4000) = 0 144 | * --> 4000 145 | * 4500 ---> Math.floor(4500 / 4000) = 1 146 | * 5400 147 | * 6300 148 | * 7200 149 | * --> 8000 150 | * 8100 151 | * 152 | * crawlDelay = 4000 153 | */ 154 | 155 | 156 | } 157 | 158 | Storage.prototype.getRobotsTxtParserForUrl = async function (robotsTxtUrl) { 159 | return new Promise((resolve, reject) => { 160 | new robots.RobotsParser( 161 | robotsTxtUrl, 162 | this.userAgent, 163 | (parser, success) => { 164 | if (success) { 165 | resolve(parser); 166 | } else { 167 | reject(new Error('Cannot create parser for ' + robotsTxtUrl)); 168 | } 169 | } 170 | ); 171 | }); 172 | }; 173 | 174 | Storage.prototype.executeUpdate = async function (updateInterval) { 175 | const now = new Date(); 176 | const lastUpdate = Math.ceil(now.getTime() / 1000); 177 | 178 | let users = await this.getAllUsers(); 179 | 180 | await Promise.all(users.map(async (user) => { 181 | var client = http; 182 | var options = urlUtils.parse(user.url); 183 | 184 | if (options['protocol'] === "https:") { 185 | client = https; 186 | } 187 | 188 | options.headers = { 189 | "User-Agent": this.userAgent 190 | }; 191 | 192 | options.method = 'GET'; 193 | 194 | var robotsTxtOptions = JSON.parse(JSON.stringify(options)); 195 | robotsTxtOptions.path = "/robots.txt"; 196 | robotsTxtOptions.pathname = "/robots.txt"; 197 | var robotsTxtUrl = urlUtils.format(robotsTxtOptions); 198 | 199 | let robotsUrlParser = null; 200 | 201 | try { 202 | robotsUrlParser = await this.getRobotsTxtParserForUrl(robotsTxtUrl); 203 | } catch (error) { 204 | return ; 205 | } 206 | 207 | let crawlDelay = Math.ceil(robotsUrlParser.getCrawlDelay(this.userAgent)); 208 | 209 | /* default delay is 100 times a day (if it is 900 and seconds is 60. our setting is after 120/60 seconds) */ 210 | 211 | console.log("CrawlDelay: ", crawlDelay, " for ", robotsTxtUrl); 212 | console.log("last update at", lastUpdate, "update interval is ", updateInterval); 213 | 214 | if (!this.isTimeForCrawl(crawlDelay, lastUpdate, updateInterval)) { 215 | console.log("does not match crawlDelay! STOP"); 216 | return ; 217 | } 218 | 219 | console.log("does match crawlDelay! FETCH"); 220 | 221 | let {access, url, reason} = await (new Promise((resolve, reject) => { 222 | robotsUrlParser.canFetch(this.userAgent, options.path, (access, url, reason) => { 223 | resolve({ 224 | access, url, reason 225 | }); 226 | }); 227 | })); 228 | 229 | if (!access) { 230 | console.error("not allowed to fetch", user.url, "because of " + robotsTxtUrl + ":", reason.type, " statusCode:", reason.statusCode); 231 | return ; 232 | } 233 | 234 | let lastModifiedEntity = null; 235 | 236 | try { 237 | const [lastModifiedEntity] = await this.datastore.get(this.datastore.key(['last-modified-since', user.url])); 238 | } catch (error) { 239 | } 240 | 241 | if (lastModifiedEntity) { 242 | options.headers['If-Modified-Since'] = lastModifiedEntity['timestamp']; 243 | } 244 | 245 | let {statusCode, body, lastModified} = await (new Promise((resolve, reject) => { 246 | client.request(options, (res) => { 247 | let body = []; 248 | res.on('data', (chunk) => { 249 | body.push(chunk); 250 | }).on('end', () => { 251 | resolve({ 252 | body: Buffer.concat(body).toString(), 253 | statusCode: res.statusCode, 254 | lastModified: res.headers['last-modified'] 255 | }); 256 | }); 257 | }).on('error', reject).end(); 258 | })); 259 | 260 | if (statusCode === 304) { 261 | return ; 262 | } 263 | 264 | var txt = new TwtxtTxt(user.url, user.nickname, body); 265 | 266 | await Promise.all(txt.getTweets().map(async (tweet) => { 267 | if (tweet.body.length > 1500) { 268 | console.log('Skip tweet - has more than 1500 body letters.'); 269 | } else { 270 | await this.storeTweet(tweet); 271 | } 272 | })); 273 | 274 | if (lastModified) { 275 | // FIXME: ttl 60*60*24 276 | await this.datastore.save([{ 277 | 'key': this.datastore.key(['last-modified-since', user.url]), 278 | 'data': { 279 | 'timestamp': lastModified 280 | } 281 | }]); 282 | }; 283 | })); 284 | }; 285 | 286 | Storage.prototype.startUpdating = function(updateInterval) { 287 | clearInterval(this.updatingInterval); 288 | 289 | let execute = () => { 290 | this.executeUpdate(updateInterval); 291 | }; 292 | 293 | this.updatingInterval = setInterval(() => { 294 | execute(); 295 | }, updateInterval * 1000); 296 | 297 | execute(); 298 | }; 299 | 300 | 301 | export default Storage; --------------------------------------------------------------------------------