├── .gitignore
├── hooks
    └── build
├── index.yaml
├── Dockerfile
├── src
    ├── fetch-tweets.js
    ├── server.js
    ├── plainApi.mjs
    ├── twtxt-utils
    │   └── TwtxtTxt.mjs
    ├── swagger.json
    └── Storage.mjs
├── package.json
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | 


--------------------------------------------------------------------------------
/hooks/build:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | docker build --build-arg APP_VERSION=$SOURCE_BRANCH -f $DOCKERFILE_PATH -t $IMAGE_NAME .
3 | 


--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
 1 | indexes:
 2 | - kind: tweets
 3 |   properties:
 4 |     - name: mentions
 5 |     - name: timestamp
 6 |       direction: desc
 7 | - kind: tweets
 8 |   properties:
 9 |     - name: stems
10 |     - name: timestamp
11 |       direction: desc
12 | - kind: tweets
13 |   properties:
14 |     - name: hashTags
15 |     - name: timestamp
16 |       direction: desc
17 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 node:16.19.1-alpine3.17
 2 | RUN npm install -g npm
 3 | RUN mkdir -p /usr/src/app/
 4 | RUN chown node:node /usr/src/app
 5 | ADD --chown=node:node package*.json /usr/src/app/
 6 | ADD --chown=node:node src /usr/src/app/src
 7 | WORKDIR /usr/src/app
 8 | USER node
 9 | RUN npm install
10 | ARG APP_VERSION
11 | ENV APP_VERSION $APP_VERSION
12 | ENV NODE_ENV "production"
13 | ENV PORT 8080
14 | CMD npm start
15 | 
16 | 


--------------------------------------------------------------------------------
/src/fetch-tweets.js:
--------------------------------------------------------------------------------
 1 | import Storage from './Storage.mjs';
 2 | import dotenv from 'dotenv';
 3 | import express from 'express';
 4 | import {Datastore} from "@google-cloud/datastore";
 5 | import http from "http";
 6 | import fs from "fs";
 7 | import plainApi from './plainApi.mjs';
 8 | 
 9 | dotenv.config();
10 | 
11 | process.on('unhandledRejection', (reason, promise) => {
12 |     throw reason;
13 | });
14 | 
15 | var storage = new Storage(
16 |   new Datastore()
17 | );
18 | 
19 | const updateInterval = parseInt(process.env.UPDATING_INTERVAL || "900", 10);
20 | 
21 | (async () => {
22 |     await storage.executeUpdate(updateInterval);
23 |     process.exit(0);
24 | })();
25 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "twtxt-registry",
 3 |   "description": "",
 4 |   "main": "src/server.js",
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "start": "node src/server.js",
 9 |     "fetch-tweets": "node src/fetch-tweets.js"
10 |   },
11 |   "repository": {
12 |     "type": "git",
13 |     "url": "git+https://github.com/DracoBlue/twtxt-registry.git"
14 |   },
15 |   "dependencies": {
16 |     "@google-cloud/datastore": "^7.3.2",
17 |     "dotenv": "^16.0.3",
18 |     "express": "^4.18.2",
19 |     "md5": "2.0.0",
20 |     "moment": "^2.29.4",
21 |     "robots": "0.9.4",
22 |     "stemmer": "^2.0.1",
23 |     "swagger-ui-dist": "^4.16.1"
24 |   },
25 |   "author": "DracoBlue <JanS@DracoBlue.de>",
26 |   "license": "MIT",
27 |   "bugs": {
28 |     "url": "https://github.com/DracoBlue/twtxt-registry/issues"
29 |   },
30 |   "homepage": "https://github.com/DracoBlue/twtxt-registry#readme"
31 | }
32 | 


--------------------------------------------------------------------------------
/src/server.js:
--------------------------------------------------------------------------------
 1 | import dotenv from 'dotenv';
 2 | dotenv.config();
 3 | 
 4 | import Storage from './Storage.mjs';
 5 | import express from 'express';
 6 | import {Datastore} from "@google-cloud/datastore";
 7 | import http from "http";
 8 | import fs from "fs";
 9 | import plainApi from './plainApi.mjs';
10 | 
11 | 
12 | 
13 | // Create an express instance and set a port variable
14 | var app = express();
15 | var port = process.env.PORT || 8080;
16 | 
17 | // Disable etag headers on responses
18 | app.disable('etag');
19 | 
20 | var storage = new Storage(
21 |   new Datastore()
22 | );
23 | 
24 | app.use('/api/plain/', plainApi(storage));
25 | 
26 | app.get('/', function (req, res) {
27 |   var response = [
28 |     "<!DOCTYPE html>",
29 |     "<html>",
30 |     "<head>",
31 |     "<title>twtxt registry</title>",
32 |     "</head>",
33 |     "<body>",
34 |     "<h1>Twtxt Registry</h1>",
35 |     '<p>This is a hosted registry for <a href="https://github.com/buckket/twtxt">https://github.com/buckket/twtxt</a>. The registry software is developed by <a href="https://dracoblue.net">dracoblue</a> and you may find the source code at <a href="https://github.com/DracoBlue/twtxt-registry">https://github.com/DracoBlue/twtxt-registry</a>.</p>',
36 |     '<p>The api doc can be found at <a href="/swagger-ui/">/swagger-ui/</a>.</p>',
37 |     "<body>",
38 |     "</html>"
39 |   ];
40 |   res.set('Content-Type', 'text/html');
41 |   res.send(response.join("\n"));
42 | });
43 | 
44 | var renderSwaggerInitializerJson = function(req, res) {
45 |   var response = fs.readFileSync(  './node_modules/swagger-ui-dist/swagger-initializer.js').toString();
46 |   response = response.replace("https://petstore.swagger.io/v2/swagger.json", "/api/swagger.json");
47 | 
48 |   res.set('Content-Type', 'application/json');
49 |   res.send(response);
50 | };
51 | var renderSwaggerHtml = function(req, res) {
52 |   var response = fs.readFileSync(  './node_modules/swagger-ui-dist/index.html').toString();
53 |   res.set('Content-Type', 'text/html');
54 |   res.send(response);
55 | };
56 | 
57 | app.get("/swagger-ui/swagger-initializer.js", renderSwaggerInitializerJson);
58 | app.get("/swagger-ui/index.html", renderSwaggerHtml);
59 | app.get("/swagger-ui/", renderSwaggerHtml);
60 | 
61 | app.get("/api/swagger.json", function(req, res) {
62 |   res.set('Content-Type', 'application/json');
63 |   var response = JSON.parse(fs.readFileSync( './src/swagger.json').toString());
64 |   var info = JSON.parse(fs.readFileSync('./package.json'));
65 |   response.info.version = info.version || 'dev';
66 |   res.send(JSON.stringify(response));
67 | });
68 | 
69 | // Set /public as our static content dir
70 | app.use("/swagger-ui/", express.static("./node_modules/swagger-ui-dist/"));
71 | 
72 | var server = http.createServer(app).listen(port, function() {
73 |   console.log('twtxt registry listening on port ' + port);
74 |   if (process.env.START_UPDATING) {
75 |     const updateInterval = parseInt(process.env.UPDATING_INTERVAL || "900", 10);
76 |     storage.startUpdating(updateInterval);
77 |   }
78 | });
79 | 
80 | storage.addUser("https://buckket.org/twtxt.txt", "buckket", function() {
81 | });
82 | 
83 | storage.addUser("https://buckket.org/twtxt_news.txt", "twtxt_news", function() {
84 | });
85 | 
86 | storage.addUser("https://dracoblue.net/twtxt.txt", "dracoblue", function() {
87 | });
88 | 


--------------------------------------------------------------------------------
/src/plainApi.mjs:
--------------------------------------------------------------------------------
  1 | import express from "express";
  2 | import url from "url";
  3 | 
  4 | var plainApi = function(storage) {
  5 |   var api = express.Router();
  6 | 
  7 |   var renderAuthorForTweet = function(tweet) {
  8 |     if (tweet.author_nickname) {
  9 |       return tweet.author_nickname + "\t" + tweet.author_url;
 10 |     }
 11 | 
 12 |     return tweet.author_url + "\t" + tweet.author_url;
 13 |   };
 14 | 
 15 |   api.use(function (req, res, next) {
 16 |     res.set('Content-Type', 'text/plain');
 17 |     next();
 18 |   });
 19 | 
 20 |   api.get('/tags/:tag', function (req, res) {
 21 |     if (!req.params.tag) {
 22 |       res.status(400);
 23 |       res.end();
 24 |       return;
 25 |     }
 26 | 
 27 |     var page = parseInt(req.query.page || 1, 10) || 1;
 28 | 
 29 |     storage.getTweetsByHashTag("#" + req.params.tag, page, function (tweets) {
 30 |       var response = [];
 31 | 
 32 |       tweets.forEach(function (tweet) {
 33 |         response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text);
 34 |       });
 35 |       res.send(response.join("\n"));
 36 |     })
 37 |   });
 38 | 
 39 |   api.get('/tweets', function (req, res) {
 40 |     var page = parseInt(req.query.page || 1, 10) || 1;
 41 | 
 42 |     storage.searchTweets(req.query.q || '', page, function (tweets) {
 43 |       var response = [];
 44 | 
 45 |       tweets.forEach(function (tweet) {
 46 |         response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text);
 47 |       });
 48 |       res.send(response.join("\n"));
 49 |     })
 50 |   });
 51 | 
 52 |   api.get('/mentions', function (req, res) {
 53 |     if (!req.query.url) {
 54 |       res.status(400);
 55 |       res.send("`url` must be provided.");
 56 |       res.end();
 57 |       return;
 58 |     }
 59 | 
 60 |     var page = parseInt(req.query.page || 1, 10) || 1;
 61 | 
 62 |     storage.getTweetsByMentions(req.query.url, page, function (tweets) {
 63 |       var response = [];
 64 | 
 65 |       tweets.forEach(function (tweet) {
 66 |         response.push(renderAuthorForTweet(tweet) + "\t" + tweet.timestamp + "\t" + tweet.text);
 67 |       });
 68 |       res.send(response.join("\n"));
 69 |     })
 70 |   });
 71 | 
 72 |   api.post('/users', function (req, res) {
 73 |     if (!req.query.url || !req.query.nickname) {
 74 |       res.status(400);
 75 |       res.send("`nickname` and `url` must be provided.");
 76 |       return;
 77 |     }
 78 | 
 79 |     if (!req.query.nickname.match(/^[A-Za-z0-9_-]+$/)) {
 80 |       res.status(400);
 81 |       res.send("`nickname` must match ^[A-Za-z0-9_-]+$");
 82 |       return ;
 83 |     }
 84 | 
 85 |     var urlParts = url.parse(req.query.url);
 86 |     if (!urlParts['hostname'] || !urlParts['protocol'] || (urlParts['protocol'] != 'https:' && urlParts['protocol'] != 'http:') ) {
 87 |       res.status(400);
 88 |       res.send("`url` must provide hostname and either protocol as http or https!");
 89 |       return ;
 90 |     }
 91 | 
 92 |     storage.addUser(req.query.url, req.query.nickname, function () {
 93 |       res.send("OK");
 94 |     });
 95 |   });
 96 | 
 97 |   api.get('/users', function (req, res) {
 98 |     var page = parseInt(req.query.page || 1, 10) || 1;
 99 | 
100 |     storage.searchUsers(req.query.q || '', page, function (users) {
101 |       var response = [];
102 | 
103 |       users.forEach(function (user) {
104 |         response.push(user.nickname + "\t" + user.url + "\t" + user.timestamp);
105 |       });
106 |       res.send(response.join("\n"));
107 |     })
108 |   });
109 | 
110 |   return api;
111 | };
112 | 
113 | export default plainApi;


--------------------------------------------------------------------------------
/src/twtxt-utils/TwtxtTxt.mjs:
--------------------------------------------------------------------------------
  1 | import moment from 'moment';
  2 | import urlUtils from "url";
  3 | import md5 from 'md5';
  4 | 
  5 | var TwtxtTxt = function(url, nickname, body) {
  6 |   this.body = body;
  7 |   this.url = url;
  8 |   this.nickname = nickname;
  9 | 
 10 |   this.setTweetsByBody(this.body);
 11 | };
 12 | 
 13 | TwtxtTxt.prototype.getTweets = function() {
 14 |   return this.tweets;
 15 | };
 16 | 
 17 | TwtxtTxt.prototype.setTweetsByBody = function(body) {
 18 |   var that = this;
 19 |   this.tweets = [];
 20 | 
 21 |   body.split("\n").forEach(function(row) {
 22 |     row = (row || "").trim();
 23 | 
 24 |     if (row) {
 25 |       var match = row.match(/^([^\t]+)\t(.+)/);
 26 | 
 27 |       if (match && moment(match[1]).isValid()) {
 28 | 
 29 |         var text = match[2].trim();
 30 |         var body = text.toString();
 31 | 
 32 |         var hashTags = that.extractHashTagsByRow(text);
 33 |         var mentions = that.extractMentionsByRow(text);
 34 | 
 35 |         if (body) {
 36 |           var currentMatch = body.match(/@<([^ ]+) ([^> ]+)>/);
 37 |           while (currentMatch) {
 38 |             body = body.replace(currentMatch[0], '<a href="' + that.encodeXml(currentMatch[2]) + '" class="username">@' + that.encodeXml(currentMatch[1]) + '</a>');
 39 |             currentMatch = body.match(/@<([^ ]+) ([^> ]+)>/);
 40 |           }
 41 | 
 42 |           currentMatch = body.match(/@<([^> ]+)>/);
 43 |           while (currentMatch) {
 44 |             body = body.replace(currentMatch[0], '<a href="' + that.encodeXml(currentMatch[1]) + '" class="username">@' + that.encodeXml(urlUtils.parse(currentMatch[1])['hostname'] || currentMatch[1]) + '</a>');
 45 |             currentMatch = body.match(/@<([^> ]+)>/);
 46 |           }
 47 |         }
 48 | 
 49 | 
 50 |         that.tweets.push({
 51 |           id: md5(that.url + "\t" + row),
 52 |           timestamp: moment(match[1]).toISOString(),
 53 |           hashTags: hashTags,
 54 |           mentions: mentions,
 55 |           author_url: that.url,
 56 |           author_nickname: that.nickname,
 57 |           body: body,
 58 |           text: text
 59 |         });
 60 |       }
 61 |     }
 62 |   });
 63 | };
 64 | 
 65 | 
 66 | var xml_special_to_escaped_one_map = {
 67 |   '&': '&',
 68 |   '"': '"',
 69 |   '<': '&lt;',
 70 |   '>': '&gt;'
 71 | };
 72 | 
 73 | var escaped_one_to_xml_special_map = {
 74 |   '&': '&',
 75 |   '"': '"',
 76 |   '&lt;': '<',
 77 |   '&gt;': '>'
 78 | };
 79 | 
 80 | TwtxtTxt.prototype.encodeXml = function(string) {
 81 |   return string.replace(/([\&"<>])/g, function(str, item) {
 82 |     return xml_special_to_escaped_one_map[item];
 83 |   });
 84 | };
 85 | 
 86 | TwtxtTxt.prototype.decodeXml = function(string) {
 87 |   return string.replace(/("|<|>|&)/g,
 88 |     function(str, item) {
 89 |       return escaped_one_to_xml_special_map[item];
 90 |     });
 91 | };
 92 | 
 93 | TwtxtTxt.prototype.extractHashTagsByRow = function(string) {
 94 |   return string.match(/(#[^\s#<>'"]+)/g) || [];
 95 | };
 96 | 
 97 | 
 98 | TwtxtTxt.prototype.extractMentionsByRow = function(body) {
 99 |   var mentions = [];
100 |   var currentMatch = body.match(/@<([^ ]+ [^> ]+)>/g);
101 |   if (currentMatch) {
102 |     currentMatch.forEach(function(mention) {
103 |       var rowMatch = mention.match(/@<([^ ]+) ([^> ]+)>/);
104 |       mentions.push(rowMatch[2]);
105 |     })
106 |   }
107 | 
108 |   currentMatch = body.match(/@<([^> ]+)>/g);
109 |   if (currentMatch) {
110 |     currentMatch.forEach(function(mention) {
111 |       var rowMatch = mention.match(/@<([^> ]+)>/);
112 |       mentions.push(rowMatch[1]);
113 |     })
114 |   }
115 | 
116 |   return mentions;
117 | };
118 | 
119 | export default TwtxtTxt;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Twtxt Registry Server
 2 | 
 3 | A small registry server for twtxt, which allows to query for mentions and hash tags.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | * [gcp firestore in datastore mode](https://cloud.google.com/datastore/docs)
 8 | * [npm](https://nodejs.org) installed
 9 | 
10 | ## Installation
11 | 
12 | Set up the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to a user (or use gcloud auth login) for a gcp datastore project.
13 | Replace `$GOOGLE_PROJECT_NAME` with your google project name.
14 | 
15 | ``` console
16 | $ npm install
17 | $ export PORT=8080
18 | $ export START_UPDATING=1
19 | $ export UPDATING_INTERVAL=900
20 | $ gcloud datastore indexes create --project $GOOGLE_PROJECT_NAME index.yaml
21 | Configurations to update:
22 | 
23 | descriptor:      [index.yaml]
24 | type:            [datastore indexes]
25 | target project:  [your-project-name]
26 | 
27 | 
28 | Do you want to continue (Y/n)?  y
29 | 
30 | ....done.     
31 | $ node src/server.js
32 | ```
33 | 
34 | ## Example API calls for the Plain-Text-Api
35 | 
36 | You can see a demo running at <https://registry.twtxt.org> and a swagger-ui api doc at <https://registry.twtxt.org/swagger-ui/>.
37 | 
38 | Add a new Twtxt User to the Registry:
39 | 
40 | ``` console
41 | $ curl -X POST 'http://localhost:8080/api/plain/users?url=https://dracoblue.net/twtxt.txt&nickname=dracoblue'
42 | OK
43 | ```
44 | 
45 | See latest tweets in the Registry (e.g. <https://registry.twtxt.org/api/plain/tweets>):
46 | 
47 | ``` console
48 | $ curl 'http://localhost:8080/api/plain/tweets'
49 | dracoblue	https://dracoblue.net/twtxt.txt	2016-02-06T21:32:02.000Z	@erlehmann is messing with timestamps in @buckket #twtxt :)
50 | dracoblue	https://dracoblue.net/twtxt.txt	2016-02-06T12:14:18.000Z	Simple nodejs script to convert your twitter timeline to twtxt: https://t.co/txnWsC5jvA ( find my #twtxt at https://t.co/uN1KDXwJ8B )
51 | ```
52 | 
53 | Search for tweets in the Registry (e.g. <https://registry.twtxt.org/api/plain/tweets?q=twtxt>):
54 | 
55 | ``` console
56 | $ curl 'http://localhost:8080/api/plain/tweets?q=twtxt'
57 | buckket	https://buckket.org/twtxt.txt	2016-02-09T12:42:26.000Z	Do we need an IRC channel for twtxt?
58 | buckket	https://buckket.org/twtxt.txt	2016-02-09T12:42:12.000Z	Good Morning, twtxt-world!
59 | ```
60 | 
61 | Retrieve a list of all mentions of a specific twtxt User like `https://buckket.org/twtxt.txt` (e.g. <https://registry.twtxt.org/api/plain/mentions?url=https://buckket.org/twtxt.txt>):
62 | 
63 | ``` console
64 | $ curl 'http://localhost:8080/api/plain/mentions?url=https://buckket.org/twtxt.txt'
65 | dracoblue	https://dracoblue.net/twtxt.txt	2016-02-09T12:57:59.000Z	@<buckket https://buckket.org/twtxt.txt> something like https://gitter.im/ or a freenode channel?
66 | dracoblue	https://dracoblue.net/twtxt.txt	2016-02-08T22:51:47.000Z	@<buckket https://buckket.org/twtxt.txt> looks nice ;)
67 | ```
68 | 
69 | Retrieve a list of all tweets with a specific tag like `#twtxt` (e.g. <https://registry.twtxt.org/api/plain/tags/twtxt>):
70 | 
71 | ``` console
72 | $ curl 'http://localhost:8080/api/plain/tags/twtxt'
73 | dracoblue	https://dracoblue.net/twtxt.txt>	2016-02-06T21:32:02.000Z	@erlehmann is messing with timestamps in @buckket #twtxt :)
74 | dracoblue	https://dracoblue.net/twtxt.txt>	2016-02-06T12:14:18.000Z	Simple nodejs script to convert your twitter timeline to twtxt: https://t.co/txnWsC5jvA ( find my #twtxt at https://t.co/uN1KDXwJ8B )
75 | ```
76 | 
77 | Search for users in the Registry (e.g. <https://registry.twtxt.org/api/plain/users?q=dracoblue>):
78 | 
79 | ``` console
80 | $ curl 'http://localhost:8080/api/plain/users?q=dracoblue'
81 | https://dracoblue.net/twtxt.txt	2016-02-09T12:42:26.000Z	dracoblue
82 | ```
83 | 
84 | ## License
85 | 
86 | This work is copyright by DracoBlue (http://dracoblue.net) and licensed under the terms of MIT License.
87 | 


--------------------------------------------------------------------------------
/src/swagger.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "swagger": "2.0",
  3 |   "info": {
  4 |     "description": "This is a registry server for twtxt. Sourcecode at [Github](https://github.com/DracoBlue/twtxt-registry).",
  5 |     "version": "1.0.0",
  6 |     "title": "Twtxt Registry",
  7 |     "termsOfService": "https://dracoblue.net/about/#nutzungsbedingungen",
  8 |     "contact": {"email": "JanS@DracoBlue.de"},
  9 |     "license": {
 10 |       "name": "MIT",
 11 |       "url": "http://www.apache.org/licenses/LICENSE-2.0.html"
 12 |     }
 13 |   },
 14 |   "basePath": "/api",
 15 |   "tags": [
 16 |     {
 17 |       "name": "tweets",
 18 |       "description": "Tweets in this twtxt registry"
 19 |     },
 20 |     {
 21 |       "name": "users",
 22 |       "description": "Users in this twtxt registry"
 23 |     }
 24 |   ],
 25 |   "paths": {
 26 |     "/plain/users": {
 27 |       "post": {
 28 |         "tags": ["users"],
 29 |         "summary": "Add a user to the registry",
 30 |         "description": "",
 31 |         "operationId": "addUser",
 32 |         "consumes": [
 33 |           "application/json"
 34 |         ],
 35 |         "produces": [
 36 |           "text/plain"
 37 |         ],
 38 |         "parameters": [
 39 |           {
 40 |             "in": "query",
 41 |             "name": "nickname",
 42 |             "description": "Nickname at the registry",
 43 |             "required": true,
 44 |             "type": "string"
 45 |           },
 46 |           {
 47 |             "in": "query",
 48 |             "name": "url",
 49 |             "description": "Url of the twtxt.txt file",
 50 |             "required": true,
 51 |             "type": "string"
 52 |           }
 53 |         ]
 54 |       },
 55 |       "get": {
 56 |         "tags": ["users"],
 57 |         "summary": "Finds users",
 58 |         "description": "",
 59 |         "operationId": "findUsers",
 60 |         "consumes": [
 61 |           "application/json"
 62 |         ],
 63 |         "produces": [
 64 |           "text/plain"
 65 |         ],
 66 |         "parameters": [
 67 |           {
 68 |             "in": "query",
 69 |             "name": "q",
 70 |             "description": "Part of the nickname",
 71 |             "required": false,
 72 |             "default": "buckket",
 73 |             "type": "string"
 74 |           },
 75 |           {
 76 |             "in": "query",
 77 |             "name": "page",
 78 |             "description": "Page to query for",
 79 |             "required": false,
 80 |             "type": "number"
 81 |           }
 82 |         ]
 83 |       }
 84 |     },
 85 |     "/plain/mentions": {
 86 |       "get": {
 87 |         "tags": ["tweets"],
 88 |         "summary": "Finds tweets by mention",
 89 |         "description": "",
 90 |         "operationId": "findMentions",
 91 |         "consumes": [
 92 |           "application/json"
 93 |         ],
 94 |         "produces": [
 95 |           "text/plain"
 96 |         ],
 97 |         "parameters": [
 98 |           {
 99 |             "in": "query",
100 |             "name": "url",
101 |             "description": "Url of the twtxt.txt file",
102 |             "default": "https://buckket.org/twtxt.txt",
103 |             "required": true,
104 |             "type": "string"
105 |           },
106 |           {
107 |             "in": "query",
108 |             "name": "page",
109 |             "description": "Page to query for",
110 |             "required": false,
111 |             "type": "number"
112 |           }
113 |         ]
114 |       }
115 |     },
116 |     "/plain/tweets": {
117 |       "get": {
118 |         "tags": ["tweets"],
119 |         "summary": "Finds tweets by query",
120 |         "description": "",
121 |         "operationId": "findTweets",
122 |         "consumes": [
123 |           "application/json"
124 |         ],
125 |         "produces": [
126 |           "text/plain"
127 |         ],
128 |         "parameters": [
129 |           {
130 |             "in": "query",
131 |             "name": "q",
132 |             "description": "Search for tweets",
133 |             "required": false,
134 |             "default": "twtxt",
135 |             "type": "string"
136 |           },
137 |           {
138 |             "in": "query",
139 |             "name": "page",
140 |             "description": "Page to query for",
141 |             "required": false,
142 |             "type": "number"
143 |           }
144 |         ]
145 |       }
146 |     },
147 |     "/plain/tags/{tag}": {
148 |       "get": {
149 |         "tags": ["tweets"],
150 |         "summary": "Finds tweets by query",
151 |         "description": "",
152 |         "operationId": "find",
153 |         "consumes": [
154 |           "application/json"
155 |         ],
156 |         "produces": [
157 |           "text/plain"
158 |         ],
159 |         "parameters": [
160 |           {
161 |             "in": "path",
162 |             "name": "tag",
163 |             "description": "The hash tag",
164 |             "required": true,
165 |             "type": "string",
166 |             "default": "twtxt"
167 |           },
168 |           {
169 |             "in": "query",
170 |             "name": "page",
171 |             "description": "Page to query for",
172 |             "required": false,
173 |             "type": "number"
174 |           }
175 |         ]
176 |       }
177 |     }
178 |   }
179 | }


--------------------------------------------------------------------------------
/src/Storage.mjs:
--------------------------------------------------------------------------------
  1 | import {stemmer} from 'stemmer'
  2 | import md5 from 'md5';
  3 | import TwtxtTxt from './twtxt-utils/TwtxtTxt.mjs';
  4 | import urlUtils from 'url';
  5 | import http from 'http';
  6 | import https from 'https';
  7 | import fs from 'fs';
  8 | import robots from 'robots';
  9 | 
 10 | const info = JSON.parse(fs.readFileSync( './package.json'));
 11 | info.version = process.env.APP_VERSION || info.version || 'dev';
 12 | 
 13 | let Storage = function(datastore) {
 14 |   this.datastore = datastore;
 15 |   this.robotsParserUrlMap = {};
 16 |   this.userAgent = "twtxt-registry/" + info.version;
 17 | };
 18 | 
 19 | Storage.prototype.addUser = function(url, nickname, cb) {
 20 |   console.log("Save user for url", url);
 21 | 
 22 |   this.datastore.save({
 23 |     key: this.datastore.key(['users', url]),
 24 |     data: {
 25 |       timestamp: new Date().toJSON(),
 26 |       nickname,
 27 |       url
 28 |     }
 29 |   }).then(cb);
 30 | };
 31 | 
 32 | Storage.prototype.storeTweet = async function(tweet) {
 33 |   console.log("Save tweet for id", tweet.id);
 34 | 
 35 |   tweet.stems = tweet.text.split(" ").map((word) => {
 36 |     return stemmer(word.trim()).trim();
 37 |   });
 38 | 
 39 |   await this.datastore.save({
 40 |     key: this.datastore.key(['tweets', tweet.id]),
 41 |     data: tweet
 42 |   });
 43 | };
 44 | 
 45 | Storage.prototype.getAllUsers = async function() {
 46 |   const query = this.datastore.createQuery('users');
 47 |   const [users] = await query.run();
 48 |   return users;
 49 | }
 50 | 
 51 | Storage.prototype.forEachUser = function(cb) {
 52 |   const query = this.datastore.createQuery('users');
 53 |   query.run((err, entities) => {
 54 |     entities.map((entity) => {
 55 |       return entity;
 56 |     }).forEach(cb);
 57 |   });
 58 | };
 59 | 
 60 | Storage.prototype.getTweetsByHashTag = function(hashTag, page, cb) {
 61 |   const query = this.datastore.createQuery('tweets');
 62 |   query.filter('hashTags', '=', hashTag);
 63 |   query.order('timestamp',{
 64 |     descending: true
 65 |   });
 66 |   query.offset((page * 20) - 20);
 67 |   query.limit(20);
 68 |   query.run((err, entities) => {
 69 |     cb(entities.map((entity) => {
 70 |       return entity;
 71 |     }));
 72 |   });
 73 | };
 74 | 
 75 | Storage.prototype.searchTweets = function(queryString, page, cb) {
 76 |   const query = this.datastore.createQuery('tweets');
 77 |   if (queryString) {
 78 |     query.filter('stems', 'IN', stemmer(queryString).split(" "));
 79 |   }
 80 |   query.order('timestamp',{
 81 |     descending: true
 82 |   });
 83 |   query.offset((page * 20) - 20);
 84 |   query.limit(20);
 85 |   query.run((err, entities) => {
 86 |     cb(entities.map((entity) => {
 87 |       return entity;
 88 |     }));
 89 |   });
 90 | };
 91 | 
 92 | Storage.prototype.searchUsers = function(queryString, page, cb) {
 93 |   const query = this.datastore.createQuery('users');
 94 |   if (queryString) {
 95 |     query.filter('nickname', '=', queryString);
 96 |   }
 97 | 
 98 |   query.run((err, entities) => {
 99 |     cb(entities.map((entity) => {
100 |       return entity;
101 |     }));
102 |   });
103 | };
104 | 
105 | 
106 | Storage.prototype.getTweetsByMentions = function(twtxtUrl, page, cb) {
107 |   const query = this.datastore.createQuery('tweets');
108 |   query.filter('mentions', '=', twtxtUrl);
109 |   query.order('timestamp',{
110 |     descending: true
111 |   });
112 |   query.offset((page * 20) - 20);
113 |   query.limit(20);
114 |   query.run((err, entities) => {
115 |     if (err) {
116 |       throw err;
117 |     }
118 |     cb(entities.map((entity) => {
119 |       return entity;
120 |     }));
121 |   });
122 | };
123 | 
124 | Storage.prototype.isTimeForCrawl = function (crawlDelay, secondsSinceStartOfDay, updateIntervalInSeconds) {
125 |   if (!crawlDelay) {
126 |     return true;
127 |   }
128 | 
129 |   let crawlSlotNumberPreviousSlot = Math.floor(secondsSinceStartOfDay / crawlDelay);
130 |   let crawlSlotNumberCurrentSlot = Math.floor((secondsSinceStartOfDay + updateIntervalInSeconds) / crawlDelay);
131 | 
132 |   if (crawlSlotNumberCurrentSlot > crawlSlotNumberPreviousSlot) {
133 |     return true;
134 |   }
135 | 
136 |   return false;
137 | 
138 |   /* secondsSinceStartOfDay:
139 |    * 0
140 |    * 900
141 |    * 1800
142 |    * 2700
143 |    * 3600        ---> Math.floor(3600 / 4000) = 0
144 |    *    --> 4000
145 |    * 4500        ---> Math.floor(4500 / 4000) = 1
146 |    * 5400
147 |    * 6300
148 |    * 7200
149 |    *    --> 8000
150 |    * 8100
151 |    *
152 |    * crawlDelay = 4000
153 |    */
154 | 
155 | 
156 | }
157 | 
158 | Storage.prototype.getRobotsTxtParserForUrl = async function (robotsTxtUrl) {
159 |   return new Promise((resolve, reject) => {
160 |     new robots.RobotsParser(
161 |         robotsTxtUrl,
162 |         this.userAgent,
163 |         (parser, success) => {
164 |           if (success) {
165 |             resolve(parser);
166 |           } else {
167 |             reject(new Error('Cannot create parser for ' + robotsTxtUrl));
168 |           }
169 |         }
170 |     );
171 |   });
172 | };
173 | 
174 | Storage.prototype.executeUpdate = async function (updateInterval) {
175 |   const now = new Date();
176 |   const lastUpdate = Math.ceil(now.getTime() / 1000);
177 | 
178 |   let users = await this.getAllUsers();
179 | 
180 |   await Promise.all(users.map(async (user) => {
181 |     var client = http;
182 |     var options = urlUtils.parse(user.url);
183 | 
184 |     if (options['protocol'] === "https:") {
185 |       client = https;
186 |     }
187 | 
188 |     options.headers = {
189 |       "User-Agent": this.userAgent
190 |     };
191 | 
192 |     options.method = 'GET';
193 | 
194 |     var robotsTxtOptions = JSON.parse(JSON.stringify(options));
195 |     robotsTxtOptions.path = "/robots.txt";
196 |     robotsTxtOptions.pathname = "/robots.txt";
197 |     var robotsTxtUrl = urlUtils.format(robotsTxtOptions);
198 | 
199 |     let robotsUrlParser = null;
200 | 
201 |     try {
202 |       robotsUrlParser = await this.getRobotsTxtParserForUrl(robotsTxtUrl);
203 |     } catch (error) {
204 |       return ;
205 |     }
206 | 
207 |     let crawlDelay = Math.ceil(robotsUrlParser.getCrawlDelay(this.userAgent));
208 | 
209 |     /* default delay is 100 times a day (if it is 900 and seconds is 60. our setting is after 120/60 seconds) */
210 | 
211 |     console.log("CrawlDelay: ", crawlDelay, " for ", robotsTxtUrl);
212 |     console.log("last update at", lastUpdate, "update interval is ", updateInterval);
213 | 
214 |     if (!this.isTimeForCrawl(crawlDelay, lastUpdate, updateInterval)) {
215 |       console.log("does not match crawlDelay! STOP");
216 |       return ;
217 |     }
218 | 
219 |     console.log("does match crawlDelay! FETCH");
220 | 
221 |     let {access, url, reason} = await (new Promise((resolve, reject) => {
222 |       robotsUrlParser.canFetch(this.userAgent, options.path,  (access, url, reason) => {
223 |         resolve({
224 |           access, url, reason
225 |         });
226 |       });
227 |     }));
228 | 
229 |     if (!access) {
230 |       console.error("not allowed to fetch", user.url, "because of " + robotsTxtUrl + ":", reason.type, " statusCode:", reason.statusCode);
231 |       return ;
232 |     }
233 | 
234 |     let lastModifiedEntity = null;
235 | 
236 |     try {
237 |       const [lastModifiedEntity] = await this.datastore.get(this.datastore.key(['last-modified-since', user.url]));
238 |     } catch (error) {
239 |     }
240 | 
241 |     if (lastModifiedEntity) {
242 |       options.headers['If-Modified-Since'] = lastModifiedEntity['timestamp'];
243 |     }
244 | 
245 |     let {statusCode, body, lastModified} = await (new Promise((resolve, reject) => {
246 |       client.request(options, (res) => {
247 |         let body = [];
248 |         res.on('data', (chunk) => {
249 |           body.push(chunk);
250 |         }).on('end', () => {
251 |           resolve({
252 |             body: Buffer.concat(body).toString(),
253 |             statusCode: res.statusCode,
254 |             lastModified: res.headers['last-modified']
255 |           });
256 |         });
257 |       }).on('error', reject).end();
258 |     }));
259 | 
260 |     if (statusCode === 304) {
261 |       return ;
262 |     }
263 | 
264 |     var txt = new TwtxtTxt(user.url, user.nickname, body);
265 | 
266 |     await Promise.all(txt.getTweets().map(async (tweet) => {
267 |       if (tweet.body.length > 1500) {
268 |         console.log('Skip tweet - has more than 1500 body letters.');
269 |       } else {
270 |         await this.storeTweet(tweet);
271 |       }
272 |     }));
273 | 
274 |     if (lastModified) {
275 |       // FIXME: ttl 60*60*24
276 |       await this.datastore.save([{
277 |         'key': this.datastore.key(['last-modified-since', user.url]),
278 |         'data': {
279 |           'timestamp': lastModified
280 |         }
281 |       }]);
282 |     };
283 |   }));
284 | };
285 | 
286 | Storage.prototype.startUpdating = function(updateInterval) {
287 |   clearInterval(this.updatingInterval);
288 | 
289 |   let execute = () => {
290 |     this.executeUpdate(updateInterval);
291 |   };
292 | 
293 |   this.updatingInterval = setInterval(() => {
294 |     execute();
295 |   }, updateInterval * 1000);
296 | 
297 |   execute();
298 | };
299 | 
300 | 
301 | export default Storage;


--------------------------------------------------------------------------------