├── sample-data
    ├── graph.png
    ├── histo10.png
    └── histo10b.png
├── repextract.awk
├── CMakeLists.txt
├── LICENSE
├── ext
    └── sqlitewriter
    │   ├── minipsql.hh
    │   ├── psqlwriter.hh
    │   ├── sqlwriter.hh
    │   ├── minipsql.cc
    │   ├── psqlwriter.cc
    │   └── sqlwriter.cc
├── audience-minutes.js
├── audience.ipynb
├── access2sql.cc
└── README.md


/sample-data/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/graph.png


--------------------------------------------------------------------------------
/sample-data/histo10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/histo10.png


--------------------------------------------------------------------------------
/sample-data/histo10b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/histo10b.png


--------------------------------------------------------------------------------
/repextract.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk -f
 2 | #  1      2 3   4                     5      6    7                                          8       9   10    11
 3 | # 0.0.0.0 - - [26/Jun/2021:00:01:08 +0200] "GET /articles/report.json?scrollPerc=4&count=5 HTTP/1.1" 200 0 "https://berthub.eu/articles/posts/reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/" 
 4 | 
 5 | BEGIN { printf("url,perc,count\n"); }
 6 | {
 7 |     split($11, parts, "[\"?]");
 8 |     url=parts[2];
 9 | 
10 |     #   0                      1       2 3     4
11 |     # /articles/report.json?scrollPerc=4&count=5
12 |     if($7 ~ /report.json/) {
13 |         split($7, parts, "[?=&]");
14 |     
15 |         printf("\"%s\",%s,%s\n", url, parts[3], parts[5]);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -O3")
 3 | 
 4 | project(audience VERSION 1.0
 5 |                   DESCRIPTION "Hello deep learning"
 6 |                   LANGUAGES CXX)
 7 | 
 8 | 
 9 | #add_compile_options(-fsanitize=address)
10 | #add_link_options(-fsanitize=address)
11 |                 
12 | set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard to use")
13 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
14 | set(CMAKE_CXX_EXTENSIONS ON)
15 | 
16 | set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
17 | set(THREADS_PREFER_PTHREAD_FLAG TRUE)
18 | find_package(Threads REQUIRED)
19 | 
20 | add_executable(access2sql access2sql.cc ext/sqlitewriter/sqlwriter.cc)
21 | target_link_libraries(access2sql sqlite3 Threads::Threads)
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 bert hubert
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/minipsql.hh:
--------------------------------------------------------------------------------
 1 | #include <postgresql/libpq-fe.h>
 2 | #include <vector>
 3 | #include <string>
 4 | #include <unordered_map>
 5 | 
 6 | class MiniPSQL
 7 | {
 8 | public:
 9 |   MiniPSQL(std::string_view fname);
10 |   ~MiniPSQL();
11 |   std::vector<std::pair<std::string, std::string>> getSchema(const std::string& table);
12 |   void addColumn(const std::string& table, std::string_view name, std::string_view type);
13 |   
14 |   //!< execute a random query, for example a PRAGMA
15 |   std::vector<std::vector<std::string>> exec(const std::string& query);
16 |   //  std::vector<std::vector<std::string>> exec(const std::string& query, const std::vector<std::string>& params);
17 |   std::vector<std::vector<std::string>> exec(const std::string& query, std::vector<const char*> params);
18 | 
19 |   //!< set the prepared statement for a table, question marks as placeholder
20 |   void prepare(const std::string& table, std::string_view str, unsigned int paramsize);
21 |   // offset from 1!!
22 |   template<typename T>
23 |   void bindPrep(const std::string& table, int idx, const T& value)
24 |   {
25 |     bindPrep(table, idx, std::to_string(value));
26 |   }
27 | 
28 |   void bindPrep(const std::string& table, int idx, const std::string& value)
29 |   {
30 |     int pos = idx-1;
31 |     if(d_params[table].size() <= pos)
32 |       d_params[table].resize(pos+1);
33 |     d_params[table][pos]=value;
34 |   }
35 |   
36 |   //!< execute the prepared & bound statement
37 |   void execPrep(const std::string& table);
38 |   
39 |   void begin();
40 |   void commit();
41 |   void cycle();
42 |   
43 |   //!< do we have a prepared statement for this table
44 |   bool isPrepared(const std::string& table) const
45 |   {
46 |     return d_stmts.find(table) != d_stmts.end();
47 |   }
48 | 
49 | private:
50 |   PGconn* d_conn;
51 | 
52 |   std::unordered_map<std::string, std::string> d_stmts; // keyed on table name
53 |   std::unordered_map<std::string, std::vector<std::string>> d_params; // keyed on table name
54 | 
55 |   bool d_intransaction{false};
56 |   bool haveTable(const std::string& table);
57 | };
58 | 


--------------------------------------------------------------------------------
/audience-minutes.js:
--------------------------------------------------------------------------------
 1 | // this small script is intended to give you some insight in how people spend
 2 | // time on your site.
 3 | // It does not track users, there are no cookies or any stored data, and it only gives some probabilistic insights
 4 | 
 5 | 
 6 | // here you can pick the granularity in time
 7 | var intervalSeconds=60;
 8 | // and here the reporting probability. The busier your site is the lower you can set this &
 9 | // still get sufficient reports
10 | var reportingProbability=0.1;
11 | 
12 | var hadActivity=false;
13 | var scrollPerc=0;
14 | var activityCount=0;
15 | 
16 | // this is called every intervalSeconds to see if there was any activity
17 | function reportOrNot()
18 | {
19 |     if(hadActivity) {
20 |         activityCount++;
21 |         if(Math.random() < reportingProbability) {
22 |             let url = '/articles/report.json?scrollPerc='+Math.round(scrollPerc)+"&count="+activityCount;
23 |             var oReq = new XMLHttpRequest();
24 |             oReq.open("GET", url);
25 |             oReq.setRequestHeader("Cache-Control", "no-cache, no-store, max-age=0");
26 |             
27 |             // fallbacks for IE and older browsers:
28 |             oReq.setRequestHeader("Expires", "Tue, 01 Jan 1980 1:00:00 GMT");
29 |             oReq.setRequestHeader("Pragma", "no-cache");
30 | 
31 |             oReq.send();
32 |         }
33 |         hadActivity=false;
34 |     }
35 | }
36 | 
37 | document.addEventListener("DOMContentLoaded", function(event) { 
38 |     document.addEventListener('scroll', function(e) {
39 |         hadActivity=true;
40 |         // I found this on Stacj Overflow somewhere..
41 |         let scrollHeight = Math.max(
42 |             document.body.scrollHeight, document.documentElement.scrollHeight,
43 |             document.body.offsetHeight, document.documentElement.offsetHeight,
44 |             document.body.clientHeight, document.documentElement.clientHeight
45 |         );
46 |         scrollPerc=100.0*window.pageYOffset/(scrollHeight-window.innerHeight);  
47 |     });
48 | 
49 |     document.addEventListener('mousemove', function(e) {
50 |         hadActivity=true;
51 |     });
52 |     
53 |     setInterval(reportOrNot, intervalSeconds * 1000);
54 | });
55 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/psqlwriter.hh:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | #include <variant>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <unordered_map>
 6 | #include <unistd.h>
 7 | 
 8 | /* ok for a remote database, two things are very different:
 9 |    1) We need to stream or batch our inserts, otherwise we get killed by latency
10 |    2) We need isolation from database server failures
11 | 
12 |    The way to do this is to send all the inserts to a worker thread, which takes
13 |    care of the batching, sending, reconnecting etc.
14 | 
15 |    There can be multiple tables, which means the batching needs to happen per
16 |    table. There could be multiple insert signatures, which makes batching harder.
17 |    It may be possibe to use NULLs to create unified signatures though.
18 | 
19 | */
20 | 
21 | 
22 | class PSQLWriter
23 | {
24 | 
25 | public:
26 |   explicit PSQLWriter(std::string_view fname) 
27 |   {
28 |     pipe2(d_pipe, 0); //O_NONBLOCK);
29 |     d_thread = std::thread(&PSQLWriter::commitThread, this);
30 |   }
31 |   typedef std::variant<double, int32_t, uint32_t, int64_t, std::string> var_t;
32 |   void addValue(const std::initializer_list<std::pair<const char*, var_t>>& values, const std::string& table="data")
33 |   {
34 |     addValueGeneric(table, values);
35 |   }
36 |   
37 |   void addValue(const std::vector<std::pair<const char*, var_t>>& values, const std::string& table="data")
38 |   {
39 |     addValueGeneric(table, values);
40 |   }
41 |   
42 |   template<typename T>
43 |   void addValueGeneric(const std::string& table, const T& values);
44 |   ~PSQLWriter()
45 |   {
46 |     //    std::cerr<<"Destructor called"<<std::endl;
47 |     close(d_pipe[1]); // this is the pleasequit signal
48 |     d_thread.join();
49 |   }
50 | 
51 | private:
52 |   void commitThread();
53 |   bool d_pleasequit{false};
54 |   std::thread d_thread;
55 | 
56 |   int d_pipe[2]; // [0] = read, [1] = write
57 |   
58 |   std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> d_columns;
59 |   std::unordered_map<std::string, std::vector<std::string>> d_lastsig;
60 |   bool haveColumn(const std::string& table, std::string_view name);
61 | 
62 |   struct Message
63 |   {
64 |     std::string table;
65 |     std::unordered_map<std::string, var_t> values;
66 |   };
67 | };
68 | 
69 | 
70 | template<typename T>
71 | void PSQLWriter::addValueGeneric(const std::string& table, const T& values)
72 | {
73 |   auto msg = new Message({table});
74 |   for(const auto& v : values) {
75 |     msg->values[v.first] = v.second;
76 |   }
77 |   write(d_pipe[1], &msg, sizeof(msg));
78 | }
79 | 


--------------------------------------------------------------------------------
/audience.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib notebook\n",
 10 |     "%precision 2\n",
 11 |     "import matplotlib\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "plt.rcParams['figure.figsize'] = [9.5, 6]\n",
 14 |     "\n",
 15 |     "import pandas"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "percs=pandas.read_csv(\"percs.csv\")\n",
 25 |     "print(percs.head())\n",
 26 |     "percs[\"perc\"]=pandas.to_numeric(percs[\"perc\"], errors='coerce') # NaN strings\n",
 27 |     "percs=percs[(~percs.perc.isna()) & (percs.perc <=100)]          # remove NaN\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "busy=percs.groupby([\"url\"]).count().sort_values([\"count\"], ascending=False).head(15)\n",
 37 |     "busy"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# given 10% sampling and once a minute measurements, one sample is 10 minutes\n",
 47 |     "# so this gives a table with reading hours\n",
 48 |     "busy.perc*10/60"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {
 55 |     "scrolled": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# you can raise or lower the number of bins depending on how much data you have\n",
 60 |     "for url in busy.index:\n",
 61 |     "    plt.figure()\n",
 62 |     "    plt.hist(sep.perc, bins=10, density=True)\n",
 63 |     "    plt.grid()\n",
 64 |     "    plt.ylabel(\"Density\")\n",
 65 |     "    plt.xlabel(\"Location as percentage of page length\")\n",
 66 |     "    plt.title(\"Sampled density of readership for\\n\"+url)\n",
 67 |     "    print(url)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": []
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "Python 3 (ipykernel)",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "3.9.7"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 4
 99 | }
100 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/sqlwriter.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | #include <vector>
 4 | #include <unordered_map>
 5 | #include <variant>
 6 | #include <mutex>
 7 | #include <thread>
 8 | #include <iostream>
 9 | 
10 | struct sqlite3;
11 | struct sqlite3_stmt;
12 | 
13 | class MiniSQLite
14 | {
15 | public:
16 |   MiniSQLite(std::string_view fname);
17 |   ~MiniSQLite();
18 |   std::vector<std::pair<std::string, std::string>> getSchema(const std::string& table);
19 |   void addColumn(const std::string& table, std::string_view name, std::string_view type);
20 |   std::vector<std::vector<std::string>> exec(std::string_view query);
21 |   void prepare(const std::string& table, std::string_view str);
22 |   void bindPrep(const std::string& table, int idx, bool value);
23 |   void bindPrep(const std::string& table, int idx, int value);
24 |   void bindPrep(const std::string& table, int idx, uint32_t value);
25 |   void bindPrep(const std::string& table, int idx, long value);
26 |   void bindPrep(const std::string& table, int idx, unsigned long value);
27 |   void bindPrep(const std::string& table, int idx, long long value); 
28 |   void bindPrep(const std::string& table, int idx, unsigned long long value);
29 |   void bindPrep(const std::string& table, int idx, double value);
30 |   void bindPrep(const std::string& table, int idx, const std::string& value);
31 |   void execPrep(const std::string& table); 
32 |   void begin();
33 |   void commit();
34 |   void cycle();
35 |   bool isPrepared(const std::string& table) const
36 |   {
37 |     if(auto iter = d_stmts.find(table); iter == d_stmts.end())
38 |       return false;
39 |     else
40 |       return iter->second != nullptr;
41 |   }
42 | 
43 | private:
44 |   sqlite3* d_sqlite;
45 |   std::unordered_map<std::string, sqlite3_stmt*> d_stmts;
46 |   std::vector<std::vector<std::string>> d_rows; // for exec()
47 |   static int helperFunc(void* ptr, int cols, char** colvals, char** colnames);
48 |   bool d_intransaction{false};
49 |   bool haveTable(const std::string& table);
50 | };
51 | 
52 | class SQLiteWriter
53 | {
54 | 
55 | public:
56 |   explicit SQLiteWriter(std::string_view fname) : d_db(fname)
57 |   {
58 |     //    for(const auto& c : d_columns)
59 |     //      cout <<c.first<<"\t"<<c.second<<endl;
60 | 
61 |     d_db.exec("PRAGMA journal_mode='wal'");
62 |     d_db.begin(); // open the transaction
63 |     d_thread = std::thread(&SQLiteWriter::commitThread, this);
64 |   }
65 |   typedef std::variant<double, int32_t, uint32_t, int64_t, std::string> var_t;
66 |   void addValue(const std::initializer_list<std::pair<const char*, var_t>>& values, const std::string& table="data");
67 |   void addValue(const std::vector<std::pair<const char*, var_t>>& values, const std::string& table="data");
68 |   
69 |   template<typename T>
70 |   void addValueGeneric(const std::string& table, const T& values);
71 |   ~SQLiteWriter()
72 |   {
73 |     //    std::cerr<<"Destructor called"<<std::endl;
74 |     d_pleasequit=true;
75 |     d_thread.join();
76 |   }
77 | 
78 | private:
79 |   void commitThread();
80 |   bool d_pleasequit{false};
81 |   std::thread d_thread;
82 |   std::mutex d_mutex;  
83 |   MiniSQLite d_db;
84 |   std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> d_columns;
85 |   std::unordered_map<std::string, std::vector<std::string>> d_lastsig;
86 |   bool haveColumn(const std::string& table, std::string_view name);
87 | 
88 | };
89 | 


--------------------------------------------------------------------------------
/access2sql.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <string>
  3 | #include <stdio.h>
  4 | #include <vector>
  5 | #include <sstream>
  6 | #include <time.h>
  7 | #include "ext/sqlitewriter/sqlwriter.hh"
  8 | 
  9 | /* You can pipe a typical access.log into this, and it will populate sqlite3 database ('access.sqlite3') for you, in streaming fashion.
 10 |    You can safely access that sqlite database while the program runs, see https://berthub.eu/articles/posts/big-data-storage/
 11 | 
 12 |    A fun view to create:
 13 |  create view botfree as select * from data where agent like 'Mozilla/5.0 %' and agent not like '%bot%' and agent not like '%miniflux%' 
 14 | 
 15 |    This filters out the bulk of bots right now. 
 16 | */
 17 | 
 18 | using namespace std;
 19 | 
 20 | struct Parser {
 21 |   explicit Parser(FILE* fp) : d_fp(fp)
 22 |   {}
 23 |   
 24 |   FILE* d_fp;
 25 | 
 26 |   struct EofException{};
 27 |   
 28 |   void skipSpaces()
 29 |   {
 30 |     int c;
 31 |     for(;;) {
 32 |       c = getc(d_fp);
 33 |       if(c==EOF)
 34 |         throw EofException();
 35 |       if(c!=' ') {
 36 |         ungetc(c, d_fp);
 37 |         break;
 38 |       }
 39 |     }
 40 |   }
 41 |   void skipToEol()
 42 |   {
 43 |     int c;
 44 |     for(;;) {
 45 |       c = getc(d_fp);
 46 |       if(c==EOF)
 47 |         throw EofException();
 48 |       if(c=='\n') {
 49 |         break;
 50 |       }
 51 |     }
 52 |   }
 53 | 
 54 |   string getWord()
 55 |   {
 56 |     skipSpaces();
 57 |     string ret;
 58 |     int c;
 59 |     for(;;) {
 60 |       c = getc(d_fp);
 61 |       if(c==EOF)
 62 |         throw EofException();
 63 |       if(c==' ')
 64 |         break;
 65 |       ret.append(1, (char)c);
 66 |     }
 67 |     return ret;
 68 |   }
 69 | 
 70 |   string getDelim(char start, char stop)
 71 |   {
 72 |     skipSpaces();
 73 |     string ret;
 74 |     int c;
 75 |     c = getc(d_fp);
 76 |     if(c==EOF)
 77 |       throw EofException();
 78 | 
 79 |     if(c!=start)
 80 |       throw runtime_error("Wrong delimiter, skipping");
 81 |     for(;;) {
 82 |       c = getc(d_fp);
 83 |       if(c==EOF)
 84 |         throw EofException();
 85 | 
 86 |       if(c==EOF)
 87 |         throw EofException();
 88 |       if(c==stop)
 89 |         break;
 90 |       if(c=='\n')
 91 |         throw runtime_error("Delimiter not found on line, skipping");
 92 |       ret.append(1, (char)c);
 93 |     }
 94 |     return ret;
 95 |   }
 96 | 
 97 |   int64_t getNumber()
 98 |   {
 99 |     string word = getWord();
100 |     return stol(word);
101 |   }
102 |   string getQuotedWord();
103 | };
104 | 
105 | // 19/Mar/2023:00:00:10 +0100
106 | time_t getTime(const string& in)
107 | {
108 |   struct tm tm{};
109 |   strptime(in.c_str(), "%d/%b/%Y:%H:%M:%S %z", &tm);
110 | 
111 |   // this gets the timezone wrong! XXX
112 |   return mktime(&tm);
113 | }
114 | 
115 | bool starts_with(const std::string& str, const std::string& prefix)
116 | {
117 |     return str.compare(0, prefix.length(), prefix) == 0;
118 | }
119 | 
120 | // written by ChatGPT!
121 | vector<string> split_string(const string& input)
122 | {
123 |   istringstream iss(input);
124 |   vector<string> tokens;
125 |   string token;
126 |   while (iss >> token)
127 |     tokens.push_back(token);
128 |   return tokens;
129 | }
130 | 
131 | int main(int argc, char** argv)
132 | try
133 | {
134 |   Parser p(stdin);
135 |   SQLiteWriter sqw(argc > 1 ? argv[1] : "access.sqlite3");
136 |   
137 |   // ::ffff:194.117.254.60 - - [19/Mar/2023:00:00:10 +0100] "GET /articles/posts/nerdfluisteraar/ HTTP/1.1" 200 16627 "-" "Friendica 'Giant Rhubarb' 2023.01-1502; https://friendica.se1.eu"
138 | 
139 |   for(;;) {
140 |     try {
141 |       string ip = p.getWord();
142 |       string ign1 = p.getWord();
143 |       string ign2 = p.getWord();
144 |       if(starts_with(ip, "::ffff:"))
145 |         ip = ip.substr(7);
146 |       string t = p.getDelim('[', ']');
147 |       time_t tim = getTime(t);
148 |       string req = p.getDelim('"', '"');
149 |       int64_t stat = p.getNumber();
150 |       int64_t size = p.getNumber();
151 |       string ref = p.getDelim('"', '"');
152 |       string agent = p.getDelim('"', '"');
153 |       auto parts = split_string(req);
154 |       string url;
155 |       string params;
156 |       if(parts.size() >= 2) {
157 |         url = parts[1];
158 |         
159 |         if(auto pos = url.find('?'); pos != string::npos) {
160 |           params = url.substr(pos+1);
161 |           url.resize(pos);
162 |         }
163 |       }
164 |       sqw.addValue({{"timestamp", tim}, {"ip", ip}, {"url", url},
165 |                     {"params", params}, {"agent", agent}, {"ref", ref},
166 |                     {"stat", stat}, {"siz", size}});
167 |     }
168 |     catch(std::exception& i) {
169 |       cerr<<i.what()<<endl;
170 |     }
171 |     p.skipToEol();
172 |   }
173 | }
174 | catch(Parser::EofException& )
175 | {
176 |   return 0;
177 | }
178 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/minipsql.cc:
--------------------------------------------------------------------------------
  1 | #include "minipsql.hh"
  2 | #include "sqlwriter.hh"
  3 | #include <algorithm>
  4 | using namespace std;
  5 | 
  6 | 
  7 | MiniPSQL::MiniPSQL(std::string_view fname)
  8 | {
  9 |   d_conn = PQconnectdb(&fname[0]);
 10 |   if (PQstatus(d_conn) != CONNECTION_OK) {
 11 |     throw std::runtime_error("Error connecting to postgresql: "+ string(PQerrorMessage(d_conn)));
 12 |   }  
 13 | }
 14 | 
 15 | MiniPSQL::~MiniPSQL()
 16 | {
 17 |   if(d_intransaction)
 18 |     commit();
 19 | 
 20 |   PQfinish(d_conn);
 21 | }
 22 | 
 23 | 
 24 | struct QueryResult
 25 | {
 26 |   explicit QueryResult(PGconn* conn, const std::string& query)
 27 |   {
 28 |     d_res = PQexec(conn, query.c_str());
 29 |     if(PQresultStatus(d_res) == PGRES_COMMAND_OK) {
 30 |       d_ntuples = d_nfields = 0;
 31 |     }
 32 |     else if (PQresultStatus(d_res) != PGRES_TUPLES_OK) {
 33 |       PQclear(d_res);
 34 |       throw std::runtime_error(string("query error: ") + PQerrorMessage(conn));
 35 |     }
 36 |     d_ntuples = PQntuples(d_res);
 37 |     d_nfields = PQnfields(d_res);
 38 |   }
 39 | 
 40 |   explicit QueryResult(PGconn* conn, const std::string& table, const std::string& query, int paramsize)
 41 |   {
 42 |     d_res = PQprepare(conn, ("procedure_"+table).c_str(), query.c_str(), paramsize, NULL);
 43 | 
 44 |     if(PQresultStatus(d_res) != PGRES_COMMAND_OK) {
 45 |       PQclear(d_res);
 46 |       throw std::runtime_error(string("prepare error: ") + PQerrorMessage(conn));
 47 |     }
 48 |     PQclear(d_res);
 49 |     d_res=0;
 50 |   }
 51 | 
 52 |   explicit QueryResult(PGconn* conn, const std::string& query, const std::vector<const char*>& params)
 53 |   {
 54 |     d_res = PQexecParams(conn, query.c_str(), params.size(), NULL, &params[0], NULL, NULL, 0);
 55 |     
 56 |     if (PQresultStatus(d_res) == PGRES_COMMAND_OK) {
 57 |       d_ntuples = d_nfields = 0;
 58 |     }
 59 |     else if (PQresultStatus(d_res) != PGRES_TUPLES_OK) {
 60 |       PQclear(d_res);
 61 |       throw std::runtime_error(string("parameter query error: ") + PQerrorMessage(conn));
 62 |     }
 63 |     d_ntuples = PQntuples(d_res);
 64 |     d_nfields = PQnfields(d_res);
 65 |   }
 66 |   
 67 |   explicit QueryResult(PGconn* conn, const std::string& table, const std::vector<string>& params)
 68 |   {
 69 |     vector<const char*> pms;
 70 |     for(const auto& p : params) {
 71 |       //      cout<<"Adding param: '"<<p<<"'\n";
 72 |       pms.push_back(p.c_str());
 73 |     }
 74 |     
 75 |     d_res = PQexecPrepared(conn, ("procedure_"+table).c_str(), params.size(), &pms[0], NULL, NULL, 0);
 76 |     
 77 |     if (PQresultStatus(d_res) == PGRES_COMMAND_OK) {
 78 |       d_ntuples = d_nfields = 0;
 79 |     }
 80 |     else if (PQresultStatus(d_res) != PGRES_TUPLES_OK) {
 81 |       PQclear(d_res);
 82 |       throw std::runtime_error(string("prepared query error: ") + PQerrorMessage(conn));
 83 |     }
 84 |     d_ntuples = PQntuples(d_res);
 85 |     d_nfields = PQnfields(d_res);
 86 |   }
 87 | 
 88 |   
 89 |   vector<string> getRow()
 90 |   {
 91 |     vector<string> ret;
 92 | 
 93 |     if(d_row  < d_ntuples) {
 94 |       for (unsigned int j = 0; j < d_nfields; j++)
 95 |         ret.push_back(PQgetvalue(d_res, d_row, j));
 96 |     }
 97 |     d_row++;
 98 |     return ret;
 99 |   }
100 |   
101 |   ~QueryResult()
102 |   {
103 |     if(d_res)
104 |       PQclear(d_res);
105 |   }
106 |   PGresult* d_res=0;
107 |   unsigned int d_row=0;
108 |   unsigned int d_ntuples;
109 |   unsigned int d_nfields;
110 | };
111 | 
112 | std::vector<std::vector<std::string>> MiniPSQL::exec(const std::string& query)
113 | {
114 |   std::vector<std::vector<std::string>> ret;
115 | 
116 |   QueryResult qr(d_conn, query);
117 |   for(;;) {
118 |     auto row = qr.getRow();
119 |     if(row.empty())
120 |       break;;
121 |     ret.push_back(row);
122 |   }
123 | 
124 |   return ret;
125 | }
126 | 
127 | std::vector<std::vector<std::string>> MiniPSQL::exec(const std::string& query, vector<const char*> params)
128 | {
129 |   std::vector<std::vector<std::string>> ret;
130 | 
131 |   QueryResult qr(d_conn, query, params);
132 |   for(;;) {
133 |     auto row = qr.getRow();
134 |     if(row.empty())
135 |       break;;
136 |     ret.push_back(row);
137 |   }
138 | 
139 |   return ret;
140 | }
141 | 
142 | 
143 | void MiniPSQL::execPrep(const std::string& table)
144 | {
145 |   QueryResult qr(d_conn, table, d_params[table]);
146 |   d_params[table].clear();
147 | }
148 | 
149 | void MiniPSQL::addColumn(const std::string& table, std::string_view name, std::string_view type)
150 | {
151 |   // SECURITY PROBLEM - somehow we can't do prepared statements here
152 |   
153 |   if(!haveTable(table)) {
154 |     exec("create table if not exists "+table+" ( "+(string)name+" "+(string)type+" )");
155 |   } else {
156 |     //    cout<<"Adding column "<<name<<" to table "<<table<<endl;
157 |     exec("ALTER table \""+table+"\" add column \""+string(name)+ "\" "+string(type));
158 |   }
159 | 
160 | }
161 | 
162 | //! Get field names and types from a table
163 | vector<pair<string,string> > MiniPSQL::getSchema(const std::string& table)
164 | {
165 |   vector<pair<string,string>> ret;
166 |   
167 |   auto rows = exec("SELECT column_name, udt_name FROM information_schema.columns where table_name='"+table+"'");
168 | 
169 |   for(const auto& r : rows) {
170 |     ret.push_back({r[0], r[1]});
171 |   }
172 |   sort(ret.begin(), ret.end(), [](const auto& a, const auto& b) {
173 |     return a.first < b.first;
174 |   });
175 | 
176 |   //  cout<<"returning "<<ret.size()<<" rows for table "<<table<<"\n";
177 |   return ret;
178 | }
179 | 
180 | void MiniPSQL::prepare(const std::string& table, std::string_view str, unsigned int paramsize)
181 | {
182 |   cout<<"prep"<<endl;
183 |   if(!d_stmts[table].empty()) {
184 |     cout<<"dealloc!"<<endl;
185 |     exec("deallocate procedure_"+table);
186 |   }
187 | 
188 |   d_stmts[table]=str;
189 |   d_params[table].clear();
190 |   QueryResult qr(d_conn, (string)table, (string)str, paramsize);
191 | }
192 | 
193 | void MiniPSQL::begin()
194 | {
195 |   d_intransaction=true;
196 |   exec("begin");
197 | }
198 | void MiniPSQL::commit()
199 | {
200 |   d_intransaction=false;
201 |   exec("commit");
202 | }
203 | 
204 | void MiniPSQL::cycle()
205 | {
206 |   exec("commit;begin");
207 | }
208 | 
209 | bool MiniPSQL::haveTable(const string& table)
210 | {
211 |   return !getSchema(table).empty();
212 | }
213 | 
214 | 
215 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # audience-minutes
  2 | generate statistics on the number of audience minutes your site is
  3 | receiving, and if readers make it to the end of your screeds.
  4 | 
  5 | > “If we have data, let’s look at data. If all we have are opinions, let’s go with mine.” - Jim Barksdale.
  6 | 
  7 | ![There is some sample data linked from the end of this file](/sample-data/graph.png)
  8 | 
  9 | What is this?
 10 | -------------
 11 | If you publish articles, you probably want to know if people are reading
 12 | them. You can count raw hits but especially if you aren't drawing a lot of
 13 | traffic, many of these hits are actually bots and crawlers. 
 14 | 
 15 | In addition, that the page got loaded doesn't tell you if the visitor
 16 | actually read your work.
 17 | 
 18 | Some time ago I was involved with a Dutch newspaper article, and they could
 19 | tell me how many minutes of reading time it had generated. And this made me
 20 | somewhat jealous.
 21 | 
 22 | With these three scripts, you can instrument your pages with a tiny bit of
 23 | javascript that probabilistically samples if a reader was active over the
 24 | past minute (mouse/touch movement or scrolling). In the default settings,
 25 | 10% of every active minute will be reported.  
 26 | 
 27 | Or in other words, if you get 10 reports, something like 100 minutes was
 28 | spent reading your site. Sorta. 
 29 | 
 30 | In addition, these reports measure at what percentage of the content your
 31 | reader is positioned. This helps you determine if people are making it to
 32 | the end of your page or not.
 33 | 
 34 | At the end of this page I've included some discussion how to interpret the
 35 | data.
 36 | 
 37 | Privacy
 38 | -------
 39 | These scripts use no cookies and no local storage. There are no identifiers.
 40 | You can run this without having to add a cookie or GDPR banner etc. However,
 41 | I personally prefer to sample as little as possible. Many sites will track
 42 | your every click, and even note if you switch to another tab. I find that
 43 | somewhat upsetting.
 44 | 
 45 | You can tailor your level of intrusiveness with the `reportingProbability`
 46 | setting. The busier your site is the lower you can set this at and still
 47 | have decent statistics.
 48 | 
 49 | JavaScript
 50 | ----------
 51 | Insert or link [audience-minutes.js](audience-minutes.js) from all pages you
 52 | want to measure on. There are some settings at the beginning of the file
 53 | where you can tweak how intrusive you want the measurements to be. In other
 54 | words, do you want to sample 10% of every viewing minute? Or 1%? The busier
 55 | your site is the lower you can set this.
 56 | 
 57 | In the same place where you put the javascript file, also put an empty file
 58 | called `report.json`. This will receive reports of active minutes. If you
 59 | don't generate this file, the browser console will show 404s which is ugly.
 60 | 
 61 | The easiest way to parse the results is to grab them from an access.log
 62 | file.
 63 | 
 64 | AWK
 65 | ---
 66 | Yes, AWK! With [this little script](repextract.awk) you can trawl your access.log files and
 67 | generate a CSV file that only has URLs, scroll percentages and minute
 68 | counts. 
 69 | 
 70 | This CSV file has no privacy considerations, there are no IP addresses in
 71 | there. Unlike your original access.log. 
 72 | 
 73 | Jupyter notebook
 74 | ----------------
 75 | To turn the CSV file into a graphs, use [this Jupyter
 76 | script](audience.ipynb), from which you can also extract Python 3 if you
 77 | don't want to run Jupyter. It is based on Pandas and Matplotlib.
 78 | 
 79 | A few words on 'density'
 80 | ------------------------
 81 | The "density" is which proportion of the samples were from that part of the
 82 | page.  So if everyone reads the page straight to the end, at a constant
 83 | speed, the density is flat for the whole page.  If it drops off, it means
 84 | some people abandonded the page.  If there is a particularly difficult part
 85 | that takes people a long time to read, this would show up as a positive bump
 86 | in the density.  Conversely, of there is a list of bulletpoints that people
 87 | can go through quickly, that would show up as a negative bump in the
 88 | density.
 89 | 
 90 | 
 91 | How to interpret the results
 92 | ----------------------------
 93 | For starters, you'll notice that even quite high visitor numbers translate
 94 | into not that many "audience minutes". This is not due to this script, it is
 95 | a common disappointment. So if your calculations show that an article had
 96 | 100 reading hours, this actually is quite a lot. 
 97 | 
 98 | For obvious reasons, professional media operations are not very forthcoming
 99 | with these statistics on their readership. One big newspaper article I
100 | worked on garnered 75 confirmed reading hours, for example.
101 | 
102 | So why are these numbers so low? For starters, you may not be seeing all
103 | readers. Perhaps the script doesn't fire on all devices. I've done some
104 | research, I don't think this is a major factor. But it could be.
105 | 
106 | On the other hand, there is a ton of automated traffic coming to sites these
107 | days, lots of crawlers, bots, strange scanners etc. Mind you, some of these
108 | will even execute JavaScript! But most won't. This non-human traffic may
109 | have been inflating your numbers previously.
110 | 
111 | In terms of the graph, if you have a ton of data, interpretation is easy. I
112 | find that you can even see where you put big photos in an article - these
113 | sections do not get a lot of reading minutes.
114 | 
115 | If you have less data, you need to reduce the number of 'bins' in the
116 | histogram. I find that 10 bins work pretty well for general conclusions, and
117 | that might get you this:
118 | 
119 | ![](/sample-data/histo10.png)
120 | 
121 | If you get this, you can conclude that 1) most people that visited the URL
122 | actually wanted to read this kind of content and 2) most readers made it to
123 | the end of your article. The profile is mostly flat, with only some drop-off
124 | near the end, and no suspicious peak at the beginning.
125 | 
126 | Contrast this with:
127 | 
128 | ![](/sample-data/histo10b.png)
129 | 
130 | This was an article that was extremely popular on HackerNews and a few other
131 | places. But we can see that readership clearly peaked in the first 10%. Lots
132 | of people decided that they had read enough at that point. This is
133 | caused by how the article got promoted, and it is not necessarily the
134 | "fault" of the writer.
135 | 
136 | If we ignore the "mistargeting", from that point on, almost everyone makes
137 | it to the end of the article. 
138 | 
139 | 
140 | Sample data
141 | -----------
142 | If you want to help improve these small tools, but you don't yet have a lot
143 | data to work with, please find attached two weeks of data collected by 
144 | [berthub.eu/articles](https://berthub.eu/articles) in the
145 | [sample-data](sample-data/) directory.
146 | 
147 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/psqlwriter.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdexcept>
  3 | #include <poll.h>
  4 | #include "minipsql.hh"
  5 | #include <string.h>
  6 | #include <unistd.h>
  7 | #include <variant>
  8 | #include <thread>
  9 | #include <mutex>
 10 | #include <fcntl.h>
 11 | #include <unordered_set>
 12 | #include <unordered_map>
 13 | #include <optional>
 14 | #include <chrono>
 15 | #include <algorithm>
 16 | #include "psqlwriter.hh"
 17 | #include <map>
 18 | using namespace std;
 19 | 
 20 | struct DTime
 21 | {
 22 |   void start()
 23 |   {
 24 |     d_start =   std::chrono::steady_clock::now();
 25 |   }
 26 |   uint32_t lapUsec()
 27 |   {
 28 |     auto usec = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now()- d_start).count();
 29 |     start();
 30 |     return usec;
 31 |   }
 32 | 
 33 |   std::chrono::time_point<std::chrono::steady_clock> d_start;
 34 | };
 35 | 
 36 | // seconds
 37 | static int waitForRWData(int fd, bool waitForRead, double* timeout, bool* error=0, bool* disconnected=0)
 38 | {
 39 |   int ret;
 40 | 
 41 |   struct pollfd pfd;
 42 |   memset(&pfd, 0, sizeof(pfd));
 43 |   pfd.fd = fd;
 44 | 
 45 |   if(waitForRead)
 46 |     pfd.events=POLLIN;
 47 |   else
 48 |     pfd.events=POLLOUT;
 49 | 
 50 |   ret = poll(&pfd, 1, timeout ? (*timeout * 1000) : -1);
 51 |   if ( ret == -1 ) {
 52 |     throw std::runtime_error("Waiting for data: "+std::string(strerror(errno)));
 53 |   }
 54 |   if(ret > 0) {
 55 |     if (error && (pfd.revents & POLLERR)) {
 56 |       *error = true;
 57 |     }
 58 |     if (disconnected && (pfd.revents & POLLHUP)) {
 59 |       *disconnected = true;
 60 |     }
 61 | 
 62 |   }
 63 |   return ret;
 64 | }
 65 | 
 66 | int waitForData(int fd, double timeout)
 67 | {
 68 |   return waitForRWData(fd, true, &timeout);
 69 | }
 70 | 
 71 | void SetNonBlocking(int sock, bool to)
 72 | {
 73 |   int flags=fcntl(sock,F_GETFL,0);
 74 |   if(flags<0)
 75 |     std::runtime_error(string("Retrieving socket flags: ")+ strerror(errno));
 76 | 
 77 |   // so we could optimize to not do it if nonblocking already set, but that would be.. semantics
 78 |   if(to) {
 79 |     flags |= O_NONBLOCK;
 80 |   }
 81 |   else 
 82 |     flags &= (~O_NONBLOCK);
 83 |       
 84 |   if(fcntl(sock, F_SETFL, flags) < 0)
 85 |     std::runtime_error(string("Setting socket flags: ")+ strerror(errno));
 86 | }
 87 | 
 88 | 
 89 | void PSQLWriter::commitThread()
 90 | {
 91 |   MiniPSQL mp("");
 92 |   mp.exec("begin");
 93 | 
 94 |   map<string,vector<pair<string,string>>> schemas;
 95 | 
 96 |   // so how does this work
 97 |   // we get a stream of messages, each aimed at a single table
 98 |   // we group the messages by table, and check if the table exists, and if all fields exist
 99 |   // if not, we create tables and fields to match
100 |   SetNonBlocking(d_pipe[0], true);
101 |   bool needWait = false;
102 |   time_t prevcommit=time(0);
103 |   for(;;) {
104 |     map<string, vector<Message*>> tabwork; // group by table
105 |     DTime dt;
106 |     dt.start();
107 |     int lim=0;
108 |     int sumparms = 0;
109 |     for(; lim < 10000 && sumparms < 60000; ++lim) {
110 |       Message* msg;
111 |       if(needWait) {
112 |         cout<<"Waiting for data.."<<endl;
113 |         waitForData(d_pipe[0], 1);
114 |       }
115 |       int rc = read(d_pipe[0], &msg, sizeof(msg));
116 | 
117 |       if(rc == 0 && !tabwork.empty())
118 |         break;
119 |       if(rc == 0 || (rc < 0 && errno != EAGAIN)) {
120 |         DTime dt2;
121 |         dt2.start();
122 |         mp.exec("commit");
123 | //        cout<<"Commit took "<<dt2.lapUsec()/1000.0<<" msec"<<endl;
124 |         return;
125 |       }
126 |       if(rc < 0 && errno==EAGAIN) {
127 |         needWait = true;
128 |         break;
129 |       }
130 |       sumparms += msg->values.size();
131 |       tabwork[msg->table].push_back(msg);
132 |       needWait=false;
133 |     }
134 |     cout<<"Received "<<lim<<" messages "<<"from "<<tabwork.size()<<" tables with work, took "<<dt.lapUsec()/1000.0<<"msec, fields:";
135 |     dt.start();
136 | 
137 |     for(auto& [table, work] : tabwork) {
138 |       if(!schemas.count(table)) // new table
139 |         schemas[table] = mp.getSchema(table);
140 | 
141 |       unordered_set<string> fields;
142 |       
143 |       for(const auto& m : work) {
144 |         for(const auto& f : m->values) {
145 |           if(auto iter = fields.find(f.first); iter == fields.end()) {
146 |             fields.insert(f.first);
147 |             pair<string, string> cmp{f.first, std::string()};
148 |             if(!binary_search(schemas[table].begin(), schemas[table].end(), cmp,
149 |                        [](const auto& a, const auto& b)
150 |                        {
151 |                          return a.first < b.first;
152 |                        })) {
153 |               cout<<"shit, we miss "<<f.first<<endl;
154 |               if(std::get_if<double>(&f.second)) {
155 |                 mp.addColumn(table, f.first, "REAL");
156 |                 schemas[table].push_back({f.first, "REAL"});
157 |               }
158 |               else if(std::get_if<string>(&f.second)) {
159 |                 mp.addColumn(table, f.first, "TEXT");
160 |                 schemas[table].push_back({f.first, "TEXT"});
161 |               } else  {
162 |                 mp.addColumn(table, f.first, "BIGINT");
163 |                 schemas[table].push_back({f.first, "BIGINT"});
164 |               }
165 |               
166 |               sort(schemas[table].begin(), schemas[table].end());
167 | 
168 |             }
169 |           }
170 |         }
171 |       }
172 |       
173 |       string query="insert into "+table+" (";
174 |     
175 |       bool first=true;
176 |       for(const auto& f : fields) {
177 |         if(!first)
178 |           query+=',';
179 |         first=false;
180 |         query+=f;
181 |         cout<<" "<<f;
182 |       }
183 |       query += ") values ";
184 |       int ctr=1;
185 |       
186 |       for(unsigned int n=0; n < work.size(); ++n) {
187 |         if(n)
188 |           query +=',';
189 |         query += '(';
190 |         first=true;              
191 |         for(const auto& f: fields ) {
192 |           if(!first)
193 |             query +=',';
194 |           first=false;
195 |           query += '$';
196 |           query += to_string(ctr);
197 |           ctr++;
198 |         }
199 |         
200 |         query += ')';
201 |       }
202 |       cout<<"building query: "<<dt.lapUsec()/1000.0<<"ms\n";
203 |       
204 |     //    cout<<query<<endl;
205 |       
206 |       vector<std::optional<string>> allstrings;
207 |       allstrings.reserve(work.size()*fields.size());
208 |       
209 |       for(const auto& m : work) {
210 |         for(const auto& f: fields) {
211 |           if(auto iter = m->values.find(f); iter!= m->values.end()) {
212 |             std::visit([&allstrings](auto&&arg) {
213 |               using T = std::decay_t<decltype(arg)>;
214 |               if constexpr (std::is_same_v<T, string>)
215 |                              allstrings.push_back(arg);
216 |               else 
217 |                 allstrings.push_back(to_string(arg));
218 |             }, iter->second);
219 |           }
220 |           else {
221 |             allstrings.push_back(std::optional<string>());
222 |           }
223 |         }
224 |       }
225 |       cout<<endl;
226 | //      cout<<"params 1 took "<<dt.lapUsec()/1000.0<<" msec\n";
227 |       // types: integers, values are variable length, the lengths are integers,
228 |       //                                      array of pointers
229 |       
230 |       vector<const char*> allptrs;
231 |       allptrs.reserve(allstrings.size());
232 |       for(const auto& p : allstrings) {
233 |         if(p)
234 |           allptrs.push_back(p->c_str());
235 |         else
236 |           allptrs.push_back(0);
237 |       }
238 |       
239 | //      cout<<"params2 took "<<dt.lapUsec()/1000.0<<" msec\n";
240 |       mp.exec(query, allptrs);
241 | //      cout<<"exec took "<<dt.lapUsec()/1000.0<<" msec\n";
242 |       for(const auto& m : work)
243 |         delete m;
244 | //      cout<<"cleanup took "<<dt.lapUsec()/1000.0<<" msec\n";
245 |       
246 |       if(prevcommit != time(0)) {
247 |         cout<<"commit"<<endl;
248 |         mp.exec("commit");
249 |         mp.exec("begin");
250 |         prevcommit = time(0);
251 |       }
252 |     }
253 |   }
254 | }
255 |   
256 | 
257 | 


--------------------------------------------------------------------------------
/ext/sqlitewriter/sqlwriter.cc:
--------------------------------------------------------------------------------
  1 | #include "sqlwriter.hh"
  2 | #include <algorithm>
  3 | #include <unistd.h>
  4 | #include "sqlite3.h"
  5 | using namespace std;
  6 | 
  7 | MiniSQLite::MiniSQLite(std::string_view fname)
  8 | {
  9 |   if ( sqlite3_open(&fname[0], &d_sqlite)!=SQLITE_OK ) {
 10 |     throw runtime_error("Unable to open "+(string)fname+" for sqlite");
 11 |   }
 12 |   exec("PRAGMA journal_mode='wal'");
 13 |   sqlite3_busy_timeout(d_sqlite, 60000);
 14 | }
 15 | 
 16 | //! Get field names and types from a table
 17 | vector<pair<string,string> > MiniSQLite::getSchema(const std::string& table)
 18 | {
 19 |   vector<pair<string,string>> ret;
 20 |   
 21 |   auto rows = exec("SELECT cid,name,type FROM pragma_table_xinfo('"+table+"')");
 22 | 
 23 |   for(const auto& r : rows) {
 24 |     ret.push_back({r[1], r[2]});
 25 |   }
 26 |   sort(ret.begin(), ret.end(), [](const auto& a, const auto& b) {
 27 |     return a.first < b.first;
 28 |   });
 29 | 
 30 |   //  cout<<"returning "<<ret.size()<<" rows for table "<<table<<"\n";
 31 |   return ret;
 32 | }
 33 | 
 34 | int MiniSQLite::helperFunc(void* ptr, int cols, char** colvals, char** colnames [[maybe_unused]])
 35 | {
 36 |   vector<string> row;
 37 |   row.reserve(cols);
 38 |   for(int n=0; n < cols ; ++n)
 39 |     row.push_back(colvals[n]);
 40 |   ((MiniSQLite*)ptr)->d_rows.push_back(row);
 41 |   return 0;
 42 | }
 43 | 
 44 | vector<vector<string>> MiniSQLite::exec(std::string_view str)
 45 | {
 46 |   char *errmsg;
 47 |   std::string errstr;
 48 |   //  int (*callback)(void*,int,char**,char**)
 49 |   d_rows.clear();
 50 |   int rc = sqlite3_exec(d_sqlite, &str[0], helperFunc, this, &errmsg);
 51 |   if (rc != SQLITE_OK) {
 52 |     errstr = errmsg;
 53 |     sqlite3_free(errmsg);
 54 |     throw std::runtime_error("Error executing sqlite3 query '"+(string)str+"': "+errstr);
 55 |   }
 56 |   return d_rows; 
 57 | }
 58 | 
 59 | void MiniSQLite::bindPrep(const std::string& table, int idx, bool value) {   sqlite3_bind_int(d_stmts[table], idx, value ? 1 : 0);   }
 60 | void MiniSQLite::bindPrep(const std::string& table, int idx, int value) {   sqlite3_bind_int(d_stmts[table], idx, value);   }
 61 | void MiniSQLite::bindPrep(const std::string& table, int idx, uint32_t value) {   sqlite3_bind_int64(d_stmts[table], idx, value);   }
 62 | void MiniSQLite::bindPrep(const std::string& table, int idx, long value) {   sqlite3_bind_int64(d_stmts[table], idx, value);   }
 63 | void MiniSQLite::bindPrep(const std::string& table, int idx, unsigned long value) {   sqlite3_bind_int64(d_stmts[table], idx, value);   }
 64 | void MiniSQLite::bindPrep(const std::string& table, int idx, long long value) {   sqlite3_bind_int64(d_stmts[table], idx, value);   }
 65 | void MiniSQLite::bindPrep(const std::string& table, int idx, unsigned long long value) {   sqlite3_bind_int64(d_stmts[table], idx, value);   }
 66 | void MiniSQLite::bindPrep(const std::string& table, int idx, double value) {   sqlite3_bind_double(d_stmts[table], idx, value);   }
 67 | void MiniSQLite::bindPrep(const std::string& table, int idx, const std::string& value) {   sqlite3_bind_text(d_stmts[table], idx, value.c_str(), value.size(), SQLITE_TRANSIENT);   }
 68 | 
 69 | 
 70 | void MiniSQLite::prepare(const std::string& table, string_view str)
 71 | {
 72 |   if(d_stmts[table]) {
 73 |     sqlite3_finalize(d_stmts[table]);
 74 |     d_stmts[table] = 0;
 75 |   }
 76 |   const char* pTail;
 77 |   
 78 |   if (sqlite3_prepare_v2(d_sqlite, &str[0], -1, &d_stmts[table], &pTail ) != SQLITE_OK) {
 79 |     throw runtime_error("Unable to prepare query "+(string)str);
 80 |   }
 81 | }
 82 | 
 83 | void MiniSQLite::execPrep(const std::string& table)
 84 | {
 85 |   int rc;
 86 |   for(;;) {
 87 |     rc = sqlite3_step(d_stmts[table]); // XXX this needs to be an error checking loop
 88 |     if(rc == SQLITE_DONE)
 89 |       break;
 90 |     else
 91 |       throw runtime_error("Sqlite error: "+std::to_string(rc));
 92 |   }
 93 |   rc= sqlite3_reset(d_stmts[table]);
 94 |   if(rc != SQLITE_OK)
 95 |     throw runtime_error("Sqlite error: "+std::to_string(rc));
 96 |   sqlite3_clear_bindings(d_stmts[table]);
 97 | }
 98 | 
 99 | void MiniSQLite::begin()
100 | {
101 |   d_intransaction=true;
102 |   exec("begin");
103 | }
104 | void MiniSQLite::commit()
105 | {
106 |   d_intransaction=false;
107 |   exec("commit");
108 | }
109 | 
110 | void MiniSQLite::cycle()
111 | {
112 |   exec("commit;begin");
113 | }
114 | 
115 | bool MiniSQLite::haveTable(const string& table)
116 | {
117 |   return !getSchema(table).empty();
118 | }
119 | 
120 | 
121 | //! Add a column to a table with a certain type
122 | void MiniSQLite::addColumn(const string& table, string_view name, string_view type)
123 | {
124 |   // SECURITY PROBLEM - somehow we can't do prepared statements here
125 |   
126 |   if(!haveTable(table)) {
127 | #if SQLITE_VERSION_NUMBER >= 3037001
128 |     exec("create table if not exists '"+table+"' ( '"+(string)name+"' "+(string)type+" ) STRICT");
129 | #else
130 |     exec("create table if not exists '"+table+"' ( '"+(string)name+"' "+(string)type+" )");
131 | #endif
132 |   } else {
133 |     //    cout<<"Adding column "<<name<<" to table "<<table<<endl;
134 |     exec("ALTER table \""+table+"\" add column \""+string(name)+ "\" "+string(type));
135 |   }
136 | }
137 | 
138 | 
139 | 
140 | void SQLiteWriter::commitThread()
141 | {
142 |   int n=0;
143 |   while(!d_pleasequit) {
144 |     usleep(50000);
145 |     if(!(n%20)) {
146 |       std::lock_guard<std::mutex> lock(d_mutex);
147 |       d_db.cycle();
148 |     }
149 |     n++;
150 |   }
151 |   //  cerr<<"Thread exiting"<<endl;
152 | }
153 | 
154 | bool SQLiteWriter::haveColumn(const std::string& table, std::string_view name)
155 | {
156 |   if(d_columns[table].empty()) {
157 |     d_columns[table] = d_db.getSchema(table);
158 |   }
159 |   //  cout<<"Do we have column "<<name<<" in table "<<table<<endl;
160 |   // this could be more efficient somehow
161 |   pair<string, string> cmp{name, std::string()};
162 |   return binary_search(d_columns[table].begin(), d_columns[table].end(), cmp,
163 |                        [](const auto& a, const auto& b)
164 |                        {
165 |                          return a.first < b.first;
166 |                        });
167 | 
168 | }
169 | 
170 | 
171 | void SQLiteWriter::addValue(const initializer_list<std::pair<const char*, var_t>>& values, const std::string& table)
172 | {
173 |   addValueGeneric(table, values);
174 | }
175 | 
176 | void SQLiteWriter::addValue(const std::vector<std::pair<const char*, var_t>>& values, const std::string& table)
177 | {
178 |   addValueGeneric(table, values);
179 | }
180 | 
181 | 
182 | template<typename T>
183 | void SQLiteWriter::addValueGeneric(const std::string& table, const T& values)
184 | {
185 |   std::lock_guard<std::mutex> lock(d_mutex);
186 |   if(!d_db.isPrepared(table) || !equal(values.begin(), values.end(),
187 |                                        d_lastsig[table].cbegin(), d_lastsig[table].cend(),
188 |                             [](const auto& a, const auto& b)
189 |   {
190 |     return a.first == b;
191 |   })) {
192 |     //    cout<<"Starting a new prepared statement"<<endl;
193 |     string q = "insert into '"+table+"' (";
194 |     string qmarks;
195 |     bool first=true;
196 |     for(const auto& p : values) {
197 |       if(!haveColumn(table, p.first)) {
198 |         if(std::get_if<double>(&p.second)) {
199 |           d_db.addColumn(table, p.first, "REAL");
200 |           d_columns[table].push_back({p.first, "REAL"});
201 |         }
202 |         else if(std::get_if<string>(&p.second)) {
203 |           d_db.addColumn(table, p.first, "TEXT");
204 |           d_columns[table].push_back({p.first, "TEXT"});
205 |         } else  {
206 |           d_db.addColumn(table, p.first, "INT");
207 |           d_columns[table].push_back({p.first, "INT"});
208 |         }
209 | 
210 |         sort(d_columns[table].begin(), d_columns[table].end());
211 |       }
212 |       if(!first) {
213 |         q+=", ";
214 |         qmarks += ", ";
215 |       }
216 |       first=false;
217 |       q+="'"+string(p.first)+"'";
218 |       qmarks +="?";
219 |     }
220 |     q+= ") values ("+qmarks+")";
221 | 
222 |     d_db.prepare(table, q);
223 |     
224 |     d_lastsig[table].clear();
225 |     for(const auto& p : values)
226 |       d_lastsig[table].push_back(p.first);
227 |   }
228 |   
229 |   int n = 1;
230 |   for(const auto& p : values) {
231 |     std::visit([this, &n, &table](auto&& arg) {
232 |       d_db.bindPrep(table, n, arg);
233 |     }, p.second);
234 |     n++;
235 |   }
236 |   d_db.execPrep(table);
237 | }
238 | 
239 | MiniSQLite::~MiniSQLite()
240 | {
241 |   // needs to close down d_sqlite3
242 |   if(d_intransaction)
243 |     commit();
244 | 
245 |   for(auto& stmt: d_stmts)
246 |     if(stmt.second)
247 |       sqlite3_finalize(stmt.second);
248 |   
249 |   sqlite3_close(d_sqlite); // same
250 | }
251 | 


--------------------------------------------------------------------------------