├── .babelrc
├── renovate.json
├── .gitignore
├── clear-translog.sh
├── .eslintrc.cjs
├── README.md
├── docker-compose.yml
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── claude.yaml
├── LICENSE
├── package.json
├── CONTRIBUTING.md
├── dumpUser.js
├── LEGAL.md
└── dumpOpenData.js


/.babelrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "presets": [
 3 |     ["@babel/env", {
 4 |       "targets": {
 5 |         "node": "current"
 6 |       }
 7 |     }]
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": [
 3 |     "config:base",
 4 |     "config:semverAllMonthly"
 5 |   ],
 6 |   "ignoreDeps": [
 7 |     "prettier"
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | *.log
3 | esdata
4 | **/.DS_Store
5 | data
6 | 
7 | # Prevent google-github-actions/auth credentials being committed to git by accident
8 | gha-creds-*.json


--------------------------------------------------------------------------------
/clear-translog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # When Elasticsearch cannot correctly mount, we need to clear corrupted translog
 4 | # before an index can be correctly read.
 5 | #
 6 | # Usage: $ ./clear-translog <index directory name>
 7 | # Example: $ ./clear-translog 3u8JeNJQSPaxxjTHk69qEQ
 8 | #
 9 | 
10 | docker-compose run elasticsearch bin/elasticsearch-translog truncate -d data/nodes/0/indices/$1/0/translog/
11 | 
12 | 


--------------------------------------------------------------------------------
/.eslintrc.cjs:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   extends: [
 3 |     'eslint:recommended',
 4 |     'plugin:import/errors',
 5 |     'plugin:import/warnings',
 6 |     'prettier',
 7 |   ],
 8 |   env: { node: true, es6: true },
 9 |   plugins: ['prettier'],
10 |   parserOptions: {
11 |     sourceType: 'module',
12 |     ecmaVersion: 'latest',
13 |   },
14 |   rules: {
15 |     'prettier/prettier': [
16 |       'error',
17 |       {
18 |         trailingComma: 'es5',
19 |         singleQuote: true,
20 |       },
21 |     ],
22 |     'no-console': 'off', // It's seed script! We use no-console a hell lot.
23 |   },
24 | };
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 【Cofacts 真的假的】Open Datasets
2 | =====
3 | 
4 | [![CI test](https://github.com/cofacts/opendata/actions/workflows/ci.yml/badge.svg)](https://github.com/cofacts/opendata/actions/workflows/ci.yml)
5 | 
6 | We publish Cofacts data as Hugging Face dataset [`Cofacts/line-msg-fact-check-tw`](https://huggingface.co/datasets/Cofacts/line-msg-fact-check-tw). Application of Cofacts data has also been moved to Hugging Face.
7 | 
8 | This repository hosts source code needed for the generation of the dataset. For data users, please see [go to Hugging Face](https://huggingface.co/datasets/Cofacts/line-msg-fact-check-tw) instead.
9 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Setup a elastic search server & kibana for testing mappings or inspecting production images.
 2 | #
 3 | # Usage
 4 | # =====
 5 | # 1. $ mkdir esdata
 6 | # 2. Extract production data to esdata/ if you want
 7 | # 3. $ docker-compose up
 8 | #
 9 | 
10 | version: '2'
11 | 
12 | services:
13 |   elasticsearch:
14 |     image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.2
15 |     volumes:
16 |       - "./esdata:/usr/share/elasticsearch/data"
17 |     environment:
18 |       - "path.repo=/usr/share/elasticsearch/data"
19 |     ports:
20 |       - "62223:9200"
21 | 
22 |   kibana:
23 |     image: docker.elastic.co/kibana/kibana-oss:6.3.2
24 |     ports:
25 |       - "62224:5601"
26 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI test
 2 | 
 3 | on:
 4 |   # Triggers the workflow on push or pull request events
 5 |   - pull_request
 6 |   - push
 7 |   # Allows you to run this workflow manually from the Actions tab
 8 |   - workflow_dispatch
 9 | 
10 | jobs:
11 |   install-and-test:
12 |     runs-on: ubuntu-latest
13 |     services:
14 |       rumors-test-db:
15 |         image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.2
16 |         ports:
17 |           - 62223:9200
18 |     steps:
19 | 
20 |       - name: Checkout rumors-db
21 |         uses: actions/checkout@v4
22 |         with:
23 |           repository: 'cofacts/rumors-db'
24 |       - uses: actions/setup-node@v4
25 |         with:
26 |           node-version: '18'
27 |           cache: 'npm'
28 |       - run: npm ci
29 |       - name: Initialize DB indexes
30 |         run: npm run schema
31 |         env:
32 |           ELASTICSEARCH_URL: http://localhost:62223
33 |       - name: Checkout opendata repo
34 |         uses: actions/checkout@v4
35 |       - run: npm ci
36 |       - run: npm run lint
37 |       - name: Test if script can generate csv files from empty database
38 |         run: npm start
39 |       - run: ls data/
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016-2021 Cofacts message reporting chatbot and crowd-sourced fact-checking community (「Cofacts 真的假的」訊息回報機器人與查證協作社群)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "cofacts-opendata",
 3 |   "version": "1.0.0",
 4 |   "type": "module",
 5 |   "description": "Open data for cofacts",
 6 |   "main": "dumpOpenData.js",
 7 |   "scripts": {
 8 |     "prestart": "mkdir -p data",
 9 |     "start": "node dumpOpenData",
10 |     "lint": "eslint .",
11 |     "lint:fix": "eslint --fix .",
12 |     "test": "echo \"Error: no test specified\" && exit 1"
13 |   },
14 |   "repository": {
15 |     "type": "git",
16 |     "url": "git+https://github.com/cofacts/opendata.git"
17 |   },
18 |   "keywords": [
19 |     "open-data",
20 |     "fact-checking",
21 |     "crowd-sourcing"
22 |   ],
23 |   "author": "",
24 |   "license": "SEE LICENSE IN README.md",
25 |   "bugs": {
26 |     "url": "https://github.com/cofacts/opendata/issues"
27 |   },
28 |   "homepage": "https://github.com/cofacts/opendata#readme",
29 |   "dependencies": {
30 |     "csv-stringify": "^6.4.0",
31 |     "@elastic/elasticsearch": "^6.8.6",
32 |     "jszip": "^3.1.5"
33 |   },
34 |   "devDependencies": {
35 |     "eslint": "^8.50.0",
36 |     "eslint-config-prettier": "^9.0.0",
37 |     "eslint-plugin-import": "^2.28.0",
38 |     "eslint-plugin-prettier": "^5.0.0",
39 |     "prettier": "^3.0.0"
40 |   },
41 |   "engines": {
42 |     "node": ">=18"
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/.github/workflows/claude.yaml:
--------------------------------------------------------------------------------
 1 | name: Claude PR Assistant
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 |   pull_request_review_comment:
 7 |     types: [created]
 8 |   issues:
 9 |     types: [opened, assigned]
10 |   pull_request_review:
11 |     types: [submitted]
12 | 
13 | jobs:
14 |   claude-code-action:
15 |     if: |
16 |       (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
17 |       (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
18 |       (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
19 |       (github.event_name == 'issues' && contains(github.event.issue.body, '@claude'))
20 |     runs-on: ubuntu-latest
21 |     permissions:
22 |       contents: read
23 |       pull-requests: write
24 |       issues: write
25 |       id-token: write
26 |     steps:
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v4
29 |         with:
30 |           fetch-depth: 1
31 | 
32 |       - name: Authenticate to Google Cloud
33 |         uses: google-github-actions/auth@v2
34 |         with:
35 |           workload_identity_provider: ${{ secrets.GC_WORKLOAD_IDENTITY_PROVIDER }}
36 |           service_account: ${{ secrets.GC_AI_SERVICE_ACCOUNT }}
37 | 
38 |       - name: Run Claude PR Action
39 |         uses: anthropics/claude-code-action@beta
40 |         env:
41 |           ANTHROPIC_VERTEX_PROJECT_ID: "${{ secrets.GC_PROJECT_ID }}"
42 |           CLOUD_ML_REGION: "asia-east1"
43 |         with:
44 |           use_vertex: "true"
45 |           timeout_minutes: "60"
46 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Cofacts opendata contributor
 2 | 
 3 | ## Generating opendata files
 4 | 
 5 | We generate the opendata files by backing up production DB to local machine, then run this script on local machine.
 6 | 
 7 | ### Spin up ElasticSearch on local environment.
 8 | 
 9 | Run this to spin up a local elasticsearch for the backed up file
10 | 
11 | ```
12 | $ docker-compose up
13 | ```
14 | 
15 | This spins up elasticsearch on `localhost:62223`, with Kibana available in `localhost:62224`.
16 | 
17 | ### Restore production backup from Cofacts' Google Cloud Storage bucket
18 | 
19 | We use [Elasticsearch snapshots](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/modules-snapshots.html)
20 | and [Google Cloud Storage Repository plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/6.8/repository-gcs.html) to perform backup and restore regularly.
21 | 
22 | #### First-time setup
23 | 
24 | First, spin up local elasticsearch & kibana using `docker-compose up`.
25 | 
26 | Secondly, ask a team member for service account credential `gcs.json`. Put the file to under `esdata/`.
27 | 
28 | Open another terminal and execute:
29 | 
30 | ```
31 | # Install gcs plugin
32 | $ docker-compose exec elasticsearch bin/elasticsearch-plugin install repository-gcs
33 | # Enter "y" when asked to continue
34 | 
35 | # Install service account credential
36 | $ docker-compose exec elasticsearch bin/elasticsearch-keystore add-file gcs.client.default.credentials_file data/gcs.json
37 | 
38 | # Restart
39 | $ docker-compose restart elasticsearch
40 | ```
41 | 
42 | After elasticsearch turns green, go to [Kibana](http://localhost:62224/app/kibana#/dev_tools/console)
43 | and execute the following commands
44 | 
45 | ```
46 | # Run in Kibana
47 | 
48 | # Initialize snapshot respository named "cofacts" as GCS repository.
49 | # Since we only read from the repository, turn on "readonly" flag.
50 | #
51 | PUT _snapshot/cofacts
52 | {
53 |   "type": "gcs",
54 |   "settings": {
55 |     "bucket": "rumors-db",
56 |     "readonly": true
57 |   }
58 | }
59 | ```
60 | 
61 | ### Loading snapshot from GCS
62 | 
63 | Before publishing opendata, update your elasticsearch with the following commands in Kibana.
64 | 
65 | ```
66 | # Gets all snapshots in the repository
67 | GET /_snapshot/cofacts/_all?verbose=false
68 | ```
69 | 
70 | Find the latest snapshot name (like `2020-07-05` below), then run the following command to
71 | restore the snapshot to your local Elasticsearch indices.
72 | 
73 | ```
74 | # You may need to remove all your local Elasticsearch indices before restore
75 | DELETE /_all
76 | 
77 | # 2020-07-05 is the snapshot name.
78 | #
79 | POST /_snapshot/cofacts/2020-07-05/_restore
80 | {
81 |   "indices": "*,-urls*"
82 | }
83 | ```
84 | 
85 | #### See progress
86 | 
87 | To find out current recovery progress, run this:
88 | 
89 | ```
90 | GET /_recovery?human&filter_path=*.shards.stage,*.shards.index.size.percent
91 | ```
92 | 
93 | ### Generate CSV files
94 | After all indices are restored, run `npm start` in CLI to generate opendata files.
95 | 
96 | All files are written to `/data` directory in `*.csv.zip`.
97 | 


--------------------------------------------------------------------------------
/dumpUser.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs';
  2 | import crypto from 'crypto';
  3 | import elasticsearch from '@elastic/elasticsearch';
  4 | import csvStringify from 'csv-stringify';
  5 | import JSZip from 'jszip';
  6 | 
  7 | const ELASTICSEARCH_URL = 'http://localhost:62223';
  8 | const OUTPUT_DIR = './data';
  9 | 
 10 | const client = new elasticsearch.Client({
 11 |   node: ELASTICSEARCH_URL,
 12 | });
 13 | 
 14 | /**
 15 |  * @param {any[][]} input
 16 |  * @returns {Promise<string>} CSV content
 17 |  */
 18 | function generateCSV(input) {
 19 |   return new Promise((resolve, reject) => {
 20 |     csvStringify(input, (err, csvData) => {
 21 |       if (err) {
 22 |         return reject(err);
 23 |       }
 24 |       return resolve(csvData);
 25 |     });
 26 |   });
 27 | }
 28 | 
 29 | /**
 30 |  * @param {string} input
 31 |  * @returns {string} - input's sha256 hash hex string. Empty string if input is falsy.
 32 |  */
 33 | function sha256(input) {
 34 |   return input
 35 |     ? crypto.createHash('sha256').update(input, 'utf8').digest('hex')
 36 |     : '';
 37 | }
 38 | 
 39 | async function scanIndex(index) {
 40 |   let result = [];
 41 | 
 42 |   const { body: initialResult } = await client.search({
 43 |     index,
 44 |     size: 200,
 45 |     scroll: '5m',
 46 |   });
 47 | 
 48 |   const totalCount = initialResult.hits.total;
 49 | 
 50 |   initialResult.hits.hits.forEach((hit) => {
 51 |     result.push(hit);
 52 |   });
 53 | 
 54 |   while (result.length < totalCount) {
 55 |     const { body: scrollResult } = await client.scroll({
 56 |       scrollId: initialResult._scroll_id,
 57 |       scroll: '5m',
 58 |     });
 59 |     scrollResult.hits.hits.forEach((hit) => {
 60 |       result.push(hit);
 61 |     });
 62 |   }
 63 | 
 64 |   return result;
 65 | }
 66 | 
 67 | /**
 68 |  * @param {object[]} articles
 69 |  * @returns {Promise<string>} Generated CSV string
 70 |  */
 71 | function dumpUsers(users) {
 72 |   return generateCSV([
 73 |     [
 74 |       'userIdsha256',
 75 |       'name',
 76 |       'email',
 77 |       'facebookId',
 78 |       'githubId',
 79 |       'twitterId',
 80 |       'updatedAt',
 81 |     ],
 82 |     ...users.map(({ _id, _source }) => [
 83 |       sha256(_id),
 84 |       _source.name,
 85 |       _source.email,
 86 |       _source.facebookId,
 87 |       _source.githubId,
 88 |       _source.twitterId,
 89 |       _source.updatedAt,
 90 |     ]),
 91 |   ]);
 92 | }
 93 | 
 94 | /**
 95 |  * @param {string} fileName The name of file to be put in a zip file
 96 |  * @returns {({string}) => (none)}
 97 |  */
 98 | function writeFile(fileName) {
 99 |   return (data) => {
100 |     const zip = new JSZip();
101 |     zip.file(fileName, data);
102 | 
103 |     // Ref: https://stuk.github.io/jszip/documentation/howto/write_zip.html#in-nodejs
104 |     //
105 |     zip
106 |       .generateNodeStream({
107 |         type: 'nodebuffer',
108 |         streamFiles: true,
109 |         compression: 'DEFLATE',
110 |         compressionOptions: { level: 8 },
111 |       })
112 |       .pipe(fs.createWriteStream(`${OUTPUT_DIR}/${fileName}.zip`))
113 |       .on('finish', () => console.log(`${fileName}.zip written.`));
114 |   };
115 | }
116 | 
117 | /**
118 |  * Main process
119 |  */
120 | 
121 | scanIndex('users').then(dumpUsers).then(writeFile('users.csv'));
122 | 


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
 1 | # Cofacts 真的假的 資料使用者條款
 2 | 
 3 | > Version 1, 17 February 2021
 4 | 
 5 | 此使用者條款為利用 Cofacts API 與 Cofacts 真的假的 相關服務，取得資料進行利用之使用規範。若您以本條款「取用資料」之方式取得 Cofacts 所提供資料，即成為本條款之資料使用者，並表示完全同意本使用者條款。
 6 | 
 7 | ## 一、API 程式授權及所提供資料
 8 | 
 9 | 1. API 程式碼本身採 MIT License (MIT) 發布，程式碼的授權規範，請參閱 https://github.com/cofacts/rumors-api 檔案庫目錄裡 `LICENSE` 檔案內容所示。
10 | 
11 | 2. Cofacts 所提供資料包含：
12 |     1. Cofacts 真的假的工作小組（下稱 Cofacts WG）公布於 https://github.com/cofacts/opendata 的資料存檔，或 Cofacts WG 用其他方式提供給您的資料存檔。
13 |     2. Cofacts API 以及 Cofacts 測試用 API 所回傳之資料。
14 | 
15 | ## 二、資料授權與顯名規範
16 | 
17 | 1. Cofacts 所提供資料，均是由 Cofacts WG 依本使用者條款發布，當前版本採「CC 授權 姓名標示-相同方式分享 4.0（CC BY-SA 4.0）」發布釋出。
18 | 
19 | 2. Cofacts WG 後續或會就社會公益及開放性進行綜合評估，另擇其他適宜之資料提供條款。當資料提供條款經變更後，此時依原授權條款發布且先前已取得之資料，使用者得依原條款之規定繼續使用，然更新擇定條款之後，洽本服務取得之提供資料，後續使用時之權利及義務，悉依新條款之規定。
20 | 
21 | 3. 依照 CC BY-SA 4.0 規範，您在後續重製或散布時，原社群顯名及每一則查證的出處連結（URI）皆必須被完整引用。
22 | 
23 | 4. 除非以其他方式議定，否則 Cofacts WG 所要求的顯名聲明（attribution）應符合下面規範：
24 |     1. 若 Cofacts 所提供資料將在 LINE 中散布，請使用：
25 |         > 本編輯資料取自「Cofacts 真的假的」訊息回報機器人與查證協作社群，採 CC BY-SA 4.0 授權提供，若欲補充資訊請訪問 Cofacts LINE bot https://line.me/R/oaMessage/@cofacts/?[url-encoded-text] 。
26 | 
27 |         其中 `[url-encoded-text]` 應為 URL encode 後的網傳訊息原文，若過長可先節錄後再 URL encode。
28 |     2. 若 Cofacts 所提供資料在 LINE 之外的地方散布，請使用：
29 |         > 本編輯資料取自「Cofacts 真的假的」訊息回報機器人與查證協作社群，採 CC BY-SA 4.0 授權提供。若欲補充資訊請訪問 Cofacts LINE bot https://line.me/ti/p/@cofacts
30 | 
31 | 5. 本編輯資料容許採任何目的及方式利用與轉載，然限定延續採 CC BY-SA 4.0 授權方式發布，並保留以上顯名聲明。
32 | 
33 | 6. 如果您對上述要求的顯名聲明有疑義，請與 Cofacts WG 聯絡，說明您散布 Cofacts 資料的媒介、方法及情況，與 Cofacts WG 議定適合您的顯名聲明方式。
34 | 
35 | ## 三、取用資料
36 | 
37 | 1. 您可以手動下載 Cofacts 所提供的資料存檔，或使用程式（client application）從 Cofacts API 取得提供資料。
38 | 
39 | 2. 若您希望透過 Cofacts API 取得提供資料，請以本條款聯絡方式來聯絡 Cofacts WG 申請新建 client application，取得呼叫 API 所需要的 `app-id` 或 `app-secret`。若為 `app-secret` 形式，請妥善保管您的 `app-secret` 不被其他人得知。若使用者怠忽相關存取識別資訊的保密，導致濫用情狀發生，視為違反本使用者條款，而應自負相應衍生之法律責任。
40 | 
41 | 3. 除 Cofacts WG 提供給您的 `app-id` 或 `app-secret`、以及在測試站使用的公開 `app-id` 或 `app-secret` 之外，請勿擅自使用其他人的 client application 之 `app-id` 或 `app-secret`。若使用者怠忽相關存取識別資訊的使用誠信，導致濫用情狀發生，亦視為違反本使用者條款，而應自負相應衍生之法律責任。
42 | 
43 | 4. 除已知非直接營利的第三方公眾服務網站爬蟲，採低頻率部份內容捉取模式外（如 Internet Archive、Google、Facebook），以及依本使用者條款申請取得 `app-id` 或 `app-secret` 等 Cofacts API 應用外，本使用者條款明確禁止使用自動化方式爬取 Cofacts 網站或聊天機器人所提供的內容，所有逾此範圍之自動化資料爬取行為，皆視為對 Cofacts 真的假的 相關服務之惡意使用。
44 | 
45 | ## 四、回報資訊
46 | 
47 | 1. 原則上，若您欲提供「回報新內容」或「評價回應」等功能，請將使用者以 Cofacts WG 所要求的 URI 導向到 Cofacts 的聊天機器人或網站來進行上述功能。
48 | 
49 | 2. 若您仍希望將前述功能整合進自己的應用程式，請與 Cofacts WG 聯繫，開通您的 client application 的 API 寫入功能。
50 | 
51 | 3. 在您使用 API 寫入功能提交回報資訊時，預設採公眾領域貢獻宣告 (CC0-1.0) 方式提交至 Cofacts WG 所維運之電腦或相關設備進行存放。
52 | 
53 | 4. 提交資訊即代表您同意 Cofacts WG 將存放於電腦或其相關設備的該等資訊進行編輯性整理後，得採 CC BY-SA 4.0 或其他適宜的授權模式，將具編輯性保護之資料發布於 Cofacts 網站、Cofacts 聊天機器人或 Cofacts 所提供資料存檔等處。
54 | 
55 | 5. Cofacts WG 將就社會公益及開放性進行綜合評估，來選擇發布資料時使用之授權模式，亦歡迎您透過 Cofacts WG 的聯繫方式，隨時將授權釋出政策有關的寶貴意見提供給我們。
56 | 
57 | ## 五、免責聲明
58 | 
59 | 1. Cofacts WG 就所提供資料，依所擇定使用資料提供條款之免責條款，於法律容許的最大限度之內，主張於最大可能範圍內，免責於相關資料使用所可能產生的損害。以下並就整體服務之免責事項，進行要點聲明。
60 | 
61 | 2. Cofacts WG 並不對所提供資料的正確性提供任何擔保，亦不為所提供資料中的言論負責。Cofacts 所提供資料中的任何敘述與主張，均不代表 Cofacts WG 之立場，亦不為其背書。
62 | 
63 | 3. Cofacts WG 以及網站協作者不為任何第三方取用資料的聊天機器人、網站、內容農場做資料背書，也不承認單方取用 Cofacts 所提供資料的合作關係。若其他機器人因為其程式設計的限制，無法正確或有效回應使用者的提問，Cofacts WG 以及網站協作者不負擔資料庫的修正作業。
64 | 
65 | 4. Cofacts 所提供資料，採公眾授權條款或相關聲明釋出之標的，僅涵蓋編輯性資料，並不保證隨附資訊之中，沒有包括或鏈結至任何第三方受著作權保護之客體。
66 | 
67 | 5. 您應知悉 Cofacts 所提供資料的蒐集與實作方式有所限制、也可能受到其他因素（如尚未實作、實作錯誤、系統下線維護等系統因素，以及使用者人數等外在因素）影響，導致統計資訊可能與實際狀況有所偏誤。
68 | 
69 | ## 六、聯絡方式
70 | 
71 | 1. 如果您對於本使用者條款、資料授權、所提供資料的使用與提供等有任何疑慮，請寄信至 Cofacts WG 之聯絡信箱 cofacts@googlegroups.com ，由 Cofacts WG 成員在固定會議中或透過其他管道為您處理。
72 | 
73 | ## 七、紛爭解決和管轄法院
74 | 
75 | 1. 採公眾授權模式發布之資料，後續資料散布若涉訟，依該條款之準據法及管轄法院規定辦理，然依本使用者條款所產生之爭議，雙方合意以臺灣臺北地方法院為第一審管轄法院。
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/dumpOpenData.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs';
  2 | import { pipeline } from 'stream/promises';
  3 | import { Readable } from 'stream';
  4 | import crypto from 'crypto';
  5 | import elasticsearch from '@elastic/elasticsearch';
  6 | 
  7 | // eslint-import-resolve does not support `exports` in package.json.
  8 | // eslint-disable-next-line import/no-unresolved
  9 | import { stringify as csvStringify } from 'csv-stringify/sync';
 10 | import JSZip from 'jszip';
 11 | 
 12 | const ELASTICSEARCH_URL = 'http://localhost:62223';
 13 | const OUTPUT_DIR = './data';
 14 | 
 15 | const client = new elasticsearch.Client({
 16 |   node: ELASTICSEARCH_URL,
 17 | });
 18 | 
 19 | /**
 20 |  * @param {string} input
 21 |  * @returns {string} - input's sha256 hash hex string. Empty string if input is falsy.
 22 |  */
 23 | function sha256(input) {
 24 |   return input
 25 |     ? crypto.createHash('sha256').update(input, 'utf8').digest('hex')
 26 |     : '';
 27 | }
 28 | 
 29 | async function* scanIndex(index) {
 30 |   let processedCount = 0;
 31 | 
 32 |   const { body: initialResult } = await client.search({
 33 |     index,
 34 |     size: 200,
 35 |     scroll: '5m',
 36 |   });
 37 | 
 38 |   const totalCount = initialResult.hits.total;
 39 | 
 40 |   for (const hit of initialResult.hits.hits) {
 41 |     processedCount += 1;
 42 |     yield hit;
 43 |   }
 44 | 
 45 |   while (processedCount < totalCount) {
 46 |     const { body: scrollResult } = await client.scroll({
 47 |       scrollId: initialResult._scroll_id,
 48 |       scroll: '5m',
 49 |     });
 50 |     for (const hit of scrollResult.hits.hits) {
 51 |       processedCount += 1;
 52 |       yield hit;
 53 |       if (processedCount % 100000 === 0) {
 54 |         console.info(`${index}:\t${processedCount}/${totalCount}`);
 55 |       }
 56 |     }
 57 |   }
 58 | }
 59 | 
 60 | /**
 61 |  * @param {AsyncIterable} articles
 62 |  * @returns {Promise<string>} Generated CSV string
 63 |  */
 64 | async function* dumpArticles(articles) {
 65 |   yield csvStringify([
 66 |     [
 67 |       'id',
 68 |       'articleType',
 69 |       'status',
 70 |       'text',
 71 |       'normalArticleReplyCount',
 72 |       'createdAt',
 73 |       'updatedAt',
 74 |       'lastRequestedAt',
 75 |       'userIdsha256',
 76 |       'appId',
 77 |       'references', // array of strings
 78 |     ],
 79 |   ]);
 80 |   for await (const { _source, _id } of articles) {
 81 |     yield csvStringify([
 82 |       [
 83 |         _id,
 84 |         _source.articleType,
 85 |         _source.status,
 86 |         _source.text,
 87 |         _source.normalArticleReplyCount,
 88 |         _source.createdAt,
 89 |         _source.updatedAt,
 90 |         _source.lastRequestedAt,
 91 |         sha256(_source.userId),
 92 |         _source.appId,
 93 |         _source.references.map((ref) => ref.type).join(','),
 94 |       ],
 95 |     ]);
 96 |   }
 97 | }
 98 | 
 99 | /**
100 |  * @param {AsyncIterable} articles
101 |  * @returns {Promise<string>} Generated CSV string
102 |  */
103 | async function* dumpArticleHyperlinks(articles) {
104 |   yield csvStringify([['articleId', 'url', 'normalizedUrl', 'title']]);
105 | 
106 |   for await (const { _source, _id } of articles) {
107 |     for (const hyperlink of _source.hyperlinks || []) {
108 |       yield csvStringify([
109 |         [_id, hyperlink.url, hyperlink.normalizedUrl, hyperlink.title],
110 |       ]);
111 |     }
112 |   }
113 | }
114 | 
115 | /**
116 |  * @param {AsyncIterable} articles
117 |  * @returns {Promise<string>} Generated CSV string
118 |  */
119 | async function* dumpArticleCategories(articles) {
120 |   yield csvStringify([
121 |     [
122 |       'articleId',
123 |       'categoryId',
124 |       'aiConfidence',
125 |       'aiModel',
126 |       'userIdsha',
127 |       'appId',
128 |       'negativeFeedbackCount',
129 |       'positiveFeedbackCount',
130 |       'status',
131 |       'createdAt',
132 |       'updatedAt',
133 |     ],
134 |   ]);
135 | 
136 |   for await (const { _source, _id } of articles) {
137 |     for (const ac of _source.articleCategories || []) {
138 |       yield csvStringify([
139 |         [
140 |           _id,
141 |           ac.categoryId,
142 |           ac.aiConfidence,
143 |           ac.aiModel,
144 |           sha256(ac.userId),
145 |           ac.appId,
146 |           ac.negativeFeedbackCount,
147 |           ac.positiveFeedbackCount,
148 |           ac.status,
149 |           ac.createdAt,
150 |           ac.updatedAt,
151 |         ],
152 |       ]);
153 |     }
154 |   }
155 | }
156 | 
157 | /**
158 |  * @param {AsyncIterator} articles
159 |  * @returns {Promise<string>} Generated CSV string
160 |  */
161 | async function* dumpArticleReplies(articles) {
162 |   yield csvStringify([
163 |     [
164 |       'articleId',
165 |       'replyId',
166 |       'userIdsha256',
167 |       'negativeFeedbackCount',
168 |       'positiveFeedbackCount',
169 |       'replyType',
170 |       'appId',
171 |       'status',
172 |       'createdAt',
173 |       'updatedAt',
174 |     ],
175 |   ]);
176 | 
177 |   for await (const { _source, _id } of articles) {
178 |     for (const ar of _source.articleReplies || []) {
179 |       yield csvStringify([
180 |         [
181 |           _id,
182 |           ar.replyId,
183 |           sha256(ar.userId),
184 |           ar.negativeFeedbackCount,
185 |           ar.positiveFeedbackCount,
186 |           ar.replyType,
187 |           ar.appId,
188 |           ar.status,
189 |           ar.createdAt,
190 |           ar.updatedAt,
191 |         ],
192 |       ]);
193 |     }
194 |   }
195 | }
196 | 
197 | /**
198 |  * @param {AsyncIterable} replies
199 |  * @returns {Promise<string>} Generated CSV string
200 |  */
201 | async function* dumpReplies(replies) {
202 |   yield csvStringify([
203 |     ['id', 'type', 'reference', 'userIdsha256', 'appId', 'text', 'createdAt'],
204 |   ]);
205 | 
206 |   for await (const { _source, _id } of replies) {
207 |     yield csvStringify([
208 |       [
209 |         _id,
210 |         _source.type,
211 |         _source.reference,
212 |         sha256(_source.userId),
213 |         _source.appId,
214 |         _source.text,
215 |         _source.createdAt,
216 |       ],
217 |     ]);
218 |   }
219 | }
220 | 
221 | /**
222 |  * @param {AsyncIterable} replies
223 |  * @returns {Promise<string>} Generated CSV string
224 |  */
225 | async function* dumpReplyHyperlinks(replies) {
226 |   yield csvStringify([['replyId', 'url', 'normalizedUrl', 'title']]);
227 | 
228 |   for await (const { _source, _id } of replies) {
229 |     for (const hyperlink of _source.hyperlinks || []) {
230 |       yield csvStringify([
231 |         [_id, hyperlink.url, hyperlink.normalizedUrl, hyperlink.title],
232 |       ]);
233 |     }
234 |   }
235 | }
236 | 
237 | /**
238 |  * @param {object[]} categories
239 |  * @returns {Promise<string>} Generated CSV string
240 |  */
241 | async function* dumpCategories(categories) {
242 |   yield csvStringify([
243 |     ['id', 'title', 'description', 'createdAt', 'updatedAt'],
244 |   ]);
245 | 
246 |   for await (const { _id, _source } of categories) {
247 |     yield csvStringify([
248 |       [
249 |         _id,
250 |         _source.title,
251 |         _source.description,
252 |         _source.createdAt,
253 |         _source.updatedAt,
254 |       ],
255 |     ]);
256 |   }
257 | }
258 | 
259 | /**
260 |  * @param {AsyncIterable} replyRequests
261 |  * @returns {Promise<string>} Generated CSV string
262 |  */
263 | async function* dumpReplyRequests(replyRequests) {
264 |   yield csvStringify([
265 |     [
266 |       'articleId',
267 |       'reason',
268 |       'status',
269 |       'positiveFeedbackCount',
270 |       'negativeFeedbackCount',
271 |       'userIdsha256',
272 |       'appId',
273 |       'createdAt',
274 |     ],
275 |   ]);
276 | 
277 |   for await (const { _source } of replyRequests) {
278 |     yield csvStringify([
279 |       [
280 |         _source.articleId,
281 |         _source.reason,
282 |         _source.status,
283 |         (_source.feedbacks || []).reduce((sum, { score }) => {
284 |           if (score === 1) sum += 1;
285 |           return sum;
286 |         }, 0),
287 |         (_source.feedbacks || []).reduce((sum, { score }) => {
288 |           if (score === -1) sum += 1;
289 |           return sum;
290 |         }, 0),
291 |         sha256(_source.userId),
292 |         _source.appId,
293 |         _source.createdAt,
294 |       ],
295 |     ]);
296 |   }
297 | }
298 | 
299 | /**
300 |  * @param {AsyncIterable} articleReplyFeedbacks
301 |  * @returns {Promise<string>} Generated CSV string
302 |  */
303 | async function* dumpArticleReplyFeedbacks(articleReplyFeedbacks) {
304 |   yield csvStringify([
305 |     [
306 |       'articleId',
307 |       'replyId',
308 |       'score',
309 |       'comment',
310 |       'status',
311 |       'userIdsha256',
312 |       'appId',
313 |       'createdAt',
314 |     ],
315 |   ]);
316 | 
317 |   for await (const { _source } of articleReplyFeedbacks) {
318 |     yield csvStringify([
319 |       [
320 |         _source.articleId,
321 |         _source.replyId,
322 |         _source.score,
323 |         _source.comment,
324 |         _source.status,
325 |         sha256(_source.userId),
326 |         _source.appId,
327 |         _source.createdAt,
328 |       ],
329 |     ]);
330 |   }
331 | }
332 | 
333 | /**
334 |  * @param {AsyncIterable} analytics
335 |  * @returns {Promise<string>} Generated CSV string
336 |  */
337 | async function* dumpAnalytics(analytics) {
338 |   yield csvStringify([
339 |     [
340 |       'type',
341 |       'docId',
342 |       'date',
343 |       'lineUser',
344 |       'lineVisit',
345 |       'webUser',
346 |       'webVisit',
347 |       'liffUser',
348 |       'liffVisit',
349 |     ],
350 |   ]);
351 | 
352 |   for await (const { _source } of analytics) {
353 |     yield csvStringify([
354 |       [
355 |         _source.type,
356 |         _source.docId,
357 |         _source.date,
358 |         _source.stats.lineUser,
359 |         _source.stats.lineVisit,
360 |         _source.stats.webUser,
361 |         _source.stats.webVisit,
362 |         (_source.stats.liff || []).reduce((sum, { user }) => sum + user, 0),
363 |         (_source.stats.liff || []).reduce((sum, { visit }) => sum + visit, 0),
364 |       ],
365 |     ]);
366 |   }
367 | }
368 | 
369 | /**
370 |  * @param {AsyncIterable} users
371 |  * @returns {Promise<string>} Generated CSV string
372 |  */
373 | async function* dumpUsers(users) {
374 |   yield csvStringify([
375 |     ['userIdsha256', 'appId', 'createdAt', 'lastActiveAt', 'blockedReason'],
376 |   ]);
377 | 
378 |   for await (const { _id, _source } of users) {
379 |     yield csvStringify([
380 |       [
381 |         sha256(_id),
382 |         _source.appId,
383 |         _source.createdAt,
384 |         _source.lastActiveAt,
385 |         _source.blockedReason,
386 |       ],
387 |     ]);
388 |   }
389 | }
390 | 
391 | /**
392 |  * @param {string} fileName The name of file to be put in a zip file
393 |  * @returns {(source: AsyncIterable) => void}
394 |  */
395 | function writeFile(fileName) {
396 |   return (source) => {
397 |     const zip = new JSZip();
398 |     zip.file(fileName, Readable.from(source), { binary: false });
399 | 
400 |     return new Promise((resolve) => {
401 |       // Ref: https://stuk.github.io/jszip/documentation/howto/write_zip.html#in-nodejs
402 |       //
403 |       zip
404 |         .generateNodeStream({
405 |           type: 'nodebuffer',
406 |           streamFiles: true,
407 |           compression: 'DEFLATE',
408 |           compressionOptions: { level: 8 },
409 |         })
410 |         .pipe(fs.createWriteStream(`${OUTPUT_DIR}/${fileName}.zip`))
411 |         .on('finish', () => {
412 |           console.info(`${fileName}.zip written.`);
413 |           resolve(fileName);
414 |         });
415 |     });
416 |   };
417 | }
418 | 
419 | /**
420 |  * Main process
421 |  */
422 | pipeline(scanIndex('articles'), dumpArticles, writeFile('articles.csv'));
423 | pipeline(
424 |   scanIndex('articles'),
425 |   dumpArticleReplies,
426 |   writeFile('article_replies.csv')
427 | );
428 | pipeline(
429 |   scanIndex('articles'),
430 |   dumpArticleHyperlinks,
431 |   writeFile('article_hyperlinks.csv')
432 | );
433 | pipeline(
434 |   scanIndex('articles'),
435 |   dumpArticleCategories,
436 |   writeFile('article_categories.csv')
437 | );
438 | 
439 | pipeline(scanIndex('replies'), dumpReplies, writeFile('replies.csv'));
440 | pipeline(
441 |   scanIndex('replies'),
442 |   dumpReplyHyperlinks,
443 |   writeFile('reply_hyperlinks.csv')
444 | );
445 | 
446 | pipeline(
447 |   scanIndex('replyrequests'),
448 |   dumpReplyRequests,
449 |   writeFile('reply_requests.csv')
450 | );
451 | 
452 | pipeline(scanIndex('categories'), dumpCategories, writeFile('categories.csv'));
453 | 
454 | pipeline(
455 |   scanIndex('articlereplyfeedbacks'),
456 |   dumpArticleReplyFeedbacks,
457 |   writeFile('article_reply_feedbacks.csv')
458 | );
459 | 
460 | pipeline(scanIndex('analytics'), dumpAnalytics, writeFile('analytics.csv'));
461 | pipeline(scanIndex('users'), dumpUsers, writeFile('anonymized_users.csv'));
462 | 


--------------------------------------------------------------------------------