├── .babelrc ├── renovate.json ├── .gitignore ├── clear-translog.sh ├── .eslintrc.cjs ├── README.md ├── docker-compose.yml ├── .github └── workflows │ ├── ci.yml │ └── claude.yaml ├── LICENSE ├── package.json ├── CONTRIBUTING.md ├── dumpUser.js ├── LEGAL.md └── dumpOpenData.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | ["@babel/env", { 4 | "targets": { 5 | "node": "current" 6 | } 7 | }] 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base", 4 | "config:semverAllMonthly" 5 | ], 6 | "ignoreDeps": [ 7 | "prettier" 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | *.log 3 | esdata 4 | **/.DS_Store 5 | data 6 | 7 | # Prevent google-github-actions/auth credentials being committed to git by accident 8 | gha-creds-*.json -------------------------------------------------------------------------------- /clear-translog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # When Elasticsearch cannot correctly mount, we need to clear corrupted translog 4 | # before an index can be correctly read. 5 | # 6 | # Usage: $ ./clear-translog 7 | # Example: $ ./clear-translog 3u8JeNJQSPaxxjTHk69qEQ 8 | # 9 | 10 | docker-compose run elasticsearch bin/elasticsearch-translog truncate -d data/nodes/0/indices/$1/0/translog/ 11 | 12 | -------------------------------------------------------------------------------- /.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: [ 3 | 'eslint:recommended', 4 | 'plugin:import/errors', 5 | 'plugin:import/warnings', 6 | 'prettier', 7 | ], 8 | env: { node: true, es6: true }, 9 | plugins: ['prettier'], 10 | parserOptions: { 11 | sourceType: 'module', 12 | ecmaVersion: 'latest', 13 | }, 14 | rules: { 15 | 'prettier/prettier': [ 16 | 'error', 17 | { 18 | trailingComma: 'es5', 19 | singleQuote: true, 20 | }, 21 | ], 22 | 'no-console': 'off', // It's seed script! We use no-console a hell lot. 23 | }, 24 | }; 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 【Cofacts 真的假的】Open Datasets 2 | ===== 3 | 4 | [![CI test](https://github.com/cofacts/opendata/actions/workflows/ci.yml/badge.svg)](https://github.com/cofacts/opendata/actions/workflows/ci.yml) 5 | 6 | We publish Cofacts data as Hugging Face dataset [`Cofacts/line-msg-fact-check-tw`](https://huggingface.co/datasets/Cofacts/line-msg-fact-check-tw). Application of Cofacts data has also been moved to Hugging Face. 7 | 8 | This repository hosts source code needed for the generation of the dataset. For data users, please see [go to Hugging Face](https://huggingface.co/datasets/Cofacts/line-msg-fact-check-tw) instead. 9 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Setup a elastic search server & kibana for testing mappings or inspecting production images. 2 | # 3 | # Usage 4 | # ===== 5 | # 1. $ mkdir esdata 6 | # 2. Extract production data to esdata/ if you want 7 | # 3. $ docker-compose up 8 | # 9 | 10 | version: '2' 11 | 12 | services: 13 | elasticsearch: 14 | image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.2 15 | volumes: 16 | - "./esdata:/usr/share/elasticsearch/data" 17 | environment: 18 | - "path.repo=/usr/share/elasticsearch/data" 19 | ports: 20 | - "62223:9200" 21 | 22 | kibana: 23 | image: docker.elastic.co/kibana/kibana-oss:6.3.2 24 | ports: 25 | - "62224:5601" 26 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI test 2 | 3 | on: 4 | # Triggers the workflow on push or pull request events 5 | - pull_request 6 | - push 7 | # Allows you to run this workflow manually from the Actions tab 8 | - workflow_dispatch 9 | 10 | jobs: 11 | install-and-test: 12 | runs-on: ubuntu-latest 13 | services: 14 | rumors-test-db: 15 | image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.2 16 | ports: 17 | - 62223:9200 18 | steps: 19 | 20 | - name: Checkout rumors-db 21 | uses: actions/checkout@v4 22 | with: 23 | repository: 'cofacts/rumors-db' 24 | - uses: actions/setup-node@v4 25 | with: 26 | node-version: '18' 27 | cache: 'npm' 28 | - run: npm ci 29 | - name: Initialize DB indexes 30 | run: npm run schema 31 | env: 32 | ELASTICSEARCH_URL: http://localhost:62223 33 | - name: Checkout opendata repo 34 | uses: actions/checkout@v4 35 | - run: npm ci 36 | - run: npm run lint 37 | - name: Test if script can generate csv files from empty database 38 | run: npm start 39 | - run: ls data/ 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016-2021 Cofacts message reporting chatbot and crowd-sourced fact-checking community (「Cofacts 真的假的」訊息回報機器人與查證協作社群) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cofacts-opendata", 3 | "version": "1.0.0", 4 | "type": "module", 5 | "description": "Open data for cofacts", 6 | "main": "dumpOpenData.js", 7 | "scripts": { 8 | "prestart": "mkdir -p data", 9 | "start": "node dumpOpenData", 10 | "lint": "eslint .", 11 | "lint:fix": "eslint --fix .", 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "git+https://github.com/cofacts/opendata.git" 17 | }, 18 | "keywords": [ 19 | "open-data", 20 | "fact-checking", 21 | "crowd-sourcing" 22 | ], 23 | "author": "", 24 | "license": "SEE LICENSE IN README.md", 25 | "bugs": { 26 | "url": "https://github.com/cofacts/opendata/issues" 27 | }, 28 | "homepage": "https://github.com/cofacts/opendata#readme", 29 | "dependencies": { 30 | "csv-stringify": "^6.4.0", 31 | "@elastic/elasticsearch": "^6.8.6", 32 | "jszip": "^3.1.5" 33 | }, 34 | "devDependencies": { 35 | "eslint": "^8.50.0", 36 | "eslint-config-prettier": "^9.0.0", 37 | "eslint-plugin-import": "^2.28.0", 38 | "eslint-plugin-prettier": "^5.0.0", 39 | "prettier": "^3.0.0" 40 | }, 41 | "engines": { 42 | "node": ">=18" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /.github/workflows/claude.yaml: -------------------------------------------------------------------------------- 1 | name: Claude PR Assistant 2 | 3 | on: 4 | issue_comment: 5 | types: [created] 6 | pull_request_review_comment: 7 | types: [created] 8 | issues: 9 | types: [opened, assigned] 10 | pull_request_review: 11 | types: [submitted] 12 | 13 | jobs: 14 | claude-code-action: 15 | if: | 16 | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || 17 | (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || 18 | (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || 19 | (github.event_name == 'issues' && contains(github.event.issue.body, '@claude')) 20 | runs-on: ubuntu-latest 21 | permissions: 22 | contents: read 23 | pull-requests: write 24 | issues: write 25 | id-token: write 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v4 29 | with: 30 | fetch-depth: 1 31 | 32 | - name: Authenticate to Google Cloud 33 | uses: google-github-actions/auth@v2 34 | with: 35 | workload_identity_provider: ${{ secrets.GC_WORKLOAD_IDENTITY_PROVIDER }} 36 | service_account: ${{ secrets.GC_AI_SERVICE_ACCOUNT }} 37 | 38 | - name: Run Claude PR Action 39 | uses: anthropics/claude-code-action@beta 40 | env: 41 | ANTHROPIC_VERTEX_PROJECT_ID: "${{ secrets.GC_PROJECT_ID }}" 42 | CLOUD_ML_REGION: "asia-east1" 43 | with: 44 | use_vertex: "true" 45 | timeout_minutes: "60" 46 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Cofacts opendata contributor 2 | 3 | ## Generating opendata files 4 | 5 | We generate the opendata files by backing up production DB to local machine, then run this script on local machine. 6 | 7 | ### Spin up ElasticSearch on local environment. 8 | 9 | Run this to spin up a local elasticsearch for the backed up file 10 | 11 | ``` 12 | $ docker-compose up 13 | ``` 14 | 15 | This spins up elasticsearch on `localhost:62223`, with Kibana available in `localhost:62224`. 16 | 17 | ### Restore production backup from Cofacts' Google Cloud Storage bucket 18 | 19 | We use [Elasticsearch snapshots](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/modules-snapshots.html) 20 | and [Google Cloud Storage Repository plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/6.8/repository-gcs.html) to perform backup and restore regularly. 21 | 22 | #### First-time setup 23 | 24 | First, spin up local elasticsearch & kibana using `docker-compose up`. 25 | 26 | Secondly, ask a team member for service account credential `gcs.json`. Put the file to under `esdata/`. 27 | 28 | Open another terminal and execute: 29 | 30 | ``` 31 | # Install gcs plugin 32 | $ docker-compose exec elasticsearch bin/elasticsearch-plugin install repository-gcs 33 | # Enter "y" when asked to continue 34 | 35 | # Install service account credential 36 | $ docker-compose exec elasticsearch bin/elasticsearch-keystore add-file gcs.client.default.credentials_file data/gcs.json 37 | 38 | # Restart 39 | $ docker-compose restart elasticsearch 40 | ``` 41 | 42 | After elasticsearch turns green, go to [Kibana](http://localhost:62224/app/kibana#/dev_tools/console) 43 | and execute the following commands 44 | 45 | ``` 46 | # Run in Kibana 47 | 48 | # Initialize snapshot respository named "cofacts" as GCS repository. 49 | # Since we only read from the repository, turn on "readonly" flag. 50 | # 51 | PUT _snapshot/cofacts 52 | { 53 | "type": "gcs", 54 | "settings": { 55 | "bucket": "rumors-db", 56 | "readonly": true 57 | } 58 | } 59 | ``` 60 | 61 | ### Loading snapshot from GCS 62 | 63 | Before publishing opendata, update your elasticsearch with the following commands in Kibana. 64 | 65 | ``` 66 | # Gets all snapshots in the repository 67 | GET /_snapshot/cofacts/_all?verbose=false 68 | ``` 69 | 70 | Find the latest snapshot name (like `2020-07-05` below), then run the following command to 71 | restore the snapshot to your local Elasticsearch indices. 72 | 73 | ``` 74 | # You may need to remove all your local Elasticsearch indices before restore 75 | DELETE /_all 76 | 77 | # 2020-07-05 is the snapshot name. 78 | # 79 | POST /_snapshot/cofacts/2020-07-05/_restore 80 | { 81 | "indices": "*,-urls*" 82 | } 83 | ``` 84 | 85 | #### See progress 86 | 87 | To find out current recovery progress, run this: 88 | 89 | ``` 90 | GET /_recovery?human&filter_path=*.shards.stage,*.shards.index.size.percent 91 | ``` 92 | 93 | ### Generate CSV files 94 | After all indices are restored, run `npm start` in CLI to generate opendata files. 95 | 96 | All files are written to `/data` directory in `*.csv.zip`. 97 | -------------------------------------------------------------------------------- /dumpUser.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import crypto from 'crypto'; 3 | import elasticsearch from '@elastic/elasticsearch'; 4 | import csvStringify from 'csv-stringify'; 5 | import JSZip from 'jszip'; 6 | 7 | const ELASTICSEARCH_URL = 'http://localhost:62223'; 8 | const OUTPUT_DIR = './data'; 9 | 10 | const client = new elasticsearch.Client({ 11 | node: ELASTICSEARCH_URL, 12 | }); 13 | 14 | /** 15 | * @param {any[][]} input 16 | * @returns {Promise} CSV content 17 | */ 18 | function generateCSV(input) { 19 | return new Promise((resolve, reject) => { 20 | csvStringify(input, (err, csvData) => { 21 | if (err) { 22 | return reject(err); 23 | } 24 | return resolve(csvData); 25 | }); 26 | }); 27 | } 28 | 29 | /** 30 | * @param {string} input 31 | * @returns {string} - input's sha256 hash hex string. Empty string if input is falsy. 32 | */ 33 | function sha256(input) { 34 | return input 35 | ? crypto.createHash('sha256').update(input, 'utf8').digest('hex') 36 | : ''; 37 | } 38 | 39 | async function scanIndex(index) { 40 | let result = []; 41 | 42 | const { body: initialResult } = await client.search({ 43 | index, 44 | size: 200, 45 | scroll: '5m', 46 | }); 47 | 48 | const totalCount = initialResult.hits.total; 49 | 50 | initialResult.hits.hits.forEach((hit) => { 51 | result.push(hit); 52 | }); 53 | 54 | while (result.length < totalCount) { 55 | const { body: scrollResult } = await client.scroll({ 56 | scrollId: initialResult._scroll_id, 57 | scroll: '5m', 58 | }); 59 | scrollResult.hits.hits.forEach((hit) => { 60 | result.push(hit); 61 | }); 62 | } 63 | 64 | return result; 65 | } 66 | 67 | /** 68 | * @param {object[]} articles 69 | * @returns {Promise} Generated CSV string 70 | */ 71 | function dumpUsers(users) { 72 | return generateCSV([ 73 | [ 74 | 'userIdsha256', 75 | 'name', 76 | 'email', 77 | 'facebookId', 78 | 'githubId', 79 | 'twitterId', 80 | 'updatedAt', 81 | ], 82 | ...users.map(({ _id, _source }) => [ 83 | sha256(_id), 84 | _source.name, 85 | _source.email, 86 | _source.facebookId, 87 | _source.githubId, 88 | _source.twitterId, 89 | _source.updatedAt, 90 | ]), 91 | ]); 92 | } 93 | 94 | /** 95 | * @param {string} fileName The name of file to be put in a zip file 96 | * @returns {({string}) => (none)} 97 | */ 98 | function writeFile(fileName) { 99 | return (data) => { 100 | const zip = new JSZip(); 101 | zip.file(fileName, data); 102 | 103 | // Ref: https://stuk.github.io/jszip/documentation/howto/write_zip.html#in-nodejs 104 | // 105 | zip 106 | .generateNodeStream({ 107 | type: 'nodebuffer', 108 | streamFiles: true, 109 | compression: 'DEFLATE', 110 | compressionOptions: { level: 8 }, 111 | }) 112 | .pipe(fs.createWriteStream(`${OUTPUT_DIR}/${fileName}.zip`)) 113 | .on('finish', () => console.log(`${fileName}.zip written.`)); 114 | }; 115 | } 116 | 117 | /** 118 | * Main process 119 | */ 120 | 121 | scanIndex('users').then(dumpUsers).then(writeFile('users.csv')); 122 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | # Cofacts 真的假的 資料使用者條款 2 | 3 | > Version 1, 17 February 2021 4 | 5 | 此使用者條款為利用 Cofacts API 與 Cofacts 真的假的 相關服務,取得資料進行利用之使用規範。若您以本條款「取用資料」之方式取得 Cofacts 所提供資料,即成為本條款之資料使用者,並表示完全同意本使用者條款。 6 | 7 | ## 一、API 程式授權及所提供資料 8 | 9 | 1. API 程式碼本身採 MIT License (MIT) 發布,程式碼的授權規範,請參閱 https://github.com/cofacts/rumors-api 檔案庫目錄裡 `LICENSE` 檔案內容所示。 10 | 11 | 2. Cofacts 所提供資料包含: 12 | 1. Cofacts 真的假的工作小組(下稱 Cofacts WG)公布於 https://github.com/cofacts/opendata 的資料存檔,或 Cofacts WG 用其他方式提供給您的資料存檔。 13 | 2. Cofacts API 以及 Cofacts 測試用 API 所回傳之資料。 14 | 15 | ## 二、資料授權與顯名規範 16 | 17 | 1. Cofacts 所提供資料,均是由 Cofacts WG 依本使用者條款發布,當前版本採「CC 授權 姓名標示-相同方式分享 4.0(CC BY-SA 4.0)」發布釋出。 18 | 19 | 2. Cofacts WG 後續或會就社會公益及開放性進行綜合評估,另擇其他適宜之資料提供條款。當資料提供條款經變更後,此時依原授權條款發布且先前已取得之資料,使用者得依原條款之規定繼續使用,然更新擇定條款之後,洽本服務取得之提供資料,後續使用時之權利及義務,悉依新條款之規定。 20 | 21 | 3. 依照 CC BY-SA 4.0 規範,您在後續重製或散布時,原社群顯名及每一則查證的出處連結(URI)皆必須被完整引用。 22 | 23 | 4. 除非以其他方式議定,否則 Cofacts WG 所要求的顯名聲明(attribution)應符合下面規範: 24 | 1. 若 Cofacts 所提供資料將在 LINE 中散布,請使用: 25 | > 本編輯資料取自「Cofacts 真的假的」訊息回報機器人與查證協作社群,採 CC BY-SA 4.0 授權提供,若欲補充資訊請訪問 Cofacts LINE bot https://line.me/R/oaMessage/@cofacts/?[url-encoded-text] 。 26 | 27 | 其中 `[url-encoded-text]` 應為 URL encode 後的網傳訊息原文,若過長可先節錄後再 URL encode。 28 | 2. 若 Cofacts 所提供資料在 LINE 之外的地方散布,請使用: 29 | > 本編輯資料取自「Cofacts 真的假的」訊息回報機器人與查證協作社群,採 CC BY-SA 4.0 授權提供。若欲補充資訊請訪問 Cofacts LINE bot https://line.me/ti/p/@cofacts 30 | 31 | 5. 本編輯資料容許採任何目的及方式利用與轉載,然限定延續採 CC BY-SA 4.0 授權方式發布,並保留以上顯名聲明。 32 | 33 | 6. 如果您對上述要求的顯名聲明有疑義,請與 Cofacts WG 聯絡,說明您散布 Cofacts 資料的媒介、方法及情況,與 Cofacts WG 議定適合您的顯名聲明方式。 34 | 35 | ## 三、取用資料 36 | 37 | 1. 您可以手動下載 Cofacts 所提供的資料存檔,或使用程式(client application)從 Cofacts API 取得提供資料。 38 | 39 | 2. 若您希望透過 Cofacts API 取得提供資料,請以本條款聯絡方式來聯絡 Cofacts WG 申請新建 client application,取得呼叫 API 所需要的 `app-id` 或 `app-secret`。若為 `app-secret` 形式,請妥善保管您的 `app-secret` 不被其他人得知。若使用者怠忽相關存取識別資訊的保密,導致濫用情狀發生,視為違反本使用者條款,而應自負相應衍生之法律責任。 40 | 41 | 3. 除 Cofacts WG 提供給您的 `app-id` 或 `app-secret`、以及在測試站使用的公開 `app-id` 或 `app-secret` 之外,請勿擅自使用其他人的 client application 之 `app-id` 或 `app-secret`。若使用者怠忽相關存取識別資訊的使用誠信,導致濫用情狀發生,亦視為違反本使用者條款,而應自負相應衍生之法律責任。 42 | 43 | 4. 除已知非直接營利的第三方公眾服務網站爬蟲,採低頻率部份內容捉取模式外(如 Internet Archive、Google、Facebook),以及依本使用者條款申請取得 `app-id` 或 `app-secret` 等 Cofacts API 應用外,本使用者條款明確禁止使用自動化方式爬取 Cofacts 網站或聊天機器人所提供的內容,所有逾此範圍之自動化資料爬取行為,皆視為對 Cofacts 真的假的 相關服務之惡意使用。 44 | 45 | ## 四、回報資訊 46 | 47 | 1. 原則上,若您欲提供「回報新內容」或「評價回應」等功能,請將使用者以 Cofacts WG 所要求的 URI 導向到 Cofacts 的聊天機器人或網站來進行上述功能。 48 | 49 | 2. 若您仍希望將前述功能整合進自己的應用程式,請與 Cofacts WG 聯繫,開通您的 client application 的 API 寫入功能。 50 | 51 | 3. 在您使用 API 寫入功能提交回報資訊時,預設採公眾領域貢獻宣告 (CC0-1.0) 方式提交至 Cofacts WG 所維運之電腦或相關設備進行存放。 52 | 53 | 4. 提交資訊即代表您同意 Cofacts WG 將存放於電腦或其相關設備的該等資訊進行編輯性整理後,得採 CC BY-SA 4.0 或其他適宜的授權模式,將具編輯性保護之資料發布於 Cofacts 網站、Cofacts 聊天機器人或 Cofacts 所提供資料存檔等處。 54 | 55 | 5. Cofacts WG 將就社會公益及開放性進行綜合評估,來選擇發布資料時使用之授權模式,亦歡迎您透過 Cofacts WG 的聯繫方式,隨時將授權釋出政策有關的寶貴意見提供給我們。 56 | 57 | ## 五、免責聲明 58 | 59 | 1. Cofacts WG 就所提供資料,依所擇定使用資料提供條款之免責條款,於法律容許的最大限度之內,主張於最大可能範圍內,免責於相關資料使用所可能產生的損害。以下並就整體服務之免責事項,進行要點聲明。 60 | 61 | 2. Cofacts WG 並不對所提供資料的正確性提供任何擔保,亦不為所提供資料中的言論負責。Cofacts 所提供資料中的任何敘述與主張,均不代表 Cofacts WG 之立場,亦不為其背書。 62 | 63 | 3. Cofacts WG 以及網站協作者不為任何第三方取用資料的聊天機器人、網站、內容農場做資料背書,也不承認單方取用 Cofacts 所提供資料的合作關係。若其他機器人因為其程式設計的限制,無法正確或有效回應使用者的提問,Cofacts WG 以及網站協作者不負擔資料庫的修正作業。 64 | 65 | 4. Cofacts 所提供資料,採公眾授權條款或相關聲明釋出之標的,僅涵蓋編輯性資料,並不保證隨附資訊之中,沒有包括或鏈結至任何第三方受著作權保護之客體。 66 | 67 | 5. 您應知悉 Cofacts 所提供資料的蒐集與實作方式有所限制、也可能受到其他因素(如尚未實作、實作錯誤、系統下線維護等系統因素,以及使用者人數等外在因素)影響,導致統計資訊可能與實際狀況有所偏誤。 68 | 69 | ## 六、聯絡方式 70 | 71 | 1. 如果您對於本使用者條款、資料授權、所提供資料的使用與提供等有任何疑慮,請寄信至 Cofacts WG 之聯絡信箱 cofacts@googlegroups.com ,由 Cofacts WG 成員在固定會議中或透過其他管道為您處理。 72 | 73 | ## 七、紛爭解決和管轄法院 74 | 75 | 1. 採公眾授權模式發布之資料,後續資料散布若涉訟,依該條款之準據法及管轄法院規定辦理,然依本使用者條款所產生之爭議,雙方合意以臺灣臺北地方法院為第一審管轄法院。 76 | 77 | 78 | -------------------------------------------------------------------------------- /dumpOpenData.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import { pipeline } from 'stream/promises'; 3 | import { Readable } from 'stream'; 4 | import crypto from 'crypto'; 5 | import elasticsearch from '@elastic/elasticsearch'; 6 | 7 | // eslint-import-resolve does not support `exports` in package.json. 8 | // eslint-disable-next-line import/no-unresolved 9 | import { stringify as csvStringify } from 'csv-stringify/sync'; 10 | import JSZip from 'jszip'; 11 | 12 | const ELASTICSEARCH_URL = 'http://localhost:62223'; 13 | const OUTPUT_DIR = './data'; 14 | 15 | const client = new elasticsearch.Client({ 16 | node: ELASTICSEARCH_URL, 17 | }); 18 | 19 | /** 20 | * @param {string} input 21 | * @returns {string} - input's sha256 hash hex string. Empty string if input is falsy. 22 | */ 23 | function sha256(input) { 24 | return input 25 | ? crypto.createHash('sha256').update(input, 'utf8').digest('hex') 26 | : ''; 27 | } 28 | 29 | async function* scanIndex(index) { 30 | let processedCount = 0; 31 | 32 | const { body: initialResult } = await client.search({ 33 | index, 34 | size: 200, 35 | scroll: '5m', 36 | }); 37 | 38 | const totalCount = initialResult.hits.total; 39 | 40 | for (const hit of initialResult.hits.hits) { 41 | processedCount += 1; 42 | yield hit; 43 | } 44 | 45 | while (processedCount < totalCount) { 46 | const { body: scrollResult } = await client.scroll({ 47 | scrollId: initialResult._scroll_id, 48 | scroll: '5m', 49 | }); 50 | for (const hit of scrollResult.hits.hits) { 51 | processedCount += 1; 52 | yield hit; 53 | if (processedCount % 100000 === 0) { 54 | console.info(`${index}:\t${processedCount}/${totalCount}`); 55 | } 56 | } 57 | } 58 | } 59 | 60 | /** 61 | * @param {AsyncIterable} articles 62 | * @returns {Promise} Generated CSV string 63 | */ 64 | async function* dumpArticles(articles) { 65 | yield csvStringify([ 66 | [ 67 | 'id', 68 | 'articleType', 69 | 'status', 70 | 'text', 71 | 'normalArticleReplyCount', 72 | 'createdAt', 73 | 'updatedAt', 74 | 'lastRequestedAt', 75 | 'userIdsha256', 76 | 'appId', 77 | 'references', // array of strings 78 | ], 79 | ]); 80 | for await (const { _source, _id } of articles) { 81 | yield csvStringify([ 82 | [ 83 | _id, 84 | _source.articleType, 85 | _source.status, 86 | _source.text, 87 | _source.normalArticleReplyCount, 88 | _source.createdAt, 89 | _source.updatedAt, 90 | _source.lastRequestedAt, 91 | sha256(_source.userId), 92 | _source.appId, 93 | _source.references.map((ref) => ref.type).join(','), 94 | ], 95 | ]); 96 | } 97 | } 98 | 99 | /** 100 | * @param {AsyncIterable} articles 101 | * @returns {Promise} Generated CSV string 102 | */ 103 | async function* dumpArticleHyperlinks(articles) { 104 | yield csvStringify([['articleId', 'url', 'normalizedUrl', 'title']]); 105 | 106 | for await (const { _source, _id } of articles) { 107 | for (const hyperlink of _source.hyperlinks || []) { 108 | yield csvStringify([ 109 | [_id, hyperlink.url, hyperlink.normalizedUrl, hyperlink.title], 110 | ]); 111 | } 112 | } 113 | } 114 | 115 | /** 116 | * @param {AsyncIterable} articles 117 | * @returns {Promise} Generated CSV string 118 | */ 119 | async function* dumpArticleCategories(articles) { 120 | yield csvStringify([ 121 | [ 122 | 'articleId', 123 | 'categoryId', 124 | 'aiConfidence', 125 | 'aiModel', 126 | 'userIdsha', 127 | 'appId', 128 | 'negativeFeedbackCount', 129 | 'positiveFeedbackCount', 130 | 'status', 131 | 'createdAt', 132 | 'updatedAt', 133 | ], 134 | ]); 135 | 136 | for await (const { _source, _id } of articles) { 137 | for (const ac of _source.articleCategories || []) { 138 | yield csvStringify([ 139 | [ 140 | _id, 141 | ac.categoryId, 142 | ac.aiConfidence, 143 | ac.aiModel, 144 | sha256(ac.userId), 145 | ac.appId, 146 | ac.negativeFeedbackCount, 147 | ac.positiveFeedbackCount, 148 | ac.status, 149 | ac.createdAt, 150 | ac.updatedAt, 151 | ], 152 | ]); 153 | } 154 | } 155 | } 156 | 157 | /** 158 | * @param {AsyncIterator} articles 159 | * @returns {Promise} Generated CSV string 160 | */ 161 | async function* dumpArticleReplies(articles) { 162 | yield csvStringify([ 163 | [ 164 | 'articleId', 165 | 'replyId', 166 | 'userIdsha256', 167 | 'negativeFeedbackCount', 168 | 'positiveFeedbackCount', 169 | 'replyType', 170 | 'appId', 171 | 'status', 172 | 'createdAt', 173 | 'updatedAt', 174 | ], 175 | ]); 176 | 177 | for await (const { _source, _id } of articles) { 178 | for (const ar of _source.articleReplies || []) { 179 | yield csvStringify([ 180 | [ 181 | _id, 182 | ar.replyId, 183 | sha256(ar.userId), 184 | ar.negativeFeedbackCount, 185 | ar.positiveFeedbackCount, 186 | ar.replyType, 187 | ar.appId, 188 | ar.status, 189 | ar.createdAt, 190 | ar.updatedAt, 191 | ], 192 | ]); 193 | } 194 | } 195 | } 196 | 197 | /** 198 | * @param {AsyncIterable} replies 199 | * @returns {Promise} Generated CSV string 200 | */ 201 | async function* dumpReplies(replies) { 202 | yield csvStringify([ 203 | ['id', 'type', 'reference', 'userIdsha256', 'appId', 'text', 'createdAt'], 204 | ]); 205 | 206 | for await (const { _source, _id } of replies) { 207 | yield csvStringify([ 208 | [ 209 | _id, 210 | _source.type, 211 | _source.reference, 212 | sha256(_source.userId), 213 | _source.appId, 214 | _source.text, 215 | _source.createdAt, 216 | ], 217 | ]); 218 | } 219 | } 220 | 221 | /** 222 | * @param {AsyncIterable} replies 223 | * @returns {Promise} Generated CSV string 224 | */ 225 | async function* dumpReplyHyperlinks(replies) { 226 | yield csvStringify([['replyId', 'url', 'normalizedUrl', 'title']]); 227 | 228 | for await (const { _source, _id } of replies) { 229 | for (const hyperlink of _source.hyperlinks || []) { 230 | yield csvStringify([ 231 | [_id, hyperlink.url, hyperlink.normalizedUrl, hyperlink.title], 232 | ]); 233 | } 234 | } 235 | } 236 | 237 | /** 238 | * @param {object[]} categories 239 | * @returns {Promise} Generated CSV string 240 | */ 241 | async function* dumpCategories(categories) { 242 | yield csvStringify([ 243 | ['id', 'title', 'description', 'createdAt', 'updatedAt'], 244 | ]); 245 | 246 | for await (const { _id, _source } of categories) { 247 | yield csvStringify([ 248 | [ 249 | _id, 250 | _source.title, 251 | _source.description, 252 | _source.createdAt, 253 | _source.updatedAt, 254 | ], 255 | ]); 256 | } 257 | } 258 | 259 | /** 260 | * @param {AsyncIterable} replyRequests 261 | * @returns {Promise} Generated CSV string 262 | */ 263 | async function* dumpReplyRequests(replyRequests) { 264 | yield csvStringify([ 265 | [ 266 | 'articleId', 267 | 'reason', 268 | 'status', 269 | 'positiveFeedbackCount', 270 | 'negativeFeedbackCount', 271 | 'userIdsha256', 272 | 'appId', 273 | 'createdAt', 274 | ], 275 | ]); 276 | 277 | for await (const { _source } of replyRequests) { 278 | yield csvStringify([ 279 | [ 280 | _source.articleId, 281 | _source.reason, 282 | _source.status, 283 | (_source.feedbacks || []).reduce((sum, { score }) => { 284 | if (score === 1) sum += 1; 285 | return sum; 286 | }, 0), 287 | (_source.feedbacks || []).reduce((sum, { score }) => { 288 | if (score === -1) sum += 1; 289 | return sum; 290 | }, 0), 291 | sha256(_source.userId), 292 | _source.appId, 293 | _source.createdAt, 294 | ], 295 | ]); 296 | } 297 | } 298 | 299 | /** 300 | * @param {AsyncIterable} articleReplyFeedbacks 301 | * @returns {Promise} Generated CSV string 302 | */ 303 | async function* dumpArticleReplyFeedbacks(articleReplyFeedbacks) { 304 | yield csvStringify([ 305 | [ 306 | 'articleId', 307 | 'replyId', 308 | 'score', 309 | 'comment', 310 | 'status', 311 | 'userIdsha256', 312 | 'appId', 313 | 'createdAt', 314 | ], 315 | ]); 316 | 317 | for await (const { _source } of articleReplyFeedbacks) { 318 | yield csvStringify([ 319 | [ 320 | _source.articleId, 321 | _source.replyId, 322 | _source.score, 323 | _source.comment, 324 | _source.status, 325 | sha256(_source.userId), 326 | _source.appId, 327 | _source.createdAt, 328 | ], 329 | ]); 330 | } 331 | } 332 | 333 | /** 334 | * @param {AsyncIterable} analytics 335 | * @returns {Promise} Generated CSV string 336 | */ 337 | async function* dumpAnalytics(analytics) { 338 | yield csvStringify([ 339 | [ 340 | 'type', 341 | 'docId', 342 | 'date', 343 | 'lineUser', 344 | 'lineVisit', 345 | 'webUser', 346 | 'webVisit', 347 | 'liffUser', 348 | 'liffVisit', 349 | ], 350 | ]); 351 | 352 | for await (const { _source } of analytics) { 353 | yield csvStringify([ 354 | [ 355 | _source.type, 356 | _source.docId, 357 | _source.date, 358 | _source.stats.lineUser, 359 | _source.stats.lineVisit, 360 | _source.stats.webUser, 361 | _source.stats.webVisit, 362 | (_source.stats.liff || []).reduce((sum, { user }) => sum + user, 0), 363 | (_source.stats.liff || []).reduce((sum, { visit }) => sum + visit, 0), 364 | ], 365 | ]); 366 | } 367 | } 368 | 369 | /** 370 | * @param {AsyncIterable} users 371 | * @returns {Promise} Generated CSV string 372 | */ 373 | async function* dumpUsers(users) { 374 | yield csvStringify([ 375 | ['userIdsha256', 'appId', 'createdAt', 'lastActiveAt', 'blockedReason'], 376 | ]); 377 | 378 | for await (const { _id, _source } of users) { 379 | yield csvStringify([ 380 | [ 381 | sha256(_id), 382 | _source.appId, 383 | _source.createdAt, 384 | _source.lastActiveAt, 385 | _source.blockedReason, 386 | ], 387 | ]); 388 | } 389 | } 390 | 391 | /** 392 | * @param {string} fileName The name of file to be put in a zip file 393 | * @returns {(source: AsyncIterable) => void} 394 | */ 395 | function writeFile(fileName) { 396 | return (source) => { 397 | const zip = new JSZip(); 398 | zip.file(fileName, Readable.from(source), { binary: false }); 399 | 400 | return new Promise((resolve) => { 401 | // Ref: https://stuk.github.io/jszip/documentation/howto/write_zip.html#in-nodejs 402 | // 403 | zip 404 | .generateNodeStream({ 405 | type: 'nodebuffer', 406 | streamFiles: true, 407 | compression: 'DEFLATE', 408 | compressionOptions: { level: 8 }, 409 | }) 410 | .pipe(fs.createWriteStream(`${OUTPUT_DIR}/${fileName}.zip`)) 411 | .on('finish', () => { 412 | console.info(`${fileName}.zip written.`); 413 | resolve(fileName); 414 | }); 415 | }); 416 | }; 417 | } 418 | 419 | /** 420 | * Main process 421 | */ 422 | pipeline(scanIndex('articles'), dumpArticles, writeFile('articles.csv')); 423 | pipeline( 424 | scanIndex('articles'), 425 | dumpArticleReplies, 426 | writeFile('article_replies.csv') 427 | ); 428 | pipeline( 429 | scanIndex('articles'), 430 | dumpArticleHyperlinks, 431 | writeFile('article_hyperlinks.csv') 432 | ); 433 | pipeline( 434 | scanIndex('articles'), 435 | dumpArticleCategories, 436 | writeFile('article_categories.csv') 437 | ); 438 | 439 | pipeline(scanIndex('replies'), dumpReplies, writeFile('replies.csv')); 440 | pipeline( 441 | scanIndex('replies'), 442 | dumpReplyHyperlinks, 443 | writeFile('reply_hyperlinks.csv') 444 | ); 445 | 446 | pipeline( 447 | scanIndex('replyrequests'), 448 | dumpReplyRequests, 449 | writeFile('reply_requests.csv') 450 | ); 451 | 452 | pipeline(scanIndex('categories'), dumpCategories, writeFile('categories.csv')); 453 | 454 | pipeline( 455 | scanIndex('articlereplyfeedbacks'), 456 | dumpArticleReplyFeedbacks, 457 | writeFile('article_reply_feedbacks.csv') 458 | ); 459 | 460 | pipeline(scanIndex('analytics'), dumpAnalytics, writeFile('analytics.csv')); 461 | pipeline(scanIndex('users'), dumpUsers, writeFile('anonymized_users.csv')); 462 | --------------------------------------------------------------------------------