Skip to content

Regenerate DB

Regenerate DB #93

Workflow file for this run

name: Regenerate DB
on:
workflow_dispatch:
schedule:
- cron: '0 4 3,23 * *' # run on days 3/23 of each month (dumps start on days 1/20 and take a few days to complete and get mirrored)
jobs:
create-updated-wiki-dump:
strategy:
fail-fast: false
matrix:
lang: [en, fr, de, ja, it, ru, es, zh, pl, nl, pt, ar, fi, hu]
name: ${{ matrix.lang }}wiki Dump Gen
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: 1.19
- name: Prerequisites
id: setup
run: |
sudo apt install -y jq xz-utils libxml2-dev libgc-dev
echo "date=$(curl -L https://dumps.wikimedia.org/index.json | jq --raw-output '.wikis.enwiki.jobs.pagelinkstable.updated | split(" ") | .[0] ')" >> $GITHUB_OUTPUT
- name: Initialize swapfile
run: |
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
sudo swapon --show
- name: Build
run: |
cd db_gen_c
clang++ -Ofast -flto -march=native -mtune=native -I/usr/include/libxml2 db_gen_v2.cc -o db_gen -std=c++17 -lxml2 -lgc
mv db_gen ../gen
cd ..
curl -L "https://dumps.wikimedia.org/$(curl -L https://dumps.wikimedia.org/index.json | jq --raw-output '.wikis.${{ matrix.lang }}wiki.jobs | if has("articlesdumprecombine") then .articlesdumprecombine else .articlesdump end | .files | to_entries[0].value.url')" | bzcat | ./gen | xz -C crc32 --lzma2=preset=9e,lc=4,pb=2 -f - > "${{ matrix.lang }}.bin"
aws configure set aws_access_key_id ${{ secrets.AWS_ACCESS_KEY_ID }}
aws configure set aws_secret_access_key ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws configure set region auto
aws configure set output json
aws s3 cp ${{ matrix.lang }}.bin s3://wiki-dbs/${{ matrix.lang }}.bin --endpoint-url ${{ secrets.R2_AWS_ENDPOINT_URL }}
- name: Upload
uses: softprops/action-gh-release@v1
with:
files: ${{ matrix.lang }}.bin
tag_name: ${{ steps.setup.outputs.date }}