Skip to content

Commit

Permalink
Initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
felixonmars committed May 23, 2020
0 parents commit 9c3f461
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 0 deletions.
22 changes: 22 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FILENAME=zhwiki-20200501-all-titles-in-ns0

all: build

build: zhwiki.dict

download: $(FILENAME).gz

$(FILENAME).gz:
wget https://dumps.wikimedia.org/zhwiki/20200501/$(FILENAME).gz

$(FILENAME): $(FILENAME).gz
gzip -k -d $(FILENAME).gz

zhwiki.raw: $(FILENAME)
./convert.py $(FILENAME) > zhwiki.raw

zhwiki.dict: zhwiki.raw
libime_pinyindict zhwiki.raw zhwiki.dict

install: zhwiki.dict
install -Dm644 zhwiki.dict -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/
14 changes: 14 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
zhwiki dictionary for fcitx5-pinyin


Build time requirements:

Python modules:
opencc
pypinyin


Installation:

make
sudo make install
29 changes: 29 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/python
import sys
import re
import opencc
from pypinyin import lazy_pinyin
converter = opencc.OpenCC('t2s.json')

FILE = sys.argv[1]

HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
count = 0
with open(FILE) as f:
for line in f:
line = line.rstrip("\n")
if not HANZI_RE.match(line):
continue

pinyin = "'".join(lazy_pinyin(line))
if pinyin == line:
print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
continue

print("\t".join((converter.convert(line), pinyin, "0")))
count += 1
if count % 1000 == 0:
print(str(count) + " converted", file=sys.stderr)

if count % 1000 != 0:
print(str(count) + " converted", file=sys.stderr)

0 comments on commit 9c3f461

Please sign in to comment.