Skip to content

Commit

Permalink
Add web slang generation
Browse files Browse the repository at this point in the history
  • Loading branch information
felixonmars committed May 26, 2020
1 parent 2a41f14 commit 59ff257
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
13 changes: 11 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,23 @@ download: $(FILENAME).gz
$(FILENAME).gz:
wget https://dumps.wikimedia.org/zhwiki/20200501/$(FILENAME).gz

web-slang.source:
./zhwiki-web-slang.py > web-slang.source

$(FILENAME): $(FILENAME).gz
gzip -k -d $(FILENAME).gz

zhwiki.raw: $(FILENAME)
./convert.py $(FILENAME) > zhwiki.raw
zhwiki.source: $(FILENAME) web-slang.source
cat $(FILENAME) web-slang.source > zhwiki.source

zhwiki.raw: zhwiki.source
./convert.py zhwiki.source > zhwiki.raw

zhwiki.dict: zhwiki.raw
libime_pinyindict zhwiki.raw zhwiki.dict

install: zhwiki.dict
install -Dm644 zhwiki.dict -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/

clean:
rm -f $(FILENAME) zhwiki.{source,raw,dict} web-slang.source
49 changes: 49 additions & 0 deletions zhwiki-web-slang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import urllib.parse
import urllib.request

_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
_PAGE = "中国大陆网络用语列表"

page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
wikitext = json.loads(page)["parse"]["wikitext"]
words = set()


def add_word(word):
if word.startswith("形容"):
return
for garbage in ("、", "[", "]", "…"):
word = word.replace(garbage, "")
words.add(word.strip())


def add_words(word):
for word_separator in ("、", "/", "|", ",", "。"):
if word_separator in word:
for w in word.split(word_separator):
# recursively resolve
add_words(w.strip())
break
else:
add_word(word)


for line in wikitext.split("\n"):
if line.startswith("*"):
# Lists
for table_separator in (":", ":"):
if table_separator in line:
word = line.split(table_separator)[0].strip("*").strip()
add_words(word)
break
elif line.startswith("|"):
# Tables
word = line.split("|")[1]
add_words(word)

for word in words:
print(word)

0 comments on commit 59ff257

Please sign in to comment.