Add web slang generation

felixonmars · May 26, 2020 · 59ff257 · 59ff257
1 parent 2a41f14
commit 59ff257
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -9,14 +9,23 @@ download: $(FILENAME).gz
 $(FILENAME).gz:
 	wget https://dumps.wikimedia.org/zhwiki/20200501/$(FILENAME).gz
 
+web-slang.source:
+	./zhwiki-web-slang.py > web-slang.source
+
 $(FILENAME): $(FILENAME).gz
 	gzip -k -d $(FILENAME).gz
 
-zhwiki.raw: $(FILENAME)
-	./convert.py $(FILENAME) > zhwiki.raw
+zhwiki.source: $(FILENAME) web-slang.source
+	cat $(FILENAME) web-slang.source > zhwiki.source
+
+zhwiki.raw: zhwiki.source
+	./convert.py zhwiki.source > zhwiki.raw
 
 zhwiki.dict: zhwiki.raw
 	libime_pinyindict zhwiki.raw zhwiki.dict
 
 install: zhwiki.dict
 	install -Dm644 zhwiki.dict -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/
+
+clean:
+	rm -f $(FILENAME) zhwiki.{source,raw,dict} web-slang.source
diff --git a/zhwiki-web-slang.py b/zhwiki-web-slang.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import urllib.parse
+import urllib.request
+
+_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
+_PAGE = "中国大陆网络用语列表"
+
+page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
+wikitext = json.loads(page)["parse"]["wikitext"]
+words = set()
+
+
+def add_word(word):
+    if word.startswith("形容"):
+        return
+    for garbage in ("、", "[", "]", "…"):
+        word = word.replace(garbage, "")
+    words.add(word.strip())
+
+
+def add_words(word):
+    for word_separator in ("、", "/", "|", "，", "。"):
+        if word_separator in word:
+            for w in word.split(word_separator):
+                # recursively resolve
+                add_words(w.strip())
+            break
+    else:
+        add_word(word)
+
+
+for line in wikitext.split("\n"):
+    if line.startswith("*"):
+        # Lists
+        for table_separator in ("：", ":"):
+            if table_separator in line:
+                word = line.split(table_separator)[0].strip("*").strip()
+                add_words(word)
+                break
+    elif line.startswith("|"):
+        # Tables
+        word = line.split("|")[1]
+        add_words(word)
+
+for word in words:
+    print(word)