Skip to content

Commit

Permalink
Make web-slang generation two-passes and reproducible
Browse files Browse the repository at this point in the history
  • Loading branch information
felixonmars committed Jun 4, 2021
1 parent d17d9bc commit e6f81e8
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 38 deletions.
13 changes: 9 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
VERSION=20210101
VERSION=20210601
WEB_SLANG_VERSION=20210605
FILENAME=zhwiki-$(VERSION)-all-titles-in-ns0
WEB_SLANG_FILE=web-slang-$(VERSION).source
WEB_SLANG_FILE=web-slang-$(WEB_SLANG_VERSION).txt
WEB_SLANG_SOURCE=web-slang-$(WEB_SLANG_VERSION).source

all: build

Expand All @@ -11,8 +13,11 @@ download: $(FILENAME).gz
$(FILENAME).gz:
wget https://dumps.wikimedia.org/zhwiki/$(VERSION)/$(FILENAME).gz

$(WEB_SLANG_FILE):
./zhwiki-web-slang.py > $(WEB_SLANG_FILE)
$(WEB_SLANG_SOURCE):
./zhwiki-web-slang.py --fetch > $(WEB_SLANG_SOURCE)

$(WEB_SLANG_FILE): $(WEB_SLANG_SOURCE)
./zhwiki-web-slang.py --process $(WEB_SLANG_SOURCE) > $(WEB_SLANG_FILE)

$(FILENAME): $(FILENAME).gz
gzip -k -d $(FILENAME).gz
Expand Down
93 changes: 59 additions & 34 deletions zhwiki-web-slang.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,71 @@
import urllib.parse
import urllib.request
import collections
import sys

_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
_PAGE = "中国大陆网络用语列表"

page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
wikitext = json.loads(page)["parse"]["wikitext"]
words = collections.OrderedDict()
def fetch():
_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
_PAGE = "中国大陆网络用语列表"

page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
wikitext = json.loads(page)["parse"]["wikitext"]
return wikitext

def add_word(word):
if word.startswith("形容"):
return
for garbage in ("、", "[", "]", "…"):
word = word.replace(garbage, "")
words[word.strip()] = None

def process(wikitext):
words = collections.OrderedDict()

def add_words(word):
for word_separator in ("、", "/", "|", ",", "。"):
if word_separator in word:
for w in word.split(word_separator):
# recursively resolve
add_words(w.strip())
break
else:
add_word(word)

def add_word(word):
if word.startswith("形容"):
return
for garbage in ("、", "[", "]", "…"):
word = word.replace(garbage, "")
words[word.strip()] = None

for line in wikitext.split("\n"):
if line.startswith("*"):
# Lists
for table_separator in (":", ":"):
if table_separator in line:
word = line.split(table_separator)[0].strip("*").strip()
add_words(word)
def add_words(word):
for word_separator in ("、", "/", "|", ",", "。"):
if word_separator in word:
for w in word.split(word_separator):
# recursively resolve
add_words(w.strip())
break
elif line.startswith("|"):
# Tables
word = line.split("|")[1]
add_words(word)
else:
add_word(word)

for line in wikitext.split("\n"):
if line.startswith("*"):
# Lists
for table_separator in (":", ":"):
if table_separator in line:
word = line.split(table_separator)[0].strip("*").strip()
add_words(word)
break
elif line.startswith("|"):
# Tables
word = line.split("|")[1]
add_words(word)

return words


def print_words(words):
for word in words:
print(word)

for word in words:
print(word)

if __name__ == "__main__":
if len(sys.argv) == 1:
wikitext = fetch()
words = process(wikitext)
print_words(words)

elif sys.argv[1] == "--fetch":
print(fetch())

elif sys.argv[1] == "--process":
wikitext = open(sys.argv[2]).read()
print_words(process(wikitext))

else:
raise NotImplementedError

0 comments on commit e6f81e8

Please sign in to comment.