Skip to content

Commit

Permalink
chore: experiment with loripsum.net (#3169)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S committed May 19, 2024
1 parent a506a9a commit 25dd361
Show file tree
Hide file tree
Showing 5 changed files with 6,871 additions and 2 deletions.
21 changes: 21 additions & 0 deletions dictionaries/lorem-ipsum/loripsum.net/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Words from https://loripsum.net/

<!--- cspell:locale en, la --->

This is an experiment to see how many words https://loripsum.net/ has in its dictionary.

It downloads samples and puts them into a word frequency table. [./words.txt](./words.txt).

Sample:

```csv
*,5116693
a,30492
ab,16790
abducam,6
abducas,144
abducat,87
abducere,155
```

Where `*` is the total word count.
6 changes: 6 additions & 0 deletions dictionaries/lorem-ipsum/loripsum.net/cspell.config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import:
- ../cspell-ext.json
- ../../la/cspell-ext.json
language: lorem, la, en
ignorePaths:
- words.txt
93 changes: 93 additions & 0 deletions dictionaries/lorem-ipsum/loripsum.net/filter.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import fs from 'node:fs';

async function run() {
for (let i = 0; i < 1000; i++) {
await process();
await waitFor(250);
}
}

async function process() {
const knownWords = readWordFreq('./words.txt');
const numWords = Math.max(knownWords.size - 1, 0);
const wordCount = knownWords.get('*') || 0;

console.log('Known words:', numWords);
console.log('Total words:', wordCount);

const sourceWords = await fetchWords();
addWordsToFreqMap(knownWords, sourceWords);

console.log('New words:', knownWords.size - numWords - 1);
console.log('Added words:', knownWords.get('*') - wordCount);

writeWordFreq('./words.txt', knownWords);
}

function readFile(path) {
try {
return fs.readFileSync(path, 'utf8');
} catch {
return undefined;
}
}

/**
*
* @param {string} text
* @returns {string[]}
*/
function textToWords(text) {
const words = text
.replaceAll(/[^a-zA-Z]/g, ' ')
.toLowerCase()
.split(' ')
.filter((a) => a);
return words;
}

async function fetchWords() {
const response = await fetch('https://loripsum.net/api/20/verylong/plaintext');
const text = await response.text();
return textToWords(text);
}

/**
*
* @param {string} path
* @returns {Map<string, number>}
*/
function readWordFreq(path) {
const text = readFile(path) || '';
const freq = new Map();
text.split('\n').forEach((line) => {
if (!line.trim()) return;
const [word, count] = line.split(',');
freq.set(word, Number(count));
});
return freq;
}

function writeWordFreq(path, freqMap) {
const lines = [...freqMap.entries()].map(([word, count]) => `${word},${count}`);
fs.writeFileSync(path, lines.sort().join('\n'));
}

function waitFor(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

function addWordsToFreqMap(map, words) {
for (const word of words) {
addWordToFreqMap(map, word);
}
}

function addWordToFreqMap(map, key, n = 1) {
const v = map.get(key) || 0;
map.set(key, v + n);
const t = map.get('*') || 0;
map.set('*', t + n);
}

run();

0 comments on commit 25dd361

Please sign in to comment.