Skip to content

Commit 4a48fd3

Browse files
committed
Merge pull request #4 from mahmoud/master
data-driven wikipedia infos (and accompanying processing messcript)
2 parents 6dd91e9 + f7c4950 commit 4a48fd3

File tree

4 files changed

+2383
-292
lines changed

4 files changed

+2383
-292
lines changed

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ dist/*
1010
._*
1111
.\#*
1212
\#*\#
13-
*.csv
14-
*.json
1513
log*
1614
build/*
1715
*.swp

guess_server.py

Lines changed: 38 additions & 290 deletions
Original file line numberDiff line numberDiff line change
@@ -1,300 +1,46 @@
11
# -*- coding: utf-8 -*-
22

3-
from pyquery import PyQuery
4-
import wapiti
5-
import random
63
import os
74
from os.path import join as pjoin
5+
import json
6+
import random
7+
from collections import namedtuple
8+
9+
from pyquery import PyQuery
10+
11+
import wapiti
812
from clastic import Application
913
from clastic.render.mako_templates import MakoRenderFactory
1014

15+
16+
WikiLangInfo = namedtuple('WikiLangInfo',
17+
'name, en_article_name, shortcode, article_count, '
18+
'active_user_count, depth')
19+
20+
1121
_CURDIR = os.path.abspath(os.path.dirname(__file__))
1222
_TEMPLATE_PATH = pjoin(_CURDIR, 'templates')
1323
_STATIC_PATH = pjoin(_CURDIR, 'templates', 'assets')
14-
_LANG_DICT = {
15-
"English": "en",
16-
"German": "de",
17-
"Dutch": "nl",
18-
"French": "fr",
19-
"Italian": "it",
20-
"Spanish": "es",
21-
"Russian": "ru",
22-
"Swedish": "sv",
23-
"Polish": "pl",
24-
"Japanese": "ja",
25-
"Portuguese": "pt",
26-
"Chinese": "zh",
27-
"Vietnamese": "vi",
28-
"Ukrainian": "uk",
29-
"Catalan": "ca",
30-
"Waray-Waray": "war",
31-
"Cebuano": "ceb",
32-
"Finnish": "fi",
33-
"Persian": "fa",
34-
"Czech": "cs",
35-
"Hungarian": "hu",
36-
"Korean": "ko",
37-
"Arabic": "ar",
38-
"Romanian": "ro",
39-
"Malay": "ms",
40-
"Turkish": "tr",
41-
"Indonesian": "id",
42-
"Kazakh": "kk",
43-
"Serbian": "sr",
44-
"Slovak": "sk",
45-
"Esperanto": "eo",
46-
"Danish": "da",
47-
"Lithuanian": "lt",
48-
"Basque": "eu",
49-
"Bulgarian": "bg",
50-
"Hebrew": "he",
51-
"Croatian": "hr",
52-
"Slovenian": "sl",
53-
"Uzbek": "uz",
54-
"Volap\u00fck": "vo",
55-
"Estonian": "et",
56-
"Hindi": "hi",
57-
"Norwegian (Nynorsk)": "nn",
58-
"Galician": "gl",
59-
"Simple English ": "simple",
60-
"Azerbaijani": "az",
61-
"Latin": "la",
62-
"Greek": "el",
63-
"Serbo- Croatian": "sh",
64-
"Thai": "th",
65-
"Georgian": "ka",
66-
"Macedonian": "mk",
67-
"Occitan": "oc",
68-
"Newar/Nepal Bhasa": "new",
69-
"Piedmontese": "pms",
70-
"Tagalog": "tl",
71-
"Belarusian": "be",
72-
"Tamil": "ta",
73-
"Haitian": "ht",
74-
"Telugu": "te",
75-
"Belarusian (Tara\u0161kievica)": "be-x-old",
76-
"Welsh": "cy",
77-
"Latvian": "lv",
78-
"Bosnian": "bs",
79-
"Breton": "br",
80-
"Albanian": "sq",
81-
"Armenian": "hy",
82-
"Malagasy": "mg",
83-
"Tatar": "tt",
84-
"Javanese": "jv",
85-
"Marathi": "mr",
86-
"Luxembourgish": "lb",
87-
"Icelandic": "is",
88-
"Burmese": "my",
89-
"Yoruba": "yo",
90-
"Malayalam": "ml",
91-
"Bashkir": "ba",
92-
"Aragonese": "an",
93-
"Lombard": "lmo",
94-
"Afrikaans": "af",
95-
"West Frisian ": "fy",
96-
"Western Panjabi ": "pnb",
97-
"Bengali": "bn",
98-
"Swahili": "sw",
99-
"Bishnupriya Manipuri ": "bpy",
100-
"Ido": "io",
101-
"Kirghiz": "ky",
102-
"Urdu": "ur",
103-
"Nepali": "ne",
104-
"Sicilian": "scn",
105-
"Cantonese": "zh-yue",
106-
"Gujarati": "gu",
107-
"Low Saxon ": "nds",
108-
"Irish": "ga",
109-
"Kurdish": "ku",
110-
"Asturian": "ast",
111-
"Quechua": "qu",
112-
"Sundanese": "su",
113-
"Chuvash": "cv",
114-
"Scots": "sco",
115-
"Alemannic": "als",
116-
"Interlingua": "ia",
117-
"Neapolitan": "nap",
118-
"Buginese": "bug",
119-
"Samogitian": "bat-smg",
120-
"Kannada": "kn",
121-
"Banyumasan": "map-bms",
122-
"Walloon": "wa",
123-
"Amharic": "am",
124-
"Sorani": "ckb",
125-
"Scottish Gaelic ": "gd",
126-
"Fiji Hindi ": "hif",
127-
"Min Nan ": "zh-min-nan",
128-
"Tajik": "tg",
129-
"Egyptian Arabic ": "arz",
130-
"Mazandarani": "mzn",
131-
"Yiddish": "yi",
132-
"Venetian": "vec",
133-
"Mongolian": "mn",
134-
"Nahuatl": "nah",
135-
"Tarantino": "roa-tara",
136-
"Sakha": "sah",
137-
"Sanskrit": "sa",
138-
"Ossetian": "os",
139-
"Kapampangan": "pam",
140-
"Upper Sorbian ": "hsb",
141-
"Sinhalese": "si",
142-
"Northern Sami ": "se",
143-
"Bavarian": "bar",
144-
"Limburgish": "li",
145-
"Maori": "mi",
146-
"Corsican": "co",
147-
"Gan": "gan",
148-
"Faroese": "fo",
149-
"Ilokano": "ilo",
150-
"Tibetan": "bo",
151-
"Punjabi": "pa",
152-
"Gilaki": "glk",
153-
"Rusyn": "rue",
154-
"Central Bicolano": "bcl",
155-
"V\ u00f5ro": "fiu-vro",
156-
"Hill Mari ": "mrj",
157-
"Dutch Low Saxon": "nds-nl",
158-
"Turkmen": "tk",
159-
"Pashto": "ps",
160-
"West Flemish ": "vls",
161-
"Mingrelian": "xmf",
162-
"Manx": "gv",
163-
"Zazaki": "diq",
164-
"Oriya": "or",
165-
"Komi": "kv",
166-
"Pangasinan": "pag",
167-
"Zeelandic": "zea",
168-
"Khmer": "km",
169-
"Divehi": "dv",
170-
"Norman": "nrm",
171-
"Meadow Mari ": "mhr",
172-
"Romansh": "rm",
173-
"Komi- Permyak": "koi",
174-
"Udmurt": "udm",
175-
"Kashubian": "csb",
176-
"North Frisian ": "frr",
177-
"Vepsian": "vep",
178-
"Ladino": "lad",
179-
"Ligurian": "lij",
180-
"Wu": "wuu",
181-
"Friulian": "fur",
182-
"Classical Chinese ": "zh-classical",
183-
"Uyghur": "ug",
184-
"Sardinian": "sc",
185-
"Saterland Frisian ": "stq",
186-
"Aymara": "ay",
187-
"Maltese": "mt",
188-
"Pali": "pi",
189-
"Somali": "so",
190-
"Bihari": "bh",
191-
"Ripuarian": "ksh",
192-
"Novial": "nov",
193-
"Anglo- Saxon": "ang",
194-
"Hakka": "hak",
195-
"Cornish": "kw",
196-
"Navajo": "nv",
197-
"Picard": "pcd",
198-
"Guarani": "gn",
199-
"Extremaduran": "ext",
200-
"Assamese": "as",
201-
"Silesian": "szl",
202-
"Gagauz": "gag",
203-
"Emilian- Romagnol": "eml",
204-
"Interlingue": "ie",
205-
"Lingala": "ln",
206-
"Acehnese": "ace",
207-
"Chechen": "ce",
208-
"Karachay- Balkar": "krc",
209-
"Palatinate German ": "pfl",
210-
"Kalmyk": "xal",
211-
"Hawaiian": "haw",
212-
"Pennsylvania German ": "pdc",
213-
"Kinyarwanda": "rw",
214-
"Crimean Tatar ": "crh",
215-
"Tongan": "to",
216-
"Lower Sorbian ": "dsb",
217-
"Greenlandic": "kl",
218-
"Aramaic": "arc",
219-
"Erzya": "myv",
220-
"Kabyle": "kab",
221-
"Lezgian": "lez",
222-
"Banjar": "bjn",
223-
"Shona": "sn",
224-
"Papiamentu": "pap",
225-
"Tok Pisin ": "tpi",
226-
"Lak": "lbe",
227-
"Wolof": "wo",
228-
"Lojban": "jbo",
229-
"Moksha": "mdf",
230-
"Zamboanga Chavacano ": "cbk-zam",
231-
"Avar": "av",
232-
"Kabardian Circassian ": "kbd",
233-
"Sranan": "srn",
234-
"Mirandese": "mwl",
235-
"Tahitian": "ty",
236-
"Lao": "lo",
237-
"Abkhazian": "ab",
238-
"Tetum": "tet",
239-
"Latgalian": "ltg",
240-
"Kongo": "kg",
241-
"Nauruan": "na",
242-
"Igbo": "ig",
243-
"Buryat (Russia)": "bxr",
244-
"Northern Sotho ": "nso",
245-
"Zhuang": "za",
246-
"Karakalpak": "kaa",
247-
"Zulu": "zu",
248-
"Cheyenne": "chy",
249-
"Romani": "rmy",
250-
"Old Church Slavonic": "cu",
251-
"Aromanian": "roa-rup",
252-
"Tswana": "tn",
253-
"Cherokee": "chr",
254-
"Bislama": "bi",
255-
"Min Dong ": "cdo",
256-
"Gothic": "got",
257-
"Samoan": "sm",
258-
"Moldovan": "mo",
259-
"Bambara": "bm",
260-
"Inuktitut": "iu",
261-
"Norfolk": "pih",
262-
"Pontic": "pnt",
263-
"Sindhi": "sd",
264-
"Swati": "ss",
265-
"Kikuyu": "ki",
266-
"Ewe": "ee",
267-
"Hausa": "ha",
268-
"Oromo": "om",
269-
"Fijian": "fj",
270-
"Tigrinya": "ti",
271-
"Tsonga": "ts",
272-
"Kashmiri": "ks",
273-
"Venda": "ve",
274-
"Sango": "sg",
275-
"Kirundi": "rn",
276-
"Sesotho": "st",
277-
"Dzongkha": "dz",
278-
"Akan": "ak",
279-
"Cree": "cr",
280-
"Tumbuka": "tum",
281-
"Luganda": "lg",
282-
"Inupiak": "ik",
283-
"Fula": "ff",
284-
"Chichewa": "ny",
285-
"Twi": "tw",
286-
"Chamorro": "ch",
287-
"Xhosa": "xh",
288-
"Ndonga": "ng",
289-
"Sichuan Yi ": "ii",
290-
"Choctaw": "cho",
291-
"Marshallese": "mh",
292-
"Afar": "aa",
293-
"Kuanyama": "kj",
294-
"Hiri Motu ": "ho",
295-
"Muscogee": "mus",
296-
"Kanuri": "kr",
297-
"Herero": "hz"}
24+
25+
26+
def load_langs():
27+
"""
28+
File format:
29+
[[u'English', u'English language', u'en', 4234378, 129657, 763], ...]
30+
31+
(Language name, English Wikipedia article, shortcode, number of articles,
32+
number of active users, depth metric)
33+
"""
34+
ret = {}
35+
with open(pjoin(_CURDIR, 'wikis.json')) as f:
36+
wiki_lists = json.loads(f.read())
37+
for wiki in wiki_lists:
38+
wli = WikiLangInfo(*wiki)
39+
ret[wli.name] = wli
40+
return ret
41+
42+
43+
_LANG_DICT = load_langs()
29844

29945

30046
def get_text(element):
@@ -315,8 +61,9 @@ def get_sample(page_text):
31561

31662
def get_random_page():
31763
lang = random.choice(_LANG_DICT.keys())
64+
lang_info = _LANG_DICT[lang]
31865
#lang = 'Spanish' # a good way to debug
319-
lang_url = 'http://%s.wikipedia.org/' % (_LANG_DICT[lang],)
66+
lang_url = 'http://%s.wikipedia.org/' % (lang_info.shortcode,)
32067
lang_api_url = pjoin(lang_url, 'w/api.php')
32168
wc = wapiti.WapitiClient('languagegame@hatnote.com',
32269
api_url=lang_api_url)
@@ -345,12 +92,13 @@ def language_game(attempt=0):
34592
return language_game(attempt=attempt + 1)
34693
page_text = get_text(sample_p)
34794
sample = get_sample(page_text)
95+
correct_info = _LANG_DICT[correct]._asdict()
34896
ret = {
34997
'correct': correct,
98+
'correct_info': correct_info,
35099
'choices': choices,
351100
'sample': sample,
352-
'title': title
353-
}
101+
'title': title}
354102
return ret
355103

356104

0 commit comments

Comments
 (0)