1
1
# -*- coding: utf-8 -*-
2
2
3
- from pyquery import PyQuery
4
- import wapiti
5
- import random
6
3
import os
7
4
from os .path import join as pjoin
5
+ import json
6
+ import random
7
+ from collections import namedtuple
8
+
9
+ from pyquery import PyQuery
10
+
11
+ import wapiti
8
12
from clastic import Application
9
13
from clastic .render .mako_templates import MakoRenderFactory
10
14
15
+
16
+ WikiLangInfo = namedtuple ('WikiLangInfo' ,
17
+ 'name, en_article_name, shortcode, article_count, '
18
+ 'active_user_count, depth' )
19
+
20
+
11
21
_CURDIR = os .path .abspath (os .path .dirname (__file__ ))
12
22
_TEMPLATE_PATH = pjoin (_CURDIR , 'templates' )
13
23
_STATIC_PATH = pjoin (_CURDIR , 'templates' , 'assets' )
14
- _LANG_DICT = {
15
- "English" : "en" ,
16
- "German" : "de" ,
17
- "Dutch" : "nl" ,
18
- "French" : "fr" ,
19
- "Italian" : "it" ,
20
- "Spanish" : "es" ,
21
- "Russian" : "ru" ,
22
- "Swedish" : "sv" ,
23
- "Polish" : "pl" ,
24
- "Japanese" : "ja" ,
25
- "Portuguese" : "pt" ,
26
- "Chinese" : "zh" ,
27
- "Vietnamese" : "vi" ,
28
- "Ukrainian" : "uk" ,
29
- "Catalan" : "ca" ,
30
- "Waray-Waray" : "war" ,
31
- "Cebuano" : "ceb" ,
32
- "Finnish" : "fi" ,
33
- "Persian" : "fa" ,
34
- "Czech" : "cs" ,
35
- "Hungarian" : "hu" ,
36
- "Korean" : "ko" ,
37
- "Arabic" : "ar" ,
38
- "Romanian" : "ro" ,
39
- "Malay" : "ms" ,
40
- "Turkish" : "tr" ,
41
- "Indonesian" : "id" ,
42
- "Kazakh" : "kk" ,
43
- "Serbian" : "sr" ,
44
- "Slovak" : "sk" ,
45
- "Esperanto" : "eo" ,
46
- "Danish" : "da" ,
47
- "Lithuanian" : "lt" ,
48
- "Basque" : "eu" ,
49
- "Bulgarian" : "bg" ,
50
- "Hebrew" : "he" ,
51
- "Croatian" : "hr" ,
52
- "Slovenian" : "sl" ,
53
- "Uzbek" : "uz" ,
54
- "Volap\u00fc k" : "vo" ,
55
- "Estonian" : "et" ,
56
- "Hindi" : "hi" ,
57
- "Norwegian (Nynorsk)" : "nn" ,
58
- "Galician" : "gl" ,
59
- "Simple English " : "simple" ,
60
- "Azerbaijani" : "az" ,
61
- "Latin" : "la" ,
62
- "Greek" : "el" ,
63
- "Serbo- Croatian" : "sh" ,
64
- "Thai" : "th" ,
65
- "Georgian" : "ka" ,
66
- "Macedonian" : "mk" ,
67
- "Occitan" : "oc" ,
68
- "Newar/Nepal Bhasa" : "new" ,
69
- "Piedmontese" : "pms" ,
70
- "Tagalog" : "tl" ,
71
- "Belarusian" : "be" ,
72
- "Tamil" : "ta" ,
73
- "Haitian" : "ht" ,
74
- "Telugu" : "te" ,
75
- "Belarusian (Tara\u0161 kievica)" : "be-x-old" ,
76
- "Welsh" : "cy" ,
77
- "Latvian" : "lv" ,
78
- "Bosnian" : "bs" ,
79
- "Breton" : "br" ,
80
- "Albanian" : "sq" ,
81
- "Armenian" : "hy" ,
82
- "Malagasy" : "mg" ,
83
- "Tatar" : "tt" ,
84
- "Javanese" : "jv" ,
85
- "Marathi" : "mr" ,
86
- "Luxembourgish" : "lb" ,
87
- "Icelandic" : "is" ,
88
- "Burmese" : "my" ,
89
- "Yoruba" : "yo" ,
90
- "Malayalam" : "ml" ,
91
- "Bashkir" : "ba" ,
92
- "Aragonese" : "an" ,
93
- "Lombard" : "lmo" ,
94
- "Afrikaans" : "af" ,
95
- "West Frisian " : "fy" ,
96
- "Western Panjabi " : "pnb" ,
97
- "Bengali" : "bn" ,
98
- "Swahili" : "sw" ,
99
- "Bishnupriya Manipuri " : "bpy" ,
100
- "Ido" : "io" ,
101
- "Kirghiz" : "ky" ,
102
- "Urdu" : "ur" ,
103
- "Nepali" : "ne" ,
104
- "Sicilian" : "scn" ,
105
- "Cantonese" : "zh-yue" ,
106
- "Gujarati" : "gu" ,
107
- "Low Saxon " : "nds" ,
108
- "Irish" : "ga" ,
109
- "Kurdish" : "ku" ,
110
- "Asturian" : "ast" ,
111
- "Quechua" : "qu" ,
112
- "Sundanese" : "su" ,
113
- "Chuvash" : "cv" ,
114
- "Scots" : "sco" ,
115
- "Alemannic" : "als" ,
116
- "Interlingua" : "ia" ,
117
- "Neapolitan" : "nap" ,
118
- "Buginese" : "bug" ,
119
- "Samogitian" : "bat-smg" ,
120
- "Kannada" : "kn" ,
121
- "Banyumasan" : "map-bms" ,
122
- "Walloon" : "wa" ,
123
- "Amharic" : "am" ,
124
- "Sorani" : "ckb" ,
125
- "Scottish Gaelic " : "gd" ,
126
- "Fiji Hindi " : "hif" ,
127
- "Min Nan " : "zh-min-nan" ,
128
- "Tajik" : "tg" ,
129
- "Egyptian Arabic " : "arz" ,
130
- "Mazandarani" : "mzn" ,
131
- "Yiddish" : "yi" ,
132
- "Venetian" : "vec" ,
133
- "Mongolian" : "mn" ,
134
- "Nahuatl" : "nah" ,
135
- "Tarantino" : "roa-tara" ,
136
- "Sakha" : "sah" ,
137
- "Sanskrit" : "sa" ,
138
- "Ossetian" : "os" ,
139
- "Kapampangan" : "pam" ,
140
- "Upper Sorbian " : "hsb" ,
141
- "Sinhalese" : "si" ,
142
- "Northern Sami " : "se" ,
143
- "Bavarian" : "bar" ,
144
- "Limburgish" : "li" ,
145
- "Maori" : "mi" ,
146
- "Corsican" : "co" ,
147
- "Gan" : "gan" ,
148
- "Faroese" : "fo" ,
149
- "Ilokano" : "ilo" ,
150
- "Tibetan" : "bo" ,
151
- "Punjabi" : "pa" ,
152
- "Gilaki" : "glk" ,
153
- "Rusyn" : "rue" ,
154
- "Central Bicolano" : "bcl" ,
155
- "V\ u00f5ro" : "fiu-vro" ,
156
- "Hill Mari " : "mrj" ,
157
- "Dutch Low Saxon" : "nds-nl" ,
158
- "Turkmen" : "tk" ,
159
- "Pashto" : "ps" ,
160
- "West Flemish " : "vls" ,
161
- "Mingrelian" : "xmf" ,
162
- "Manx" : "gv" ,
163
- "Zazaki" : "diq" ,
164
- "Oriya" : "or" ,
165
- "Komi" : "kv" ,
166
- "Pangasinan" : "pag" ,
167
- "Zeelandic" : "zea" ,
168
- "Khmer" : "km" ,
169
- "Divehi" : "dv" ,
170
- "Norman" : "nrm" ,
171
- "Meadow Mari " : "mhr" ,
172
- "Romansh" : "rm" ,
173
- "Komi- Permyak" : "koi" ,
174
- "Udmurt" : "udm" ,
175
- "Kashubian" : "csb" ,
176
- "North Frisian " : "frr" ,
177
- "Vepsian" : "vep" ,
178
- "Ladino" : "lad" ,
179
- "Ligurian" : "lij" ,
180
- "Wu" : "wuu" ,
181
- "Friulian" : "fur" ,
182
- "Classical Chinese " : "zh-classical" ,
183
- "Uyghur" : "ug" ,
184
- "Sardinian" : "sc" ,
185
- "Saterland Frisian " : "stq" ,
186
- "Aymara" : "ay" ,
187
- "Maltese" : "mt" ,
188
- "Pali" : "pi" ,
189
- "Somali" : "so" ,
190
- "Bihari" : "bh" ,
191
- "Ripuarian" : "ksh" ,
192
- "Novial" : "nov" ,
193
- "Anglo- Saxon" : "ang" ,
194
- "Hakka" : "hak" ,
195
- "Cornish" : "kw" ,
196
- "Navajo" : "nv" ,
197
- "Picard" : "pcd" ,
198
- "Guarani" : "gn" ,
199
- "Extremaduran" : "ext" ,
200
- "Assamese" : "as" ,
201
- "Silesian" : "szl" ,
202
- "Gagauz" : "gag" ,
203
- "Emilian- Romagnol" : "eml" ,
204
- "Interlingue" : "ie" ,
205
- "Lingala" : "ln" ,
206
- "Acehnese" : "ace" ,
207
- "Chechen" : "ce" ,
208
- "Karachay- Balkar" : "krc" ,
209
- "Palatinate German " : "pfl" ,
210
- "Kalmyk" : "xal" ,
211
- "Hawaiian" : "haw" ,
212
- "Pennsylvania German " : "pdc" ,
213
- "Kinyarwanda" : "rw" ,
214
- "Crimean Tatar " : "crh" ,
215
- "Tongan" : "to" ,
216
- "Lower Sorbian " : "dsb" ,
217
- "Greenlandic" : "kl" ,
218
- "Aramaic" : "arc" ,
219
- "Erzya" : "myv" ,
220
- "Kabyle" : "kab" ,
221
- "Lezgian" : "lez" ,
222
- "Banjar" : "bjn" ,
223
- "Shona" : "sn" ,
224
- "Papiamentu" : "pap" ,
225
- "Tok Pisin " : "tpi" ,
226
- "Lak" : "lbe" ,
227
- "Wolof" : "wo" ,
228
- "Lojban" : "jbo" ,
229
- "Moksha" : "mdf" ,
230
- "Zamboanga Chavacano " : "cbk-zam" ,
231
- "Avar" : "av" ,
232
- "Kabardian Circassian " : "kbd" ,
233
- "Sranan" : "srn" ,
234
- "Mirandese" : "mwl" ,
235
- "Tahitian" : "ty" ,
236
- "Lao" : "lo" ,
237
- "Abkhazian" : "ab" ,
238
- "Tetum" : "tet" ,
239
- "Latgalian" : "ltg" ,
240
- "Kongo" : "kg" ,
241
- "Nauruan" : "na" ,
242
- "Igbo" : "ig" ,
243
- "Buryat (Russia)" : "bxr" ,
244
- "Northern Sotho " : "nso" ,
245
- "Zhuang" : "za" ,
246
- "Karakalpak" : "kaa" ,
247
- "Zulu" : "zu" ,
248
- "Cheyenne" : "chy" ,
249
- "Romani" : "rmy" ,
250
- "Old Church Slavonic" : "cu" ,
251
- "Aromanian" : "roa-rup" ,
252
- "Tswana" : "tn" ,
253
- "Cherokee" : "chr" ,
254
- "Bislama" : "bi" ,
255
- "Min Dong " : "cdo" ,
256
- "Gothic" : "got" ,
257
- "Samoan" : "sm" ,
258
- "Moldovan" : "mo" ,
259
- "Bambara" : "bm" ,
260
- "Inuktitut" : "iu" ,
261
- "Norfolk" : "pih" ,
262
- "Pontic" : "pnt" ,
263
- "Sindhi" : "sd" ,
264
- "Swati" : "ss" ,
265
- "Kikuyu" : "ki" ,
266
- "Ewe" : "ee" ,
267
- "Hausa" : "ha" ,
268
- "Oromo" : "om" ,
269
- "Fijian" : "fj" ,
270
- "Tigrinya" : "ti" ,
271
- "Tsonga" : "ts" ,
272
- "Kashmiri" : "ks" ,
273
- "Venda" : "ve" ,
274
- "Sango" : "sg" ,
275
- "Kirundi" : "rn" ,
276
- "Sesotho" : "st" ,
277
- "Dzongkha" : "dz" ,
278
- "Akan" : "ak" ,
279
- "Cree" : "cr" ,
280
- "Tumbuka" : "tum" ,
281
- "Luganda" : "lg" ,
282
- "Inupiak" : "ik" ,
283
- "Fula" : "ff" ,
284
- "Chichewa" : "ny" ,
285
- "Twi" : "tw" ,
286
- "Chamorro" : "ch" ,
287
- "Xhosa" : "xh" ,
288
- "Ndonga" : "ng" ,
289
- "Sichuan Yi " : "ii" ,
290
- "Choctaw" : "cho" ,
291
- "Marshallese" : "mh" ,
292
- "Afar" : "aa" ,
293
- "Kuanyama" : "kj" ,
294
- "Hiri Motu " : "ho" ,
295
- "Muscogee" : "mus" ,
296
- "Kanuri" : "kr" ,
297
- "Herero" : "hz" }
24
+
25
+
26
+ def load_langs ():
27
+ """
28
+ File format:
29
+ [[u'English', u'English language', u'en', 4234378, 129657, 763], ...]
30
+
31
+ (Language name, English Wikipedia article, shortcode, number of articles,
32
+ number of active users, depth metric)
33
+ """
34
+ ret = {}
35
+ with open (pjoin (_CURDIR , 'wikis.json' )) as f :
36
+ wiki_lists = json .loads (f .read ())
37
+ for wiki in wiki_lists :
38
+ wli = WikiLangInfo (* wiki )
39
+ ret [wli .name ] = wli
40
+ return ret
41
+
42
+
43
+ _LANG_DICT = load_langs ()
298
44
299
45
300
46
def get_text (element ):
@@ -315,8 +61,9 @@ def get_sample(page_text):
315
61
316
62
def get_random_page ():
317
63
lang = random .choice (_LANG_DICT .keys ())
64
+ lang_info = _LANG_DICT [lang ]
318
65
#lang = 'Spanish' # a good way to debug
319
- lang_url = 'http://%s.wikipedia.org/' % (_LANG_DICT [ lang ] ,)
66
+ lang_url = 'http://%s.wikipedia.org/' % (lang_info . shortcode ,)
320
67
lang_api_url = pjoin (lang_url , 'w/api.php' )
321
68
wc = wapiti .WapitiClient ('languagegame@hatnote.com' ,
322
69
api_url = lang_api_url )
@@ -345,12 +92,13 @@ def language_game(attempt=0):
345
92
return language_game (attempt = attempt + 1 )
346
93
page_text = get_text (sample_p )
347
94
sample = get_sample (page_text )
95
+ correct_info = _LANG_DICT [correct ]._asdict ()
348
96
ret = {
349
97
'correct' : correct ,
98
+ 'correct_info' : correct_info ,
350
99
'choices' : choices ,
351
100
'sample' : sample ,
352
- 'title' : title
353
- }
101
+ 'title' : title }
354
102
return ret
355
103
356
104
0 commit comments