|
| 1 | +""" |
| 2 | +LICENSE |
| 3 | +
|
| 4 | + pysrd - Python scripts for working with the DND35 OGL SRD. |
| 5 | + Copyright (C) 2012, 2013 Richard Tew |
| 6 | +
|
| 7 | + This program is free software: you can redistribute it and/or modify |
| 8 | + it under the terms of the GNU General Public License as published by |
| 9 | + the Free Software Foundation, either version 3 of the License, or |
| 10 | + (at your option) any later version. |
| 11 | +
|
| 12 | + This program is distributed in the hope that it will be useful, |
| 13 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | + GNU General Public License for more details. |
| 16 | +
|
| 17 | + You should have received a copy of the GNU General Public License |
| 18 | + along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 19 | +
|
| 20 | +OVERVIEW |
| 21 | +
|
| 22 | + The SQLite database provided by highmage, and available here (http://www.andargor.com/) lacks |
| 23 | + some data from the SRD. This script parses additional SRD information from Josh Ritter's |
| 24 | + OpenSRD (http://sourceforge.net/projects/opensrd) HTML files. |
| 25 | +
|
| 26 | + You will need to ensure the following variables have the correct values: |
| 27 | + DATABASE_FILENAME: Name of a local file containing the SQLite database created by highmage. |
| 28 | + HTML_DIR_NAME: Name of local directory immediately containing the OpenSRD html files. |
| 29 | +""" |
| 30 | + |
| 31 | +import bs4 # c:\python27\Scripts\pip.exe install beautifulsoup4 |
| 32 | +import os |
| 33 | +import re |
| 34 | +import sys |
| 35 | +import StringIO |
| 36 | +import sqlite3 |
| 37 | + |
| 38 | + |
| 39 | +DATABASE_FILENAME = "dnd35.sqlite" |
| 40 | +HTML_DIR_NAME = "SRD-html" |
| 41 | + |
| 42 | + |
| 43 | +# Taken from the Python wiki. |
| 44 | +html_escape_table = { |
| 45 | + # "&": "&", |
| 46 | + '"': """, |
| 47 | + "'": "'", |
| 48 | + ">": ">", |
| 49 | + "<": "<", |
| 50 | +} |
| 51 | +def html_escape(text): |
| 52 | + """Produce entities within text.""" |
| 53 | + return "".join(html_escape_table.get(c,c) for c in text) |
| 54 | + |
| 55 | + |
| 56 | +def escape_children(v): |
| 57 | + for child in v.contents: |
| 58 | + if isinstance(child, bs4.Tag): |
| 59 | + if child.name == "a": |
| 60 | + if child.string is None: |
| 61 | + child.replace_with("") |
| 62 | + continue |
| 63 | + child.string = html_escape(child.string) |
| 64 | + escape_children(child) |
| 65 | + else: |
| 66 | + child.replace_with(html_escape(child)) |
| 67 | + |
| 68 | + |
| 69 | +##### |
| 70 | + |
| 71 | +def parse_special_abilities(cb): |
| 72 | + file_path = os.path.join(html_path, "abilitiesAndConditions.html") |
| 73 | + with open(file_path, "r") as f: |
| 74 | + soup = bs4.BeautifulSoup(f) |
| 75 | + v = first_h5 = soup.body.h5 |
| 76 | + |
| 77 | + name = "" |
| 78 | + fulltext = "" |
| 79 | + while v: |
| 80 | + if isinstance(v, bs4.Tag): |
| 81 | + if v.name == "h5": |
| 82 | + # Commit any current entry. |
| 83 | + if name: |
| 84 | + for name in name.split("and"): |
| 85 | + cb(name=name.strip().capitalize(), fulltext=fulltext) |
| 86 | + # Start the next entry. |
| 87 | + name = v.get_text().lower() |
| 88 | + fulltext = "" |
| 89 | + elif v.name == "h3": |
| 90 | + break |
| 91 | + else: |
| 92 | + if "class" in v.attrs: |
| 93 | + del v.attrs["class"] |
| 94 | + fulltext += v.prettify() |
| 95 | + else: |
| 96 | + pass # print v.string |
| 97 | + v = v.next_sibling |
| 98 | + # Commit any current entry. |
| 99 | + if name: |
| 100 | + for name in name.split("and"): |
| 101 | + cb(name=name.strip().capitalize(), fulltext=fulltext) |
| 102 | + |
| 103 | + |
| 104 | +def parse_conditions(cb): |
| 105 | + file_path = os.path.join(html_path, "abilitiesAndConditions.html") |
| 106 | + with open(file_path, "r") as f: |
| 107 | + soup = bs4.BeautifulSoup(f) |
| 108 | + v = soup.body.h3.find_next("h3") |
| 109 | + if v.get_text() != "CONDITIONS": |
| 110 | + raise Exception, "unable to find CONDITIONS H3 tag" |
| 111 | + v = v.find_next("p") |
| 112 | + |
| 113 | + name = "" |
| 114 | + fulltext = "" |
| 115 | + while v: |
| 116 | + if "class" not in v.attrs: |
| 117 | + b = v.find("b") |
| 118 | + if b is None: |
| 119 | + fulltext += v.prettify() |
| 120 | + else: |
| 121 | + # Commit any current entry. |
| 122 | + if name: |
| 123 | + cb(name=name, fulltext=fulltext) |
| 124 | + # Start the next entry. |
| 125 | + name = b.get_text().lower().capitalize() |
| 126 | + escape_children(v) |
| 127 | + fulltext = v.prettify() |
| 128 | + v = v.find_next("p") |
| 129 | + # Commit any current entry. |
| 130 | + if name: |
| 131 | + cb(name=name, fulltext=fulltext) |
| 132 | + |
| 133 | +def parse_abilities(cb): |
| 134 | + title_re = re.compile("([a-zA-Z]+)[ ]+\(([a-zA-Z]+)\)") |
| 135 | + |
| 136 | + file_path = os.path.join(html_path, "basics.html") |
| 137 | + with open(file_path, "r") as f: |
| 138 | + soup = bs4.BeautifulSoup(f) |
| 139 | + v = soup.body.h3.find_next("h3") |
| 140 | + while v.get_text() != "THE ABILITIES": |
| 141 | + v = v.find_next("h3") |
| 142 | + |
| 143 | + v = v.find_next("h5") |
| 144 | + name = "" |
| 145 | + shortname = "" |
| 146 | + fulltext = "" |
| 147 | + while v: |
| 148 | + if isinstance(v, bs4.Tag): |
| 149 | + if v.name == "h5": |
| 150 | + # Commit any current entry. |
| 151 | + if name: |
| 152 | + cb(name=name, shortname=shortname, fulltext=fulltext) |
| 153 | + # Start the next entry. |
| 154 | + m = title_re.match(v.get_text().lower()) |
| 155 | + name = m.group(1).capitalize() |
| 156 | + shortname = m.group(2) |
| 157 | + fulltext = "" |
| 158 | + elif v.name == "h3": |
| 159 | + break |
| 160 | + else: |
| 161 | + if "class" in v.attrs: |
| 162 | + del v.attrs["class"] |
| 163 | + fulltext += v.prettify() |
| 164 | + else: |
| 165 | + pass # print v.string |
| 166 | + v = v.next_sibling |
| 167 | + # Commit any current entry. |
| 168 | + if name: |
| 169 | + cb(name=name, shortname=shortname, fulltext=fulltext) |
| 170 | + |
| 171 | +def parse_abilities_table(cb): |
| 172 | + file_path = os.path.join(html_path, "basics.html") |
| 173 | + with open(file_path, "r") as f: |
| 174 | + soup = bs4.BeautifulSoup(f) |
| 175 | + |
| 176 | + v = soup.body.h5 |
| 177 | + while v.get_text() != "ABILITY MODIFIERS": |
| 178 | + v = v.find_next("h5") |
| 179 | + |
| 180 | + e = v.find_next("tr") |
| 181 | + tr_column_names = [] |
| 182 | + tr_lines = [] |
| 183 | + while e: |
| 184 | + if isinstance(e, bs4.Tag): |
| 185 | + if e.name == "tr": |
| 186 | + if e.th is not None: |
| 187 | + # The last row with header cells is considered the right one. |
| 188 | + tr_column_names[:] = [] |
| 189 | + th = e.th |
| 190 | + while th: |
| 191 | + if isinstance(th, bs4.Tag): |
| 192 | + if "colspan" in th.attrs: |
| 193 | + colspan = int(th.attrs["colspan"]) |
| 194 | + tr_column_names.extend(( "?" for i in range(colspan) )) |
| 195 | + else: |
| 196 | + v = th.get_text().lower() |
| 197 | + tr_column_names.append(th.get_text().lower()) |
| 198 | + th = th.next_sibling |
| 199 | + elif e.td is not None: |
| 200 | + td = e.td |
| 201 | + line = [] |
| 202 | + while td: |
| 203 | + if isinstance(td, bs4.Tag): |
| 204 | + if "colspan" in td.attrs: |
| 205 | + colspan = int(td.attrs["colspan"]) |
| 206 | + if colspan != len(tr_column_names): |
| 207 | + line.extend(( "NULL" for i in range(colspan) )) |
| 208 | + else: |
| 209 | + value = td.get_text() |
| 210 | + if value == u'\u2014': # unicode for '-' |
| 211 | + value = 0 |
| 212 | + else: |
| 213 | + try: |
| 214 | + value = int(value) |
| 215 | + except ValueError: |
| 216 | + pass |
| 217 | + line.append(value) |
| 218 | + td = td.next_sibling |
| 219 | + if len(line) == len(tr_column_names): |
| 220 | + tr_lines.append(line) |
| 221 | + elif len(e.string) > 1: |
| 222 | + pass # print "'"+ e +"'" |
| 223 | + e = e.next_sibling |
| 224 | + |
| 225 | + # Translate the table column names to database column names. |
| 226 | + db_column_names = [] |
| 227 | + column_types_list = [] |
| 228 | + for i, column_name in enumerate(tr_column_names): |
| 229 | + if i == 0: |
| 230 | + db_column_names.append(column_name +"_min") |
| 231 | + column_types_list.append((db_column_names[-1], "INTEGER")) |
| 232 | + db_column_names.append(column_name +"_max") |
| 233 | + column_types_list.append((db_column_names[-1], "INTEGER")) |
| 234 | + else: |
| 235 | + c = column_name[0] |
| 236 | + try: |
| 237 | + int(column_name[0]) |
| 238 | + value = "level_"+ column_name[0] |
| 239 | + except ValueError: |
| 240 | + value = column_name |
| 241 | + db_column_names.append(value) |
| 242 | + column_types_list.append((db_column_names[-1], "INTEGER")) |
| 243 | + |
| 244 | + # Fix the first column. |
| 245 | + db_lines = [] |
| 246 | + for tr_line in tr_lines: |
| 247 | + db_line = [] |
| 248 | + score_range = tr_line[0] |
| 249 | + if type(score_range) is int: |
| 250 | + score_min = score_max = score_range |
| 251 | + else: |
| 252 | + score_min, score_max = [ int(v) for v in score_range.split("-") ] |
| 253 | + db_line.append(score_min) |
| 254 | + db_line.append(score_max) |
| 255 | + db_line.extend(tr_line[1:]) |
| 256 | + db_lines.append(db_line) |
| 257 | + |
| 258 | + for db_line in db_lines: |
| 259 | + kwargs = dict(zip(db_column_names, db_line)) |
| 260 | + kwargs["column_types_list"] = column_types_list |
| 261 | + cb(**kwargs) |
| 262 | + |
| 263 | + |
| 264 | +# This list is used to preserve column ordering. |
| 265 | +column_types_list = [ |
| 266 | + ("name", "TEXT NOT NULL UNIQUE"), |
| 267 | + ("shortname", "TEXT NOT NULL UNIQUE"), |
| 268 | + ("fulltext", "TEXT NOT NULL"), |
| 269 | +] |
| 270 | + |
| 271 | + |
| 272 | +def create_callback(table_name, statements): |
| 273 | + def cb(**kwargs): |
| 274 | + # Build the complete list of known column types for this table. |
| 275 | + local_column_types_list = column_types_list[:] |
| 276 | + if "column_types_list" in kwargs: |
| 277 | + local_column_types_list.extend(kwargs["column_types_list"]) |
| 278 | + del kwargs["column_types_list"] |
| 279 | + local_column_types = dict(local_column_types_list) |
| 280 | + input_column_names = kwargs.keys() |
| 281 | + |
| 282 | + # Validate text has sane values. |
| 283 | + for column_name in input_column_names: |
| 284 | + column_type = local_column_types[column_name] |
| 285 | + if "TEXT" in column_type and "'" in kwargs[column_name]: |
| 286 | + raise RuntimeError("text contains SQL quoting character") |
| 287 | + |
| 288 | + # Build the table definition on receiving the first row to insert. |
| 289 | + if not len(statements): |
| 290 | + s = StringIO.StringIO() |
| 291 | + # Drop the table if it already exists, to start fresh. |
| 292 | + s.write("DROP TABLE IF EXISTS %s;" % table_name) |
| 293 | + statements.append(s.getvalue()) |
| 294 | + s.close() |
| 295 | + |
| 296 | + # Recreate the table, preserving ideal column ordering. |
| 297 | + s = StringIO.StringIO() |
| 298 | + s.write("CREATE TABLE %s (" % table_name) |
| 299 | + s.write("id INTEGER PRIMARY KEY, ") |
| 300 | + cnt = 0 |
| 301 | + for entry in local_column_types_list: |
| 302 | + if entry[0] in input_column_names: |
| 303 | + if cnt > 0: |
| 304 | + s.write(", ") |
| 305 | + s.write("%s %s" % entry) |
| 306 | + cnt += 1 |
| 307 | + s.write(");") |
| 308 | + statements.append(s.getvalue()) |
| 309 | + s.close() |
| 310 | + |
| 311 | + nameSIO = StringIO.StringIO() |
| 312 | + valueSIO = StringIO.StringIO() |
| 313 | + for i, column_name in enumerate(input_column_names): |
| 314 | + if i > 0: |
| 315 | + nameSIO.write(", ") |
| 316 | + valueSIO.write(", ") |
| 317 | + nameSIO.write(column_name) |
| 318 | + column_type = local_column_types[column_name] |
| 319 | + if column_type.startswith("TEXT"): |
| 320 | + valueSIO.write("'%s'" % kwargs[column_name]) |
| 321 | + elif column_type.startswith("INTEGER"): |
| 322 | + valueSIO.write(str(kwargs[column_name])) |
| 323 | + else: |
| 324 | + raise RuntimeError("Data-type '%s' needs handling" % column_type) |
| 325 | + |
| 326 | + s = StringIO.StringIO() |
| 327 | + s.write("INSERT INTO %s (%s) VALUES (%s)" % (table_name, nameSIO.getvalue(), valueSIO.getvalue())) |
| 328 | + statements.append(s.getvalue()) |
| 329 | + s.close() |
| 330 | + return cb |
| 331 | + |
| 332 | +def run(): |
| 333 | + if False: |
| 334 | + statements = [] |
| 335 | + cb = create_callback("abilities_table", statements) |
| 336 | + parse_abilities_table(cb) |
| 337 | + for s in statements: |
| 338 | + print s.encode('ascii','xmlcharrefreplace') |
| 339 | + return |
| 340 | + |
| 341 | + conn = sqlite3.connect(DATABASE_FILENAME) |
| 342 | + for (table_name, func) in ( |
| 343 | + ("conditions", parse_conditions), |
| 344 | + ("special_abilities", parse_special_abilities), |
| 345 | + ("abilities", parse_abilities), |
| 346 | + ("abilities_table", parse_abilities_table), |
| 347 | + ): |
| 348 | + statements = [] |
| 349 | + cb = create_callback(table_name, statements) |
| 350 | + func(cb) |
| 351 | + |
| 352 | + c = conn.cursor() |
| 353 | + sys.stdout.write("%s %d [" % (table_name, len(statements))) |
| 354 | + for s in statements: |
| 355 | + sys.stdout.write(".") |
| 356 | + c.execute(s) |
| 357 | + sys.stdout.write("]"+ os.linesep) |
| 358 | + |
| 359 | + conn.commit() |
| 360 | + c.close() |
| 361 | + |
| 362 | + |
| 363 | +if __name__ == "__main__": |
| 364 | + current_path = sys.path[0] |
| 365 | + html_path = os.path.join(current_path, HTML_DIR_NAME) |
| 366 | + |
| 367 | + run() |
| 368 | + |
| 369 | + raw_input("Press enter to continue..") |
0 commit comments