wikipedia/sqlite_preorder.py
#!/usr/bin/env python
import argparse
import html
import sys
import sqlite3
import re
import os.path
from subprocess import Popen, PIPE, STDOUT
import unicodedata
from pathlib import Path
def escape_dot(s):
return s.replace('"', '\\"')
def to_title_human(s):
return s.replace('_', ' ')
def to_ns_title_human(ns, title, remove_namespace):
return ns_to_txt(ns, remove_namespace) + to_title_human(title)
def get_bigb_filename(outdir, ns, title, remove_namespace):
return os.path.join(outdir, bigb_title_to_id(ns_to_txt(ns, remove_namespace) + title) + '.bigb')
def strip_accents(s):
"""
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string/518232#518232
"""
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
azre = re.compile('[a-zA-Z0-9-]')
bigb_escapa_re = re.compile('[\\\\[\\]{}<$`]')
def bigb_escape(title):
l = []
for c in title:
if bigb_escapa_re.match(c):
l.append('\\')
l.append(c)
else:
l.append(c)
return ''.join(l)
NORMALIZE_PUNCTUATION_CHARACTER_MAP = {
'%': 'percent',
'&': 'and',
'+': 'plus',
'@': 'at',
}
def bigb_title_to_id_native(title):
l = []
for c in title:
if ord(c) < 128:
if azre.match(c):
l.append(c)
elif c in NORMALIZE_PUNCTUATION_CHARACTER_MAP:
l.append('-')
l.append(NORMALIZE_PUNCTUATION_CHARACTER_MAP[c])
l.append('-')
else:
l.append('-')
else:
if c == '\u2013' or c == '\u2014':
l.append('-')
else:
l.append(c)
return strip_accents(re.sub('^-|-$', '', re.sub('-+', '-', ''.join(l)))).lower()
def bigb_title_to_id(title):
global obb_title_to_id_process
obb_title_to_id_process.stdin.write(f'{title}\n'.encode())
obb_title_to_id_process.stdin.flush()
return obb_title_to_id_process.stdout.readline().decode()[:-1]
NAMESPACE_TO_TEXT = {
0: '',
14: 'Category:',
}
def ns_to_txt(ns, remove_namespace=False):
if remove_namespace:
return ''
else:
if ns in NAMESPACE_TO_TEXT:
return NAMESPACE_TO_TEXT[ns]
else:
return str(ns)
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--depth', type=int)
parser.add_argument('-D', '--depth-per-file', type=int, help='keep up to this depth per file, then interlink child pages beyond. Only supported by certain formats, e.g. bigb (HTML TODO)')
parser.add_argument('-i', '--index', default=False, action='store_true', help='make the (single) input title be a index.EXT file')
parser.add_argument('-M', '--max', type=int, help='max number of categories and pages to produce. To limit output size more precisely.')
parser.add_argument('-m', '--merge-article-and-category', default=False, action='store_true', help='place articles on the corresponding category of the same name if any, and omit the article in that case')
parser.add_argument('-N', '--remove-namespace', default=False, action='store_true', help='remove namespace from the output')
parser.add_argument('-O', '--output-format', action='append', default=[], help='which outputs formats to generate. Can be given multiple times to generate multiple formats.')
parser.add_argument('-o', '--outdir', default='out', help='directory where to place output')
parser.add_argument('-w', '--width', type=int, help='max number of categories and pages to consider. To speed up testing mostly.')
parser.add_argument('db')
parser.add_argument('titles', nargs='+')
args = parser.parse_args()
out_dot = False
out_txt = False
out_html = False
out_bigb = False
for f in args.output_format:
if f == 'dot':
out_dot = True
elif f == 'txt':
out_txt = True
elif f == 'html':
out_html = True
elif f == 'bigb':
out_bigb = True
else:
raise f'Unknown format: "{f}"'
outdir = args.outdir
con = sqlite3.connect(args.db)
cur = con.cursor()
params_str = f'''{'' if args.depth_per_file is None else ' -D' + str(args.depth_per_file)}{'' if args.depth is None else ' -d' + str(args.depth)}{'' if args.max is None else ' -M' + str(args.max)}{' -m' if args.merge_article_and_category is None else ''}{'' if args.width is None else ' -w' + str(args.width)}'''
Path(outdir).mkdir(parents=True, exist_ok=True)
with open(os.path.join(outdir, '.gitignore'), 'w') as gitignore_f:
pass
if out_html:
if not args.index:
with open(os.path.join(outdir, 'index.html'), 'w') as html_index_f:
html_index_f.write(f'''<!doctype html>
<html lang=en>
<head>
<meta charset=utf-8>
<title>Wikipedia CatTree</title>
</head>
<body>
<h1>Wikipedia CatTree</h1>
<p>Methodology: <a href="https://stackoverflow.com/questions/17432254/wikipedia-category-hierarchy-from-dumps/77313490#77313490">https://stackoverflow.com/questions/17432254/wikipedia-category-hierarchy-from-dumps/77313490#77313490</a> Params:{params_str}</p>
<ul>
''')
for t in args.titles:
html_index_f.write(f'<li><a href="{t + ".html"}">{t}</a></li>\n')
html_index_f.write('''</ul>
</body>
</html>
''')
if out_bigb:
gitignore_f.write('out\n')
with open(os.path.join(outdir, 'CNAME'), 'w') as f:
f.write('''wikibot.ourbigbook.com\n''')
obb_title_to_id_process = Popen(['ourbigbook', '--title-to-id'], stdout=PIPE, stdin=PIPE)
with open(os.path.join(outdir, 'ourbigbook.json'), 'w') as f:
f.write('''{}''')
with open(os.path.join(outdir, 'ourbigbook.liquid.html'), 'w') as f:
f.write('''<!doctype html>
<html lang=en>
<head>
<meta charset=utf-8>
<title>{{ title }}{% unless is_index_article %} - Wikipedia Bot - OurBigBook Docs{% endunless %}</title>
{% if is_index_article %}<script type="application/ld+json">{"name":"OurBigBook Docs","url":"https://docs.ourbigbook.com"}</script>
{% endif %}<meta name="viewport" content="width=device-width, initial-scale=1">
<style>{{ style }}</style>
<link rel="stylesheet" type="text/css" href="{{ raw_relpath }}/main.css">
<link rel="shortcut icon" href="{{ raw_relpath }}/../logo.svg" />
{{ head }}</head>
<body>
<header>
<a href="{{ root_page }}"><img src="https://upload.wikimedia.org/wikipedia/en/thumb/8/80/Wikipedia-logo-v2.svg/220px-Wikipedia-logo-v2.svg.png" />OurBigBook Wikipedia Bot</a>
<a href="https://docs.ourbigbook.com/wikipedia-bot"><img src="https://docs.ourbigbook.com/_raw/logo.svg" />Documentation</a>
</header>
<main class="ourbigbook">
{{ body }}</main>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-2XSEK2ND00"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-2XSEK2ND00');
</script>
{{ post_body }}</body>
</html>
''')
with open(os.path.join(outdir, 'main.scss'), 'w') as f:
f.write('''@import 'ourbigbook/ourbigbook.common.scss';
$color: black;
$header-background-color: #AA0;
body {
background-color: $color;
font-family: $font-family;
header, footer {
background-color: $header-background-color;
}
header {
align-items: center;
display: flex;
/* https://stackoverflow.com/questions/5078239/how-do-i-remove-the-space-between-inline-block-elements */
font-size: 0;
margin-bottom: 0.8 * $header-font-size;
overflow-x: hidden;
white-space: nowrap;
a {
/* Make buttons occupy the full height of the header bar.
* https://stackoverflow.com/questions/28254332/how-to-vertically-center-the-contents-of-a-flexbox-item/28254903#28254903 */
align-items: center;
align-self: stretch;
display: flex;
color: $color;
font-size: 32px;
font-weight: bold;
margin-left: 0;
margin-right: 0;
padding-left: 10px;
&:first-child {
padding-left: $toplevel-horizontal-padding-left;
}
padding-right: 10px;
text-decoration: none;
&:visited {
color: $color;
}
&:hover {
color: $header-background-color;
background-color: $color;
}
&.font-awesome-container {
font-weight: normal;
padding-left: 5px;
padding-right: 5px;
}
height: 1.2em;
img {
height: 100%;
margin-right: 0.1em;
}
}
}
footer {
word-wrap: break-word;
a {
$color: #00C;
color: $color;
text-decoration: none;
&:visited {
color: $color;
}
&:hover {
text-decoration: underline;
}
}
padding: 10px $toplevel-horizontal-padding-right 10px $toplevel-horizontal-padding-left;
div + div {
margin-top: 5px;
}
}
}
''')
if not args.index:
with open(os.path.join(outdir, 'index.bigb'), 'w') as bigb_index_f:
bigb_index_f.write(f'''= OurBigBook Wikipedia Bot
Hello! I am a bot that scrapes the category graph from Wikipedia!
Methodology: https://docs.ourbigbook.com/wikipedia-bot
Params:{params_str}
''')
for t in args.titles:
bigb_index_f.write(f'\\Include[{bigb_title_to_id(ns_to_txt(14, args.remove_namespace) + t)}]\n')
# TODO remove, just for symmetry with other broken files with an extra \n at end.
bigb_index_f.write('\n')
visited = set()
bigb_ids = set()
bigb_ids_repeated = []
n = 0
titles0_set = set(args.titles)
for title in args.titles:
visited.add((14, title))
if args.merge_article_and_category and \
cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (0, title,)).fetchone() is not None:
visited.add((0, title))
if out_bigb:
bigb_ids.add(bigb_title_to_id(ns_to_txt(14, args.remove_namespace) + title))
if args.merge_article_and_category:
bigb_ids.add(bigb_title_to_id(ns_to_txt(0, args.remove_namespace) + title))
if args.depth_per_file is None or args.index:
out_bigb_f = open(get_bigb_filename(outdir, 14, title, args.remove_namespace), 'w')
else:
for title in args.titles:
Path.unlink(get_bigb_filename(outdir, 14, title, args.remove_namespace), missing_ok=True)
for title in args.titles:
todo = [(14, title, 0, None, None, 14, title, 0)]
if args.index:
title = 'index'
title_human = to_title_human(title) + ' - Wikipedia CatTree'
basename = os.path.join(outdir, title)
if out_txt:
out_txt_f = open(f'{basename}.txt', 'w')
if out_dot:
out_dot_f = open(f'{basename}.dot', 'w')
out_dot_f.write('digraph {\n')
if out_html:
out_html_f = open(f'{basename}.html', 'w')
out_html_f.write(f'''<!doctype html>
<html lang=en>
<head>
<meta charset=utf-8>
<title>{title_human}</title>
<style>
a {{ text-decoration: none; }}
details {{ margin-left: 1em; }}
summary {{ margin-bottom: 0.4em; }}
</style>
</head>
<body>
<h1>{title_human}</h1>
<p><a href=".">Index</a></p>
''')
last_depth = 0
while len(todo):
namespace, title, depth, parent_namespace, parent_title, parent_namespace_file, parent_title_file, childi = todo.pop()
depth_delta = depth - last_depth
if depth_delta <= 0:
repeat_close = -depth_delta + 1
else:
repeat_close = 0
last_depth = depth
if args.merge_article_and_category and \
namespace == 14 and \
cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (0, title,)).fetchone() is not None:
namespace_eff = 0
else:
namespace_eff = namespace
if args.remove_namespace:
path_last = title
else:
path_last = ns_to_txt(namespace_eff) + title
title_human = to_title_human(path_last)
if out_txt:
out_txt_f.write('{}{} {}\n'.format(' ' * depth, depth, path_last))
if out_html:
out_html_f.write('</details>\n' * repeat_close)
out_html_f.write(f'<details open="true"><summary><a href="https://en.wikipedia.org/wiki/{html.escape(path_last)}">{html.escape(title_human)}</a></summary>\n')
if out_bigb:
if args.index and parent_title is None:
cur_bigb_f = open(os.path.join(outdir, 'index.bigb'), 'a')
else:
if args.depth_per_file is None:
cur_bigb_f = out_bigb_f
else:
cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a')
cur_bigb_f.write(f'= {bigb_escape(title_human)}\n')
if parent_title is not None and (args.depth_per_file is None or depth % args.depth_per_file):
cur_bigb_f.write(f'{{parent={bigb_escape(to_ns_title_human(parent_namespace, parent_title, args.remove_namespace)).replace("/", " ")}}}\n')
cur_bigb_f.write(f'{{wiki={bigb_escape(ns_to_txt(namespace_eff) + title)}}}\n')
cur_bigb_f.write(f'\n')
if args.depth_per_file is not None:
cur_bigb_f.close()
if namespace == 14:
ncats = 0
npages = 0
cat_includes = []
page_includes = []
for cur_childi, (child_namespace, child_title, page_is_redirect) in enumerate(cur.execute('''
select page_namespace, page_title, page_is_redirect from categorylinks
inner join page on cl_from = page_id and cl_to = ?
order by page_namespace asc, page_title desc
''', (title,)).fetchall()):
# Some redirects also have categories, it is crazy, e.g.: https://en.wikipedia.org/w/index.php?title=Khatri-Rao_product&action=edit contains:
#``
#REDIRECT [[Khatri–Rao product]] {{R with possibilities}}
#
#[[Category:Matrix theory]]
#``
if not page_is_redirect and (child_namespace == 0 or child_namespace == 14) and not child_title in titles0_set:
# We found a article that has a category with the same name, so we just ignore the article
# and push the category instead to be looped over later.
if args.merge_article_and_category and child_namespace == 0:
if cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (14, child_title,)).fetchone() is not None:
visited.add((child_namespace, child_title))
child_namespace = 14
if out_bigb:
bigb_id = bigb_title_to_id(ns_to_txt(child_namespace, args.remove_namespace) + child_title)
# Not ideal that the bigb ID repetition changes other outputs as well. But well!!!
bigb_id_repeated = bigb_id in bigb_ids
else:
bigb_id_repeated = False
if (
not (child_namespace, child_title) in visited and
not bigb_id_repeated and
not ( args.depth is not None and depth == args.depth )
):
if bigb_id_repeated:
bigb_ids_repeated.append(bigb_id)
if out_dot:
out_dot_f.write('"{}{}"->"{}{}";\n'.format(ns_to_txt(namespace), escape_dot(title), ns_to_txt(child_namespace), escape_dot(child_title)))
append = False
if child_namespace == 14:
if args.width is None or ncats < args.width:
ncats += 1
append = True
else:
if args.width is None or npages < args.width:
npages += 1
append = True
if append:
visited.add((child_namespace, child_title))
if out_bigb:
bigb_ids.add(bigb_id)
if args.max is not None and n == args.max:
break
n += 1
if args.depth_per_file is not None and ((depth + 1) % args.depth_per_file) == 0:
if out_bigb:
if child_namespace == 14:
cat_includes.append((child_namespace, child_title))
else:
page_includes.append((child_namespace, child_title))
Path.unlink(get_bigb_filename(outdir, child_namespace, child_title, args.remove_namespace), missing_ok=True)
child_parent_namespace_file = child_namespace
child_parent_title_file = child_title
else:
child_parent_namespace_file = namespace
if args.index and parent_title is None:
child_parent_namespace_file = 14
child_parent_title_file = 'index'
else:
child_parent_namespace_file = parent_namespace_file
child_parent_title_file = parent_title_file
print(f'{n} {ns_to_txt(child_namespace)}{child_title} depth={depth + 1} child_parent_title_file={child_parent_title_file}', file=sys.stderr)
todo.append((child_namespace, child_title, depth + 1, namespace, title, child_parent_namespace_file, child_parent_title_file, cur_childi))
if args.width is not None and ncats == args.width and npages == args.width:
break
if out_bigb:
if parent_title_file is None:
cur_bigb_f = out_bigb_f
else:
cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a')
if cat_includes or page_includes:
cur_bigb_f.write(
''.join(map(
lambda t: f'\\Include[{bigb_title_to_id(ns_to_txt(t[0], args.remove_namespace) + t[1])}]\n',
(list(reversed(cat_includes)) + list(reversed(page_includes))),
)) + '\n'
)
if parent_title_file is not None:
cur_bigb_f.close()
if out_bigb:
if args.depth_per_file is None:
out_bigb_f.close()
if out_txt:
out_txt_f.close()
if out_dot:
out_dot_f.write('}\n')
out_dot_f.close()
if out_html:
out_html_f.write('</details>\n' * last_depth)
out_html_f.write('''</body>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-DEE2HEJW9X"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-DEE2HEJW9X');
</script>
</html>
''')
out_html_f.close
if out_bigb:
with open(os.path.join(outdir, 'ids_repeated.tmp'), 'w') as f:
f.write('\n'.join(bigb_ids_repeated))