#!/usr/bin/python3
+import locale
+
from lxml import etree
from lxml.etree import tostring
-tree = etree.parse('mekanikk-1999/meksme-utf8.xml')
+import json
+
+list_topic = False
+
+filemakerxml = 'meksme-utf8.xml'
+
+tree = etree.parse(filemakerxml)
root = tree.getroot()
#print(root)
#print(tostring(tree))
cols = (
- 'topic', 'sme', 'desc-sme', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is',
- 'unknown',
+ 'topic', 'se', 'desc-se', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is',
)
+topicmap = {
+ 'nb' : {
+ 'fáddá': 'tema',
+ 'ávnnas': 'emne',
+ 'eanan': 'land',
+ 'biras': 'miljø',
+ 'huksen': 'bygg',
+ 'bohcci': 'rør',
+ 'data': 'data',
+ 'hydr': 'hydraulikk',
+ 'fys': 'fysikk',
+ 'sveis': 'sveising',
+ 'mihttu': 'måling',
+ 'elektro': 'elektro',
+ 'neavvu': 'verktøy',
+ 'mohtor': 'motor',
+ 'mašiidna': 'maskin',
+ 'fuolahas': 'bearbeiding',
+ }
+}
+
resultset = root.find("{http://www.filemaker.com/fmpxmlresult}RESULTSET")
words = []
#print(d)
words.append(d)
-def make_glossary(lang):
- print(".. glossary::")
- print()
+ with open('meksme-utf8.json', 'w') as f:
+ json.dump(words, f)
- def langsort(e):
+def langsort(lang, e):
+ if lang in e:
+ return locale.strxfrm(e[lang])
+ else:
+ return locale.strxfrm(e['se'])
+
+def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'):
+ import lxml.builder
+ E = lxml.builder.ElementMaker(
+ nsmap={
+# 'xi': "http://www.w3.org/2001/XInclude",
+ }
+ )
+
+ def word2id(word):
+ return word \
+ .replace('[', '_') \
+ .replace(']', '_') \
+ .replace('(', '_') \
+ .replace(')', '_') \
+ .replace('/', '_') \
+ .replace('\'', '_') \
+ .replace(' ', '_')
+
+ def indexit(entry, wlist, lang=None):
+ for w in wlist.split(","):
+ if "" != w:
+ if lang and '[' not in w:
+ w += "[%s]" % lang
+ entry.append(E.indexterm(E.primary(w)))
+ ids = {}
+ redirects = {}
+ glossary = E.glossary()
+ for e in sorted(words, key=lambda x: langsort(lang, x)):
+ ldesc = 'desc-%s' % lang
+ if 'topic' in e and lang in topicmap:
+ e['topic'] = topicmap[lang][e['topic']]
if lang in e:
- return e[lang]
- else:
- return e['sme']
- for e in sorted(words, key=langsort):
- if lang in e and 'desc-%s' % lang in e:
- if 'topic' not in e:
- e['topic'] = 'n/a'
- #print(e)
- print(" %s [%s]\n %s" % (e[lang], e['topic'], e['desc-%s' % lang]))
- print()
+ w = e[lang].split(',')
+ id = word2id(w[0])
+ while id in ids:
+ id = id + 'x'
+ ids[id] = True
+
+ # First handle redirections with not extra info
+ if -1 != e[lang].find('>') and ldesc not in e:
+ p = e[lang].split(' > ')
+ if p[0] in redirects: # Skip if already added
+ continue
+ if -1 == p[1].find(','):
+ if '-' == p[1][-1]:
+ print("warning: Skipping dangling reference %s -> %s" %
+ (p[0], p[1]))
+ else:
+ seeentry = E.glossentry()
+ seeentry.append(E.glossterm(p[0]))
+ id = word2id(p[1])
+ seeentry.append(E.glosssee(otherterm=id))
+ glossary.append(seeentry)
+ redirects[p[0]] = id
+ else:
+ print("warning: skipping split refererence %s -> %s" %
+ (p[0], p[1]))
+ if False: # Not allowed in docbook
+ seeentry = E.glossentry()
+ seeentry.append(E.glossterm(p[0]))
+ for s in p[1].split(','):
+ s = s.strip().lstrip()
+ seeentry.append(E.glosssee(otherterm=word2id(s)))
+ glossary.append(seeentry)
+ continue
+
+ # Add See also entries pointing to main entry
+ if 1 < len(w):
+ for t in w[1:]:
+ t = t.strip().lstrip()
+ if t not in redirects:
+ #print("info: Adding see also entry for %s" % t)
+ seeentry = E.glossentry()
+ seeentry.append(E.glossterm(t))
+ seeentry.append(E.glosssee(otherterm=id))
+ glossary.append(seeentry)
+ redirects[t] = id
+ elif ldesc not in e:
+ print("warning: term %s missing primary language %s description" % (e[lang], lang))
+ entry = E.glossentry(id=id)
+ if list_topic and 'topic' in e:
+ entry.append(E.glossterm('%s [%s]' % (e[lang], e['topic'])))
+ else:
+ entry.append(E.glossterm(e[lang]))
+ indexit(entry, e[lang])
+ lstr = ""
+ for l in langcodes:
+ if l != lang and l in e:
+ lstr += "%s (%s) " % (e[l], l)
+ # Add foreign words to index, split on comma
+ indexit(entry, e[l], l)
+ if "" != lstr:
+ entry.append(E.glossdef(E.para(lstr)))
+ else:
+ # only single word witout translations, skip it
+ continue
+ for desccode in desccodes:
+ codestr = 'desc-%s' % desccode
+ if codestr in e:
+ entry.append(E.glossdef(E.para("(%s): %s" % (desccode,
+ e[codestr]))))
+ glossary.append(entry)
+
+ def glosstermlocale(x):
+ # Look up glossterm (FIXME figure out more robust way)
+ t = x.getchildren()[0].text
+ if t:
+ return locale.strxfrm(t)
else:
- # ERROR / missing definition
- pass
-
-print("Nordsamisk")
-print("==========")
-print()
-make_glossary(lang='sme')
-
-print("Norsk")
-print("=====")
-print()
-make_glossary(lang='nb')
-
-#print("Engelsk")
-#print("=====")
-#print()
-#make_glossary(lang='en')
+ return ""
+ # Sort list to mix seealso entries into their correct location.
+ glossary[:] = sorted(glossary, key=glosstermlocale)
+
+ l = len(glossary)
+ print("info: dictionary contain %d entries" % l)
+
+ content = lxml.etree.tostring(glossary,
+ pretty_print=True,
+ xml_declaration=True,
+ encoding='UTF-8')
+# print(content)
+ with open(output, 'wb') as f:
+ f.write(content)
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("langcode", help="language code to generate glossary for")
+parser.add_argument("--output", help="where to store the glossary")
+args = parser.parse_args()
+
+locale.setlocale(locale.LC_ALL, '')
+
+if 'nb' == args.langcode:
+ print("Norsk/bokmål")
+ print()
+ make_glossary_docbook(lang='nb', desccodes=('nb',),
+ langcodes=('en', 'se', 'sv', 'da', 'fi', 'is',),
+ output=args.output)
+elif 'se' == args.langcode:
+ print("Nordsamisk")
+ print()
+ make_glossary_docbook(lang='se', desccodes=('se', 'nb'),
+ langcodes=('nb', 'en', 'sv', 'da', 'fi', 'is',),
+ output=args.output)
+elif 'en' == args.langcode:
+ print("Engelsk")
+ print()
+ make_glossary_docbook(lang='en', desccodes=('en'),
+ langcodes=('en', 'nb', 'se', 'sv', 'da', 'fi', 'is',),
+ output=args.output)
+else:
+ print("error: Unknown language code %s" % args.langcode)