X-Git-Url: https://pere.pagekite.me/gitweb/text-mekanikerord.git/blobdiff_plain/3f579fcd820586b100d6cae4dc0d02ba2309e533..aa89b6448da06c1c104f1b9d2bf6cf3678e2ed87:/make-glossary?ds=sidebyside diff --git a/make-glossary b/make-glossary index 6095a00..6e36003 100755 --- a/make-glossary +++ b/make-glossary @@ -1,19 +1,47 @@ #!/usr/bin/python3 +import locale + from lxml import etree from lxml.etree import tostring -tree = etree.parse('mekanikk-1999/meksme-utf8.xml') +import json + +list_topic = False + +filemakerxml = 'meksme-utf8.xml' + +tree = etree.parse(filemakerxml) root = tree.getroot() #print(root) #print(tostring(tree)) cols = ( - 'topic', 'sme', 'desc-sme', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is', - 'unknown', + 'topic', 'se', 'desc-se', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is', ) +topicmap = { + 'nb' : { + 'fáddá': 'tema', + 'ávnnas': 'emne', + 'eanan': 'land', + 'biras': 'miljø', + 'huksen': 'bygg', + 'bohcci': 'rør', + 'data': 'data', + 'hydr': 'hydraulikk', + 'fys': 'fysikk', + 'sveis': 'sveising', + 'mihttu': 'måling', + 'elektro': 'elektro', + 'neavvu': 'verktøy', + 'mohtor': 'motor', + 'mašiidna': 'maskin', + 'fuolahas': 'bearbeiding', + } +} + resultset = root.find("{http://www.filemaker.com/fmpxmlresult}RESULTSET") words = [] @@ -30,37 +58,164 @@ for row in resultset.getchildren(): #print(d) words.append(d) -def make_glossary(lang): - print(".. glossary::") - print() + with open('meksme-utf8.json', 'w') as f: + json.dump(words, f) - def langsort(e): +def langsort(lang, e): + if lang in e: + return locale.strxfrm(e[lang]) + else: + return locale.strxfrm(e['se']) + +def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): + import lxml.builder + E = lxml.builder.ElementMaker( + nsmap={ +# 'xi': "http://www.w3.org/2001/XInclude", + } + ) + + def word2id(word): + return word \ + .replace('[', '_') \ + .replace(']', '_') \ + .replace('(', '_') \ + .replace(')', '_') \ + .replace('/', '_') \ + .replace('\'', '_') \ + .replace(' ', '_') + + def indexit(entry, wlist, lang=None): + for w in wlist.split(","): + if "" != w: + if lang and '[' not in w: + w += "[%s]" % lang + entry.append(E.indexterm(E.primary(w))) + ids = {} + redirects = {} + glossary = E.glossary() + for e in sorted(words, key=lambda x: langsort(lang, x)): + ldesc = 'desc-%s' % lang + if 'topic' in e and lang in topicmap: + e['topic'] = topicmap[lang][e['topic']] if lang in e: - return e[lang] - else: - return e['sme'] - for e in sorted(words, key=langsort): - if lang in e and 'desc-%s' % lang in e: - if 'topic' not in e: - e['topic'] = 'n/a' - #print(e) - print(" %s [%s]\n %s" % (e[lang], e['topic'], e['desc-%s' % lang])) - print() + w = e[lang].split(',') + id = word2id(w[0]) + while id in ids: + id = id + 'x' + ids[id] = True + + # First handle redirections with not extra info + if -1 != e[lang].find('>') and ldesc not in e: + p = e[lang].split(' > ') + if p[0] in redirects: # Skip if already added + continue + if -1 == p[1].find(','): + if '-' == p[1][-1]: + print("warning: Skipping dangling reference %s -> %s" % + (p[0], p[1])) + else: + seeentry = E.glossentry() + seeentry.append(E.glossterm(p[0])) + id = word2id(p[1]) + seeentry.append(E.glosssee(otherterm=id)) + glossary.append(seeentry) + redirects[p[0]] = id + else: + print("warning: skipping split refererence %s -> %s" % + (p[0], p[1])) + if False: # Not allowed in docbook + seeentry = E.glossentry() + seeentry.append(E.glossterm(p[0])) + for s in p[1].split(','): + s = s.strip().lstrip() + seeentry.append(E.glosssee(otherterm=word2id(s))) + glossary.append(seeentry) + continue + + # Add See also entries pointing to main entry + if 1 < len(w): + for t in w[1:]: + t = t.strip().lstrip() + if t not in redirects: + #print("info: Adding see also entry for %s" % t) + seeentry = E.glossentry() + seeentry.append(E.glossterm(t)) + seeentry.append(E.glosssee(otherterm=id)) + glossary.append(seeentry) + redirects[t] = id + elif ldesc not in e: + print("warning: term %s missing primary language %s description" % (e[lang], lang)) + entry = E.glossentry(id=id) + if list_topic and 'topic' in e: + entry.append(E.glossterm('%s [%s]' % (e[lang], e['topic']))) + else: + entry.append(E.glossterm(e[lang])) + indexit(entry, e[lang]) + lstr = "" + for l in langcodes: + if l != lang and l in e: + lstr += "%s (%s) " % (e[l], l) + # Add foreign words to index, split on comma + indexit(entry, e[l], l) + if "" != lstr: + entry.append(E.glossdef(E.para(lstr))) + else: + # only single word witout translations, skip it + continue + for desccode in desccodes: + codestr = 'desc-%s' % desccode + if codestr in e: + entry.append(E.glossdef(E.para("(%s): %s" % (desccode, + e[codestr])))) + glossary.append(entry) + + def glosstermlocale(x): + # Look up glossterm (FIXME figure out more robust way) + t = x.getchildren()[0].text + if t: + return locale.strxfrm(t) else: - # ERROR / missing definition - pass - -print("Nordsamisk") -print("==========") -print() -make_glossary(lang='sme') - -print("Norsk") -print("=====") -print() -make_glossary(lang='nb') - -#print("Engelsk") -#print("=====") -#print() -#make_glossary(lang='en') + return "" + # Sort list to mix seealso entries into their correct location. + glossary[:] = sorted(glossary, key=glosstermlocale) + + l = len(glossary) + print("info: dictionary contain %d entries" % l) + + content = lxml.etree.tostring(glossary, + pretty_print=True, + xml_declaration=True, + encoding='UTF-8') +# print(content) + with open(output, 'wb') as f: + f.write(content) + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("langcode", help="language code to generate glossary for") +parser.add_argument("--output", help="where to store the glossary") +args = parser.parse_args() + +locale.setlocale(locale.LC_ALL, '') + +if 'nb' == args.langcode: + print("Norsk/bokmål") + print() + make_glossary_docbook(lang='nb', desccodes=('nb',), + langcodes=('en', 'se', 'sv', 'da', 'fi', 'is',), + output=args.output) +elif 'se' == args.langcode: + print("Nordsamisk") + print() + make_glossary_docbook(lang='se', desccodes=('se', 'nb'), + langcodes=('nb', 'en', 'sv', 'da', 'fi', 'is',), + output=args.output) +elif 'en' == args.langcode: + print("Engelsk") + print() + make_glossary_docbook(lang='en', desccodes=('en'), + langcodes=('en', 'nb', 'se', 'sv', 'da', 'fi', 'is',), + output=args.output) +else: + print("error: Unknown language code %s" % args.langcode)