X-Git-Url: https://pere.pagekite.me/gitweb/text-mekanikerord.git/blobdiff_plain/a17b773ebf8bfc41d3dc16c2b49a2962d89cf11c..d61eea973489ef51bd5c7b00597b4f89f2619023:/make-glossary diff --git a/make-glossary b/make-glossary index 51c94bf..6e36003 100755 --- a/make-glossary +++ b/make-glossary @@ -5,6 +5,8 @@ import locale from lxml import etree from lxml.etree import tostring +import json + list_topic = False filemakerxml = 'meksme-utf8.xml' @@ -16,7 +18,7 @@ root = tree.getroot() #print(tostring(tree)) cols = ( - 'topic', 'sme', 'desc-sme', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is', + 'topic', 'se', 'desc-se', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is', ) topicmap = { @@ -55,11 +57,15 @@ for row in resultset.getchildren(): index += 1 #print(d) words.append(d) + + with open('meksme-utf8.json', 'w') as f: + json.dump(words, f) + def langsort(lang, e): if lang in e: return locale.strxfrm(e[lang]) else: - return locale.strxfrm(e['sme']) + return locale.strxfrm(e['se']) def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): import lxml.builder @@ -69,6 +75,16 @@ def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): } ) + def word2id(word): + return word \ + .replace('[', '_') \ + .replace(']', '_') \ + .replace('(', '_') \ + .replace(')', '_') \ + .replace('/', '_') \ + .replace('\'', '_') \ + .replace(' ', '_') + def indexit(entry, wlist, lang=None): for w in wlist.split(","): if "" != w: @@ -76,6 +92,7 @@ def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): w += "[%s]" % lang entry.append(E.indexterm(E.primary(w))) ids = {} + redirects = {} glossary = E.glossary() for e in sorted(words, key=lambda x: langsort(lang, x)): ldesc = 'desc-%s' % lang @@ -83,19 +100,52 @@ def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): e['topic'] = topicmap[lang][e['topic']] if lang in e: w = e[lang].split(',') - id = w[0] \ - .replace('[', '_') \ - .replace(']', '_') \ - .replace('(', '_') \ - .replace(')', '_') \ - .replace('/', '_') \ - .replace(' ', '_') + id = word2id(w[0]) while id in ids: id = id + 'x' ids[id] = True - if ldesc not in e: - print("warning: %s missing %s description" % (e[lang], lang)) + + # First handle redirections with not extra info + if -1 != e[lang].find('>') and ldesc not in e: + p = e[lang].split(' > ') + if p[0] in redirects: # Skip if already added + continue + if -1 == p[1].find(','): + if '-' == p[1][-1]: + print("warning: Skipping dangling reference %s -> %s" % + (p[0], p[1])) + else: + seeentry = E.glossentry() + seeentry.append(E.glossterm(p[0])) + id = word2id(p[1]) + seeentry.append(E.glosssee(otherterm=id)) + glossary.append(seeentry) + redirects[p[0]] = id + else: + print("warning: skipping split refererence %s -> %s" % + (p[0], p[1])) + if False: # Not allowed in docbook + seeentry = E.glossentry() + seeentry.append(E.glossterm(p[0])) + for s in p[1].split(','): + s = s.strip().lstrip() + seeentry.append(E.glosssee(otherterm=word2id(s))) + glossary.append(seeentry) continue + + # Add See also entries pointing to main entry + if 1 < len(w): + for t in w[1:]: + t = t.strip().lstrip() + if t not in redirects: + #print("info: Adding see also entry for %s" % t) + seeentry = E.glossentry() + seeentry.append(E.glossterm(t)) + seeentry.append(E.glosssee(otherterm=id)) + glossary.append(seeentry) + redirects[t] = id + elif ldesc not in e: + print("warning: term %s missing primary language %s description" % (e[lang], lang)) entry = E.glossentry(id=id) if list_topic and 'topic' in e: entry.append(E.glossterm('%s [%s]' % (e[lang], e['topic']))) @@ -110,23 +160,16 @@ def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): indexit(entry, e[l], l) if "" != lstr: entry.append(E.glossdef(E.para(lstr))) + else: + # only single word witout translations, skip it + continue for desccode in desccodes: codestr = 'desc-%s' % desccode if codestr in e: - entry.append(E.glossdef(E.para("%s: %s" % (desccode, + entry.append(E.glossdef(E.para("(%s): %s" % (desccode, e[codestr])))) glossary.append(entry) - # Add See also entries pointing to main entry - if 1 < len(w): - for t in w[1:]: - t = t.strip().lstrip() - entry = E.glossentry() - entry.append(E.glossterm(t)) - # FIXME - entry.append(E.glosssee(otherterm=id)) - glossary.append(entry) - def glosstermlocale(x): # Look up glossterm (FIXME figure out more robust way) t = x.getchildren()[0].text @@ -136,6 +179,9 @@ def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'): return "" # Sort list to mix seealso entries into their correct location. glossary[:] = sorted(glossary, key=glosstermlocale) + + l = len(glossary) + print("info: dictionary contain %d entries" % l) content = lxml.etree.tostring(glossary, pretty_print=True, @@ -157,19 +203,19 @@ if 'nb' == args.langcode: print("Norsk/bokmål") print() make_glossary_docbook(lang='nb', desccodes=('nb',), - langcodes=('en', 'sme', 'sv', 'da', 'fi', 'is',), + langcodes=('en', 'se', 'sv', 'da', 'fi', 'is',), output=args.output) -elif 'sme' == args.langcode: +elif 'se' == args.langcode: print("Nordsamisk") print() - make_glossary_docbook(lang='sme', desccodes=('sme', 'nb'), + make_glossary_docbook(lang='se', desccodes=('se', 'nb'), langcodes=('nb', 'en', 'sv', 'da', 'fi', 'is',), output=args.output) elif 'en' == args.langcode: print("Engelsk") print() - make_glossary_docbook(lang='en', desccodes=('en', 'nb'), - langcodes=('en', 'nb', 'sme', 'sv', 'da', 'fi', 'is',), + make_glossary_docbook(lang='en', desccodes=('en'), + langcodes=('en', 'nb', 'se', 'sv', 'da', 'fi', 'is',), output=args.output) else: print("error: Unknown language code %s" % args.langcode)