]> pere.pagekite.me Git - text-mekanikerord.git/blob - make-glossary
Legg til 1999-copyright.
[text-mekanikerord.git] / make-glossary
1 #!/usr/bin/python3
2
3 import locale
4
5 from lxml import etree
6 from lxml.etree import tostring
7
8 list_topic = False
9
10 filemakerxml = 'meksme-utf8.xml'
11
12 tree = etree.parse(filemakerxml)
13 root = tree.getroot()
14
15 #print(root)
16 #print(tostring(tree))
17
18 cols = (
19 'topic', 'se', 'desc-se', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is',
20 )
21
22 topicmap = {
23 'nb' : {
24 'fáddá': 'tema',
25 'ávnnas': 'emne',
26 'eanan': 'land',
27 'biras': 'miljø',
28 'huksen': 'bygg',
29 'bohcci': 'rør',
30 'data': 'data',
31 'hydr': 'hydraulikk',
32 'fys': 'fysikk',
33 'sveis': 'sveising',
34 'mihttu': 'måling',
35 'elektro': 'elektro',
36 'neavvu': 'verktøy',
37 'mohtor': 'motor',
38 'mašiidna': 'maskin',
39 'fuolahas': 'bearbeiding',
40 }
41 }
42
43 resultset = root.find("{http://www.filemaker.com/fmpxmlresult}RESULTSET")
44
45 words = []
46 for row in resultset.getchildren():
47 d = {}
48 index = 0
49 for col in row.findall("{http://www.filemaker.com/fmpxmlresult}COL"):
50 t = col.getchildren()[0].text
51 if t:
52 import re
53 t = re.sub(r'\s+', ' ', t)
54 d[cols[index]] = t
55 index += 1
56 #print(d)
57 words.append(d)
58 def langsort(lang, e):
59 if lang in e:
60 return locale.strxfrm(e[lang])
61 else:
62 return locale.strxfrm(e['se'])
63
64 def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'):
65 import lxml.builder
66 E = lxml.builder.ElementMaker(
67 nsmap={
68 # 'xi': "http://www.w3.org/2001/XInclude",
69 }
70 )
71
72 def indexit(entry, wlist, lang=None):
73 for w in wlist.split(","):
74 if "" != w:
75 if lang and '[' not in w:
76 w += "[%s]" % lang
77 entry.append(E.indexterm(E.primary(w)))
78 ids = {}
79 glossary = E.glossary()
80 for e in sorted(words, key=lambda x: langsort(lang, x)):
81 ldesc = 'desc-%s' % lang
82 if 'topic' in e and lang in topicmap:
83 e['topic'] = topicmap[lang][e['topic']]
84 if lang in e:
85 w = e[lang].split(',')
86 id = w[0] \
87 .replace('[', '_') \
88 .replace(']', '_') \
89 .replace('(', '_') \
90 .replace(')', '_') \
91 .replace('/', '_') \
92 .replace('\'', '_') \
93 .replace(' ', '_')
94 while id in ids:
95 id = id + 'x'
96 ids[id] = True
97 if ldesc not in e:
98 print("warning: term %s missing primary language %s description" % (e[lang], lang))
99 entry = E.glossentry(id=id)
100 if list_topic and 'topic' in e:
101 entry.append(E.glossterm('%s [%s]' % (e[lang], e['topic'])))
102 else:
103 entry.append(E.glossterm(e[lang]))
104 indexit(entry, e[lang])
105 lstr = ""
106 for l in langcodes:
107 if l != lang and l in e:
108 lstr += "%s (%s) " % (e[l], l)
109 # Add foreign words to index, split on comma
110 indexit(entry, e[l], l)
111 if "" != lstr:
112 entry.append(E.glossdef(E.para(lstr)))
113 else:
114 # only single word witout translations, skip it
115 continue
116 for desccode in desccodes:
117 codestr = 'desc-%s' % desccode
118 if codestr in e:
119 entry.append(E.glossdef(E.para("(%s): %s" % (desccode,
120 e[codestr]))))
121 glossary.append(entry)
122
123 # Add See also entries pointing to main entry
124 if 1 < len(w):
125 for t in w[1:]:
126 t = t.strip().lstrip()
127 #print("info: Adding see also entry for %s" % t)
128 seeentry = E.glossentry()
129 seeentry.append(E.glossterm(t))
130 seeentry.append(E.glosssee(otherterm=id))
131 glossary.append(seeentry)
132
133 def glosstermlocale(x):
134 # Look up glossterm (FIXME figure out more robust way)
135 t = x.getchildren()[0].text
136 if t:
137 return locale.strxfrm(t)
138 else:
139 return ""
140 # Sort list to mix seealso entries into their correct location.
141 glossary[:] = sorted(glossary, key=glosstermlocale)
142
143 l = len(glossary)
144 print("info: dictionary contain %d entries" % l)
145
146 content = lxml.etree.tostring(glossary,
147 pretty_print=True,
148 xml_declaration=True,
149 encoding='UTF-8')
150 # print(content)
151 with open(output, 'wb') as f:
152 f.write(content)
153
154 import argparse
155 parser = argparse.ArgumentParser()
156 parser.add_argument("langcode", help="language code to generate glossary for")
157 parser.add_argument("--output", help="where to store the glossary")
158 args = parser.parse_args()
159
160 locale.setlocale(locale.LC_ALL, '')
161
162 if 'nb' == args.langcode:
163 print("Norsk/bokmål")
164 print()
165 make_glossary_docbook(lang='nb', desccodes=('nb',),
166 langcodes=('en', 'se', 'sv', 'da', 'fi', 'is',),
167 output=args.output)
168 elif 'se' == args.langcode:
169 print("Nordsamisk")
170 print()
171 make_glossary_docbook(lang='se', desccodes=('se', 'nb'),
172 langcodes=('nb', 'en', 'sv', 'da', 'fi', 'is',),
173 output=args.output)
174 elif 'en' == args.langcode:
175 print("Engelsk")
176 print()
177 make_glossary_docbook(lang='en', desccodes=('en'),
178 langcodes=('en', 'nb', 'se', 'sv', 'da', 'fi', 'is',),
179 output=args.output)
180 else:
181 print("error: Unknown language code %s" % args.langcode)