]> pere.pagekite.me Git - text-mekanikerord.git/blob - make-glossary
Add Nothern Saami translation from Svein Lund.
[text-mekanikerord.git] / make-glossary
1 #!/usr/bin/python3
2
3 import locale
4
5 from lxml import etree
6 from lxml.etree import tostring
7
8 import json
9
10 list_topic = False
11
12 filemakerxml = 'meksme-utf8.xml'
13
14 tree = etree.parse(filemakerxml)
15 root = tree.getroot()
16
17 #print(root)
18 #print(tostring(tree))
19
20 cols = (
21 'topic', 'se', 'desc-se', 'desc-nb', 'nb', 'sv', 'fi', 'en', 'is',
22 )
23
24 topicmap = {
25 'nb' : {
26 'fáddá': 'tema',
27 'ávnnas': 'emne',
28 'eanan': 'land',
29 'biras': 'miljø',
30 'huksen': 'bygg',
31 'bohcci': 'rør',
32 'data': 'data',
33 'hydr': 'hydraulikk',
34 'fys': 'fysikk',
35 'sveis': 'sveising',
36 'mihttu': 'måling',
37 'elektro': 'elektro',
38 'neavvu': 'verktøy',
39 'mohtor': 'motor',
40 'mašiidna': 'maskin',
41 'fuolahas': 'bearbeiding',
42 }
43 }
44
45 resultset = root.find("{http://www.filemaker.com/fmpxmlresult}RESULTSET")
46
47 words = []
48 for row in resultset.getchildren():
49 d = {}
50 index = 0
51 for col in row.findall("{http://www.filemaker.com/fmpxmlresult}COL"):
52 t = col.getchildren()[0].text
53 if t:
54 import re
55 t = re.sub(r'\s+', ' ', t)
56 d[cols[index]] = t
57 index += 1
58 #print(d)
59 words.append(d)
60
61 with open('meksme-utf8.json', 'w') as f:
62 json.dump(words, f)
63
64 def langsort(lang, e):
65 if lang in e:
66 return locale.strxfrm(e[lang])
67 else:
68 return locale.strxfrm(e['se'])
69
70 def make_glossary_docbook(lang, desccodes, langcodes, output='glossary.xml'):
71 import lxml.builder
72 E = lxml.builder.ElementMaker(
73 nsmap={
74 # 'xi': "http://www.w3.org/2001/XInclude",
75 }
76 )
77
78 def word2id(word):
79 return word \
80 .replace('[', '_') \
81 .replace(']', '_') \
82 .replace('(', '_') \
83 .replace(')', '_') \
84 .replace('/', '_') \
85 .replace('\'', '_') \
86 .replace(' ', '_')
87
88 def indexit(entry, wlist, lang=None):
89 for w in wlist.split(","):
90 if "" != w:
91 if lang and '[' not in w:
92 w += "[%s]" % lang
93 entry.append(E.indexterm(E.primary(w)))
94 ids = {}
95 redirects = {}
96 glossary = E.glossary()
97 for e in sorted(words, key=lambda x: langsort(lang, x)):
98 ldesc = 'desc-%s' % lang
99 if 'topic' in e and lang in topicmap:
100 e['topic'] = topicmap[lang][e['topic']]
101 if lang in e:
102 w = e[lang].split(',')
103 id = word2id(w[0])
104 while id in ids:
105 id = id + 'x'
106 ids[id] = True
107
108 # First handle redirections with not extra info
109 if -1 != e[lang].find('>') and ldesc not in e:
110 p = e[lang].split(' > ')
111 if p[0] in redirects: # Skip if already added
112 continue
113 if -1 == p[1].find(','):
114 if '-' == p[1][-1]:
115 print("warning: Skipping dangling reference %s -> %s" %
116 (p[0], p[1]))
117 else:
118 seeentry = E.glossentry()
119 seeentry.append(E.glossterm(p[0]))
120 id = word2id(p[1])
121 seeentry.append(E.glosssee(otherterm=id))
122 glossary.append(seeentry)
123 redirects[p[0]] = id
124 else:
125 print("warning: skipping split refererence %s -> %s" %
126 (p[0], p[1]))
127 if False: # Not allowed in docbook
128 seeentry = E.glossentry()
129 seeentry.append(E.glossterm(p[0]))
130 for s in p[1].split(','):
131 s = s.strip().lstrip()
132 seeentry.append(E.glosssee(otherterm=word2id(s)))
133 glossary.append(seeentry)
134 continue
135
136 # Add See also entries pointing to main entry
137 if 1 < len(w):
138 for t in w[1:]:
139 t = t.strip().lstrip()
140 if t not in redirects:
141 #print("info: Adding see also entry for %s" % t)
142 seeentry = E.glossentry()
143 seeentry.append(E.glossterm(t))
144 seeentry.append(E.glosssee(otherterm=id))
145 glossary.append(seeentry)
146 redirects[t] = id
147 elif ldesc not in e:
148 print("warning: term %s missing primary language %s description" % (e[lang], lang))
149 entry = E.glossentry(id=id)
150 if list_topic and 'topic' in e:
151 entry.append(E.glossterm('%s [%s]' % (e[lang], e['topic'])))
152 else:
153 entry.append(E.glossterm(e[lang]))
154 indexit(entry, e[lang])
155 lstr = ""
156 for l in langcodes:
157 if l != lang and l in e:
158 lstr += "%s (%s) " % (e[l], l)
159 # Add foreign words to index, split on comma
160 indexit(entry, e[l], l)
161 if "" != lstr:
162 entry.append(E.glossdef(E.para(lstr)))
163 else:
164 # only single word witout translations, skip it
165 continue
166 for desccode in desccodes:
167 codestr = 'desc-%s' % desccode
168 if codestr in e:
169 entry.append(E.glossdef(E.para("(%s): %s" % (desccode,
170 e[codestr]))))
171 glossary.append(entry)
172
173 def glosstermlocale(x):
174 # Look up glossterm (FIXME figure out more robust way)
175 t = x.getchildren()[0].text
176 if t:
177 return locale.strxfrm(t)
178 else:
179 return ""
180 # Sort list to mix seealso entries into their correct location.
181 glossary[:] = sorted(glossary, key=glosstermlocale)
182
183 l = len(glossary)
184 print("info: dictionary contain %d entries" % l)
185
186 content = lxml.etree.tostring(glossary,
187 pretty_print=True,
188 xml_declaration=True,
189 encoding='UTF-8')
190 # print(content)
191 with open(output, 'wb') as f:
192 f.write(content)
193
194 import argparse
195 parser = argparse.ArgumentParser()
196 parser.add_argument("langcode", help="language code to generate glossary for")
197 parser.add_argument("--output", help="where to store the glossary")
198 args = parser.parse_args()
199
200 locale.setlocale(locale.LC_ALL, '')
201
202 if 'nb' == args.langcode:
203 print("Norsk/bokmål")
204 print()
205 make_glossary_docbook(lang='nb', desccodes=('nb',),
206 langcodes=('en', 'se', 'sv', 'da', 'fi', 'is',),
207 output=args.output)
208 elif 'se' == args.langcode:
209 print("Nordsamisk")
210 print()
211 make_glossary_docbook(lang='se', desccodes=('se', 'nb'),
212 langcodes=('nb', 'en', 'sv', 'da', 'fi', 'is',),
213 output=args.output)
214 elif 'en' == args.langcode:
215 print("Engelsk")
216 print()
217 make_glossary_docbook(lang='en', desccodes=('en'),
218 langcodes=('en', 'nb', 'se', 'sv', 'da', 'fi', 'is',),
219 output=args.output)
220 else:
221 print("error: Unknown language code %s" % args.langcode)