1 /* $Id: FrenchVerbDictionary.h,v 1.44 2012/04/24 02:46:05 sarrazip Exp $
2 FrenchVerbDictionary.h - Dictionary of verbs and conjugation templates
4 verbiste - French conjugation system
5 Copyright (C) 2003-2010 Pierre Sarrazin <http://sarrazip.com/>
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License
9 as published by the Free Software Foundation; either version 2
10 of the License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 #ifndef _H_FrenchVerbDictionary
24 #define _H_FrenchVerbDictionary
26 #include <verbiste/c-api.h>
27 #include <verbiste/misc-types.h>
28 #include <verbiste/Trie.h>
30 #include <libxml/xmlmemory.h>
31 #include <libxml/parser.h>
41 /** C++ namespace in which all of this library's names are defined. */
45 /** French verbs and conjugation knowledge base.
46 The text processing done by this class is case-sensitive.
48 class FrenchVerbDictionary
52 enum Language { NO_LANGUAGE, FRENCH, ITALIAN, GREEK };
54 /** Returns the language identifier recognized in the given string.
55 @param twoLetterCode string containing a language code
56 @returns a member of the 'Language' enum,
57 or NO_LANGUAGE if the string was
60 static Language parseLanguageCode(const std::string &twoLetterCode);
62 /** Returns the two-letter code that names the given language identifier.
63 @param l valid language identifier
64 @returns empty string if 'l' is NO_LANGUAGE or invalid,
65 or a twi-letter string otherwise
67 static std::string getLanguageCode(Language l);
69 /** Returns the full path names of the conjugation and verb XML files
70 for the given language code.
71 @param conjFN receives the full path of the conjugation template XML file
72 @param verbsFN receives the full path of the verb list XML file
73 @param languageCode "fr" for French or "it" for Italian
75 static void getXMLFilenames(std::string &conjFN, std::string &verbsFN,
78 /** Load a conjugation database.
79 @param conjugationFilename filename of the XML document that
80 defines all the conjugation templates
81 @param verbsFilename filename of the XML document that
82 defines all the known verbs and their
83 corresponding template
84 @param includeWithoutAccents fill knowledge base with variants of
85 verbs where some or all accents are missing
86 @param lang language of the dictionary
87 @throws logic_error for invalid arguments,
88 unparseable or unexpected XML documents
90 FrenchVerbDictionary(const std::string &conjugationFilename,
91 const std::string &verbsFilename,
92 bool includeWithoutAccents,
94 throw (std::logic_error);
96 /** Load the French conjugation database.
97 Uses the default (hard-coded) location for the French dictionary's
99 @param includeWithoutAccents include in the knowledge base variants
100 verbs where some or all accents are missing
101 @throws logic_error for invalid filename arguments,
102 unparseable or unexpected XML documents
103 (if verbs or template names are
104 mentioned, they are in Latin-1)
106 FrenchVerbDictionary(bool includeWithoutAccents) throw (std::logic_error);
108 /** Frees the memory used by this dictionary.
110 ~FrenchVerbDictionary();
112 /** Returns a conjugation template specification from its name.
113 @param templateName name of the desired template (e.g. "aim:er")
114 @returns a pointer to the TemplateSpec object if found,
117 const TemplateSpec *getTemplate(const std::string &templateName) const;
119 /** Returns an iterator for the list of conjugation templates. */
120 ConjugationSystem::const_iterator beginConjugSys() const;
122 /** Returns an iterator for the end of the list of conjugation templates. */
123 ConjugationSystem::const_iterator endConjugSys() const;
125 /** Returns the set of templates used by a verb.
126 @param infinitive infinitive form of the verb in Latin-1
127 (e.g., "manger", not "mange")
128 @returns a set of template names of the form radical:termination
129 in Latin-1 (e.g., "aim:er"),
130 or an empty set if the verb is unknown
131 or if 'infinitive' is NULL
133 const std::set<std::string> &getVerbTemplateSet(const char *infinitive) const;
135 /** Returns the template used by a verb.
136 @param infinitive infinitive form of the verb in Latin-1
137 (e.g., "manger", not "mange")
138 @returns a set of template names of the form radical:termination
139 in Latin-1 (e.g., "aim:er"),
140 or an empty set if the verb is unknown
141 or NULL if the verb is unknown
143 const std::set<std::string> &getVerbTemplateSet(const std::string &infinitive) const;
145 /** Returns an iterator for the list of known verbs. */
146 VerbTable::const_iterator beginKnownVerbs() const;
148 /** Returns an iterator for the end of the list of known verbs. */
149 VerbTable::const_iterator endKnownVerbs() const;
152 /** Describes an inflection according to a given conjugation template.
153 If the given inflection is known to the given conjugation template,
154 the list of possible modes, tenses and persons is returned.
155 For example, the inflection "es" in the "aim:er" template
156 can be the 2nd person singular of the indicative present
158 or the 2nd person singular of the subjunctive present.
160 Here, two ModeTensePersonNumber objects would be in the returned vector.
162 @param templateName name of the conjugation template to use
164 @param inflection inflection to be described
166 @returns a pointer to a vector of ModeTensePersonNumber
167 objects (which must not be modified nor
168 destroyed), or NULL if the inflection was not
169 known to the template
171 const std::vector<ModeTensePersonNumber> *getMTPNForInflection(
172 const std::string &templateName,
173 const std::string &inflection) const;
175 /** Converts an English mode name into the corresponding enumerated type.
176 @param modeName English mode name (infinitive, indicative, etc)
177 @returns a member of the Mode enumeration
178 (INVALID_MODE if 'modeName' is not known)
180 static Mode convertModeName(const char *modeName);
182 /** Converts an English tense name into the corresponding enumerated type.
183 @param tenseName English tense name (present, past, etc)
184 @returns a member of the Tense enumeration
185 (INVALID_MODE if 'modeName' is not known)
187 static Tense convertTenseName(const char *tenseName);
189 /** Analyzes a conjugated verb and finds all known possible cases.
190 @param utf8ConjugatedVerb conjugated French verb in UTF-8
192 @param results vector in which to store the inflection
193 descriptions (this vector is not emptied
194 before elements are stored in it);
195 no elements are stored in this vector
196 if the given conjugated verb is unknown
198 void deconjugate(const std::string &utf8ConjugatedVerb,
199 std::vector<InflectionDesc> &results);
201 /** Returns the English name (in ASCII) of the given mode.
203 static const char *getModeName(Mode m);
205 /** Returns the English name (in ASCII) of the given tense.
207 static const char *getTenseName(Tense t);
209 /** Converts the Latin-1 characters of a wide character string to lower-case.
210 @param wideString Unicode character string to be converted
211 @returns lower-case version of the character string
213 std::wstring tolowerWide(const std::wstring &wideString) const;
215 /** Determines if a Unicode character is lower-case.
216 Only works on Latin-1 characters.
217 @param c Unicode character code
218 @returns true iff 'c' is a Latin-1 vowel.
220 static bool isWideVowel(wchar_t c);
222 /** Converts a UTF-8 string to a wide character string.
223 @param utf8String UTF-8 string to be converted
224 @returns Unicode string
226 std::wstring utf8ToWide(const std::string &utf8String) const throw(int);
228 /** Converts a wide character string to a UTF-8 string.
229 @param wideString Unicode string to be converted
230 @returns UTF-8 string
232 std::string wideToUTF8(const std::wstring &wideString) const throw(int);
234 /** Removes accents from accented letters in the given string.
235 @param utf8String UTF-8 string with accented characters
236 @returns a UTF-8 string with the accents removed
238 std::string removeUTF8Accents(const std::string &utf8String);
240 /** Returns all unaccented variants of a wide character string.
241 If N letters are accented in 'utf8String', then 2^N variants
242 are returned, where each accented letter either keeps or loses
244 For example, the word "été" has 3 unaccented variants:
245 "eté", "ete" and "éte".
246 @param wideString wide character string with accented characters
247 @param index pass zero (recursive calls to this function
248 pass non-zero indices)
249 @param utf8Variants vector to which UTF-8 strings are added
250 with push_back (this function does not
251 clear the vector beforehand)
253 void formUTF8UnaccentedVariants(const std::wstring &wideString,
255 std::vector<std::string> &utf8Variants);
257 /** Returns all unaccented variants of a UTF-8 string.
258 If N letters are accented in 'utf8String', then 2^N variants
259 are returned, where each accented letter either keeps or loses
261 For example, the word "été" has 3 unaccented variants:
262 "eté", "ete" and "éte".
263 @param utf8String UTF-8 string with accented characters
264 @param index pass zero (recursive calls to this function
265 pass non-zero indices)
266 @param utf8Variants vector to which UTF-8 strings are added
267 with push_back (this function does not
268 clear the vector beforehand)
270 void formUTF8UnaccentedVariants(const std::string &utf8String,
272 std::vector<std::string> &utf8Variants);
275 /** Returns the content of an XML node in UTF-8.
276 @param doc the XML document
277 @param node the node of the XML document whose contents
279 @returns a Latin-1 string representing the contents
280 of the node; this string is empty the
281 requested node does not exist
282 @throws int errno value set by iconv(3), in the case of a
283 UTF-8 to Latin-1 conversion error
285 std::string getUTF8XmlNodeText(
286 xmlDocPtr doc, xmlNodePtr node) throw(int);
288 /** Returns the content of an XML property in UTF-8.
289 For example, if 'node' represents <foo type='xyz'/>,
290 then passing "type" for 'propName' will return "xyz".
291 @param node the node of the XML document
292 @param propName the name of the property to extract
293 @returns a Latin-1 string representing the contents
294 of the property; this string is empty the
295 requested property does not exist
296 @throws int errno value set by iconv(3), in the case of a
297 UTF-8 to Latin-1 conversion error
299 std::string getUTF8XmlProp(
300 xmlNodePtr node, const char *propName) throw(int);
303 /** Gets the radical part of an infinitive, according to a template name.
304 @param infinitive infinitive whose radical is requested
305 @param templateName name of the conjugation template that applies
306 @returns a prefix of 'infinitive'
307 @throws logic_error the template name is invalid (no ':' found)
309 static std::string getRadical(
310 const std::string &infinitive,
311 const std::string &templateName) throw(std::logic_error);
313 /** Generates the conjugation of a verb for a given mode and tense.
314 The generated words are complete, they are not just inflections.
315 @param radical radical part of the verb to conjugate
316 @param templ conjugation template to apply
317 @param mode mode to use
318 @param tense tense to use
319 @param dest vector of vectors of strings into which to
320 store the results; the result is a list of
321 "persons", and a person is a list of
323 @param includePronouns put pronouns before conjugated verbs in the
324 modes where pronouns are used
325 @param aspirateH notifies this function that the verb starts
326 with an aspirate h (e.g., "hacher", which
327 gives "je hache") instead of a silent h
328 (e.g., "habiter", which gives "j'habite")
329 @param isItalian language used (true for Italian, false for French)
330 @returns true for success, or false if the mode or
333 bool generateTense(const std::string &radical,
334 const TemplateSpec &templ,
337 std::vector< std::vector<std::string> > &dest,
338 bool includePronouns,
340 bool isItalian) const throw();
342 /** Indicates if the given verb starts with an aspirate h.
343 An aspirate h means that one cannot make a contraction or liaison
344 in front of the word. For example, "hacher" has an aspirate h
345 and this means that one says "je hache" and not "j'hache".
346 The verb "habiter" however does not have an aspirate h, so one
347 says "j'habite" and not "je habite".
349 bool isVerbStartingWithAspirateH(
350 const std::string &infinitive) const throw();
352 /** Returns the code representing this dictionary's language.
354 Language getLanguage() const { return lang; }
358 // User data employed in the Verb Trie.
359 // Remembers the correct spelling of the verb, in case the user
360 // reached a trie entry through tolerance of missing accents.
361 // This way, if the user enters "etaler", the displayed conjugation
362 // will show the missing acute accent on the first "e".
367 TrieValue(const std::string &t, const std::string &r)
368 : templateName(t), correctVerbRadical(r) {}
370 std::string templateName;
371 std::string correctVerbRadical;
374 /** Trie that contains all known verb radicals.
375 The associated information is a list of template names
376 that can apply to the radical.
377 The verb radicals and the template names are stored in Latin-1.
379 class VerbTrie : public Trie< std::vector<TrieValue> >
382 const FrenchVerbDictionary &fvd;
383 std::vector<InflectionDesc> *results;
385 /** Constructs a trie that keeps a reference to the dictionary.
386 @param d reference to the verb dictionary
388 VerbTrie(const FrenchVerbDictionary &d)
389 : Trie< std::vector<TrieValue> >(true),
395 /** Callback invoked by the Trie<>::get() method.
396 Inherited from Trie<>.
397 This callback will be called for each prefix of the searched
398 string that corresponds to the radical of a known verb.
399 Stores data in the vector<InflectionDesc> designated by
400 the last call to setDestination().
401 @param conjugatedVerb the searched string
402 @param index length of the prefix
403 @param templateList list of conjugation templates that
404 might apply to the conjugated verb
406 virtual void onFoundPrefixWithUserData(
407 const std::wstring &conjugatedVerb,
408 std::wstring::size_type index,
409 const std::vector<TrieValue> *templateList) const
412 /** Sets the destination vector in which callback() stores results.
413 When the Trie<>::get() method is called on this object,
414 it may invoke the callback() virtual method.
415 callback() will store any results in the vector designated here.
416 After calling get(), iterate through the vector to obtain
417 the possible inflections of the conjugated verb.
418 @param d destination vector designated as the
419 repository for results (may be NULL)
421 void setDestination(std::vector<InflectionDesc> *d)
427 // Forbidden operations:
428 VerbTrie(const VerbTrie &);
429 VerbTrie &operator = (const VerbTrie &);
432 friend class VerbTrie;
436 ConjugationSystem conjugSys;
437 VerbTable knownVerbs;
438 std::set<std::string> aspirateHVerbs;
439 InflectionTable inflectionTable;
440 iconv_t wideToUTF8Conv;
441 iconv_t utf8ToWideConv;
442 char latin1TolowerTable[256];
448 void init(const std::string &conjugationFilename,
449 const std::string &verbsFilename,
450 bool includeWithoutAccents)
451 throw (std::logic_error);
452 void loadConjugationDatabase(const char *conjugationFilename,
453 bool includeWithoutAccents)
454 throw (std::logic_error);
455 void loadVerbDatabase(const char *verbsFilename,
456 bool includeWithoutAccents)
457 throw (std::logic_error);
458 void readConjugation(xmlDocPtr doc,
459 bool includeWithoutAccents) throw(std::logic_error);
460 static void generateOtherPastParticiple(const char *mascSing,
461 std::vector<std::string> &dest);
462 void readVerbs(xmlDocPtr doc,
463 bool includeWithoutAccents)
464 throw(std::logic_error);
465 void insertVerbRadicalInTrie(const std::string &verbRadical,
466 const std::string &tname,
467 const std::string &correctVerbRadical);
469 // Forbidden operations:
470 FrenchVerbDictionary(const FrenchVerbDictionary &x);
471 FrenchVerbDictionary &operator = (const FrenchVerbDictionary &x);
475 } // namespace verbiste
478 #endif /* _H_FrenchVerbDictionary */