2 * Copyright (C) 2007 by INdT
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 * @author Gustavo Sverzut Barbieri <gustavo.barbieri@openbossa.org>
21 #include "lightmediascanner_charset_conv.h"
29 struct lms_charset_conv {
38 * Create a new charset conversion tool controlling its behavior.
40 * Conversion tool will try to convert provided strings to UTF-8, just need
41 * to register known charsets with lms_charset_conv_add() and then call
44 * @return newly allocated conversion tool or NULL on error.
47 lms_charset_conv_new_full(int use_check, int use_fallback)
49 lms_charset_conv_t *lcc;
51 lcc = malloc(sizeof(*lcc));
58 lcc->check = (iconv_t)-1;
60 lcc->check = iconv_open("UTF-8", "UTF-8");
61 if (lcc->check == (iconv_t)-1) {
62 perror("ERROR: could not create conversion checker");
68 lcc->fallback = (iconv_t)-1;
70 lcc->fallback = iconv_open("UTF-8//IGNORE", "UTF-8");
71 if (lcc->fallback == (iconv_t)-1) {
72 perror("ERROR: could not create conversion fallback");
83 if (lcc->check != (iconv_t)-1)
84 iconv_close(lcc->check);
92 * Create a new charset conversion tool.
94 * Conversion tool will try to convert provided strings to UTF-8, just need
95 * to register known charsets with lms_charset_conv_add() and then call
98 * @return newly allocated conversion tool or NULL on error.
101 lms_charset_conv_new(void)
103 return lms_charset_conv_new_full(1, 1);
107 * Free existing charset conversion tool.
109 * @param lcc existing Light Media Scanner charset conversion.
112 lms_charset_conv_free(lms_charset_conv_t *lcc)
119 if (lcc->check != (iconv_t)-1)
120 iconv_close(lcc->check);
121 if (lcc->fallback != (iconv_t)-1)
122 iconv_close(lcc->fallback);
124 for (i = 0; i < lcc->size; i++) {
125 iconv_close(lcc->convs[i]);
137 * Register new charset to conversion tool.
139 * @param lcc existing Light Media Scanner charset conversion.
140 * @param charset charset name as understood by iconv_open(3).
142 * @return On success 0 is returned.
145 lms_charset_conv_add(lms_charset_conv_t *lcc, const char *charset)
157 cd = iconv_open("UTF-8", charset);
158 if (cd == (iconv_t)-1) {
159 fprintf(stderr, "ERROR: could not add conversion charset '%s': %s\n",
160 charset, strerror(errno));
167 convs = realloc(lcc->convs, ns * sizeof(*convs));
171 lcc->convs[idx] = cd;
173 names = realloc(lcc->names, ns * sizeof(*names));
177 lcc->names[idx] = strdup(charset);
178 if (!lcc->names[idx])
191 _find(const lms_charset_conv_t *lcc, const char *charset)
195 for (i = 0; i < lcc->size; i++)
196 if (strcmp(lcc->names[i], charset) == 0)
203 * Forget about previously registered charset in conversion tool.
205 * @param lcc existing Light Media Scanner charset conversion.
206 * @param charset charset name.
208 * @return On success 0 is returned.
211 lms_charset_conv_del(lms_charset_conv_t *lcc, const char *charset)
223 idx = _find(lcc, charset);
225 fprintf(stderr, "ERROR: could not find charset '%s'\n", charset);
229 iconv_close(lcc->convs[idx]);
230 free(lcc->names[idx]);
233 for (; idx < lcc->size; idx++) {
234 lcc->convs[idx] = lcc->convs[idx + 1];
235 lcc->names[idx] = lcc->names[idx + 1];
238 convs = realloc(lcc->convs, lcc->size * sizeof(*convs));
242 perror("could not realloc 'convs'");
244 names = realloc(lcc->names, lcc->size * sizeof(*names));
248 perror("could not realloc 'names'");
254 _check(lms_charset_conv_t *lcc, const char *istr, unsigned int ilen, char *ostr, unsigned int olen)
256 char *inbuf, *outbuf;
257 size_t r, inlen, outlen;
259 if (lcc->check == (iconv_t)-1)
262 inbuf = (char *)istr;
267 iconv(lcc->check, NULL, NULL, NULL, NULL);
268 r = iconv(lcc->check, &inbuf, &inlen, &outbuf, &outlen);
276 _conv(iconv_t cd, char **p_str, unsigned int *p_len, char *ostr, unsigned int olen)
278 char *inbuf, *outbuf;
279 size_t r, inlen, outlen;
286 iconv(cd, NULL, NULL, NULL, NULL);
287 r = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
291 *p_len = olen - outlen;
295 outbuf = realloc(*p_str, *p_len + 1);
301 (*p_str)[*p_len] = '\0';
307 _fix_non_ascii(char *s, int len)
309 for (; len > 0; len--, s++)
315 * If required, do charset conversion to UTF-8.
317 * @param lcc existing Light Media Scanner charset conversion.
318 * @param p_str string to be converted.
319 * @param p_len string size.
321 * @note the check for string being already UTF-8 is not reliable,
322 * some cases might show false positives (UTF-16 is considered UTF-8).
323 * @see lms_charset_conv_check()
325 * @return On success 0 is returned.
328 lms_charset_conv(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len)
339 if (!*p_str || !*p_len)
343 outstr = malloc(outlen + 1);
349 if (_check(lcc, *p_str, *p_len, outstr, outlen) == 0) {
354 for (i = 0; i < lcc->size; i++)
355 if (_conv(lcc->convs[i], p_str, p_len, outstr, outlen) == 0)
358 if (lcc->fallback == (iconv_t)-1)
362 "WARNING: could not convert '%*s' to any charset, use fallback\n",
364 i = _conv(lcc->fallback, p_str, p_len, outstr, outlen);
366 _fix_non_ascii(*p_str, *p_len);
373 * Forcefully do charset conversion to UTF-8.
375 * @param lcc existing Light Media Scanner charset conversion.
376 * @param p_str string to be converted.
377 * @param p_len string size.
379 * @note This function does not check for the string being in UTF-8 before
380 * doing the conversion, use it if you are sure about the charset.
381 * In this case you'll usually have just one charset added.
383 * @return On success 0 is returned.
386 lms_charset_conv_force(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len)
397 if (!*p_str || !*p_len)
401 outstr = malloc(outlen + 1);
407 for (i = 0; i < lcc->size; i++)
408 if (_conv(lcc->convs[i], p_str, p_len, outstr, outlen) == 0)
411 if (lcc->fallback == (iconv_t)-1)
415 "WARNING: could not convert '%*s' to any charset, use fallback\n",
417 i = _conv(lcc->fallback, p_str, p_len, outstr, outlen);
419 _fix_non_ascii(*p_str, *p_len);
426 * Check if strings is not UTF-8 and conversion is required.
428 * @param lcc existing Light Media Scanner charset conversion.
429 * @param str string to be analysed.
430 * @param len string size.
432 * @note current implementation is not reliable, it tries to convert from
433 * UTF-8 to UTF-8. Some cases, like ISO-8859-1 will work, but some like
434 * UTF-16 to UTF-8 will say it's already in the correct charset,
437 * @return 0 if string is already UTF-8.
440 lms_charset_conv_check(lms_charset_conv_t *lcc, const char *str, unsigned int len)
451 outstr = malloc(outlen);
457 r = _check(lcc, str, len, outstr, outlen);