2 Copyright 1998-2003 Victor Wagner
3 Copyright 2003 Alex Ott
4 This file is released under the GPL. Details can be
5 found in the file COPYING accompanying this distribution.
15 char *charset_path=CHARSETPATH;
16 char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;
17 uint16_t * source_charset;
19 char bad_char[]=UNKNOWN_CHAR;
20 CHARSET target_charset;
21 /************************************************************************/
22 /* Converts char in input charset into unicode representation */
23 /* Should be converted to macro */
24 /************************************************************************/
25 int to_unicode (short int *charset, int c) {
28 /************************************************************************/
29 /* Search inverse charset record for given unicode char and returns */
30 /* 0-255 char value if found, -1 otherwise */
31 /************************************************************************/
32 int from_unicode (CHARSET charset, int u) {
34 /* This is really assignment, not comparation */
35 if ((p=charset[(unsigned)u>>8])) {
41 /************************************************************************/
42 /* Converts direct (charset -> unicode) to reverse map */
43 /************************************************************************/
44 CHARSET make_reverse_map(short int *charset) {
45 CHARSET newmap=calloc(sizeof(short int *), 256);
56 newmap[j] = (short int *)malloc(sizeof(short int)*256);
58 fprintf(stderr,"Insufficient memory for charset\n");
61 for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;
69 /************************************************************************/
70 /* Reads charset file (as got from ftp.unicode.org) and returns array of*/
71 /* 256 short ints (malloced) mapping from charset t unicode */
72 /************************************************************************/
73 short int * read_charset(const char *filename) {
79 path= find_file(stradd(filename,CHARSET_EXT),charset_path);
81 fprintf(stderr,"Cannot load charset %s - file not found\n",filename);
91 setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);
93 new = calloc(sizeof(short int),256);
98 if (fscanf(f,"%i %li",&c,&uc)==2) {
99 if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {
100 fprintf(stderr,"Invalid charset file %s\n",path);
107 while((fgetc(f)!='\n')&&!feof(f)) ;
115 /************************************************************************/
116 /* Reads 8-bit char and convers it from source charset */
117 /************************************************************************/
119 int get_8bit_char (FILE *f,long *offset,long fileend)
122 if (catdoc_read(&buf, 1, 1, f)==0) return EOF;
124 return to_unicode(source_charset,buf);
128 /************************************************************************/
129 /* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only, */
130 /* so read lsb first always and don't care about proper bit order */
131 /************************************************************************/
133 int get_utf16lsb (FILE *f,long *offset,long fileend) {
134 unsigned char buf[2];
136 result=catdoc_read(buf, 1, 2, f);
145 return ((int)buf[1])|(((int)buf[0])<<8);
148 /************************************************************************/
149 /* Reads 16-bit unicode value written in MSB order. For processing
150 * non-word files . */
151 /************************************************************************/
152 int get_utf16msb (FILE *f,long *offset,long fileend) {
153 unsigned char buf[2];
155 result=catdoc_read(buf, 1, 2, f);
164 return ((int)buf[0])|(((int)buf[1])<<8);
167 int get_utf8 (FILE *f,long *offset,long fileend) {
168 unsigned char buf[3];
171 result=catdoc_read(buf, 1, 1, f);
176 if (result==0) return EOF;
181 return 0xfeff; /*skip corrupted sequebces*/
183 if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF;
184 return ((c & 0x1F)<<6 | ((char)buf[1] & 0x3F));
187 if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF;
188 return ((c & 0x0F)<<12)|
189 ((buf[1] & 0x3f)<<6)|
195 /**************************************************************************/
196 /* Converts unicode char to output charset sequence. Coversion have */
197 /* three steps: 1. Replacement map is searched for the character in case */
198 /* it is not allowed for output format (% in TeX, < in HTML */
199 /* 2. target charset is searched for this unicode char, if it wasn't */
200 /* replaced. If not found, then 3. Substitution map is searched */
201 /**************************************************************************/
202 char *convert_char(int uc) {
203 static char plain_char[]="a"; /*placeholder for one-char sequences */
204 static char hexbuf[8];
207 if ((mapped=map_subst(spec_chars,uc))) return mapped;
208 if (target_charset) {
209 c =from_unicode(target_charset,uc);
214 if ((mapped = map_subst(replacements,uc))) return mapped;
215 if (unknown_as_hex) {
216 sprintf(hexbuf,"\\x%04X",(unsigned)uc);
217 /* This sprintf is safe, becouse uc is unicode character code,
218 which cannot be greater than 0xFFFE. It is ensured by routines
225 /* NULL target charset means UTF-8 output */
229 /******************************************************************/
230 /* Converts given unicode character to the utf-8 sequence */
231 /* in the static string buffer. Buffer wouldbe overwritten upon */
233 /******************************************************************/
234 char *to_utf8(unsigned int uc) {
235 static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal
236 with chars greater than 65535*/
243 utfbuffer[count++]=0xC0 | (uc >> 6);
245 utfbuffer[count++]=0xE0 | (uc >>12);
246 utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
248 utfbuffer[count++]=0x80 | (uc & 0x3F);
259 struct cp_map cp_to_charset [] = {
261 {10001,"mac-japanese"},
262 {10002,"mac-tchinese"},
263 {10003,"mac-korean"},
264 {10004,"mac-arabic"},
265 {10005,"mac-hebrew"},
266 {10006,"mac-greek1"},
267 {10007,"mac-cyrillic"},
268 {10008,"mac-schinese"},
269 {10010,"mac-romania"},
270 {10017,"mac-ukraine"},
272 {10029,"mac-centeuro"},
273 {10079,"mac-iselandic"},
274 {10081,"mac-turkish"},
275 {10082,"mac-croatia"},
289 const char *charset_from_codepage(unsigned int codepage) {
291 static char buffer[7];
293 if (codepage==1200||codepage==1201) {
297 if (codepage<10000) {
298 sprintf(buffer,"cp%d",codepage);
301 for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
302 return cp->charset_name;