]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/charsets.c
eda5cb4caa24e413357b779c229cd8e5150766bb
[oss/catdoc.git] / src / charsets.c
1 /*
2   Copyright 1998-2003 Victor Wagner
3   Copyright 2003 Alex Ott
4   This file is released under the GPL.  Details can be
5   found in the file COPYING accompanying this distribution.
6 */
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10 #include <stdio.h>
11 #include <string.h>
12 #include <stdlib.h>
13 #include "catdoc.h"
14
15 char *charset_path=CHARSETPATH;
16 char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;
17 uint16_t * source_charset;
18 int unknown_as_hex=0;
19 char bad_char[]=UNKNOWN_CHAR;
20 CHARSET target_charset;
21 /************************************************************************/
22 /* Converts char in input charset into unicode representation           */
23 /* Should be converted to macro                                         */
24 /************************************************************************/
25 int to_unicode (uint16_t *charset, int c) {
26         return charset[c];
27 }
28 /************************************************************************/
29 /* Search inverse charset record for given unicode char and returns     */
30 /* 0-255 char value if found, -1 otherwise                              */
31 /************************************************************************/
32 int from_unicode (CHARSET charset, int u) {
33         short int *p;
34         /* This is really assignment, not comparation */
35         if ((p=charset[(unsigned)u>>8])) {
36                 return p[u & 0xff];
37         } else {
38                 return -1;
39         }
40 }
41 /************************************************************************/
42 /*  Converts direct (charset -> unicode) to reverse map                 */
43 /************************************************************************/
44 CHARSET make_reverse_map(short int *charset) {
45         CHARSET newmap=calloc(sizeof(short int *), 256);
46         int i,j,k,l;
47         short int *p;   
48         if (! charset) {
49                 free(newmap);
50                 return NULL;
51         }       
52         for (i=0;i<256;i++) {
53                 k= charset[i];
54                 j=  (unsigned)k>>8;
55                 if (!newmap[j]) {
56                         newmap[j] = (short int *)malloc(sizeof(short int)*256);
57                         if (!newmap[j]) {
58                                 fprintf(stderr,"Insufficient memory for  charset\n");
59                                 exit(1);
60                         }
61                         for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;
62                 }
63                 p=newmap[j];
64                 p[k & 0xff]=i;
65         }
66         return newmap;
67 }
68
69 /************************************************************************/
70 /* Reads charset file (as got from ftp.unicode.org) and returns array of*/
71 /* 256 short ints (malloced) mapping from charset t unicode             */
72 /************************************************************************/int * read_charset(const char *filename) {
73 uint16_t * read_charset(const char *filename) {
74         char *path;
75         FILE *f;
76         uint16_t *new;
77         int c;
78         long int uc;
79         path= find_file(stradd(filename,CHARSET_EXT),charset_path);
80         if (!path) {
81                 fprintf(stderr,"Cannot load charset %s - file not found\n",filename);
82                 return NULL;
83         }
84         f=fopen(path,"rb");
85
86         if (!f) {
87                 perror(path); 
88                 return NULL;
89         }
90         if (input_buffer)
91                 setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);
92         /* defaults */
93         new = calloc(sizeof(short int),256);
94         for (c=0;c<32;c++) {
95                 new[c]=c;
96         }
97         while (!feof(f)) {
98                 if (fscanf(f,"%i %li",&c,&uc)==2) {
99                         if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {
100                                 fprintf(stderr,"Invalid charset file %s\n",path);
101                                 fclose(f);
102                                 return NULL;
103                         }
104                         new[c]=uc;
105                 }
106                 while((fgetc(f)!='\n')&&!feof(f)) ;
107         }
108         fclose (f);
109         free(path);
110         return new;
111 }
112
113
114 /************************************************************************/
115 /* Reads 8-bit char and convers it from source charset                  */
116 /************************************************************************/
117
118 int get_8bit_char (FILE *f,long *offset,long fileend)
119 {
120         unsigned char buf;
121         if (catdoc_read(&buf, 1, 1, f)==0) return EOF;
122         (*offset)++;  
123         return to_unicode(source_charset,buf);
124 }
125
126
127 /************************************************************************/
128 /* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only,  */
129 /* so read lsb first always and don't care about proper bit order       */
130 /************************************************************************/
131
132 int get_utf16lsb (FILE *f,long *offset,long fileend) {
133         unsigned char buf[2];
134     int result;
135         result=catdoc_read(buf, 1, 2, f);
136         if (result<0) {
137                 perror("read:");
138                 exit(1);
139         }
140         if (result !=2) {
141                 return EOF;
142         }       
143         (*offset)+=2;
144         return ((int)buf[1])|(((int)buf[0])<<8);
145 }
146
147 /************************************************************************/
148 /* Reads 16-bit unicode value written in MSB order. For processing 
149  * non-word files            .                                          */
150 /************************************************************************/
151 int get_utf16msb (FILE *f,long *offset,long fileend) {
152         unsigned char buf[2];
153     int result;
154         result=catdoc_read(buf, 1, 2, f);
155         if (result<0) {
156                 perror("read:");
157                 exit(1);
158         }
159         if (result !=2) {
160                 return EOF;
161         }       
162         (*offset)+=2;
163         return ((int)buf[0])|(((int)buf[1])<<8);
164 }
165
166 int get_utf8 (FILE *f,long *offset,long fileend) {
167         unsigned char buf[3];
168         int c;
169     int result;
170         result=catdoc_read(buf, 1, 1, f);
171         if (result<0) {
172                 perror("read");
173                 exit(1);
174         }       
175         if (result==0) return EOF;
176         c=buf[0];
177         if (c<0x80) 
178                 return c;
179         if (c <0xC0) 
180                 return 0xfeff; /*skip corrupted sequebces*/
181         if (c <0xE0) {
182                 if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF;
183                 return (((c & 0x1F)<<6) | ((char)buf[1] & 0x3F));
184         }
185         if (c <0xF0) {
186                 if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF;
187                 return ((c & 0x0F)<<12)|
188                         ((buf[1] & 0x3f)<<6)|
189                                          (buf[2] & 0x3f);
190         }  
191         return 0xFEFF; 
192 }
193
194 /**************************************************************************/
195 /*  Converts unicode char to output charset sequence. Coversion have      */
196 /*  three steps: 1. Replacement map is searched for the character in case */
197 /* it is not allowed for output format (% in TeX, < in HTML               */
198 /* 2. target charset is searched for this unicode char, if it wasn't      */
199 /*  replaced. If not found, then 3. Substitution map is searched          */
200 /**************************************************************************/
201 char *convert_char(int uc) {
202         static char plain_char[]="a"; /*placeholder for one-char sequences */
203         static char hexbuf[8];
204         char *mapped;
205         int c;
206         if ((mapped=map_subst(spec_chars,uc))) return mapped;
207         if (target_charset) { 
208                 c =from_unicode(target_charset,uc);
209                 if (c>=0) {
210                         *plain_char=c;
211                         return plain_char;
212                 }
213                 if ((mapped = map_subst(replacements,uc))) return mapped;
214                 if (unknown_as_hex) {
215                         sprintf(hexbuf,"\\x%04X",(unsigned)uc);
216                         /* This sprintf is safe, becouse uc is unicode character code,
217                            which cannot be greater than 0xFFFE. It is ensured by routines
218                            in reader.c
219                            */
220                         return hexbuf;
221                 }   
222                 return  bad_char;
223         } else {
224                 /* NULL target charset means UTF-8 output */
225                 return to_utf8(uc);
226         }  
227 }
228 /******************************************************************/
229 /* Converts given unicode character to the utf-8 sequence         */
230 /* in the static string buffer. Buffer wouldbe overwritten upon   */
231 /* next call                                                      */
232 /******************************************************************/ 
233 char *to_utf8(unsigned int uc) {
234         static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal
235                                                                  with chars greater than 65535*/
236         int count=0;
237         if (uc< 0x80) {
238                 utfbuffer[0]=uc;
239                 count=1;
240         } else  {
241                 if (uc < 0x800) {
242                         utfbuffer[count++]=0xC0 | (uc >> 6);
243                 } else {
244                         utfbuffer[count++]=0xE0 | (uc >>12);
245                         utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
246                 }           
247                 utfbuffer[count++]=0x80 | (uc & 0x3F);
248         }  
249         utfbuffer[count]=0;
250         return utfbuffer;
251 }    
252
253 struct cp_map {
254         uint16_t codepage;
255         char *charset_name;
256 };
257
258 struct cp_map cp_to_charset [] = {
259         {10000,"mac-roman"},
260         {10001,"mac-japanese"},
261         {10002,"mac-tchinese"},
262         {10003,"mac-korean"},
263         {10004,"mac-arabic"},
264         {10005,"mac-hebrew"},
265         {10006,"mac-greek1"},
266         {10007,"mac-cyrillic"},
267         {10008,"mac-schinese"},
268         {10010,"mac-romania"},
269         {10017,"mac-ukraine"},
270         {10021,"mac-thai"},
271         {10029,"mac-centeuro"},
272         {10079,"mac-iselandic"},
273         {10081,"mac-turkish"},
274         {10082,"mac-croatia"},
275         {20866,"koi8-r"},
276         {28591,"8859-1"},
277         {28592,"8859-2"},
278         {28593,"8859-3"},
279         {28594,"8859-4"},
280         {28595,"8859-5"},
281         {28596,"8859-6"},
282         {28597,"8859-7"},
283         {28598,"8859-8"},
284         {28599,"8859-9"},
285         {28605,"8859-15"},
286         {65001,"utf-8"},
287     {0,NULL}};
288 const char *charset_from_codepage(unsigned int codepage) {
289         
290         static char buffer[7];
291         struct cp_map *cp;
292         if (codepage==1200||codepage==1201) {
293                 /* For UCS2 */
294                 return "";
295         } else 
296         if (codepage<10000) {
297                 sprintf(buffer,"cp%d",codepage);
298                 return buffer;
299         } else {
300                 for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
301                 return cp->charset_name;
302         }
303 }