src/charsets.c

   1 /*
   2   Copyright 1998-2003 Victor Wagner
   3   Copyright 2003 Alex Ott
   4   This file is released under the GPL.  Details can be
   5   found in the file COPYING accompanying this distribution.
   6 */
   7 #ifdef HAVE_CONFIG_H
   8 #include <config.h>
   9 #endif
  10 #include <stdio.h>
  11 #include <string.h>
  12 #include <stdlib.h>
  13 #include "catdoc.h"
  14
  15 char *charset_path=CHARSETPATH;
  16 char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;
  17 uint16_t * source_charset;
  18 int unknown_as_hex=0;
  19 char bad_char[]=UNKNOWN_CHAR;
  20 CHARSET target_charset;
  21 /************************************************************************/
  22 /* Converts char in input charset into unicode representation           */
  23 /* Should be converted to macro                                         */
  24 /************************************************************************/
  25 int to_unicode (uint16_t *charset, int c) {
  26         return charset[c];
  27 }
  28 /************************************************************************/
  29 /* Search inverse charset record for given unicode char and returns     */
  30 /* 0-255 char value if found, -1 otherwise                              */
  31 /************************************************************************/
  32 int from_unicode (CHARSET charset, int u) {
  33         short int *p;
  34         /* This is really assignment, not comparation */
  35         if ((p=charset[(unsigned)u>>8])) {
  36                 return p[u & 0xff];
  37         } else {
  38                 return -1;
  39         }
  40 }
  41 /************************************************************************/
  42 /*  Converts direct (charset -> unicode) to reverse map                 */
  43 /************************************************************************/
  44 CHARSET make_reverse_map(short int *charset) {
  45         CHARSET newmap=calloc(sizeof(short int *), 256);
  46         int i,j,k,l;
  47         short int *p;
  48         if (! charset) {
  49                 free(newmap);
  50                 return NULL;
  51         }
  52         for (i=0;i<256;i++) {
  53                 k= charset[i];
  54                 j=  (unsigned)k>>8;
  55                 if (!newmap[j]) {
  56                         newmap[j] = (short int *)malloc(sizeof(short int)*256);
  57                         if (!newmap[j]) {
  58                                 fprintf(stderr,"Insufficient memory for  charset\n");
  59                                 exit(1);
  60                         }
  61                         for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;
  62                 }
  63                 p=newmap[j];
  64                 p[k & 0xff]=i;
  65         }
  66         return newmap;
  67 }
  68
  69 /************************************************************************/
  70 /* Reads charset file (as got from ftp.unicode.org) and returns array of*/
  71 /* 256 short ints (malloced) mapping from charset t unicode             */
  72 /************************************************************************/
  73 uint16_t * read_charset(const char *filename) {
  74         char *path;
  75         FILE *f;
  76         uint16_t *new;
  77         int c;
  78         long int uc;
  79         path= find_file(stradd(filename,CHARSET_EXT),charset_path);
  80         if (!path) {
  81                 fprintf(stderr,"Cannot load charset %s - file not found\n",filename);
  82                 return NULL;
  83         }
  84         f=fopen(path,"rb");
  85
  86         if (!f) {
  87                 perror(path);
  88                 return NULL;
  89         }
  90         if (input_buffer)
  91                 setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);
  92         /* defaults */
  93         new = calloc(sizeof(short int),256);
  94         for (c=0;c<32;c++) {
  95                 new[c]=c;
  96         }
  97         while (!feof(f)) {
  98                 if (fscanf(f,"%i %li",&c,&uc)==2) {
  99                         if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {
 100                                 fprintf(stderr,"Invalid charset file %s\n",path);
 101                                 fclose(f);
 102                                 free(new);
 103                                 return NULL;
 104                         }
 105                         new[c]=uc;
 106                 }
 107                 while((fgetc(f)!='\n')&&!feof(f)) ;
 108         }
 109         fclose (f);
 110         free(path);
 111         return new;
 112 }
 113
 114
 115 /************************************************************************/
 116 /* Reads 8-bit char and convers it from source charset                  */
 117 /************************************************************************/
 118
 119 int get_8bit_char (FILE *f,long *offset,long fileend)
 120 {
 121         unsigned char buf;
 122         if (catdoc_read(&buf, 1, 1, f)==0) return EOF;
 123         (*offset)++;
 124         return to_unicode(source_charset,buf);
 125 }
 126
 127
 128 /************************************************************************/
 129 /* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only,  */
 130 /* so read lsb first always and don't care about proper bit order       */
 131 /************************************************************************/
 132
 133 int get_utf16lsb (FILE *f,long *offset,long fileend) {
 134         unsigned char buf[2];
 135     int result;
 136         result=catdoc_read(buf, 1, 2, f);
 137         if (result<0) {
 138                 perror("read:");
 139                 exit(1);
 140         }
 141         if (result !=2) {
 142                 return EOF;
 143         }
 144         (*offset)+=2;
 145         return ((int)buf[1])|(((int)buf[0])<<8);
 146 }
 147
 148 /************************************************************************/
 149 /* Reads 16-bit unicode value written in MSB order. For processing
 150  * non-word files            .                                          */
 151 /************************************************************************/
 152 int get_utf16msb (FILE *f,long *offset,long fileend) {
 153         unsigned char buf[2];
 154     int result;
 155         result=catdoc_read(buf, 1, 2, f);
 156         if (result<0) {
 157                 perror("read:");
 158                 exit(1);
 159         }
 160         if (result !=2) {
 161                 return EOF;
 162         }
 163         (*offset)+=2;
 164         return ((int)buf[0])|(((int)buf[1])<<8);
 165 }
 166
 167 int get_utf8 (FILE *f,long *offset,long fileend) {
 168         unsigned char buf[3];
 169         int c;
 170     int result;
 171         result=catdoc_read(buf, 1, 1, f);
 172         if (result<0) {
 173                 perror("read");
 174                 exit(1);
 175         }
 176         if (result==0) return EOF;
 177         c=buf[0];
 178         if (c<0x80)
 179                 return c;
 180         if (c <0xC0)
 181                 return 0xfeff; /*skip corrupted sequebces*/
 182         if (c <0xE0) {
 183                 if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF;
 184                 return (((c & 0x1F)<<6) | ((char)buf[1] & 0x3F));
 185         }
 186         if (c <0xF0) {
 187                 if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF;
 188                 return ((c & 0x0F)<<12)|
 189                         ((buf[1] & 0x3f)<<6)|
 190                                          (buf[2] & 0x3f);
 191         }
 192         return 0xFEFF;
 193 }
 194
 195 /**************************************************************************/
 196 /*  Converts unicode char to output charset sequence. Coversion have      */
 197 /*  three steps: 1. Replacement map is searched for the character in case */
 198 /* it is not allowed for output format (% in TeX, < in HTML               */
 199 /* 2. target charset is searched for this unicode char, if it wasn't      */
 200 /*  replaced. If not found, then 3. Substitution map is searched          */
 201 /**************************************************************************/
 202 char *convert_char(int uc) {
 203         static char plain_char[]="a"; /*placeholder for one-char sequences */
 204         static char hexbuf[8];
 205         char *mapped;
 206         int c;
 207         if ((mapped=map_subst(spec_chars,uc))) return mapped;
 208         if (target_charset) {
 209                 c =from_unicode(target_charset,uc);
 210                 if (c>=0) {
 211                         *plain_char=c;
 212                         return plain_char;
 213                 }
 214                 if ((mapped = map_subst(replacements,uc))) return mapped;
 215                 if (unknown_as_hex) {
 216                         sprintf(hexbuf,"\\x%04X",(unsigned)uc);
 217                         /* This sprintf is safe, becouse uc is unicode character code,
 218                            which cannot be greater than 0xFFFE. It is ensured by routines
 219                            in reader.c
 220                            */
 221                         return hexbuf;
 222                 }
 223                 return  bad_char;
 224         } else {
 225                 /* NULL target charset means UTF-8 output */
 226                 return to_utf8(uc);
 227         }
 228 }
 229 /******************************************************************/
 230 /* Converts given unicode character to the utf-8 sequence         */
 231 /* in the static string buffer. Buffer wouldbe overwritten upon   */
 232 /* next call                                                      */
 233 /******************************************************************/
 234 char *to_utf8(unsigned int uc) {
 235         static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal
 236                                                                  with chars greater than 65535*/
 237         int count=0;
 238         if (uc< 0x80) {
 239                 utfbuffer[0]=uc;
 240                 count=1;
 241         } else  {
 242                 if (uc < 0x800) {
 243                         utfbuffer[count++]=0xC0 | (uc >> 6);
 244                 } else {
 245                         utfbuffer[count++]=0xE0 | (uc >>12);
 246                         utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
 247                 }
 248                 utfbuffer[count++]=0x80 | (uc & 0x3F);
 249         }
 250         utfbuffer[count]=0;
 251         return utfbuffer;
 252 }
 253
 254 struct cp_map {
 255         uint16_t codepage;
 256         char *charset_name;
 257 };
 258
 259 struct cp_map cp_to_charset [] = {
 260         {10000,"mac-roman"},
 261         {10001,"mac-japanese"},
 262         {10002,"mac-tchinese"},
 263         {10003,"mac-korean"},
 264         {10004,"mac-arabic"},
 265         {10005,"mac-hebrew"},
 266         {10006,"mac-greek1"},
 267         {10007,"mac-cyrillic"},
 268         {10008,"mac-schinese"},
 269         {10010,"mac-romania"},
 270         {10017,"mac-ukraine"},
 271         {10021,"mac-thai"},
 272         {10029,"mac-centeuro"},
 273         {10079,"mac-iselandic"},
 274         {10081,"mac-turkish"},
 275         {10082,"mac-croatia"},
 276         {20866,"koi8-r"},
 277         {28591,"8859-1"},
 278         {28592,"8859-2"},
 279         {28593,"8859-3"},
 280         {28594,"8859-4"},
 281         {28595,"8859-5"},
 282         {28596,"8859-6"},
 283         {28597,"8859-7"},
 284         {28598,"8859-8"},
 285         {28599,"8859-9"},
 286         {28605,"8859-15"},
 287         {65001,"utf-8"},
 288     {0,NULL}};
 289 const char *charset_from_codepage(unsigned int codepage) {
 290
 291         static char buffer[7];
 292         struct cp_map *cp;
 293         if (codepage==1200||codepage==1201) {
 294                 /* For UCS2 */
 295                 return "";
 296         } else
 297         if (codepage<10000) {
 298                 sprintf(buffer,"cp%d",codepage);
 299                 return buffer;
 300         } else {
 301                 for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
 302                 return cp->charset_name;
 303         }
 304 }