src/charsets.c

   1 /*
   2   Copyright 1998-2003 Victor Wagner
   3   Copyright 2003 Alex Ott
   4   This file is released under the GPL.  Details can be
   5   found in the file COPYING accompanying this distribution.
   6 */
   7 #ifdef HAVE_CONFIG_H
   8 #include <config.h>
   9 #endif
  10 #include <stdio.h>
  11 #include <string.h>
  12 #include <stdlib.h>
  13 #include "catdoc.h"
  14
  15 char *charset_path=CHARSETPATH;
  16 char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;
  17 uint16_t * source_charset;
  18 int unknown_as_hex=0;
  19 char bad_char[]=UNKNOWN_CHAR;
  20 CHARSET target_charset;
  21 /************************************************************************/
  22 /* Converts char in input charset into unicode representation           */
  23 /* Should be converted to macro                                         */
  24 /************************************************************************/
  25 int to_unicode (uint16_t *charset, int c) {
  26         return charset[c];
  27 }
  28 /************************************************************************/
  29 /* Search inverse charset record for given unicode char and returns     */
  30 /* 0-255 char value if found, -1 otherwise                              */
  31 /************************************************************************/
  32 int from_unicode (CHARSET charset, int u) {
  33         short int *p;
  34         /* This is really assignment, not comparation */
  35         if ((p=charset[(unsigned)u>>8])) {
  36                 return p[u & 0xff];
  37         } else {
  38                 return -1;
  39         }
  40 }
  41 /************************************************************************/
  42 /*  Converts direct (charset -> unicode) to reverse map                 */
  43 /************************************************************************/
  44 CHARSET make_reverse_map(short int *charset) {
  45         CHARSET newmap=calloc(sizeof(short int *), 256);
  46         int i,j,k,l;
  47         short int *p;
  48         if (! charset) {
  49                 free(newmap);
  50                 return NULL;
  51         }
  52         for (i=0;i<256;i++) {
  53                 k= charset[i];
  54                 j=  (unsigned)k>>8;
  55                 if (!newmap[j]) {
  56                         newmap[j] = (short int *)malloc(sizeof(short int)*256);
  57                         if (!newmap[j]) {
  58                                 fprintf(stderr,"Insufficient memory for  charset\n");
  59                                 exit(1);
  60                         }
  61                         for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;
  62                 }
  63                 p=newmap[j];
  64                 p[k & 0xff]=i;
  65         }
  66         return newmap;
  67 }
  68
  69 /************************************************************************/
  70 /* Reads charset file (as got from ftp.unicode.org) and returns array of*/
  71 /* 256 short ints (malloced) mapping from charset t unicode             */
  72 /************************************************************************/int * read_charset(const char *filename) {
  73 uint16_t * read_charset(const char *filename) {
  74         char *path;
  75         FILE *f;
  76         uint16_t *new;
  77         int c;
  78         long int uc;
  79         path= find_file(stradd(filename,CHARSET_EXT),charset_path);
  80         if (!path) {
  81                 fprintf(stderr,"Cannot load charset %s - file not found\n",filename);
  82                 return NULL;
  83         }
  84         f=fopen(path,"rb");
  85
  86         if (!f) {
  87                 perror(path);
  88                 return NULL;
  89         }
  90         if (input_buffer)
  91                 setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);
  92         /* defaults */
  93         new = calloc(sizeof(short int),256);
  94         for (c=0;c<32;c++) {
  95                 new[c]=c;
  96         }
  97         while (!feof(f)) {
  98                 if (fscanf(f,"%i %li",&c,&uc)==2) {
  99                         if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {
 100                                 fprintf(stderr,"Invalid charset file %s\n",path);
 101                                 fclose(f);
 102                                 return NULL;
 103                         }
 104                         new[c]=uc;
 105                 }
 106                 while((fgetc(f)!='\n')&&!feof(f)) ;
 107         }
 108         fclose (f);
 109         free(path);
 110         return new;
 111 }
 112
 113
 114 /************************************************************************/
 115 /* Reads 8-bit char and convers it from source charset                  */
 116 /************************************************************************/
 117
 118 int get_8bit_char (FILE *f,long *offset,long fileend)
 119 {
 120         unsigned char buf;
 121         if (catdoc_read(&buf, 1, 1, f)==0) return EOF;
 122         (*offset)++;
 123         return to_unicode(source_charset,buf);
 124 }
 125
 126
 127 /************************************************************************/
 128 /* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only,  */
 129 /* so read lsb first always and don't care about proper bit order       */
 130 /************************************************************************/
 131
 132 int get_utf16lsb (FILE *f,long *offset,long fileend) {
 133         unsigned char buf[2];
 134     int result;
 135         result=catdoc_read(buf, 1, 2, f);
 136         if (result<0) {
 137                 perror("read:");
 138                 exit(1);
 139         }
 140         if (result !=2) {
 141                 return EOF;
 142         }
 143         (*offset)+=2;
 144         return ((int)buf[1])|(((int)buf[0])<<8);
 145 }
 146
 147 /************************************************************************/
 148 /* Reads 16-bit unicode value written in MSB order. For processing
 149  * non-word files            .                                          */
 150 /************************************************************************/
 151 int get_utf16msb (FILE *f,long *offset,long fileend) {
 152         unsigned char buf[2];
 153     int result;
 154         result=catdoc_read(buf, 1, 2, f);
 155         if (result<0) {
 156                 perror("read:");
 157                 exit(1);
 158         }
 159         if (result !=2) {
 160                 return EOF;
 161         }
 162         (*offset)+=2;
 163         return ((int)buf[0])|(((int)buf[1])<<8);
 164 }
 165
 166 int get_utf8 (FILE *f,long *offset,long fileend) {
 167         unsigned char buf[3];
 168         int c;
 169     int result;
 170         result=catdoc_read(buf, 1, 1, f);
 171         if (result<0) {
 172                 perror("read");
 173                 exit(1);
 174         }
 175         if (result==0) return EOF;
 176         c=buf[0];
 177         if (c<0x80)
 178                 return c;
 179         if (c <0xC0)
 180                 return 0xfeff; /*skip corrupted sequebces*/
 181         if (c <0xE0) {
 182                 if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF;
 183                 return (((c & 0x1F)<<6) | ((char)buf[1] & 0x3F));
 184         }
 185         if (c <0xF0) {
 186                 if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF;
 187                 return ((c & 0x0F)<<12)|
 188                         ((buf[1] & 0x3f)<<6)|
 189                                          (buf[2] & 0x3f);
 190         }
 191         return 0xFEFF;
 192 }
 193
 194 /**************************************************************************/
 195 /*  Converts unicode char to output charset sequence. Coversion have      */
 196 /*  three steps: 1. Replacement map is searched for the character in case */
 197 /* it is not allowed for output format (% in TeX, < in HTML               */
 198 /* 2. target charset is searched for this unicode char, if it wasn't      */
 199 /*  replaced. If not found, then 3. Substitution map is searched          */
 200 /**************************************************************************/
 201 char *convert_char(int uc) {
 202         static char plain_char[]="a"; /*placeholder for one-char sequences */
 203         static char hexbuf[8];
 204         char *mapped;
 205         int c;
 206         if ((mapped=map_subst(spec_chars,uc))) return mapped;
 207         if (target_charset) {
 208                 c =from_unicode(target_charset,uc);
 209                 if (c>=0) {
 210                         *plain_char=c;
 211                         return plain_char;
 212                 }
 213                 if ((mapped = map_subst(replacements,uc))) return mapped;
 214                 if (unknown_as_hex) {
 215                         sprintf(hexbuf,"\\x%04X",(unsigned)uc);
 216                         /* This sprintf is safe, becouse uc is unicode character code,
 217                            which cannot be greater than 0xFFFE. It is ensured by routines
 218                            in reader.c
 219                            */
 220                         return hexbuf;
 221                 }
 222                 return  bad_char;
 223         } else {
 224                 /* NULL target charset means UTF-8 output */
 225                 return to_utf8(uc);
 226         }
 227 }
 228 /******************************************************************/
 229 /* Converts given unicode character to the utf-8 sequence         */
 230 /* in the static string buffer. Buffer wouldbe overwritten upon   */
 231 /* next call                                                      */
 232 /******************************************************************/
 233 char *to_utf8(unsigned int uc) {
 234         static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal
 235                                                                  with chars greater than 65535*/
 236         int count=0;
 237         if (uc< 0x80) {
 238                 utfbuffer[0]=uc;
 239                 count=1;
 240         } else  {
 241                 if (uc < 0x800) {
 242                         utfbuffer[count++]=0xC0 | (uc >> 6);
 243                 } else {
 244                         utfbuffer[count++]=0xE0 | (uc >>12);
 245                         utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
 246                 }
 247                 utfbuffer[count++]=0x80 | (uc & 0x3F);
 248         }
 249         utfbuffer[count]=0;
 250         return utfbuffer;
 251 }
 252
 253 struct cp_map {
 254         uint16_t codepage;
 255         char *charset_name;
 256 };
 257
 258 struct cp_map cp_to_charset [] = {
 259         {10000,"mac-roman"},
 260         {10001,"mac-japanese"},
 261         {10002,"mac-tchinese"},
 262         {10003,"mac-korean"},
 263         {10004,"mac-arabic"},
 264         {10005,"mac-hebrew"},
 265         {10006,"mac-greek1"},
 266         {10007,"mac-cyrillic"},
 267         {10008,"mac-schinese"},
 268         {10010,"mac-romania"},
 269         {10017,"mac-ukraine"},
 270         {10021,"mac-thai"},
 271         {10029,"mac-centeuro"},
 272         {10079,"mac-iselandic"},
 273         {10081,"mac-turkish"},
 274         {10082,"mac-croatia"},
 275         {20866,"koi8-r"},
 276         {28591,"8859-1"},
 277         {28592,"8859-2"},
 278         {28593,"8859-3"},
 279         {28594,"8859-4"},
 280         {28595,"8859-5"},
 281         {28596,"8859-6"},
 282         {28597,"8859-7"},
 283         {28598,"8859-8"},
 284         {28599,"8859-9"},
 285         {28605,"8859-15"},
 286         {65001,"utf-8"},
 287     {0,NULL}};
 288 const char *charset_from_codepage(unsigned int codepage) {
 289
 290         static char buffer[7];
 291         struct cp_map *cp;
 292         if (codepage==1200||codepage==1201) {
 293                 /* For UCS2 */
 294                 return "";
 295         } else
 296         if (codepage<10000) {
 297                 sprintf(buffer,"cp%d",codepage);
 298                 return buffer;
 299         } else {
 300                 for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
 301                 return cp->charset_name;
 302         }
 303 }