src/charsets.c

   1 /*
   2   Copyright 1998-2003 Victor Wagner
   3   Copyright 2003 Alex Ott
   4   This file is released under the GPL.  Details can be
   5   found in the file COPYING accompanying this distribution.
   6 */
   7 #ifdef HAVE_CONFIG_H
   8 #include <config.h>
   9 #endif
  10 #include <stdio.h>
  11 #include <string.h>
  12 #include <stdlib.h>
  13 #include "catdoc.h"
  14
  15 char *charset_path=CHARSETPATH;
  16 char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;
  17 short int * source_charset;
  18 int unknown_as_hex=0;
  19 char bad_char[]=UNKNOWN_CHAR;
  20 CHARSET target_charset;
  21 /************************************************************************/
  22 /* Converts char in input charset into unicode representation           */
  23 /* Should be converted to macro                                         */
  24 /************************************************************************/
  25 int to_unicode (short int *charset, int c) {
  26         return charset[c];
  27 }
  28 /************************************************************************/
  29 /* Search inverse charset record for given unicode char and returns     */
  30 /* 0-255 char value if found, -1 otherwise                              */
  31 /************************************************************************/
  32 int from_unicode (CHARSET charset, int u) {
  33         short int *p;
  34         /* This is really assignment, not comparation */
  35         if ((p=charset[(unsigned)u>>8])) {
  36                 return p[u & 0xff];
  37         } else {
  38                 return -1;
  39         }
  40 }
  41 /************************************************************************/
  42 /*  Converts direct (charset -> unicode) to reverse map                 */
  43 /************************************************************************/
  44 CHARSET make_reverse_map(short int *charset) {
  45         CHARSET newmap=calloc(sizeof(short int *), 256);
  46         int i,j,k,l;
  47         short int *p;
  48         if (! charset) {
  49                 return NULL;
  50         }
  51         for (i=0;i<256;i++) {
  52                 k= charset[i];
  53                 j=  (unsigned)k>>8;
  54                 if (!newmap[j]) {
  55                         newmap[j] = malloc(sizeof(short int *)*256);
  56                         if (!newmap[j]) {
  57                                 fprintf(stderr,"Insufficient memory for  charset\n");
  58                                 exit(1);
  59                         }
  60                         for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;
  61                 }
  62                 p=newmap[j];
  63                 p[k & 0xff]=i;
  64         }
  65         return newmap;
  66 }
  67
  68 /************************************************************************/
  69 /* Reads charset file (as got from ftp.unicode.org) and returns array of*/
  70 /* 256 short ints (malloced) mapping from charset t unicode             */
  71 /************************************************************************/
  72 short int * read_charset(const char *filename) {
  73         char *path;
  74         FILE *f;
  75         short int *new=calloc(sizeof(short int),256);
  76         int c;
  77         long int uc;
  78         path= find_file(stradd(filename,CHARSET_EXT),charset_path);
  79         if (!path) {
  80                 fprintf(stderr,"Cannot load charset %s - file not found\n",filename);
  81                 return NULL;
  82         }
  83         f=fopen(path,"rb");
  84
  85         if (!f) {
  86                 perror(path);
  87                 return NULL;
  88         }
  89         if (input_buffer)
  90                 setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);
  91         /* defaults */
  92         for (c=0;c<32;c++) {
  93                 new[c]=c;
  94         }
  95         while (!feof(f)) {
  96                 if (fscanf(f,"%i %li",&c,&uc)==2) {
  97                         if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {
  98                                 fprintf(stderr,"Invalid charset file %s\n",path);
  99                                 fclose(f);
 100                                 return NULL;
 101                         }
 102                         new[c]=uc;
 103                 }
 104                 while((fgetc(f)!='\n')&&!feof(f)) ;
 105         }
 106         fclose (f);
 107         free(path);
 108         return new;
 109 }
 110
 111
 112 /************************************************************************/
 113 /* Reads 8-bit char and convers it from source charset                  */
 114 /************************************************************************/
 115
 116 int get_8bit_char (FILE *f,long *offset,long fileend)
 117 {
 118         unsigned char buf;
 119         if (catdoc_read(&buf, 1, 1, f)==0) return EOF;
 120         (*offset)++;
 121         return to_unicode(source_charset,buf);
 122 }
 123
 124
 125 /************************************************************************/
 126 /* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only,  */
 127 /* so read lsb first always and don't care about proper bit order       */
 128 /************************************************************************/
 129
 130 int get_utf16lsb (FILE *f,long *offset,long fileend) {
 131         unsigned char buf[2];
 132     int result;
 133         result=catdoc_read(buf, 1, 2, f);
 134         if (result<0) {
 135                 perror("read:");
 136                 exit(1);
 137         }
 138         if (result !=2) {
 139                 return EOF;
 140         }
 141         (*offset)+=2;
 142         return ((int)buf[1])|(((int)buf[0])<<8);
 143 }
 144
 145 /************************************************************************/
 146 /* Reads 16-bit unicode value written in MSB order. For processing
 147  * non-word files            .                                          */
 148 /************************************************************************/
 149 int get_utf16msb (FILE *f,long *offset,long fileend) {
 150         unsigned char buf[2];
 151     int result;
 152         result=catdoc_read(buf, 1, 2, f);
 153         if (result<0) {
 154                 perror("read:");
 155                 exit(1);
 156         }
 157         if (result !=2) {
 158                 return EOF;
 159         }
 160         (*offset)+=2;
 161         return ((int)buf[0])|(((int)buf[1])<<8);
 162 }
 163
 164 int get_utf8 (FILE *f,long *offset,long fileend) {
 165         unsigned char buf[3];
 166         int d,c;
 167     int result;
 168         result=catdoc_read(buf, 1, 1, f);
 169         if (result<0) {
 170                 perror("read");
 171                 exit(1);
 172         }
 173         if (result==0) return EOF;
 174         c=buf[0];
 175         d=0;
 176         if (c<0x80)
 177                 return c;
 178         if (c <0xC0)
 179                 return 0xfeff; /*skip corrupted sequebces*/
 180         if (c <0xE0) {
 181                 if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF;
 182                 return ((c & 0x1F)<<6 | ((char)buf[1] & 0x3F));
 183         }
 184         if (c <0xF0) {
 185                 if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF;
 186                 return ((c & 0x0F)<<12)|
 187                         ((buf[1] & 0x3f)<<6)|
 188                                          (buf[2] & 0x3f);
 189         }
 190         return 0xFEFF;
 191 }
 192
 193 /**************************************************************************/
 194 /*  Converts unicode char to output charset sequence. Coversion have      */
 195 /*  three steps: 1. Replacement map is searched for the character in case */
 196 /* it is not allowed for output format (% in TeX, < in HTML               */
 197 /* 2. target charset is searched for this unicode char, if it wasn't      */
 198 /*  replaced. If not found, then 3. Substitution map is searched          */
 199 /**************************************************************************/
 200 char *convert_char(int uc) {
 201         static char plain_char[]="a"; /*placeholder for one-char sequences */
 202         static char hexbuf[8];
 203         char *mapped;
 204         int c;
 205         if ((mapped=map_subst(spec_chars,uc))) return mapped;
 206         if (target_charset) {
 207                 c =from_unicode(target_charset,uc);
 208                 if (c>=0) {
 209                         *plain_char=c;
 210                         return plain_char;
 211                 }
 212                 if ((mapped = map_subst(replacements,uc))) return mapped;
 213                 if (unknown_as_hex) {
 214                         sprintf(hexbuf,"\\x%04X",(unsigned)uc);
 215                         /* This sprintf is safe, becouse uc is unicode character code,
 216                            which cannot be greater than 0xFFFE. It is ensured by routines
 217                            in reader.c
 218                            */
 219                         return hexbuf;
 220                 }
 221                 return  bad_char;
 222         } else {
 223                 /* NULL target charset means UTF-8 output */
 224                 return to_utf8(uc);
 225         }
 226 }
 227 /******************************************************************/
 228 /* Converts given unicode character to the utf-8 sequence         */
 229 /* in the static string buffer. Buffer wouldbe overwritten upon   */
 230 /* next call                                                      */
 231 /******************************************************************/
 232 char *to_utf8(unsigned int uc) {
 233         static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal
 234                                                                  with chars greater than 65535*/
 235         int count=0;
 236         if (uc< 0x80) {
 237                 utfbuffer[0]=uc;
 238                 count=1;
 239         } else  {
 240                 if (uc < 0x800) {
 241                         utfbuffer[count++]=0xC0 | (uc >> 6);
 242                 } else {
 243                         utfbuffer[count++]=0xE0 | (uc >>12);
 244                         utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
 245                 }
 246                 utfbuffer[count++]=0x80 | (uc & 0x3F);
 247         }
 248         utfbuffer[count]=0;
 249         return utfbuffer;
 250 }
 251
 252 struct cp_map {
 253         int codepage;
 254         char *charset_name;
 255 };
 256
 257 struct cp_map cp_to_charset [] = {
 258         {10000,"mac-roman"},
 259         {10001,"mac-japanese"},
 260         {10002,"mac-tchinese"},
 261         {10003,"mac-korean"},
 262         {10004,"mac-arabic"},
 263         {10005,"mac-hebrew"},
 264         {10006,"mac-greek1"},
 265         {10007,"mac-cyrillic"},
 266         {10008,"mac-schinese"},
 267         {10010,"mac-romania"},
 268         {10017,"mac-ukraine"},
 269         {10021,"mac-thai"},
 270         {10029,"mac-centeuro"},
 271         {10079,"mac-iselandic"},
 272         {10081,"mac-turkish"},
 273         {10082,"mac-croatia"},
 274         {20866,"koi8-r"},
 275         {28591,"8859-1"},
 276         {28592,"8859-2"},
 277         {28593,"8859-3"},
 278         {28594,"8859-4"},
 279         {28595,"8859-5"},
 280         {28596,"8859-6"},
 281         {28597,"8859-7"},
 282         {28598,"8859-8"},
 283         {28599,"8859-9"},
 284         {28605,"8859-15"},
 285         {65001,"utf-8"},
 286     {0,NULL}};
 287 const char *charset_from_codepage(unsigned int codepage) {
 288
 289         static char buffer[7];
 290         struct cp_map *cp;
 291         if (codepage==1200||codepage==1201) {
 292                 /* For UCS2 */
 293                 return "";
 294         } else
 295         if (codepage<10000) {
 296                 sprintf(buffer,"cp%d",codepage);
 297                 return buffer;
 298         } else {
 299                 for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
 300                 return cp->charset_name;
 301         }
 302 }