src/reader.c

   1 /*****************************************************************/
   2 /* Reading routines for MS-Word, MS-Write and text files         */
   3 /*                                                               */
   4 /* This file is part of catdoc project                           */
   5 /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003                    */
   6 /*****************************************************************/
   7 #ifdef HAVE_CONFIG_H
   8 #include <config.h>
   9 #endif
  10 #include <string.h>
  11 #include <stdio.h>
  12 #include "catdoc.h"
  13 unsigned short int buffer[PARAGRAPH_BUFFER];
  14 static unsigned char read_buf[256];
  15 static int buf_is_unicode;
  16
  17 /**************************************************************************/
  18 /* Just prints out content of input file. Called when file is not OLE     */
  19 /* stream                                                                 */
  20 /* Parameters - f - file to copy out. header - first few bytes of file,   */
  21 /*  which have been already read by format recognition code, but should   */
  22 /*  be output anyway                                                      */
  23 /**************************************************************************/
  24 void copy_out (FILE *f,char *header) {
  25         char *buf=(char *)buffer;
  26         int count,i;
  27         long offset;
  28         if (get_unicode_char == get_word8_char) {
  29                 /* non-word file and -u specified. Trying to guess which kind of
  30                  * unicode is used
  31                  */
  32                 if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
  33                         get_unicode_char = get_utf16msb;
  34                         fputs(convert_char(header[2]<<8|header[3]),stdout);
  35                         fputs(convert_char(header[4]<<8|header[5]),stdout);
  36                         fputs(convert_char(header[6]<<8|header[7]),stdout);
  37                 } else if ((unsigned char)header[0]!=0xFF ||
  38                                 (unsigned char)header[1]!=0xFE) {
  39                         int c,j,d;
  40                         /* if it is not utf16, assume it is UTF8. We are told -u,
  41                          * aren't we */
  42                         get_unicode_char = get_utf8;
  43                         i=0;
  44                         while (i<8) {
  45                                 c=(unsigned char)header[i++];
  46                                 if (c >=0x80) {
  47                                         if ( c<0xE0) {
  48                                                 c=(c & 0x1F);
  49                                                 count =1;
  50                                         } else {
  51                                                 c=(c & 0xF);
  52                                                 count = 2;
  53                                         }
  54                                         for (j=0;j<count;j++) {
  55                                                 if (i<7) {
  56                                                         d=(unsigned char) header[i++];
  57                                                 } else {
  58                                                         d=fgetc(f);
  59                                                 }
  60                                                 c=c<<6 | (d & 0x3F);
  61                                         }
  62                                 }
  63                                 fputs (convert_char(c),stdout);
  64                         }
  65                 } else {
  66                         get_unicode_char = get_utf16lsb;
  67                         fputs(convert_char(header[3]<<8|header[2]),stdout);
  68                         fputs(convert_char(header[5]<<8|header[4]),stdout);
  69                         fputs(convert_char(header[7]<<8|header[6]),stdout);
  70                 }
  71                 while (!catdoc_eof(f)) {
  72                         i=get_unicode_char(f,&offset,0x7FFFFFFF);
  73                         if (i!=EOF) fputs(convert_char(i),stdout);
  74                 }
  75         } else {
  76                 for (i=0;i<8;i++) {
  77                         fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
  78                 }
  79                 /* Assuming 8-bit input text */
  80                 while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
  81                         for (i=0;i<count;i++) {
  82                                 fputs(convert_char(to_unicode(source_charset,
  83                                                                 (unsigned char)buf[i])),stdout);
  84                         }
  85                 }
  86         }
  87 }
  88 /**************************************************************************/
  89 /*  process_file - main process engine. Reads word file using function,   */
  90 /*  pointed by get_unicode_char, searches for things which looks like     */
  91 /*  paragraphs and print them out                                         */
  92 /**************************************************************************/
  93 int process_file(FILE *f,long stop) {
  94         int bufptr;
  95         int tabmode=0;
  96         long offset=0;
  97         int hyperlink_mode = 0;
  98         unsigned short c;
  99         /* Now we are starting to read with get_unicode_char */
 100         while (!catdoc_eof(f) && offset<stop) {
 101                 bufptr = -1;
 102                 do {
 103                         c=get_unicode_char(f,&offset,stop);
 104                         /* Following symbols below 32 are allowed inside paragraph:
 105                            0x0002 - footnote mark
 106                            0x0007 - table separator (converted to tabmode)
 107                            0x0009 - Horizontal tab ( printed as is)
 108                            0x000B - hard return
 109                            0x000C - page break
 110                            0x000D - return - marks an end of paragraph
 111                            0x001E - IS2 for some reason means short defis in Word.
 112                            0x001F - soft hyphen in Word
 113                            0x0013 - start embedded hyperlink
 114                            0x0014 - separate hyperlink URL from text
 115                            0x0015 - end embedded hyperlink
 116                            */
 117                         if (tabmode) {
 118                                 tabmode=0;
 119                                 if (c==0x007) {
 120                                         buffer[++bufptr]=0x1E;
 121                                         continue;
 122                                 } else {
 123                                         buffer[++bufptr]=0x1C;
 124                                 }
 125                         }
 126                         if (c<32) {
 127                                 switch (c) {
 128                                         case 0x007:
 129                                                 tabmode = 1;
 130                                                 break;
 131                                         case 0x000D:
 132                                         case 0x000B:
 133                                                 buffer[++bufptr]=0x000A;
 134                                                 break;
 135                                         case 0x000C:
 136                                                 buffer[++bufptr]=c;
 137                                                 break;
 138                                         case 0x001E:
 139                                                 buffer[++bufptr]='-';
 140                                                 break;
 141                                         case 0x0002: break;
 142
 143                                         case 0x001F:
 144                                                                  buffer[++bufptr]=0xAD;/* translate to Unicode
 145                                                                                                                   soft hyphen */
 146                                                                  break;
 147                                         case 0x0009:
 148                                                                  buffer[++bufptr]=c;
 149                                                                  break;
 150                                         case 0x0013:
 151                                                                  hyperlink_mode=1;
 152                                                                  buffer[++bufptr]=' ';
 153                                                                  break;
 154                                         case 0x0014:
 155                                                                  hyperlink_mode = 0;
 156                                                                  /*fall through */
 157                                         case 0x0015:
 158                                                                  /* just treat hyperlink separators as
 159                                                                   * space */
 160                                                                  buffer[++bufptr]=' ';
 161                                                                  break;
 162                                         case 0x0001: if (hyperlink_mode)
 163                                                                                 break;
 164                                                                  /* else fall through */
 165                                         default:
 166                                                                  bufptr=-1; /* Any other control char - discard para*/
 167                                 }
 168                         } else if (c != 0xfeff) {
 169                                 /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
 170                                  * else*/
 171                                 buffer[++bufptr]=c;
 172                         }
 173                 } while (bufptr<=PARAGRAPH_BUFFER-2 &&
 174                                  !catdoc_eof(f) &&
 175                                  buffer[bufptr]!=0x000a);
 176                 if (bufptr>0) {
 177                         buffer[++bufptr]=0;
 178                         output_paragraph(buffer);
 179                 }
 180         }
 181         return 0;
 182 }
 183 /**********************************************************************/
 184 /* Reads file from MS-Word 97 and above file. Takes in account strange*
 185  * situation that unicode and non-unicode 256-byte blocks could be    *
 186  * intermixed in word file                                            *
 187  *                                                                    *
 188  * Parameters:                                                        *
 189  *                                                                    *
 190  * f - file to read                                                   *
 191  * offset - position of the character inside file (to determine       *
 192  * possible  block boundaries                                         *
 193  **********************************************************************/
 194 int get_word8_char(FILE *f,long *offset,long fileend) {
 195         int count,i,u;
 196         char c;
 197         if ((i=(*offset)%256) ==0) {
 198                 count=catdoc_read(read_buf,1,256,f);
 199                 memset(read_buf+count,0,256-count);
 200                 buf_is_unicode=0;
 201                 if (*offset+(long)count>fileend) {
 202                         count=fileend-*offset;
 203                 }
 204                 while (i<count) {
 205                         c=read_buf[i++];
 206                         if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
 207                                 buf_is_unicode=1;
 208                                 break;
 209                         }
 210                         i++;
 211                 }
 212                 i=0;
 213         }
 214         if (buf_is_unicode) {
 215                 u=read_buf[i] | read_buf[i+1]<<8;
 216                 (*offset)+=2;
 217         } else {
 218                 u=to_unicode(source_charset,read_buf[i]);
 219                 (*offset)++;
 220         }
 221         return u;
 222 }
 223
 224