]> www.wagner.pp.ru Git - oss/catdoc.git/commitdiff
Simular fix in reader.c
authorVictor Wagner <vitus@wagner.pp.ru>
Tue, 18 Jul 2006 11:20:01 +0000 (11:20 +0000)
committerVictor Wagner <vitus@wagner.pp.ru>
Tue, 18 Jul 2006 11:20:01 +0000 (11:20 +0000)
src/reader.c
src/rtfread.c

index b51996e7666b5ac09f47c137fbb22dca03415dde..4db9ffb7ed49c0c39fb1ff89c12f59fec4148968 100644 (file)
@@ -170,7 +170,7 @@ int process_file(FILE *f,long stop) {
                                 * else*/
                                buffer[++bufptr]=c;
                        }
-               } while (bufptr<PARAGRAPH_BUFFER-2 &&
+               } while (bufptr<=PARAGRAPH_BUFFER-2 &&
                                 !catdoc_eof(f) &&
                                 buffer[bufptr]!=0x000a);
                if (bufptr>0) {
index 8ed1be68c78bf0634c2690241bc7886772d35c74..cbfc103b896bf953712bbf1137e6fe23fe4ae009 100644 (file)
@@ -173,6 +173,230 @@ void add_to_buffer(int *bufptr,unsigned short int c) {
        buffer[++(*bufptr)]=c;
        if (*bufptr >= PARAGRAPH_BUFFER-2) {
                buffer[++(*bufptr)]=0;
+/*****************************************************************/
+/* Reading routines for MS-Word, MS-Write and text files         */
+/*                                                               */
+/* This file is part of catdoc project                           */
+/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003                   */
+/*****************************************************************/
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdio.h>
+#include "catdoc.h"
+unsigned short int buffer[PARAGRAPH_BUFFER];
+static unsigned char read_buf[256];
+static int buf_is_unicode;
+
+/**************************************************************************/
+/* Just prints out content of input file. Called when file is not OLE     */
+/* stream                                                                 */
+/* Parameters - f - file to copy out. header - first few bytes of file,   */
+/*  which have been already read by format recognition code, but should   */
+/*  be output anyway                                                      */
+/**************************************************************************/
+void copy_out (FILE *f,char *header) {
+       char *buf=(char *)buffer;
+       int count,i;
+       long offset;
+       if (get_unicode_char == get_word8_char) {
+               /* non-word file and -u specified. Trying to guess which kind of
+                * unicode is used
+                */
+               if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
+                       get_unicode_char = get_utf16msb;
+                       fputs(convert_char(header[2]<<8|header[3]),stdout); 
+                       fputs(convert_char(header[4]<<8|header[5]),stdout); 
+                       fputs(convert_char(header[6]<<8|header[7]),stdout); 
+               } else if ((unsigned char)header[0]!=0xFF ||
+                               (unsigned char)header[1]!=0xFE) {
+                       int c,j,d;
+                       /* if it is not utf16, assume it is UTF8. We are told -u,
+                        * aren't we */
+                       get_unicode_char = get_utf8;
+                       i=0;
+                       while (i<8) {
+                               c=(unsigned char)header[i++];           
+                               if (c >=0x80) {
+                                       if ( c<0xE0) {
+                                               c=(c & 0x1F);
+                                               count =1;
+                                       } else {
+                                               c=(c & 0xF);
+                                               count = 2;
+                                       }
+                                       for (j=0;j<count;j++) {
+                                               if (i<7) {
+                                                       d=(unsigned char) header[i++];
+                                               } else {
+                                                       d=fgetc(f);
+                                               }
+                                               c=c<<6 | (d & 0x3F);
+                                       }
+                               }
+                               fputs (convert_char(c),stdout);
+                       }
+               } else {
+                       get_unicode_char = get_utf16lsb;
+                       fputs(convert_char(header[3]<<8|header[2]),stdout); 
+                       fputs(convert_char(header[5]<<8|header[4]),stdout); 
+                       fputs(convert_char(header[7]<<8|header[6]),stdout); 
+               }           
+               while (!catdoc_eof(f)) {
+                       i=get_unicode_char(f,&offset,0x7FFFFFFF); 
+                       if (i!=EOF) fputs(convert_char(i),stdout);
+               }    
+       } else {
+               for (i=0;i<8;i++) {
+                       fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
+               }                        
+               /* Assuming 8-bit input text */
+               while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
+                       for (i=0;i<count;i++) {
+                               fputs(convert_char(to_unicode(source_charset,
+                                                               (unsigned char)buf[i])),stdout);
+                       }                      
+               }
+       } 
+} 
+/**************************************************************************/
+/*  process_file - main process engine. Reads word file using function,   */
+/*  pointed by get_unicode_char, searches for things which looks like     */
+/*  paragraphs and print them out                                         */
+/**************************************************************************/
+int process_file(FILE *f,long stop) {
+       int bufptr;
+       int tabmode=0;
+       long offset=0;
+       int hyperlink_mode = 0;
+       unsigned short c;
+       /* Now we are starting to read with get_unicode_char */
+       while (!catdoc_eof(f) && offset<stop) {
+               bufptr = -1;
+               do {
+                       c=get_unicode_char(f,&offset,stop);
+                       /* Following symbols below 32 are allowed inside paragraph:
+                          0x0002 - footnote mark
+                          0x0007 - table separator (converted to tabmode)
+                          0x0009 - Horizontal tab ( printed as is)
+                          0x000B - hard return
+                          0x000C - page break
+                          0x000D - return - marks an end of paragraph
+                          0x001E - IS2 for some reason means short defis in Word.
+                          0x001F - soft hyphen in Word
+                          0x0013 - start embedded hyperlink
+                          0x0014 - separate hyperlink URL from text
+                          0x0015 - end embedded hyperlink
+                          */
+                       if (tabmode) {
+                               tabmode=0;
+                               if (c==0x007) {
+                                       buffer[++bufptr]=0x1E;
+                                       continue;
+                               } else {
+                                       buffer[++bufptr]=0x1C;
+                               }  
+                       }        
+                       if (c<32) {
+                               switch (c) {
+                                       case 0x007:
+                                               tabmode = 1;
+                                               break;
+                                       case 0x000D:
+                                       case 0x000B:
+                                               buffer[++bufptr]=0x000A;
+                                               break;
+                                       case 0x000C:
+                                               buffer[++bufptr]=c;
+                                               break;
+                                       case 0x001E:
+                                               buffer[++bufptr]='-';
+                                               break;
+                                       case 0x0002: break;
+
+                                       case 0x001F:
+                                                                buffer[++bufptr]=0xAD;/* translate to Unicode
+                                                                                                                 soft hyphen */
+                                                                break;                                           
+                                       case 0x0009:
+                                                                buffer[++bufptr]=c;
+                                                                break;
+                                       case 0x0013:
+                                                                hyperlink_mode=1;
+                                                                buffer[++bufptr]=' ';
+                                                                break;
+                                       case 0x0014:
+                                                                hyperlink_mode = 0;
+                                                                /*fall through */
+                                       case 0x0015:
+                                                                /* just treat hyperlink separators as
+                                                                 * space */
+                                                                buffer[++bufptr]=' ';
+                                                                break;
+                                       case 0x0001: if (hyperlink_mode) 
+                                                                               break;
+                                                                /* else fall through */
+                                       default:
+                                                                bufptr=-1; /* Any other control char - discard para*/
+                               }
+                       } else if (c != 0xfeff) {
+                               /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
+                                * else*/
+                               buffer[++bufptr]=c;
+                       }
+               } while (bufptr<=PARAGRAPH_BUFFER-2 &&
+                                !catdoc_eof(f) &&
+                                buffer[bufptr]!=0x000a);
+               if (bufptr>0) {
+                       buffer[++bufptr]=0;
+                       output_paragraph(buffer);
+               }
+       }
+       return 0;
+}
+/**********************************************************************/
+/* Reads file from MS-Word 97 and above file. Takes in account strange*
+ * situation that unicode and non-unicode 256-byte blocks could be    *
+ * intermixed in word file                                            *
+ *                                                                    *
+ * Parameters:                                                        *
+ *                                                                    *
+ * f - file to read                                                   *
+ * offset - position of the character inside file (to determine       * 
+ * possible  block boundaries                                         *
+ **********************************************************************/ 
+int get_word8_char(FILE *f,long *offset,long fileend) {
+       int count,i,u;
+       char c;
+       if ((i=(*offset)%256) ==0) {
+               count=catdoc_read(read_buf,1,256,f);
+               memset(read_buf+count,0,256-count);
+               buf_is_unicode=0;
+               if (*offset+(long)count>fileend) {
+                       count=fileend-*offset;
+               }       
+               while (i<count) {
+                       c=read_buf[i++];
+                       if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
+                               buf_is_unicode=1;
+                               break;
+                       }
+                       i++;
+               }   
+               i=0;
+       }    
+       if (buf_is_unicode) {
+               u=read_buf[i] | read_buf[i+1]<<8;
+               (*offset)+=2;
+       } else {
+               u=to_unicode(source_charset,read_buf[i]);
+               (*offset)++;
+       }
+       return u;
+}  
+
+
                output_paragraph(buffer);
                *bufptr=-1;
        }