From 790ecc75063e718e33528060ce966088e9aa99db Mon Sep 17 00:00:00 2001
From: Victor Wagner <vitus@wagner.pp.ru>
Date: Tue, 18 Jul 2006 11:20:01 +0000
Subject: [PATCH] Simular fix in reader.c

---
 src/reader.c  |   2 +-
 src/rtfread.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 1 deletion(-)

diff --git a/src/reader.c b/src/reader.c
index b51996e..4db9ffb 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -170,7 +170,7 @@ int process_file(FILE *f,long stop) {
 				 * else*/
 				buffer[++bufptr]=c;
 			}
-		} while (bufptr<PARAGRAPH_BUFFER-2 &&
+		} while (bufptr<=PARAGRAPH_BUFFER-2 &&
 				 !catdoc_eof(f) &&
 				 buffer[bufptr]!=0x000a);
 		if (bufptr>0) {
diff --git a/src/rtfread.c b/src/rtfread.c
index 8ed1be6..cbfc103 100644
--- a/src/rtfread.c
+++ b/src/rtfread.c
@@ -173,6 +173,230 @@ void add_to_buffer(int *bufptr,unsigned short int c) {
 	buffer[++(*bufptr)]=c;
 	if (*bufptr >= PARAGRAPH_BUFFER-2) {
 		buffer[++(*bufptr)]=0;
+/*****************************************************************/
+/* Reading routines for MS-Word, MS-Write and text files         */
+/*                                                               */
+/* This file is part of catdoc project                           */
+/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003	             */
+/*****************************************************************/
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdio.h>
+#include "catdoc.h"
+unsigned short int buffer[PARAGRAPH_BUFFER];
+static unsigned char read_buf[256];
+static int buf_is_unicode;
+
+/**************************************************************************/
+/* Just prints out content of input file. Called when file is not OLE     */
+/* stream                                                                 */
+/* Parameters - f - file to copy out. header - first few bytes of file,   */
+/*  which have been already read by format recognition code, but should   */
+/*  be output anyway                                                      */
+/**************************************************************************/
+void copy_out (FILE *f,char *header) {
+	char *buf=(char *)buffer;
+	int count,i;
+	long offset;
+	if (get_unicode_char == get_word8_char) {
+		/* non-word file and -u specified. Trying to guess which kind of
+		 * unicode is used
+		 */
+		if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
+			get_unicode_char = get_utf16msb;
+			fputs(convert_char(header[2]<<8|header[3]),stdout); 
+			fputs(convert_char(header[4]<<8|header[5]),stdout); 
+			fputs(convert_char(header[6]<<8|header[7]),stdout); 
+		} else if ((unsigned char)header[0]!=0xFF ||
+				(unsigned char)header[1]!=0xFE) {
+			int c,j,d;
+			/* if it is not utf16, assume it is UTF8. We are told -u,
+			 * aren't we */
+			get_unicode_char = get_utf8;
+			i=0;
+			while (i<8) {
+				c=(unsigned char)header[i++];		
+				if (c >=0x80) {
+					if ( c<0xE0) {
+						c=(c & 0x1F);
+						count =1;
+					} else {
+						c=(c & 0xF);
+						count = 2;
+					}
+					for (j=0;j<count;j++) {
+						if (i<7) {
+							d=(unsigned char) header[i++];
+						} else {
+							d=fgetc(f);
+						}
+						c=c<<6 | (d & 0x3F);
+					}
+				}
+				fputs (convert_char(c),stdout);
+			}
+		} else {
+			get_unicode_char = get_utf16lsb;
+			fputs(convert_char(header[3]<<8|header[2]),stdout); 
+			fputs(convert_char(header[5]<<8|header[4]),stdout); 
+			fputs(convert_char(header[7]<<8|header[6]),stdout); 
+		}	    
+		while (!catdoc_eof(f)) {
+			i=get_unicode_char(f,&offset,0x7FFFFFFF); 
+			if (i!=EOF) fputs(convert_char(i),stdout);
+		}    
+	} else {
+		for (i=0;i<8;i++) {
+			fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
+		}			 
+		/* Assuming 8-bit input text */
+		while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
+			for (i=0;i<count;i++) {
+				fputs(convert_char(to_unicode(source_charset,
+								(unsigned char)buf[i])),stdout);
+			}		       
+		}
+	} 
+} 
+/**************************************************************************/
+/*  process_file - main process engine. Reads word file using function,   */
+/*  pointed by get_unicode_char, searches for things which looks like     */
+/*  paragraphs and print them out                                         */
+/**************************************************************************/
+int process_file(FILE *f,long stop) {
+	int bufptr;
+	int tabmode=0;
+	long offset=0;
+	int hyperlink_mode = 0;
+	unsigned short c;
+	/* Now we are starting to read with get_unicode_char */
+	while (!catdoc_eof(f) && offset<stop) {
+		bufptr = -1;
+		do {
+			c=get_unicode_char(f,&offset,stop);
+			/* Following symbols below 32 are allowed inside paragraph:
+			   0x0002 - footnote mark
+			   0x0007 - table separator (converted to tabmode)
+			   0x0009 - Horizontal tab ( printed as is)
+			   0x000B - hard return
+			   0x000C - page break
+			   0x000D - return - marks an end of paragraph
+			   0x001E - IS2 for some reason means short defis in Word.
+			   0x001F - soft hyphen in Word
+			   0x0013 - start embedded hyperlink
+			   0x0014 - separate hyperlink URL from text
+			   0x0015 - end embedded hyperlink
+			   */
+			if (tabmode) {
+				tabmode=0;
+				if (c==0x007) {
+					buffer[++bufptr]=0x1E;
+					continue;
+				} else {
+					buffer[++bufptr]=0x1C;
+				}  
+			}   	 
+			if (c<32) {
+				switch (c) {
+					case 0x007:
+						tabmode = 1;
+						break;
+					case 0x000D:
+					case 0x000B:
+						buffer[++bufptr]=0x000A;
+						break;
+					case 0x000C:
+						buffer[++bufptr]=c;
+						break;
+					case 0x001E:
+						buffer[++bufptr]='-';
+						break;
+					case 0x0002: break;
+
+					case 0x001F:
+								 buffer[++bufptr]=0xAD;/* translate to Unicode
+														  soft hyphen */
+								 break;						  
+					case 0x0009:
+								 buffer[++bufptr]=c;
+								 break;
+					case 0x0013:
+								 hyperlink_mode=1;
+								 buffer[++bufptr]=' ';
+								 break;
+					case 0x0014:
+								 hyperlink_mode = 0;
+								 /*fall through */
+					case 0x0015:
+								 /* just treat hyperlink separators as
+								  * space */
+								 buffer[++bufptr]=' ';
+								 break;
+					case 0x0001: if (hyperlink_mode) 
+									 	break;
+								 /* else fall through */
+					default:
+								 bufptr=-1; /* Any other control char - discard para*/
+				}
+			} else if (c != 0xfeff) {
+				/* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
+				 * else*/
+				buffer[++bufptr]=c;
+			}
+		} while (bufptr<=PARAGRAPH_BUFFER-2 &&
+				 !catdoc_eof(f) &&
+				 buffer[bufptr]!=0x000a);
+		if (bufptr>0) {
+			buffer[++bufptr]=0;
+			output_paragraph(buffer);
+		}
+	}
+	return 0;
+}
+/**********************************************************************/
+/* Reads file from MS-Word 97 and above file. Takes in account strange*
+ * situation that unicode and non-unicode 256-byte blocks could be    *
+ * intermixed in word file                                            *
+ *                                                                    *
+ * Parameters:                                                        *
+ *                                                                    *
+ * f - file to read                                                   *
+ * offset - position of the character inside file (to determine       * 
+ * possible  block boundaries                                         *
+ **********************************************************************/ 
+int get_word8_char(FILE *f,long *offset,long fileend) {
+	int count,i,u;
+	char c;
+	if ((i=(*offset)%256) ==0) {
+		count=catdoc_read(read_buf,1,256,f);
+		memset(read_buf+count,0,256-count);
+		buf_is_unicode=0;
+		if (*offset+(long)count>fileend) {
+			count=fileend-*offset;
+		}	
+		while (i<count) {
+			c=read_buf[i++];
+			if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
+				buf_is_unicode=1;
+				break;
+			}
+			i++;
+		}   
+		i=0;
+	}    
+	if (buf_is_unicode) {
+		u=read_buf[i] | read_buf[i+1]<<8;
+		(*offset)+=2;
+	} else {
+		u=to_unicode(source_charset,read_buf[i]);
+		(*offset)++;
+	}
+	return u;
+}  
+
+
 		output_paragraph(buffer);
 		*bufptr=-1;
 	}
-- 
2.39.2