]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/reader.c
4db9ffb7ed49c0c39fb1ff89c12f59fec4148968
[oss/catdoc.git] / src / reader.c
1 /*****************************************************************/
2 /* Reading routines for MS-Word, MS-Write and text files         */
3 /*                                                               */
4 /* This file is part of catdoc project                           */
5 /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003                    */
6 /*****************************************************************/
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10 #include <string.h>
11 #include <stdio.h>
12 #include "catdoc.h"
13 unsigned short int buffer[PARAGRAPH_BUFFER];
14 static unsigned char read_buf[256];
15 static int buf_is_unicode;
16
17 /**************************************************************************/
18 /* Just prints out content of input file. Called when file is not OLE     */
19 /* stream                                                                 */
20 /* Parameters - f - file to copy out. header - first few bytes of file,   */
21 /*  which have been already read by format recognition code, but should   */
22 /*  be output anyway                                                      */
23 /**************************************************************************/
24 void copy_out (FILE *f,char *header) {
25         char *buf=(char *)buffer;
26         int count,i;
27         long offset;
28         if (get_unicode_char == get_word8_char) {
29                 /* non-word file and -u specified. Trying to guess which kind of
30                  * unicode is used
31                  */
32                 if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
33                         get_unicode_char = get_utf16msb;
34                         fputs(convert_char(header[2]<<8|header[3]),stdout); 
35                         fputs(convert_char(header[4]<<8|header[5]),stdout); 
36                         fputs(convert_char(header[6]<<8|header[7]),stdout); 
37                 } else if ((unsigned char)header[0]!=0xFF ||
38                                 (unsigned char)header[1]!=0xFE) {
39                         int c,j,d;
40                         /* if it is not utf16, assume it is UTF8. We are told -u,
41                          * aren't we */
42                         get_unicode_char = get_utf8;
43                         i=0;
44                         while (i<8) {
45                                 c=(unsigned char)header[i++];           
46                                 if (c >=0x80) {
47                                         if ( c<0xE0) {
48                                                 c=(c & 0x1F);
49                                                 count =1;
50                                         } else {
51                                                 c=(c & 0xF);
52                                                 count = 2;
53                                         }
54                                         for (j=0;j<count;j++) {
55                                                 if (i<7) {
56                                                         d=(unsigned char) header[i++];
57                                                 } else {
58                                                         d=fgetc(f);
59                                                 }
60                                                 c=c<<6 | (d & 0x3F);
61                                         }
62                                 }
63                                 fputs (convert_char(c),stdout);
64                         }
65                 } else {
66                         get_unicode_char = get_utf16lsb;
67                         fputs(convert_char(header[3]<<8|header[2]),stdout); 
68                         fputs(convert_char(header[5]<<8|header[4]),stdout); 
69                         fputs(convert_char(header[7]<<8|header[6]),stdout); 
70                 }           
71                 while (!catdoc_eof(f)) {
72                         i=get_unicode_char(f,&offset,0x7FFFFFFF); 
73                         if (i!=EOF) fputs(convert_char(i),stdout);
74                 }    
75         } else {
76                 for (i=0;i<8;i++) {
77                         fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
78                 }                        
79                 /* Assuming 8-bit input text */
80                 while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
81                         for (i=0;i<count;i++) {
82                                 fputs(convert_char(to_unicode(source_charset,
83                                                                 (unsigned char)buf[i])),stdout);
84                         }                      
85                 }
86         } 
87
88 /**************************************************************************/
89 /*  process_file - main process engine. Reads word file using function,   */
90 /*  pointed by get_unicode_char, searches for things which looks like     */
91 /*  paragraphs and print them out                                         */
92 /**************************************************************************/
93 int process_file(FILE *f,long stop) {
94         int bufptr;
95         int tabmode=0;
96         long offset=0;
97         int hyperlink_mode = 0;
98         unsigned short c;
99         /* Now we are starting to read with get_unicode_char */
100         while (!catdoc_eof(f) && offset<stop) {
101                 bufptr = -1;
102                 do {
103                         c=get_unicode_char(f,&offset,stop);
104                         /* Following symbols below 32 are allowed inside paragraph:
105                            0x0002 - footnote mark
106                            0x0007 - table separator (converted to tabmode)
107                            0x0009 - Horizontal tab ( printed as is)
108                            0x000B - hard return
109                            0x000C - page break
110                            0x000D - return - marks an end of paragraph
111                            0x001E - IS2 for some reason means short defis in Word.
112                            0x001F - soft hyphen in Word
113                            0x0013 - start embedded hyperlink
114                            0x0014 - separate hyperlink URL from text
115                            0x0015 - end embedded hyperlink
116                            */
117                         if (tabmode) {
118                                 tabmode=0;
119                                 if (c==0x007) {
120                                         buffer[++bufptr]=0x1E;
121                                         continue;
122                                 } else {
123                                         buffer[++bufptr]=0x1C;
124                                 }  
125                         }        
126                         if (c<32) {
127                                 switch (c) {
128                                         case 0x007:
129                                                 tabmode = 1;
130                                                 break;
131                                         case 0x000D:
132                                         case 0x000B:
133                                                 buffer[++bufptr]=0x000A;
134                                                 break;
135                                         case 0x000C:
136                                                 buffer[++bufptr]=c;
137                                                 break;
138                                         case 0x001E:
139                                                 buffer[++bufptr]='-';
140                                                 break;
141                                         case 0x0002: break;
142
143                                         case 0x001F:
144                                                                  buffer[++bufptr]=0xAD;/* translate to Unicode
145                                                                                                                   soft hyphen */
146                                                                  break;                                           
147                                         case 0x0009:
148                                                                  buffer[++bufptr]=c;
149                                                                  break;
150                                         case 0x0013:
151                                                                  hyperlink_mode=1;
152                                                                  buffer[++bufptr]=' ';
153                                                                  break;
154                                         case 0x0014:
155                                                                  hyperlink_mode = 0;
156                                                                  /*fall through */
157                                         case 0x0015:
158                                                                  /* just treat hyperlink separators as
159                                                                   * space */
160                                                                  buffer[++bufptr]=' ';
161                                                                  break;
162                                         case 0x0001: if (hyperlink_mode) 
163                                                                                 break;
164                                                                  /* else fall through */
165                                         default:
166                                                                  bufptr=-1; /* Any other control char - discard para*/
167                                 }
168                         } else if (c != 0xfeff) {
169                                 /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
170                                  * else*/
171                                 buffer[++bufptr]=c;
172                         }
173                 } while (bufptr<=PARAGRAPH_BUFFER-2 &&
174                                  !catdoc_eof(f) &&
175                                  buffer[bufptr]!=0x000a);
176                 if (bufptr>0) {
177                         buffer[++bufptr]=0;
178                         output_paragraph(buffer);
179                 }
180         }
181         return 0;
182 }
183 /**********************************************************************/
184 /* Reads file from MS-Word 97 and above file. Takes in account strange*
185  * situation that unicode and non-unicode 256-byte blocks could be    *
186  * intermixed in word file                                            *
187  *                                                                    *
188  * Parameters:                                                        *
189  *                                                                    *
190  * f - file to read                                                   *
191  * offset - position of the character inside file (to determine       * 
192  * possible  block boundaries                                         *
193  **********************************************************************/ 
194 int get_word8_char(FILE *f,long *offset,long fileend) {
195         int count,i,u;
196         char c;
197         if ((i=(*offset)%256) ==0) {
198                 count=catdoc_read(read_buf,1,256,f);
199                 memset(read_buf+count,0,256-count);
200                 buf_is_unicode=0;
201                 if (*offset+(long)count>fileend) {
202                         count=fileend-*offset;
203                 }       
204                 while (i<count) {
205                         c=read_buf[i++];
206                         if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
207                                 buf_is_unicode=1;
208                                 break;
209                         }
210                         i++;
211                 }   
212                 i=0;
213         }    
214         if (buf_is_unicode) {
215                 u=read_buf[i] | read_buf[i+1]<<8;
216                 (*offset)+=2;
217         } else {
218                 u=to_unicode(source_charset,read_buf[i]);
219                 (*offset)++;
220         }
221         return u;
222 }  
223
224