1 /*****************************************************************/
2 /* Reading routines for MS-Word, MS-Write and text files */
4 /* This file is part of catdoc project */
5 /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */
6 /*****************************************************************/
13 unsigned short int buffer[PARAGRAPH_BUFFER];
14 static unsigned char read_buf[256];
15 static int buf_is_unicode;
17 /**************************************************************************/
18 /* Just prints out content of input file. Called when file is not OLE */
20 /* Parameters - f - file to copy out. header - first few bytes of file, */
21 /* which have been already read by format recognition code, but should */
22 /* be output anyway */
23 /**************************************************************************/
24 void copy_out (FILE *f,char *header) {
25 char *buf=(char *)buffer;
28 if (get_unicode_char == get_word8_char) {
29 /* non-word file and -u specified. Trying to guess which kind of
32 if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
33 get_unicode_char = get_utf16msb;
34 fputs(convert_char(header[2]<<8|header[3]),stdout);
35 fputs(convert_char(header[4]<<8|header[5]),stdout);
36 fputs(convert_char(header[6]<<8|header[7]),stdout);
37 } else if ((unsigned char)header[0]!=0xFF ||
38 (unsigned char)header[1]!=0xFE) {
40 /* if it is not utf16, assume it is UTF8. We are told -u,
42 get_unicode_char = get_utf8;
45 c=(unsigned char)header[i++];
54 for (j=0;j<count;j++) {
56 d=(unsigned char) header[i++];
63 fputs (convert_char(c),stdout);
66 get_unicode_char = get_utf16lsb;
67 fputs(convert_char(header[3]<<8|header[2]),stdout);
68 fputs(convert_char(header[5]<<8|header[4]),stdout);
69 fputs(convert_char(header[7]<<8|header[6]),stdout);
71 while (!catdoc_eof(f)) {
72 i=get_unicode_char(f,&offset,0x7FFFFFFF);
73 if (i!=EOF) fputs(convert_char(i),stdout);
77 fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
79 /* Assuming 8-bit input text */
80 while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
81 for (i=0;i<count;i++) {
82 fputs(convert_char(to_unicode(source_charset,
83 (unsigned char)buf[i])),stdout);
88 /**************************************************************************/
89 /* process_file - main process engine. Reads word file using function, */
90 /* pointed by get_unicode_char, searches for things which looks like */
91 /* paragraphs and print them out */
92 /**************************************************************************/
93 int process_file(FILE *f,long stop) {
97 int hyperlink_mode = 0;
99 /* Now we are starting to read with get_unicode_char */
100 while (!catdoc_eof(f) && offset<stop) {
103 c=get_unicode_char(f,&offset,stop);
104 /* Following symbols below 32 are allowed inside paragraph:
105 0x0002 - footnote mark
106 0x0007 - table separator (converted to tabmode)
107 0x0009 - Horizontal tab ( printed as is)
110 0x000D - return - marks an end of paragraph
111 0x001E - IS2 for some reason means short defis in Word.
112 0x001F - soft hyphen in Word
113 0x0013 - start embedded hyperlink
114 0x0014 - separate hyperlink URL from text
115 0x0015 - end embedded hyperlink
120 buffer[++bufptr]=0x1E;
123 buffer[++bufptr]=0x1C;
133 buffer[++bufptr]=0x000A;
139 buffer[++bufptr]='-';
144 buffer[++bufptr]=0xAD;/* translate to Unicode
152 buffer[++bufptr]=' ';
158 /* just treat hyperlink separators as
160 buffer[++bufptr]=' ';
162 case 0x0001: if (hyperlink_mode)
164 /* else fall through */
166 bufptr=-1; /* Any other control char - discard para*/
168 } else if (c != 0xfeff) {
169 /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
173 } while (bufptr<=PARAGRAPH_BUFFER-2 &&
175 buffer[bufptr]!=0x000a);
178 output_paragraph(buffer);
183 /**********************************************************************/
184 /* Reads file from MS-Word 97 and above file. Takes in account strange*
185 * situation that unicode and non-unicode 256-byte blocks could be *
186 * intermixed in word file *
191 * offset - position of the character inside file (to determine *
192 * possible block boundaries *
193 **********************************************************************/
194 int get_word8_char(FILE *f,long *offset,long fileend) {
197 if ((i=(*offset)%256) ==0) {
198 count=catdoc_read(read_buf,1,256,f);
199 memset(read_buf+count,0,256-count);
201 if (*offset+(long)count>fileend) {
202 count=fileend-*offset;
206 if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
214 if (buf_is_unicode) {
215 u=read_buf[i] | read_buf[i+1]<<8;
218 u=to_unicode(source_charset,read_buf[i]);