]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/rtfread.c
9cb869b05a551c0b012af178c97e7d39c869aba4
[oss/catdoc.git] / src / rtfread.c
1 /*****************************************************************/
2 /* Reading routines for rtf files                                */
3 /*                                                               */
4 /* This file is part of catdoc project                           */
5 /* (c) Victor Wagner 2003, (c) Alex Ott 2003                 */
6 /*****************************************************************/
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <ctype.h>
14 #include <string.h>
15 #include "catdoc.h"
16
17 /********************************************************
18  * Datatypes declaration
19  * 
20  */
21 typedef enum {
22         RTF_CODEPAGE,
23         RTF_FONT_CHARSET,
24         RTF_UC,
25         RTF_UNICODE_CHAR,
26         RTF_CHAR,
27         RTF_PARA,
28         RTF_TABLE_START,
29         RTF_TABLE_END,
30         RTF_ROW,
31         RTF_CELL,
32         RTF_UNKNOWN,
33         RTF_OVERLAY,
34         RTF_PICT,
35         RTF_F,
36         RTF_AUTHOR,
37         RTF_FONTTBL,
38         RTF_INFO,
39         RTF_STYLESHEET,
40         RTF_COLORTBL,
41         RTF_LISTOVERRIDETABLE,
42         RTF_LISTTABLE,
43         RTF_RSIDTBL,
44         RTF_GENERATOR,
45         RTF_DATAFIELD,
46         RTF_LANG,
47         RTF_PARD,
48         RTF_TAB,
49         RTF_SPEC_CHAR,
50         RTF_EMDASH,
51         RTF_ENDASH,
52         RTF_EMSPACE,
53         RTF_ENSPACE,
54         RTF_BULLET, 
55         RTF_LQUOTE,
56         RTF_RQUOTE,
57         RTF_LDBLQUOTE,
58         RTF_RDBLQUOTE,
59         RTF_ZWNONJOINER,
60 } RTFTypes;
61
62 typedef struct {
63         char *name;
64         RTFTypes type;
65 } RTFTypeMap;
66
67 RTFTypeMap rtf_types[]={
68         {"uc",RTF_UC},
69         {"ansicpg",RTF_CODEPAGE},
70         {"pard",RTF_PARD},
71         {"par",RTF_PARA},
72         {"cell",RTF_CELL},
73         {"row",RTF_ROW},
74         {"overlay",RTF_OVERLAY}, 
75         {"pict",RTF_PICT},
76         {"author",RTF_AUTHOR},
77         {"f",RTF_F}, 
78         {"fonttbl",RTF_FONTTBL}, 
79         {"info",RTF_INFO}, 
80         {"stylesheet",RTF_STYLESHEET},
81         {"colortbl",RTF_COLORTBL},
82         {"listtable",RTF_LISTTABLE},
83         {"listoverridetable",RTF_LISTOVERRIDETABLE},
84         {"rsidtbl",RTF_RSIDTBL}, 
85         {"generator",RTF_GENERATOR}, 
86         {"datafield",RTF_DATAFIELD}, 
87         {"lang",RTF_LANG}, 
88         {"tab",RTF_TAB}, 
89         {"emdash",RTF_EMDASH},
90         {"endash",RTF_ENDASH},
91         {"emspace",RTF_EMDASH},
92         {"enspace",RTF_ENDASH},
93         {"bullet",RTF_BULLET}, 
94         {"lquote",RTF_LQUOTE},
95         {"rquote",RTF_RQUOTE},
96         {"ldblquote",RTF_LDBLQUOTE},
97         {"rdblquote",RTF_RDBLQUOTE},
98         {"zwnj",RTF_ZWNONJOINER},
99 /*      {"",}, */
100 /*      {"",}, */
101         {"u",RTF_UNICODE_CHAR}
102 };
103
104 #define RTFNAMEMAXLEN 32
105 #define RTFARGSMAXLEN 64
106
107 /**
108  * Structure describing rtf command
109  * 
110  */
111 typedef struct {
112         RTFTypes type;
113         char name[RTFNAMEMAXLEN+1];
114         signed int numarg;
115 /*      void *args; */
116 } RTFcommand;
117
118
119 #define MAXFONTNAME 64
120 /**
121  * 
122  * 
123  */
124 typedef struct {
125         int name;
126         char fontname[MAXFONTNAME+1];
127 } RTFFont;
128
129 /**
130  * Structure to describe style
131  * 
132  */
133 typedef struct {
134         int codepage;
135 } RTFStyle;
136
137 /**
138  * Structure to store values, local to rtf group
139  * 
140  */
141 typedef struct {
142         int uc;                                         /**< How much symbols to skip */
143         RTFStyle* style;                        /**< curren style */
144 } RTFGroupData;
145
146 /********************************************************
147  * Functions declaration
148  * 
149  */
150
151 extern int forced_charset;
152 signed long getNumber(FILE *f);
153
154 int getRtfCommand(FILE *f, RTFcommand *command );
155 unsigned short int rtf_to_unicode(int code);
156 RTFTypes getCommandType(char *name);
157 signed int getCharCode(FILE *f);
158 void rtfSetCharset(short int **charset_ptr,unsigned int codepage);
159
160 /********************************************************
161  * Global data
162  * 
163  */
164 short int *current_charset;
165 int rtf_level=0;
166
167 /********************************************************
168  * Functions implementation
169  * 
170  */
171 extern unsigned short int buffer[];
172 void add_to_buffer(int *bufptr,unsigned short int c) {
173         buffer[++(*bufptr)]=c;
174         if (*bufptr >= PARAGRAPH_BUFFER-2) {
175                 buffer[++(*bufptr)]=0;
176                 output_paragraph(buffer);
177                 *bufptr=-1;
178         }
179 }
180
181 void end_paragraph(int *bufptr) {
182                                    add_to_buffer(bufptr,0x000a);
183                                    add_to_buffer(bufptr,0);
184                                    output_paragraph(buffer);
185                                    *bufptr=-1;
186 }                                  
187
188 /** 
189  * Parses RTF file from file stream
190  * 
191  * @param f - file stream descriptor
192  */
193 int parse_rtf(FILE *f) {
194         int para_mode=0, data_skip_mode=0,i;
195         RTFGroupData *groups=NULL;
196         int group_count=0, group_store=20;
197         int bufptr=-1;
198         current_charset=source_charset;
199         fseek(f,0,SEEK_SET);
200         if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
201                 perror("Can\'t allocate memory: ");
202                 return 1;
203         }
204         groups[0].uc = 2; /* DEfault uc = 2 */
205         while ( !feof(f) ) {
206                 int c = fgetc(f);
207                 if ( feof( f ) )
208                         break;
209                 switch (c) {
210                 case '\\': {
211                         int code;
212                         RTFcommand com;
213                         if ((code=getRtfCommand(f, &com)) != 0)
214                                 break;
215                         switch (com.type) {
216                         case RTF_SPEC_CHAR:
217 /*                              fprintf(stderr, "Spec Char found=%s and arg=%c\n", */
218 /*                              com.name, com.numarg); */
219                                 if (com.numarg == '*' && data_skip_mode == 0) {
220                                         data_skip_mode=group_count;
221                                 } else if (com.numarg == '\r') {
222                                         end_paragraph(&bufptr);
223                                 } else if (com.numarg == '~') {
224                                         add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
225                                 } else if (com.numarg == '-') {
226                                         add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
227                                 }       
228
229                                    break;
230                         case RTF_EMDASH:
231                                    add_to_buffer(&bufptr,0x2014);/* EM DASH*/
232                                    break;
233                         case RTF_ENDASH: 
234                                    add_to_buffer(&bufptr,0x2013);break;
235                         case RTF_BULLET: 
236                                    add_to_buffer(&bufptr,0x2022);break;
237                         case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
238                         case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
239                         case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
240                         case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
241                         case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
242                         case RTF_EMSPACE:
243                         case RTF_ENSPACE:
244                                         add_to_buffer(&bufptr,' ');break;
245                         case RTF_CHAR:
246 /*                              fprintf(stderr, "RTF char %d\n", com.numarg); */
247                                 if (data_skip_mode == 0) {
248                                         add_to_buffer(&bufptr,rtf_to_unicode(com.numarg));
249                                 }       
250                                 break;
251                         case RTF_UC:
252                                 groups[group_count].uc=com.numarg;
253                                 break;
254                         case RTF_TAB:
255                                 add_to_buffer(&bufptr,0x0009);
256                                 break;
257                         case RTF_UNICODE_CHAR:
258                                 if (com.numarg < 0)
259                                         break;
260 /*                              fprintf(stderr, "Unicode char %d\n", com.numarg);  */
261                                 if (data_skip_mode == 0)
262                                         add_to_buffer(&bufptr,com.numarg);
263                                 i=groups[group_count].uc;
264                                 while((--i)>0) {
265                                         int c=fgetc(f);
266                                         if (c == '\\') {
267                                                 c = fgetc(f);
268                                                 switch (c) {
269                                                  case '\\': break;
270                                                  case '\'':
271                                                         /* skip two hex digits */
272                                                         fgetc(f);
273                                                         fgetc(f);
274                                                         break;
275                                                 default:
276                                                         break;
277                                                 }
278                                         }               
279                                 }       
280                                         
281                                 break;
282                         case RTF_PARA:
283                                 /*if (para_mode > 0) {*/
284                                         end_paragraph(&bufptr); 
285                                 /*}*/   
286                                 para_mode=group_count;
287                                 break;
288                         case RTF_PICT:
289                         case RTF_FONTTBL:
290                         case RTF_INFO:
291                         case RTF_COLORTBL:
292                         case RTF_STYLESHEET:
293                         case RTF_LISTTABLE:
294                         case RTF_LISTOVERRIDETABLE:
295                         case RTF_RSIDTBL:
296                         case RTF_GENERATOR:
297                         case RTF_DATAFIELD:
298                                 if (data_skip_mode == 0){
299                                         data_skip_mode=group_count;
300                                 }
301                                 break;
302                         case RTF_LANG:
303 /*                              fprintf(stderr, "Selected lang = %d\n",com.numarg); */
304                                 break;
305                         case RTF_CODEPAGE:
306                                 rtfSetCharset(&current_charset,com.numarg);
307                         default:
308 /*                              fprintf(stderr, "Unknown command with name %s and arg=%d\n",  */
309 /*                                              com.name, com.numarg);  */
310                         ;
311                         }
312                         break;
313                 }
314                 case '{':
315                         group_count++;
316                         if (group_count >= group_store ) {
317                                 group_store+=10;
318                                 if((groups=(RTFGroupData*)realloc(groups,
319                                                                                                   group_store*sizeof(RTFGroupData)))
320                                    == NULL ) {
321                                         perror("Can\'t allocate memory: ");
322                                         return 1;
323                                 }
324                         }
325                         if (para_mode)
326                                 add_to_buffer(&bufptr,0x20);
327                         groups[group_count]=groups[group_count-1];
328                         break;
329                 case '}':
330                         group_count--;
331                         if(group_count < 0)
332                                 group_count=0;
333                         if(para_mode > 0 && para_mode > group_count) {
334                                 /*add_to_buffer(&bufptr,0);
335                                 output_paragraph(buffer);
336                                 fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
337                                 bufptr=-1;*/
338                                 para_mode=0;
339                         }
340                         if(data_skip_mode > group_count) {
341                                 data_skip_mode=0;
342                         }
343                         break;
344                 default:
345                         if (data_skip_mode == 0)
346                                 if (c != '\n' && c != '\r')
347                                         add_to_buffer(&bufptr,rtf_to_unicode(c));
348                 }
349         }
350         if (bufptr>=0) {
351                 add_to_buffer(&bufptr,'\n');
352                 add_to_buffer(&bufptr,0);
353                 output_paragraph(buffer);
354         }       
355         free(groups);
356         return 0;
357 }  
358
359 /** 
360  * Convert text string to number
361  * 
362  * @param f stream to read data from
363  * 
364  * @return converted number
365  */
366 signed long getNumber(FILE *f) {
367         int c,count=0;
368         char buf[RTFARGSMAXLEN];
369         
370         while(isdigit(c=fgetc(f)) || c=='-') {
371                 if(feof(f))
372                         return -1;
373                 buf[count++]=(char)c;
374         }
375         ungetc(c,f);
376         buf[count]='\0';
377         return strtol(buf, (char **)NULL, 10);
378 }
379
380 /** 
381  * Parse command stream from rtf file and fill command structure
382  * 
383  * @param f - rtf file stream
384  * @param command - pointer to RTFcommand structure to fill
385  * 
386  * @return parse code not 0 - error, 0 - success
387  */
388 int getRtfCommand(FILE *f, RTFcommand *command ) {
389         int c=fgetc(f);
390         if (isalpha(c)) {
391                 int name_count=1;
392                 command->name[0]=(char)c;
393                 while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) {
394                         if(feof(f))
395                                 return 1;
396                         command->name[name_count++]=(char)c;
397                 }
398                 command->name[name_count]='\0';
399                 command->type=getCommandType(command->name);
400 /*              command->args=NULL; */
401                 ungetc(c,f);
402                 if (isdigit(c) || c == '-' )
403                         command->numarg=getNumber(f);
404                 else
405                         command->numarg=0;
406                 c=fgetc(f);
407                 if(!(c==' ' || c=='\t'))
408                         ungetc(c,f);
409         } else {
410                 command->name[0]=(char)c;
411                 command->name[1]='\0';
412 /*              command->args=NULL; */
413                 if (c == '\'') {
414                         command->type=RTF_CHAR;
415                         command->numarg=getCharCode(f);
416                         if(feof(f))
417                                 return -1;
418                 } else {
419                         command->type=RTF_SPEC_CHAR;
420                         command->numarg=c;
421                 }
422         }
423         
424         return 0;
425 }
426
427 /** 
428  * Converts char to unicode.
429  * 
430  * @param code - integer code of char
431  * 
432  * @return converted char
433  */
434 unsigned short int rtf_to_unicode(int code) {
435         int cc=code;
436         if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF;
437         return cc;
438 }
439
440 /** 
441  * Convert name of RTF command to RTFType
442  * 
443  * @param name name to convert
444  * 
445  * @return RTFType, if unknown command, then return RTF_UNKNOWN
446  */
447 RTFTypes getCommandType(char *name) {
448         int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap);
449         for (i = 0; i < olen ; i++) {
450                 if ( strcmp(name,rtf_types[i].name) == 0 ) {
451                         return rtf_types[i].type;
452                 }
453         }
454         return RTF_UNKNOWN;
455 }
456
457 /** 
458  * Return number representing char code in Hex
459  * 
460  * @param f stream to read data from
461  * 
462  * @return converted number
463  */
464 signed int getCharCode(FILE *f) {
465         int c,count=0,i;
466         char buf[RTFARGSMAXLEN];
467         for(i=0;i<2; i++) {
468                 if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) {
469                         if(feof(f))
470                                 return -1;
471                         buf[count++]=(char)c;
472                 } else 
473                         ungetc(c,f);
474         }
475
476         buf[count]='\0';
477         return strtol(buf, (char **)NULL, 16);
478 }
479
480 void rtfSetCharset(short int **charset_ptr,unsigned int codepage)
481 {
482         /* Do not override charset if it is specified in the command line */
483         const char *charset_name;
484         char *save_buf = input_buffer;
485         if (forced_charset) return;
486         charset_name = charset_from_codepage(codepage);
487         check_charset(&source_csname,charset_name);
488         input_buffer=NULL;
489         *charset_ptr = read_charset(source_csname);     
490         input_buffer = save_buf;
491 }