1 /*****************************************************************/
2 /* Reading routines for rtf files */
4 /* This file is part of catdoc project */
5 /* (c) Victor Wagner 2003, (c) Alex Ott 2003 */
6 /*****************************************************************/
17 /********************************************************
18 * Datatypes declaration
41 RTF_LISTOVERRIDETABLE,
67 RTFTypeMap rtf_types[]={
69 {"ansicpg",RTF_CODEPAGE},
74 {"overlay",RTF_OVERLAY},
76 {"author",RTF_AUTHOR},
78 {"fonttbl",RTF_FONTTBL},
80 {"stylesheet",RTF_STYLESHEET},
81 {"colortbl",RTF_COLORTBL},
82 {"listtable",RTF_LISTTABLE},
83 {"listoverridetable",RTF_LISTOVERRIDETABLE},
84 {"rsidtbl",RTF_RSIDTBL},
85 {"generator",RTF_GENERATOR},
86 {"datafield",RTF_DATAFIELD},
89 {"emdash",RTF_EMDASH},
90 {"endash",RTF_ENDASH},
91 {"emspace",RTF_EMDASH},
92 {"enspace",RTF_ENDASH},
93 {"bullet",RTF_BULLET},
94 {"lquote",RTF_LQUOTE},
95 {"rquote",RTF_RQUOTE},
96 {"ldblquote",RTF_LDBLQUOTE},
97 {"rdblquote",RTF_RDBLQUOTE},
98 {"zwnj",RTF_ZWNONJOINER},
101 {"u",RTF_UNICODE_CHAR}
104 #define RTFNAMEMAXLEN 32
105 #define RTFARGSMAXLEN 64
108 * Structure describing rtf command
113 char name[RTFNAMEMAXLEN+1];
119 #define MAXFONTNAME 64
126 char fontname[MAXFONTNAME+1];
130 * Structure to describe style
138 * Structure to store values, local to rtf group
142 int uc; /**< How much symbols to skip */
143 RTFStyle* style; /**< curren style */
146 /********************************************************
147 * Functions declaration
151 extern int forced_charset;
152 signed long getNumber(FILE *f);
154 int getRtfCommand(FILE *f, RTFcommand *command );
155 unsigned short int rtf_to_unicode(int code);
156 RTFTypes getCommandType(char *name);
157 signed int getCharCode(FILE *f);
158 void rtfSetCharset(short int **charset_ptr,unsigned int codepage);
160 /********************************************************
164 short int *current_charset;
167 /********************************************************
168 * Functions implementation
171 extern unsigned short int buffer[];
172 void add_to_buffer(int *bufptr,unsigned short int c) {
173 buffer[++(*bufptr)]=c;
174 if (*bufptr >= PARAGRAPH_BUFFER-2) {
175 buffer[++(*bufptr)]=0;
176 /*****************************************************************/
177 /* Reading routines for MS-Word, MS-Write and text files */
179 /* This file is part of catdoc project */
180 /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */
181 /*****************************************************************/
188 unsigned short int buffer[PARAGRAPH_BUFFER];
189 static unsigned char read_buf[256];
190 static int buf_is_unicode;
192 /**************************************************************************/
193 /* Just prints out content of input file. Called when file is not OLE */
195 /* Parameters - f - file to copy out. header - first few bytes of file, */
196 /* which have been already read by format recognition code, but should */
197 /* be output anyway */
198 /**************************************************************************/
199 void copy_out (FILE *f,char *header) {
200 char *buf=(char *)buffer;
203 if (get_unicode_char == get_word8_char) {
204 /* non-word file and -u specified. Trying to guess which kind of
207 if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
208 get_unicode_char = get_utf16msb;
209 fputs(convert_char(header[2]<<8|header[3]),stdout);
210 fputs(convert_char(header[4]<<8|header[5]),stdout);
211 fputs(convert_char(header[6]<<8|header[7]),stdout);
212 } else if ((unsigned char)header[0]!=0xFF ||
213 (unsigned char)header[1]!=0xFE) {
215 /* if it is not utf16, assume it is UTF8. We are told -u,
217 get_unicode_char = get_utf8;
220 c=(unsigned char)header[i++];
229 for (j=0;j<count;j++) {
231 d=(unsigned char) header[i++];
238 fputs (convert_char(c),stdout);
241 get_unicode_char = get_utf16lsb;
242 fputs(convert_char(header[3]<<8|header[2]),stdout);
243 fputs(convert_char(header[5]<<8|header[4]),stdout);
244 fputs(convert_char(header[7]<<8|header[6]),stdout);
246 while (!catdoc_eof(f)) {
247 i=get_unicode_char(f,&offset,0x7FFFFFFF);
248 if (i!=EOF) fputs(convert_char(i),stdout);
252 fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
254 /* Assuming 8-bit input text */
255 while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
256 for (i=0;i<count;i++) {
257 fputs(convert_char(to_unicode(source_charset,
258 (unsigned char)buf[i])),stdout);
263 /**************************************************************************/
264 /* process_file - main process engine. Reads word file using function, */
265 /* pointed by get_unicode_char, searches for things which looks like */
266 /* paragraphs and print them out */
267 /**************************************************************************/
268 int process_file(FILE *f,long stop) {
272 int hyperlink_mode = 0;
274 /* Now we are starting to read with get_unicode_char */
275 while (!catdoc_eof(f) && offset<stop) {
278 c=get_unicode_char(f,&offset,stop);
279 /* Following symbols below 32 are allowed inside paragraph:
280 0x0002 - footnote mark
281 0x0007 - table separator (converted to tabmode)
282 0x0009 - Horizontal tab ( printed as is)
285 0x000D - return - marks an end of paragraph
286 0x001E - IS2 for some reason means short defis in Word.
287 0x001F - soft hyphen in Word
288 0x0013 - start embedded hyperlink
289 0x0014 - separate hyperlink URL from text
290 0x0015 - end embedded hyperlink
295 buffer[++bufptr]=0x1E;
298 buffer[++bufptr]=0x1C;
308 buffer[++bufptr]=0x000A;
314 buffer[++bufptr]='-';
319 buffer[++bufptr]=0xAD;/* translate to Unicode
327 buffer[++bufptr]=' ';
333 /* just treat hyperlink separators as
335 buffer[++bufptr]=' ';
337 case 0x0001: if (hyperlink_mode)
339 /* else fall through */
341 bufptr=-1; /* Any other control char - discard para*/
343 } else if (c != 0xfeff) {
344 /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
348 } while (bufptr<=PARAGRAPH_BUFFER-2 &&
350 buffer[bufptr]!=0x000a);
353 output_paragraph(buffer);
358 /**********************************************************************/
359 /* Reads file from MS-Word 97 and above file. Takes in account strange*
360 * situation that unicode and non-unicode 256-byte blocks could be *
361 * intermixed in word file *
366 * offset - position of the character inside file (to determine *
367 * possible block boundaries *
368 **********************************************************************/
369 int get_word8_char(FILE *f,long *offset,long fileend) {
372 if ((i=(*offset)%256) ==0) {
373 count=catdoc_read(read_buf,1,256,f);
374 memset(read_buf+count,0,256-count);
376 if (*offset+(long)count>fileend) {
377 count=fileend-*offset;
381 if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
389 if (buf_is_unicode) {
390 u=read_buf[i] | read_buf[i+1]<<8;
393 u=to_unicode(source_charset,read_buf[i]);
400 output_paragraph(buffer);
405 void end_paragraph(int *bufptr) {
406 add_to_buffer(bufptr,0x000a);
407 add_to_buffer(bufptr,0);
408 output_paragraph(buffer);
413 * Parses RTF file from file stream
415 * @param f - file stream descriptor
417 int parse_rtf(FILE *f) {
418 int para_mode=0, data_skip_mode=0,i;
419 RTFGroupData *groups=NULL;
420 int group_count=0, group_store=20;
422 current_charset=source_charset;
424 if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
425 perror("Can\'t allocate memory: ");
428 groups[0].uc = 2; /* DEfault uc = 2 */
437 if ((code=getRtfCommand(f, &com)) != 0)
441 /* fprintf(stderr, "Spec Char found=%s and arg=%c\n", */
442 /* com.name, com.numarg); */
443 if (com.numarg == '*' && data_skip_mode == 0) {
444 data_skip_mode=group_count;
445 } else if (com.numarg == '\r') {
446 end_paragraph(&bufptr);
447 } else if (com.numarg == '~') {
448 add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
449 } else if (com.numarg == '-') {
450 add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
455 add_to_buffer(&bufptr,0x2014);/* EM DASH*/
458 add_to_buffer(&bufptr,0x2013);break;
460 add_to_buffer(&bufptr,0x2022);break;
461 case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
462 case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
463 case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
464 case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
465 case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
468 add_to_buffer(&bufptr,' ');break;
470 /* fprintf(stderr, "RTF char %d\n", com.numarg); */
471 if (data_skip_mode == 0) {
472 add_to_buffer(&bufptr,rtf_to_unicode(com.numarg));
476 groups[group_count].uc=com.numarg;
479 add_to_buffer(&bufptr,0x0009);
481 case RTF_UNICODE_CHAR:
484 /* fprintf(stderr, "Unicode char %d\n", com.numarg); */
485 if (data_skip_mode == 0)
486 add_to_buffer(&bufptr,com.numarg);
487 i=groups[group_count].uc;
492 /*if (para_mode > 0) {*/
493 end_paragraph(&bufptr);
495 para_mode=group_count;
503 case RTF_LISTOVERRIDETABLE:
507 if (data_skip_mode == 0){
508 data_skip_mode=group_count;
512 /* fprintf(stderr, "Selected lang = %d\n",com.numarg); */
515 rtfSetCharset(¤t_charset,com.numarg);
517 /* fprintf(stderr, "Unknown command with name %s and arg=%d\n", */
518 /* com.name, com.numarg); */
525 if (group_count >= group_store ) {
527 if((groups=(RTFGroupData*)realloc(groups,
528 group_store*sizeof(RTFGroupData)))
530 perror("Can\'t allocate memory: ");
535 add_to_buffer(&bufptr,0x20);
536 groups[group_count]=groups[group_count-1];
542 if(para_mode > 0 && para_mode > group_count) {
543 /*add_to_buffer(&bufptr,0);
544 output_paragraph(buffer);
545 fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
549 if(data_skip_mode > group_count) {
554 if (data_skip_mode == 0)
555 if (c != '\n' && c != '\r')
556 add_to_buffer(&bufptr,rtf_to_unicode(c));
560 add_to_buffer(&bufptr,'\n');
561 add_to_buffer(&bufptr,0);
562 output_paragraph(buffer);
569 * Convert text string to number
571 * @param f stream to read data from
573 * @return converted number
575 signed long getNumber(FILE *f) {
577 char buf[RTFARGSMAXLEN];
579 while(isdigit(c=fgetc(f)) || c=='-') {
582 buf[count++]=(char)c;
586 return strtol(buf, (char **)NULL, 10);
590 * Parse command stream from rtf file and fill command structure
592 * @param f - rtf file stream
593 * @param command - pointer to RTFcommand structure to fill
595 * @return parse code not 0 - error, 0 - success
597 int getRtfCommand(FILE *f, RTFcommand *command ) {
601 command->name[0]=(char)c;
602 while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) {
605 command->name[name_count++]=(char)c;
607 command->name[name_count]='\0';
608 command->type=getCommandType(command->name);
609 /* command->args=NULL; */
611 if (isdigit(c) || c == '-' )
612 command->numarg=getNumber(f);
616 if(!(c==' ' || c=='\t'))
619 command->name[0]=(char)c;
620 command->name[1]='\0';
621 /* command->args=NULL; */
623 command->type=RTF_CHAR;
624 command->numarg=getCharCode(f);
628 command->type=RTF_SPEC_CHAR;
637 * Converts char to unicode.
639 * @param code - integer code of char
641 * @return converted char
643 unsigned short int rtf_to_unicode(int code) {
645 if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF;
650 * Convert name of RTF command to RTFType
652 * @param name name to convert
654 * @return RTFType, if unknown command, then return RTF_UNKNOWN
656 RTFTypes getCommandType(char *name) {
657 int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap);
658 for (i = 0; i < olen ; i++) {
659 if ( strcmp(name,rtf_types[i].name) == 0 ) {
660 return rtf_types[i].type;
667 * Return number representing char code in Hex
669 * @param f stream to read data from
671 * @return converted number
673 signed int getCharCode(FILE *f) {
675 char buf[RTFARGSMAXLEN];
677 if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) {
680 buf[count++]=(char)c;
686 return strtol(buf, (char **)NULL, 16);
689 void rtfSetCharset(short int **charset_ptr,unsigned int codepage)
691 /* Do not override charset if it is specified in the command line */
692 const char *charset_name;
693 char *save_buf = input_buffer;
694 if (forced_charset) return;
695 charset_name = charset_from_codepage(codepage);
696 check_charset(&source_csname,charset_name);
698 *charset_ptr = read_charset(source_csname);
699 input_buffer = save_buf;