1 /*****************************************************************/
2 /* BIFF-stream (excel file) parsing */
4 /* This file is part of catdoc project */
5 /* (c) David Rysdam 1998 */
6 /* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */
7 /*****************************************************************/
20 #include "../compat/strftime.h"
22 static unsigned char rec[MAX_MS_RECSIZE];
24 short int *formatTable=NULL;
25 char *forced_date_format = NULL;
26 size_t formatTableIndex = 0;
27 size_t formatTableSize = 0;
28 double date_shift = 25569.0;
29 #define FLT_FORMAT(a,b,c) a #b c
30 #define MK_FORMAT(x) FLT_FORMAT("%.",x,"g")
31 char number_format[8]=MK_FORMAT(DBL_DIG);
33 void CleanUpFormatIdxUsed(void);
35 void do_table(FILE *input,char *filename) {
37 long reclen,build_year=0,build_rel=0,offset=0;
40 date_shift=25569.0; /* Windows 1900 date system */
41 CleanUpFormatIdxUsed();
43 catdoc_read(rec,2,1,input);
44 biff_version=getshort(rec,0);
45 catdoc_read(rec,2,1,input);
46 reclen=getshort(rec,0);
47 if ( biff_version == 0x0809 || biff_version == 0x0409 ||
48 biff_version == 0x0209 || biff_version == 0x0009 ) {
49 if (reclen==8 || reclen==16) {
50 if (biff_version == 0x0809 ) {
51 itemsread=catdoc_read(rec,4,1,input);
52 build_year=getshort(rec+2,0);
53 build_rel=getshort(rec,0);
56 itemsread=catdoc_read(rec,8,1,input);
64 } else if (biff_version == 0x0209 ) {
67 } else if (biff_version == 0x0409 ) {
73 itemsread=catdoc_read(rec,reclen-offset,1,input);
76 fprintf(stderr,"%s: Invalid BOF record\n",filename);
80 itemsread=catdoc_read(rec,126,1,input);
83 if (catdoc_eof(input)) {
84 fprintf(stderr,"%s: No BOF record found\n",filename);
88 unsigned char buffer[2];
90 itemsread = catdoc_read(buffer, 2, 1, input);
91 if (catdoc_eof(input)) {
92 process_item(MSEOF,0,NULL);
96 rectype=getshort(buffer,0);
101 itemsread = catdoc_read(buffer, 2, 1, input);
102 reclen=getshort(buffer,0);
103 if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){
104 itemsread = catdoc_read(rec, 1, reclen, input);
108 if (rectype != BOF) {
112 /* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */
113 process_item(rectype,reclen,rec);
114 if (rectype == MSEOF) {
122 unsigned char **sst=NULL;/* Shared string table parsed into array of strings in
124 int sstsize = 0; /*Number of strings in SST*/
125 unsigned char *sstBuffer=NULL; /*Unparsed sst to accumulate all its parts*/
126 int sstBytes = 0; /*Size of SST Data, already accumulated in the buffer */
127 int codepage=1251; /*default*/
129 /* holds a pointer to formula value, becouse value itself would be in
132 unsigned char **saved_reference = NULL;
134 void process_item (int rectype, int reclen, unsigned char *rec) {
135 if (rectype != CONTINUE && prev_rectype == SST) {
136 /* we have accumulated unparsed SST, and now encountered
137 * another record, which indicates that SST is ended */
138 /* fprintf(stderr,"parse sst!\n");*/
139 parse_sst(sstBuffer,sstBytes);
143 fprintf(stderr,"File is encrypted\n");
148 fprintf(stderr,"File is write protected\n");
153 if (source_charset) break;
154 codepage=getshort(rec,0);
155 /*fprintf(stderr,"CODEPAGE %d\n",codepage); */
156 if (codepage!=1200) {
157 const char *cp = charset_from_codepage(codepage);
158 source_charset=read_charset(cp);
164 format_code=getshort(rec,0);
165 SetFormatIdxUsed(format_code);
166 /* this debug code prints format string */
170 fprintf(stderr,"Format %x \"",format_code);
171 if (rec[2] == reclen - 3 && rec[3] != 0) {
172 for (i=0,ptr=rec+3;i<rec[2];i++,ptr++) {
176 for (i=0,ptr=rec+5;i<rec[2];i++,ptr+=2) {
180 fprintf (stderr,"\"\n");
185 /* Just copy SST into buffer, and wait until we get
186 * all CONTINUE records
188 /* fprintf(stderr,"SST\n"); */
189 /* If exists first SST entry, then just drop it and start new*/
190 if (sstBuffer != NULL)
195 sstBuffer=(unsigned char*)malloc(reclen);
197 if (sstBuffer == NULL ) {
198 perror("SSTptr alloc error! ");
201 memcpy(sstBuffer,rec,reclen);
205 if (prev_rectype != SST) {
206 return; /* to avoid changing of prev_rectype;*/
208 sstBuffer=realloc(sstBuffer,sstBytes+reclen);
209 if (sstBuffer == NULL ) {
210 perror("SSTptr realloc error! ");
213 memcpy(sstBuffer+sstBytes,rec,reclen);
219 unsigned char **pcell;
220 unsigned char *src=(unsigned char *)rec+6;
222 saved_reference=NULL;
223 row = getshort(rec,0);
224 col = getshort(rec,2);
225 /* fprintf(stderr,"LABEL!\n"); */
226 pcell=allocate(row,col);
227 *pcell=copy_unicode_string(&src);
230 case BLANK: { int row,col;unsigned char **pcell;
231 row = getshort(rec,0);
232 col = getshort(rec,2);
233 pcell=allocate(row,col);
238 int row, startcol,endcol;
239 unsigned char **pcell;
240 row = getshort(rec,0);
241 startcol = getshort(rec,2);
242 endcol=getshort(rec,reclen-2);
243 pcell=allocate(row,endcol);
248 case CONSTANT_STRING: {
249 int row = getshort(rec,0);
250 int col = getshort(rec,2);
251 unsigned char **pcell;
252 int string_no=getshort(rec,6);
254 fprintf(stderr,"CONSTANT_STRING before SST parsed\n");
257 /* fprintf(stderr,"col=%d row=%d no=%d\n",col,row,string_no); */
259 saved_reference=NULL;
260 pcell=allocate(row,col);
261 if (string_no>=sstsize|| string_no < 0 ) {
262 fprintf(stderr,"string index out of boundary\n");
264 } else if (sst[string_no] !=NULL) {
266 unsigned char *outptr;
267 len=strlen((char *)sst[string_no]);
268 outptr=*pcell=malloc(len+1);
269 strcpy((char *)outptr,(char *)sst[string_no]);
281 unsigned char **pcell;
283 saved_reference=NULL;
284 row = getshort(rec,0)-startrow;
285 col = getshort(rec,2);
286 pcell=allocate(row,col);
287 *pcell=(unsigned char *)strdup(format_double(rec,6,getshort(rec,4)));
292 unsigned char **pcell;
294 row = getshort(rec,0)-startrow;
295 col = getshort(rec,2);
296 pcell=allocate(row,col);
297 *pcell=(unsigned char *)strdup(format_int(getshort(rec,7),getshort(rec,4)));
302 int row,col,format_code;
303 unsigned char **pcell;
305 saved_reference=NULL;
306 row = getshort(rec,0)-startrow;
307 col = getshort(rec,2);
308 pcell=allocate(row,col);
309 format_code = getshort(rec,4);
310 *pcell=(unsigned char *)strdup(format_rk(rec+6,format_code));
314 int row,col,startcol,endcol,offset,format_code;
315 unsigned char **pcell;
316 row = getshort(rec,0)-startrow;
317 startcol = getshort(rec,2);
318 endcol = getshort(rec,reclen-2);
319 saved_reference=NULL;
321 for (offset=4,col=startcol;col<=endcol;offset+=6,col++) {
322 pcell=allocate(row,col);
323 format_code=getshort(rec,offset);
324 *pcell=(unsigned char *)strdup(format_rk(rec+offset+2,format_code));
331 unsigned char **pcell;
332 saved_reference=NULL;
333 row = getshort(rec,0)-startrow;
334 col = getshort(rec,2);
335 pcell=allocate(row,col);
336 if (((unsigned char)rec[12]==0xFF)&&(unsigned char)rec[13]==0xFF) {
337 /* not a floating point value */
342 *pcell=(unsigned char *)strdup(buf);
343 } else if (rec[6]==2) {
346 *pcell=(unsigned char *)strdup(buf);
347 } else if (rec[6]==0) {
348 saved_reference=pcell;
351 int format_code=getshort(rec,4);
352 *pcell=(unsigned char *)strdup(format_double(rec,6,format_code));
357 unsigned char *src=(unsigned char *)rec;
358 if (!saved_reference) {
359 fprintf(stderr,"String record without preceeding string formula\n");
362 *saved_reference=copy_unicode_string(&src);
367 fprintf(stderr,"BOF when current sheet is not flushed\n");
373 case 0x43: /*from perl module Spreadsheet::ParseExecel */
375 short int formatIndex = getshort(rec,2);
376 /* we are interested only in format index here */
377 if (formatTableIndex >= formatTableSize) {
378 formatTable=realloc(formatTable,
379 (formatTableSize+=16)*sizeof(short int));
382 fprintf(stderr,"Out of memory for format table");
386 formatTable[formatTableIndex++] = formatIndex;
389 case MS1904: /* Macintosh 1904 date system */
401 /* fprintf(stderr,"Row! %d %d %d\n",getshort(rec,0), getshort(rec+2,0),getshort(rec+4,0)); */
405 /* fprintf(stderr,"INDEX! %d %d\n", getlong(rec+4,0), getlong(rec+8,0)); */
410 fprintf(stderr,"Unknown record 0x%x\n length %d\n",rectype,reclen);
414 prev_rectype=rectype;
418 * Extracts string from sst and returns mallocked copy of it
420 unsigned char *copy_unicode_string (unsigned char **src) {
424 int to_skip=0; /* Used to counmt data after end of string */
425 int offset = 1; /* Variable length of the first field */
427 /* char *realstart=*src; */
428 unsigned char *dest;/* where to copy string */
429 unsigned char *s,*d,*c;
433 /* for(i=0;i<20;i++) */
434 /* fprintf(stderr,"%02x ",(*src)[i]); */
435 /* fprintf(stderr,"\n"); */
437 flags = *(*src+1+offset);
438 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
439 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
441 flags = *(*src+offset);
443 flags = *(*src+1+offset);
444 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
445 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
446 /* fprintf(stderr,"Strange flags = %d, returning NULL\n", flags); */
451 count=getshort(*src,0);
453 charsize=(flags &0x01) ? 2 : 1;
455 switch (flags & 12 ) {
456 case 0x0c: /* Far East with RichText formating */
457 to_skip=4*getshort(*src,2+offset)+getlong(*src, 4+offset);
458 start_offset=2+offset+2+4;
459 /* fprintf(stderr,"Far East with RichText formating\n"); */
462 case 0x08: /* With RichText formating */
463 to_skip=4*getshort(*src,2+offset);
464 start_offset=2+offset+2;
465 /* fprintf(stderr,"With RichText formating %d\n",getshort(*src,2+offset)); */
468 case 0x04: /* Far East */
469 to_skip=getlong(*src, 2+offset);
470 start_offset=2+offset+4;
471 /* fprintf(stderr,"Far East\n"); */
476 start_offset=2+offset;
477 /* fprintf(stderr,"Default string\n"); */
480 /* fprintf(stderr,"count=%d skip=%d start_offset=%d\n", */
481 /* count, to_skip, start_offset); */
482 /* Á ÚÄÅÓØ ÍÙ ËÏÐÉÒÕÅÍ ÓÔÒÏËÕ */
483 if ( (dest=malloc(count+1)) == NULL ) {
484 perror("Dest string alloc error");
485 *src+=(to_skip+start_offset+(count*charsize));
491 for (s=*src,d=dest,i=0;i<count;i++,s+=charsize) {
492 /* fprintf(stderr,"l=%d len=%d count=%d charsize=%d\n",l,len,count,charsize); */
493 if ( (charsize == 1 && (*s == 1 || *s == 0)) ||
494 (charsize == 2 && (*s == 1 || *s == 0) && *(s+1) != 4)) {
495 /* fprintf(stderr,"extchar (unicode)=%02x %02x\n",*s, *(s+1)); */
496 charsize=(*s &0x01) ? 2 : 1;
502 if ( charsize == 2 ){
503 u=(unsigned short)getshort(s,0);
504 c=(unsigned char *)convert_char(u);
505 /* fprintf(stderr,"char=%02x %02x\n", *s, *(s+1)); */
507 if (!source_charset) {
508 check_charset(&source_csname,source_csname);
509 /* fprintf(stderr,"charset=%s\n",source_csname);*/
510 source_charset=read_charset(source_csname);
512 u=(unsigned short)to_unicode(source_charset,(unsigned char)*s);
513 c=(unsigned char *)convert_char(u);
516 int dl = strlen((char *)c);
519 dest=realloc(dest,len+1);
522 strcpy((char *)d,(char *)c);
533 * Format code is index into format table (which is list of XF records
535 * Second word of XF record is format type idnex
536 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
537 * date if it is not used for explicitly stored formats.
538 * BuiltInDateFormatIdx converts format index into index of explicit
539 * built-in date formats sutable for strftime.
541 int BuiltInDateFormatIdx (int index) {
543 offset=1; /* offset of date formats */
544 /* 0 is used as false -- format not found */
545 if ((index>= 0x0E) && (index<=0x16)) {
546 return offset+index-0x0E;
548 if ((index>=0x2d) && (index<=0x2F)) {
549 return offset+index-0x2d+9;
550 } else if (index==0xa4) {
557 * GetBuiltInDateFormat stores and returns
558 * built in xls2csv strftime formats.
560 #define NUMOFDATEFORMATS 13
561 char *GetBuiltInDateFormat(int dateindex) {
562 static char *formats[]={
563 /* reserved */ NULL, /* BuiltInDateFormatIdx use dateindex=0 as flag format not found */
564 /* 0x0E */ "%m-%d-%y", /* 01 */
565 /* 0x0F */ "%d-%b-%y", /* 02 */
566 /* 0x10 */ "%d-%b", /* 03 */
567 /* 0x11 */ "%b-%d", /* 04 */
568 /* 0x12 */ "%l:%M %p", /* 05 */
569 /* 0x13 */ "%l:%M:%S %p", /* 06 */
570 /* 0x14 */ "%H:%M", /* 07 */
571 /* 0x15 */ "%H:%M:%S", /* 08 */
572 /* 0x16 */ "%m-%d-%y %H:%M", /* 09 */
573 /* 0x2d */ "%M:%S", /* 10 */
574 /* 0x2e */ "%H:%M:%S", /* 11 */
575 /* 0x2f */ "%M:%S", /* 12 */
576 /* 0xa4 */ "%m.%d.%Y %l:%M:%S %p" /* 13 */
578 if (dateindex>0 && dateindex <= NUMOFDATEFORMATS) {
579 return formats[dateindex];
584 static char FormatIdxUsed[NUMOFDATEFORMATS];
586 void CleanUpFormatIdxUsed() {
588 for (i=0;i<NUMOFDATEFORMATS; i++)
593 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
594 * date in case when they are built-in Excel97 formats.
595 * Nevertheless, those indexes can be used for explicitly stored formats,
596 * which are not dates in general.
597 * SetFormatIdxUsed marks this formats as already used
598 * and excludes them from list of built-in formats
599 * preventing misformatting of corresponding data.
601 void SetFormatIdxUsed(int format_code) {
603 /*fprintf(stderr,"Format idx %x to be set to dirty\n",format_code);
605 dateindex=BuiltInDateFormatIdx(format_code);
607 FormatIdxUsed[dateindex]=1;
608 /*fprintf(stderr,"Date idx %d is set to be dirty\n",dateindex); */
613 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
614 * date in case when they are built-in Excel97 formats.
615 * Nevertheless, those indexes can be used for explicitly stored formats,
616 * which are not dates in general.
617 * SetFormatIdxUsed marks this formats as already used
618 * and excludes them from list of built-in formats
619 * preventing misformatting of corresponding data.
620 * IsFormatIdxUsed tests this case.
622 char IsFormatIdxUsed(int format_code) {
624 dateindex=BuiltInDateFormatIdx(format_code);
626 /* fprintf(stderr,"Date idx %d is dirty\n",dateindex); */
627 return FormatIdxUsed[dateindex]==1;
633 /* Checks if format denoted by given code is date
634 * Format code is index into format table (which is list of XF records
636 * Second word of XF record is format type inex
637 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
639 * If so, it returns strftime format for this date. Otherwise returns
644 char *isDateFormat(int format_code) {
647 if (format_code>=formatTableIndex) {
648 fprintf(stderr,"Format code %d is used before definition\n",format_code);
652 index = formatTable[format_code];
653 if (IsFormatIdxUsed(index)) {
654 fprintf(stderr,"Format %x is redefined\n",index);
655 /* this format is something user-defined --- not a standard built-in date*/
658 dateindex=BuiltInDateFormatIdx(index);
660 if (forced_date_format) return forced_date_format;
661 return GetBuiltInDateFormat(dateindex);
668 time_t float2date(double d);
670 * Extracts floating point value and formats it
673 char *number2string(double d,short int format_code) {
674 static char buffer [128];
676 if ((datefmt=isDateFormat(format_code))!=NULL) {
677 time_t t = float2date(d);
678 strftime(buffer, 127,datefmt, gmtime(&t));
680 sprintf(buffer,number_format,d);
685 char *format_double(unsigned char *rec,int offset,int format_code) {
686 union { unsigned char cc[8];
690 # ifdef WORDS_BIGENDIAN
691 for(s=rec+offset+8,d=dconv.cc,i=0;
692 i<8;i++) *(d++)=*(--s);
694 for(s=rec+offset,d=dconv.cc,i=0;
695 i<8;i++) *(d++)=*(s++);
697 return number2string(dconv.d,format_code);
701 * Formats integer value into static buffer
703 char *format_int(int value,int format_code) {
704 static char buffer[12];
705 sprintf(buffer,"%i",value);
711 char* format_rk(unsigned char *rec,short int format_code) {
717 value=(double)(getlong(rec,0)>>2);
720 union { unsigned char cc[8];
725 # ifdef WORDS_BIGENDIAN
726 for(s=rec+4,d=dconv.cc,i=0; i<4;i++)
728 dconv.cc[0]=dconv.cc[0] & 0xfc;
730 for(s=rec,d=dconv.cc+4,i=0;
731 i<4;i++) *(d++)=*(s++);
732 dconv.cc[3]=dconv.cc[3] & 0xfc;
738 return number2string(value,format_code);
743 * Converts excel date into time_t
745 time_t float2date(double f) {
746 /* Hacked version. Excell stores date as floating point count of days
747 * since 1.1.1900. or 1.1.1904
748 * We are substracting value of 1.1.1970 and multiplying
749 * by 86400 thus getting seconds from the epoch
751 return rint((f-date_shift)*86400);
755 * Parses SST into array of strings
757 void parse_sst(unsigned char *sstbuf,int bufsize) {
758 int i; /* index into sst */
759 unsigned char *curString; /* pointer into unparsed buffer*/
760 unsigned char *barrier=(unsigned char *)sstbuf+bufsize; /*pointer to end of buffer*/
761 unsigned char **parsedString;/*pointer into parsed array*/
763 sstsize = getlong(sstbuf+4,0);
764 sst=malloc(sstsize*sizeof(char *));
767 perror("SST allocation error");
770 memset(sst,0,sstsize*sizeof(char *));
771 for (i=0,parsedString=sst,curString=sstbuf+8;
772 i<sstsize && curString<barrier; i++,parsedString++) {
773 /* fprintf(stderr,"copying %d string\n",i); */
774 *parsedString = copy_unicode_string(&curString);
776 /* fprintf(stderr,"end sst i=%d sstsize=%d\n",i,sstsize); */