/* * pdf_read.c * * Version 1.0 -- August 5, 2005 -- Pius Fischer */ #include #include #include #include #include typedef enum { PDF_ObjectType_Boolean, PDF_ObjectType_Number, PDF_ObjectType_String, PDF_ObjectType_Name, PDF_ObjectType_Array, PDF_ObjectType_Dictionary, PDF_ObjectType_Stream, PDF_ObjectType_Null, PDF_ObjectType_Indirect, } PDF_ObjectType; class PDF_Object { public: PDF_Object() {} virtual ~PDF_Object() {} virtual PDF_ObjectType get_type(void) = 0; }; class PDF_Number : public PDF_Object { public: PDF_Number(long _number) { m_number = _number; } ~PDF_Number() {} PDF_ObjectType get_type(void) { return PDF_ObjectType_Number; } long get_number(void) { return m_number; } private: long m_number; }; class PDF_String : public PDF_Object { public: PDF_String(const char *_string); ~PDF_String(); PDF_ObjectType get_type(void) { return PDF_ObjectType_String; } private: char *m_string; }; class PDF_Name : public PDF_Object { public: PDF_Name(const char *_name); ~PDF_Name(); PDF_ObjectType get_type(void) { return PDF_ObjectType_Name; } char *get_name(void) { return m_name; } private: char *m_name; }; class PDF_Array : public PDF_Object { public: PDF_Array() {} ~PDF_Array() {} PDF_ObjectType get_type(void) { return PDF_ObjectType_Array; } void add(PDF_Object *_obj) {} }; class PDF_Dictionary : public PDF_Object { public: PDF_Dictionary() { m_value_obj = NULL; } ~PDF_Dictionary() {} PDF_ObjectType get_type(void) { return PDF_ObjectType_Dictionary; } PDF_Object *get(const char *_key); void add(PDF_Name *_key_obj, PDF_Object *_value_obj); private: PDF_Object *m_value_obj; }; class PDF_IndirectObject : public PDF_Object { public: PDF_IndirectObject(unsigned int _obj_num) { m_obj_num = _obj_num; } ~PDF_IndirectObject() {} PDF_ObjectType get_type(void) { return PDF_ObjectType_Indirect; } unsigned int get_obj_num(void) { return m_obj_num; } private: unsigned int m_obj_num; }; #define MAX_TOKEN_LEN 1023 static char token[MAX_TOKEN_LEN + 1]; static unsigned int token_len; static bool push_token = false; static bool lookahead = false; static int lookahead_ch; static unsigned int tab_count; typedef struct { long filepos; unsigned int obj_num; unsigned int gen_num; PDF_Object * obj; } XRefEntry; static XRefEntry * xref_array; static unsigned int xref_array_len; typedef enum { PDF_Delimiter, PDF_Whitespace, PDF_Regular } PDF_CharType; static PDF_CharType pdf_char_type[256]; void die(const char *fmt, ...) { va_list args; va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); exit(1); } void tprintf(const char *fmt, ...) { for (int i = 0; i < tab_count; ++i) fputc('\t', stdout); va_list args; va_start(args, fmt); vfprintf(stdout, fmt, args); va_end(args); } PDF_String::PDF_String(const char *_string) { m_string = new char[strlen(_string)]; strcpy(m_string, _string); } PDF_String::~PDF_String() { delete m_string; } PDF_Name::PDF_Name(const char *_name) { m_name = new char[strlen(_name)]; strcpy(m_name, _name); } PDF_Name::~PDF_Name() { delete m_name; } PDF_Object *PDF_Dictionary::get(const char *_key) { if (strcmp(_key, "/Length") == 0) return m_value_obj; return NULL; } void PDF_Dictionary::add(PDF_Name *_key_obj, PDF_Object *_value_obj) { if (strcmp(_key_obj->get_name(), "/Length") == 0) m_value_obj = _value_obj; } void init_pdf_char_type(void) { const char *delimiter_array = "%()/<>[]{}"; const char *whitespace_array = "\t\n\f\r "; pdf_char_type[0] = PDF_Whitespace; for (int i = 1; i < 256; ++i) pdf_char_type[i] = PDF_Regular; for (const char *p = delimiter_array; *p; ++p) pdf_char_type[*p] = PDF_Delimiter; for (const char *p = whitespace_array; *p; ++p) pdf_char_type[*p] = PDF_Whitespace; } bool token_is_a_number(void) { for (int i = 0; i < token_len; ++i) if (token[i] < '0' || token[i] > '9') return false; return true; } void token_append(char ch) { if (token_len == MAX_TOKEN_LEN) die("Exceeded max token length\n"); token[token_len] = ch; token[++token_len] = 0; } void read_string(FILE *_file) { int ch; unsigned int paren_count = 1; for (;;) { if ((ch = fgetc(_file)) == EOF) die("Unexpected end of file\n"); token_append(ch); if (ch == '(' && token[token_len - 2] != '\\') ++paren_count; if (ch == ')' && token[token_len - 2] != '\\') if (--paren_count == 0) return; } } void get_next_token(FILE *_file) { int ch; if (push_token) { push_token = false; return; } token[0] = 0; token_len = 0; for (;;) { if (lookahead) { ch = lookahead_ch; lookahead = false; } else { if ((ch = fgetc(_file)) == EOF) die("Unexpected end of file\n"); } switch (pdf_char_type[ch]) { case PDF_Delimiter: if (token_len != 0) { lookahead_ch = ch; lookahead = true; return; } switch (ch) { case '<': case '>': token_append(ch); if ((lookahead_ch = fgetc(_file)) == EOF) die("Unexpected end of file\n"); if (lookahead_ch == ch) token_append(ch); else lookahead = true; return; case '%': for (;;) { if ((ch = fgetc(_file)) == EOF) die("Unexpected end of file\n"); if (ch == '\r' || ch == '\n') break; } break; case '/': token_append(ch); break; case '[': case ']': token_append(ch); return; case '(': token_append(ch); read_string(_file); return; case ')': case '{': case '}': default: die("Unexpected character\n"); } break; case PDF_Whitespace: if (token_len != 0) { if (ch == '\r') { if ((ch = fgetc(_file)) == EOF) die("Unexpected end of file\n"); if (ch != '\n') { lookahead_ch = ch; lookahead = true; } } return; } break; default: token_append(ch); break; } } } void seek_and_tell(FILE *_file, long _offset, int _whence) { long filepos; if (fseek(_file, _offset, _whence) != 0) { const char *whence_str = "UNKNOWN"; switch (_whence) { case SEEK_SET: whence_str = "SEEK_SET"; break; case SEEK_CUR: whence_str = "SEEK_CUR"; break; case SEEK_END: whence_str = "SEEK_END"; break; } die("fseek to %ld bytes from %s failed: %s\n", _offset, whence_str, strerror(errno)); } if ((filepos = ftell(_file)) < 0) { die("ftell failed: %s\n", strerror(errno)); } lookahead = false; // tprintf("Seek to position %ld\n", filepos); } PDF_Object *parse_object(FILE *_file) { get_next_token(_file); if (strcmp(token, "<<") == 0) { tprintf("<<\n"); ++tab_count; PDF_Dictionary *obj = new PDF_Dictionary(); for (;;) { get_next_token(_file); if (strcmp(token, ">>") == 0) break; if (token[0] != '/') die("Dictionary key must be a name\n"); tprintf("%s\n", token); PDF_Name *key_obj = new PDF_Name(token); PDF_Object *value_obj = parse_object(_file); obj->add(key_obj, value_obj); } --tab_count; tprintf(">>\n"); return obj; } if (token[0] == '/') { tprintf("%s\n", token); return new PDF_Name(token); } if (token_is_a_number()) { long savepos = ftell(_file); int number1 = atoi(token); get_next_token(_file); if (!token_is_a_number()) { push_token = true; tprintf("%d\n", number1); return new PDF_Number(number1); } int number2 = atoi(token); get_next_token(_file); if (strcmp(token, "R") == 0) { if (number2 != xref_array[number1].gen_num) die("Generation number does not match\n"); tprintf("%d 0 R\n", number1); return new PDF_IndirectObject(number1); } seek_and_tell(_file, savepos, SEEK_SET); tprintf("%d\n", number1); return new PDF_Number(number1); } if (strcmp(token, "[") == 0) { tprintf("[\n"); ++tab_count; PDF_Array *obj = new PDF_Array(); for (;;) { get_next_token(_file); if (strcmp(token, "]") == 0) break; push_token = true; obj->add(parse_object(_file)); } --tab_count; tprintf("]\n"); return obj; } if (token[0] == '(') { tprintf("%s\n", token); return new PDF_String(token); } tprintf("%s\n", token); return NULL; } void save_stream(FILE *_file, size_t _stream_len, unsigned int _obj_num) { static char stream_buffer[16384]; static char outfile_name[128]; FILE *outfile; snprintf(outfile_name, sizeof(outfile_name), "stream.%u", _obj_num); if ((outfile = fopen(outfile_name, "w")) == NULL) die("Cannot open %s: %s\n", outfile_name, strerror(errno)); while (_stream_len) { size_t read_len = sizeof(stream_buffer); if (_stream_len < read_len) read_len = _stream_len; _stream_len -= read_len; fread(stream_buffer, read_len, 1, _file); fwrite(stream_buffer, read_len, 1, outfile); } fclose(outfile); } void read_object(FILE *_file, XRefEntry *_xref_entry); void read_stream(FILE *_file, XRefEntry *_xref_entry) { tprintf("stream\n"); ++tab_count; PDF_Dictionary *_dictionary = (PDF_Dictionary *)(_xref_entry->obj); PDF_Object *len_obj = _dictionary->get("/Length"); if (len_obj->get_type() == PDF_ObjectType_Indirect) { unsigned int len_obj_num = ((PDF_IndirectObject *)len_obj)->get_obj_num(); if (xref_array[len_obj_num].obj == NULL) { long savepos; if ((savepos = ftell(_file)) < 0) die("ftell failed: %s\n", strerror(errno)); read_object(_file, &xref_array[len_obj_num]); seek_and_tell(_file, savepos, SEEK_SET); } len_obj = xref_array[len_obj_num].obj; } if (len_obj->get_type() != PDF_ObjectType_Number) die("The stream length must be a number\n"); long stream_len = ((PDF_Number *)len_obj)->get_number(); save_stream(_file, stream_len, _xref_entry->obj_num); lookahead = false; get_next_token(_file); if (strcmp(token, "endstream") != 0) die("Expected the endstream keyword\n"); --tab_count; tprintf("endstream\n"); } void read_object(FILE *_file, XRefEntry *_xref_entry) { tprintf("obj %d 0\n", _xref_entry->obj_num); ++tab_count; seek_and_tell(_file, _xref_entry->filepos, SEEK_SET); get_next_token(_file); if (!token_is_a_number()) die("Object number does not match\n"); if (_xref_entry->obj_num != atoi(token)) die("Object number does not match\n"); get_next_token(_file); if (!token_is_a_number()) die("Generation number does not match\n"); if (_xref_entry->gen_num != atoi(token)) die("Generation number does not match\n"); get_next_token(_file); if (strcmp(token, "obj") != 0) die("Expected the obj keyword\n"); _xref_entry->obj = parse_object(_file); get_next_token(_file); if (strcmp(token, "stream") == 0) { read_stream(_file, _xref_entry); get_next_token(_file); } if (strcmp(token, "endobj") != 0) die("Expected the endobj keyword\n"); --tab_count; tprintf("endobj\n"); } void read_trailer(FILE *_file) { get_next_token(_file); if (strcmp(token, "trailer") != 0) die("Expected the trailer keyword\n"); tprintf("trailer\n"); parse_object(_file); } void read_xref(FILE *_file) { char keyword; unsigned int obj_num; unsigned int gen_num; unsigned int max_obj; long filepos; if (fscanf(_file, "xref %u %u", &obj_num, &xref_array_len) != 2) die("read_xref: fscanf failed\n"); xref_array = new XRefEntry[xref_array_len]; for (max_obj = obj_num + xref_array_len; obj_num < max_obj; ++obj_num) { if (fscanf(_file, "%ld %u %c", &filepos, &gen_num, &keyword) != 3) die("read_xref: fscanf failed\n"); xref_array[obj_num].obj_num = obj_num; xref_array[obj_num].obj = NULL; if (keyword == 'f') { xref_array[obj_num].filepos = 0; xref_array[obj_num].gen_num = 0; continue; } if (keyword != 'n') die("read_xref: syntax error\n"); xref_array[obj_num].filepos = filepos; xref_array[obj_num].gen_num = gen_num; // fprintf(stderr, "obj %d %d starts at %ld\n", // obj_num, gen_num, filepos); } } #define EOF_BUFFER_SIZE 40 void read_startxref(FILE *_file) { static char eof_buffer[EOF_BUFFER_SIZE]; const char *p; long filepos = 0; long power_of_ten = 1; seek_and_tell(_file, 0, SEEK_END); seek_and_tell(_file, -EOF_BUFFER_SIZE, SEEK_CUR); if (fread(eof_buffer, EOF_BUFFER_SIZE, 1, _file) != 1) die("fread failed\n"); p = eof_buffer + EOF_BUFFER_SIZE - 1; if (*p == '\n') --p; if (*p == '\r') --p; if (strncmp(p - 4, "%%EOF", 5) != 0) die("Cannot read the end of file marker\n"); p -= 5; if (*p == '\n') --p; if (*p == '\r') --p; while (*p >= '0' && *p <= '9') { filepos += power_of_ten * (*p - '0'); if (--p < eof_buffer) die("Cannot read the startxref value\n"); power_of_ten *= 10; } if (filepos == 0) die("Cannot read the startxref value\n"); // fprintf(stderr, "startxref = %ld\n", filepos); if (p < eof_buffer + 10) die("Cannot read the startxref keyword\n"); if (*p == '\n') --p; if (*p == '\r') --p; if (strncmp(p - 8, "startxref", 9) != 0) die("Cannot read the startxref keyword\n"); seek_and_tell(_file, filepos, SEEK_SET); } int main(int argc, char **argv) { FILE *file; if (argc != 2) die("Usage: %s [filename]\n", argv[0]); if ((file = fopen(argv[1], "r")) == NULL) die("Cannot open %s: %s\n", argv[1], strerror(errno)); read_startxref(file); read_xref(file); init_pdf_char_type(); read_trailer(file); for (int i = 0; i < xref_array_len; ++i) if (xref_array[i].filepos > 0 && xref_array[i].obj == NULL) read_object(file, &xref_array[i]); fclose(file); return 0; }