// Copyright (C) 2002-2012 Nikolaus Gebhardt // This file is part of the "Irrlicht Engine" and the "irrXML" project. // For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h #ifndef IRR_ICXML_READER_IMPL_H_INCLUDED #define IRR_ICXML_READER_IMPL_H_INCLUDED #include "irrXML.h" #include "irrString.h" #include "irrArray.h" #include "fast_atof.h" #ifdef _DEBUG #define IRR_DEBUGPRINT(x) printf((x)); #else // _DEBUG #define IRR_DEBUGPRINT(x) #endif // _DEBUG namespace irr { namespace io { //! implementation of the IrrXMLReader template class CXMLReaderImpl : public IIrrXMLReader { public: //! Constructor CXMLReaderImpl(IFileReadCallBack* callback, bool deleteCallBack = true) : IgnoreWhitespaceText(true), TextData(0), P(0), TextBegin(0), TextSize(0), CurrentNodeType(EXN_NONE), SourceFormat(ETF_ASCII), TargetFormat(ETF_ASCII), IsEmptyElement(false) { if (!callback) return; storeTargetFormat(); // read whole xml file readFile(callback); // clean up if (deleteCallBack) delete callback; // create list with special characters createSpecialCharacterList(); // set pointer to text begin P = TextBegin; } //! Destructor virtual ~CXMLReaderImpl() { delete [] TextData; } //! Reads forward to the next xml node. //! \return Returns false, if there was no further node. virtual bool read() IRR_OVERRIDE { // if not end reached, parse the node if (P && ((unsigned int)(P - TextBegin) < TextSize - 1) && (*P != 0)) { return parseCurrentNode(); } return false; } //! Returns the type of the current XML node. virtual EXML_NODE getNodeType() const IRR_OVERRIDE { return CurrentNodeType; } //! Returns attribute count of the current XML node. virtual unsigned int getAttributeCount() const IRR_OVERRIDE { return Attributes.size(); } //! Returns name of an attribute. virtual const char_type* getAttributeName(int idx) const IRR_OVERRIDE { if ((u32)idx >= Attributes.size()) return 0; return Attributes[idx].Name.c_str(); } //! Returns the value of an attribute. virtual const char_type* getAttributeValue(int idx) const IRR_OVERRIDE { if ((unsigned int)idx >= Attributes.size()) return 0; return Attributes[idx].Value.c_str(); } //! Returns the value of an attribute. virtual const char_type* getAttributeValue(const char_type* name) const IRR_OVERRIDE { const SAttribute* attr = getAttributeByName(name); if (!attr) return 0; return attr->Value.c_str(); } //! Returns the value of an attribute virtual const char_type* getAttributeValueSafe(const char_type* name) const IRR_OVERRIDE { const SAttribute* attr = getAttributeByName(name); if (!attr) return EmptyString.c_str(); return attr->Value.c_str(); } //! Returns the value of an attribute as integer. virtual int getAttributeValueAsInt(const char_type* name, int defaultNotFound) const IRR_OVERRIDE { const SAttribute* attr = getAttributeByName(name); if (!attr) return defaultNotFound; core::stringc c(attr->Value.c_str()); return core::strtol10(c.c_str()); } //! Returns the value of an attribute as integer. virtual int getAttributeValueAsInt(int idx, int defaultNotFound) const IRR_OVERRIDE { const char_type* attrvalue = getAttributeValue(idx); if (!attrvalue) return defaultNotFound; core::stringc c(attrvalue); return core::strtol10(c.c_str()); } //! Returns the value of an attribute as float. virtual float getAttributeValueAsFloat(const char_type* name, float defaultNotFound) const IRR_OVERRIDE { const SAttribute* attr = getAttributeByName(name); if (!attr) return defaultNotFound; core::stringc c = attr->Value.c_str(); return core::fast_atof(c.c_str()); } //! Returns the value of an attribute as float. virtual float getAttributeValueAsFloat(int idx, float defaultNotFound) const IRR_OVERRIDE { const char_type* attrvalue = getAttributeValue(idx); if (!attrvalue) return defaultNotFound; core::stringc c = attrvalue; return core::fast_atof(c.c_str()); } //! Returns the name of the current node. virtual const char_type* getNodeName() const IRR_OVERRIDE { return NodeName.c_str(); } //! Returns data of the current node. virtual const char_type* getNodeData() const IRR_OVERRIDE { return NodeName.c_str(); } //! Returns if an element is an empty element, like virtual bool isEmptyElement() const IRR_OVERRIDE { return IsEmptyElement; } //! Returns format of the source xml file. virtual ETEXT_FORMAT getSourceFormat() const IRR_OVERRIDE { return SourceFormat; } //! Returns format of the strings returned by the parser. virtual ETEXT_FORMAT getParserFormat() const IRR_OVERRIDE { return TargetFormat; } private: // Reads the current xml node // return false if no further node is found bool parseCurrentNode() { char_type* start = P; // more forward until '<' found while(*P != L'<' && *P) ++P; // not a node, so return false if (!*P) return false; if (P - start > 0) { // we found some text, store it if (setText(start, P)) return true; } ++P; // based on current token, parse and report next element switch(*P) { case L'/': parseClosingXMLElement(); break; case L'?': ignoreDefinition(); break; case L'!': if (!parseCDATA()) parseComment(); break; default: parseOpeningXMLElement(); break; } return true; } //! sets the state that text was found. Returns true if set should be set bool setText(char_type* start, char_type* end) { // By default xml preserves all whitespace. But Irrlicht dropped some whitespace by default // in the past which did lead to OS dependent behavior. We just ignore all whitespace for now // as it's the closest to fixing behavior without breaking downward compatibility too much. if ( IgnoreWhitespaceText ) { char_type* p = start; for(; p != end; ++p) if (!isWhiteSpace(*p)) break; if (p == end) return false; } // set current text to the parsed text, and replace xml special characters core::string s(start, (int)(end - start)); NodeName = replaceSpecialCharacters(s); // current XML node type is text CurrentNodeType = EXN_TEXT; return true; } //! ignores an xml definition like void ignoreDefinition() { CurrentNodeType = EXN_UNKNOWN; // move until end marked with '>' reached while(*P != L'>') ++P; ++P; } //! parses a comment void parseComment() { CurrentNodeType = EXN_COMMENT; P += 1; char_type *pCommentBegin = P; int count = 1; // move until end of comment reached while(count) { if (*P == L'>') --count; else if (*P == L'<') ++count; ++P; } P -= 3; NodeName = core::string(pCommentBegin+2, (int)(P - pCommentBegin-2)); P += 3; } //! parses an opening xml element and reads attributes void parseOpeningXMLElement() { CurrentNodeType = EXN_ELEMENT; IsEmptyElement = false; Attributes.clear(); // find name const char_type* startName = P; // find end of element while(*P != L'>' && !isWhiteSpace(*P)) ++P; const char_type* endName = P; // find Attributes while(*P != L'>') { if (isWhiteSpace(*P)) ++P; else { if (*P != L'/') { // we've got an attribute // read the attribute names const char_type* attributeNameBegin = P; while(!isWhiteSpace(*P) && *P != L'=') ++P; const char_type* attributeNameEnd = P; ++P; // read the attribute value // check for quotes and single quotes, thx to murphy while( (*P != L'\"') && (*P != L'\'') && *P) ++P; if (!*P) // malformatted xml file return; const char_type attributeQuoteChar = *P; ++P; const char_type* attributeValueBegin = P; while(*P != attributeQuoteChar && *P) ++P; if (!*P) // malformatted xml file return; const char_type* attributeValueEnd = P; ++P; SAttribute attr; attr.Name = core::string(attributeNameBegin, (int)(attributeNameEnd - attributeNameBegin)); core::string s(attributeValueBegin, (int)(attributeValueEnd - attributeValueBegin)); attr.Value = replaceSpecialCharacters(s); Attributes.push_back(attr); } else { // tag is closed directly ++P; IsEmptyElement = true; break; } } } // check if this tag is closing directly if (endName > startName && *(endName-1) == L'/') { // directly closing tag IsEmptyElement = true; endName--; } NodeName = core::string(startName, (int)(endName - startName)); ++P; } //! parses an closing xml tag void parseClosingXMLElement() { CurrentNodeType = EXN_ELEMENT_END; IsEmptyElement = false; Attributes.clear(); ++P; const char_type* pBeginClose = P; while(*P != L'>') ++P; NodeName = core::string(pBeginClose, (int)(P - pBeginClose)); ++P; } //! parses a possible CDATA section, returns false if begin was not a CDATA section bool parseCDATA() { if (*(P+1) != L'[') return false; CurrentNodeType = EXN_CDATA; // skip '' && (*(P-1) == L']') && (*(P-2) == L']')) { cDataEnd = P - 2; } ++P; } if ( cDataEnd ) NodeName = core::string(cDataBegin, (int)(cDataEnd - cDataBegin)); else NodeName = ""; return true; } // structure for storing attribute-name pairs struct SAttribute { core::string Name; core::string Value; }; // finds a current attribute by name, returns 0 if not found const SAttribute* getAttributeByName(const char_type* name) const { if (name) { for (irr::u32 i=0; i replaceSpecialCharacters( const core::string& origstr) { int pos = origstr.findFirst(L'&'); int oldPos = 0; if (pos == -1) return origstr; core::string newstr; while(pos != -1 && pos < (int)origstr.size()-2) { // check if it is one of the special characters int specialChar = -1; for (int i=0; i<(int)SpecialCharacters.size(); ++i) { const char_type* p = &origstr.c_str()[pos]+1; if (equalsn(&SpecialCharacters[i][1], p, SpecialCharacters[i].size()-1)) { specialChar = i; break; } } if (specialChar != -1) { newstr.append(origstr.subString(oldPos, pos - oldPos)); newstr.append(SpecialCharacters[specialChar][0]); pos += SpecialCharacters[specialChar].size(); } else { newstr.append(origstr.subString(oldPos, pos - oldPos + 1)); pos += 1; } // find next & oldPos = pos; pos = origstr.findNext(L'&', pos); } if (oldPos < (int)origstr.size()) newstr.append(origstr.subString(oldPos, origstr.size()-oldPos)); return newstr; } //! reads the xml file and converts it into the wanted character format. bool readFile(IFileReadCallBack* callback) { long size = callback->getSize(); if (size<0) return false; // We need four terminating 0's at the end. // For ASCII we need 1 0's, for UTF-16 2, for UTF-32 4. size += 4; char* data8 = new char[size]; if (!callback->read(data8, size-4)) { delete [] data8; return false; } // add zeros at end memset(data8+size-4, 0, 4); char16* data16 = reinterpret_cast(data8); char32* data32 = reinterpret_cast(data8); // now we need to convert the data to the desired target format // based on the byte order mark. const unsigned char UTF8[] = {0xEF, 0xBB, 0xBF}; // 0xEFBBBF; const u16 UTF16_BE = 0xFFFE; const u16 UTF16_LE = 0xFEFF; const u32 UTF32_BE = 0xFFFE0000; const u32 UTF32_LE = 0x0000FEFF; // check source for all utf versions and convert to target data format if (size >= 4 && data32[0] == static_cast(UTF32_BE)) { // UTF-32, big endian SourceFormat = ETF_UTF32_BE; convertTextData(data32+1, data8, (size/4)-1); // data32+1 because we need to skip the header } else if (size >= 4 && data32[0] == static_cast(UTF32_LE)) { // UTF-32, little endian SourceFormat = ETF_UTF32_LE; convertTextData(data32+1, data8, (size/4)-1); // data32+1 because we need to skip the header } else if (size >= 2 && data16[0] == UTF16_BE) { // UTF-16, big endian SourceFormat = ETF_UTF16_BE; convertTextData(data16+1, data8, (size/2)-1); // data16+1 because we need to skip the header } else if (size >= 2 && data16[0] == UTF16_LE) { // UTF-16, little endian SourceFormat = ETF_UTF16_LE; convertTextData(data16+1, data8, (size/2)-1); // data16+1 because we need to skip the header } else if (size >= 3 && memcmp(data8,UTF8,3)==0) { // UTF-8 SourceFormat = ETF_UTF8; convertTextData(data8+3, data8, size-3); // data8+3 because we need to skip the header } else { // ASCII SourceFormat = ETF_ASCII; convertTextData(data8, data8, size); } return true; } //! converts the text file into the desired format. /** \param source: begin of the text (without byte order mark) \param pointerToStore: pointer to text data block which can be stored or deleted based on the necessary conversion. \param sizeWithoutHeader: Text size in characters without header */ template void convertTextData(src_char_type* source, char* pointerToStore, int sizeWithoutHeader) { // convert little to big endian if necessary if (sizeof(src_char_type) > 1 && isLittleEndian(TargetFormat) != isLittleEndian(SourceFormat)) convertToLittleEndian(source); // check if conversion is necessary: if (sizeof(src_char_type) == sizeof(char_type)) { // no need to convert TextBegin = (char_type*)source; TextData = (char_type*)pointerToStore; TextSize = sizeWithoutHeader; } else { // convert source into target data format. // TODO: implement a real conversion. This one just // copies bytes. This is a problem when there are // unicode symbols using more than one character. TextData = new char_type[sizeWithoutHeader]; if ( sizeof(src_char_type) == 1 ) { // we have to cast away negative numbers or results might add the sign instead of just doing a copy for (int i=0; i(static_cast(source[i])); } } else { for (int i=0; i(source[i]); } TextBegin = TextData; TextSize = sizeWithoutHeader; // delete original data because no longer needed delete [] pointerToStore; } } //! converts whole text buffer to little endian template void convertToLittleEndian(src_char_type* t) { if (sizeof(src_char_type) == 4) { // 32 bit while(*t) { *t = ((*t & 0xff000000) >> 24) | ((*t & 0x00ff0000) >> 8) | ((*t & 0x0000ff00) << 8) | ((*t & 0x000000ff) << 24); ++t; } } else { // 16 bit while(*t) { *t = (*t >> 8) | (*t << 8); ++t; } } } //! returns if a format is little endian inline bool isLittleEndian(ETEXT_FORMAT f) { return f == ETF_ASCII || f == ETF_UTF8 || f == ETF_UTF16_LE || f == ETF_UTF32_LE; } //! returns true if a character is whitespace inline bool isWhiteSpace(char_type c) { return (c==' ' || c=='\t' || c=='\n' || c=='\r'); } //! generates a list with xml special characters void createSpecialCharacterList() { // list of strings containing special symbols, // the first character is the special character, // the following is the symbol string without trailing &. SpecialCharacters.push_back("&"); SpecialCharacters.push_back("gt;"); SpecialCharacters.push_back("\"quot;"); SpecialCharacters.push_back("'apos;"); } //! compares the first n characters of the strings bool equalsn(const char_type* str1, const char_type* str2, int len) { int i; for(i=0; str1[i] && str2[i] && i < len; ++i) if (str1[i] != str2[i]) return false; // if one (or both) of the strings was smaller then they // are only equal if they have the same length return (i == len) || (str1[i] == 0 && str2[i] == 0); } //! stores the target text format void storeTargetFormat() { // get target format. We could have done this using template specialization, // but VisualStudio 6 don't like it and we want to support it. switch(sizeof(char_type)) { case 1: TargetFormat = ETF_UTF8; break; case 2: TargetFormat = ETF_UTF16_LE; break; case 4: TargetFormat = ETF_UTF32_LE; break; default: TargetFormat = ETF_ASCII; // should never happen. } } // instance variables: bool IgnoreWhitespaceText; // do not return EXN_TEXT nodes for pure whitespace char_type* TextData; // data block of the text file char_type* P; // current point in text to parse char_type* TextBegin; // start of text to parse unsigned int TextSize; // size of text to parse in characters, not bytes EXML_NODE CurrentNodeType; // type of the currently parsed node ETEXT_FORMAT SourceFormat; // source format of the xml file ETEXT_FORMAT TargetFormat; // output format of this parser core::string NodeName; // name of the node currently in - also used for text core::string EmptyString; // empty string to be returned by getSafe() methods bool IsEmptyElement; // is the currently parsed node empty? core::array< core::string > SpecialCharacters; // see createSpecialCharacterList() core::array Attributes; // attributes of current element }; // end CXMLReaderImpl } // end namespace } // end namespace #endif