| unicode.h | | unicode.h | |
| /** @file unicode.h | | /** @file unicode.h | |
| * @brief Unicode and UTF-8 related classes and functions. | | * @brief Unicode and UTF-8 related classes and functions. | |
| */ | | */ | |
|
| /* Copyright (C) 2006,2007,2008 Olly Betts | | /* Copyright (C) 2006,2007,2008,2009 Olly Betts | |
| * | | * | |
| * This program is free software; you can redistribute it and/or modify | | * This program is free software; you can redistribute it and/or modify | |
| * it under the terms of the GNU General Public License as published by | | * it under the terms of the GNU General Public License as published by | |
| * the Free Software Foundation; either version 2 of the License, or | | * the Free Software Foundation; either version 2 of the License, or | |
| * (at your option) any later version. | | * (at your option) any later version. | |
| * | | * | |
| * This program is distributed in the hope that it will be useful, | | * This program is distributed in the hope that it will be useful, | |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| * GNU General Public License for more details. | | * GNU General Public License for more details. | |
| | | | |
| skipping to change at line 30 | | skipping to change at line 30 | |
| | | | |
| #ifndef XAPIAN_INCLUDED_UNICODE_H | | #ifndef XAPIAN_INCLUDED_UNICODE_H | |
| #define XAPIAN_INCLUDED_UNICODE_H | | #define XAPIAN_INCLUDED_UNICODE_H | |
| | | | |
| #include <xapian/visibility.h> | | #include <xapian/visibility.h> | |
| | | | |
| #include <string> | | #include <string> | |
| | | | |
| namespace Xapian { | | namespace Xapian { | |
| | | | |
|
| /** An iterator which returns unicode character values from a UTF-8 encoded | | /** An iterator which returns Unicode character values from a UTF-8 encoded | |
| * string. | | * string. | |
| */ | | */ | |
| class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator { | | class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator { | |
| const unsigned char *p; | | const unsigned char *p; | |
| const unsigned char *end; | | const unsigned char *end; | |
| mutable unsigned seqlen; | | mutable unsigned seqlen; | |
| | | | |
| void calculate_sequence_length() const; | | void calculate_sequence_length() const; | |
| | | | |
| unsigned get_char() const; | | unsigned get_char() const; | |
| | | | |
| skipping to change at line 127 | | skipping to change at line 127 | |
| */ | | */ | |
| Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); } | | Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); } | |
| | | | |
| /** Create an iterator which is at the end of its iteration. | | /** Create an iterator which is at the end of its iteration. | |
| * | | * | |
| * This can be compared to another iterator to check if the other iter
ator | | * This can be compared to another iterator to check if the other iter
ator | |
| * has reached its end. | | * has reached its end. | |
| */ | | */ | |
| Utf8Iterator() : p(NULL), end(0), seqlen(0) { } | | Utf8Iterator() : p(NULL), end(0), seqlen(0) { } | |
| | | | |
|
| /** Get the current unicode character value pointed to by the iterator. | | /** Get the current Unicode character value pointed to by the iterator. | |
| * | | * | |
| * Returns unsigned(-1) if the iterator has reached the end of its buf
fer. | | * Returns unsigned(-1) if the iterator has reached the end of its buf
fer. | |
| */ | | */ | |
| unsigned operator*() const; | | unsigned operator*() const; | |
| | | | |
|
| /** Move forward to the next unicode character. | | /** Move forward to the next Unicode character. | |
| * | | * | |
| * @return An iterator pointing to the position before the move. | | * @return An iterator pointing to the position before the move. | |
| */ | | */ | |
| Utf8Iterator operator++(int) { | | Utf8Iterator operator++(int) { | |
| // If we've not calculated seqlen yet, do so. | | // If we've not calculated seqlen yet, do so. | |
| if (seqlen == 0) calculate_sequence_length(); | | if (seqlen == 0) calculate_sequence_length(); | |
| const unsigned char *old_p = p; | | const unsigned char *old_p = p; | |
| unsigned old_seqlen = seqlen; | | unsigned old_seqlen = seqlen; | |
| p += seqlen; | | p += seqlen; | |
| if (p == end) p = NULL; | | if (p == end) p = NULL; | |
| seqlen = 0; | | seqlen = 0; | |
| return Utf8Iterator(old_p, end, old_seqlen); | | return Utf8Iterator(old_p, end, old_seqlen); | |
| } | | } | |
| | | | |
|
| /** Move forward to the next unicode character. | | /** Move forward to the next Unicode character. | |
| * | | * | |
| * @return A reference to this object. | | * @return A reference to this object. | |
| */ | | */ | |
| Utf8Iterator & operator++() { | | Utf8Iterator & operator++() { | |
| if (seqlen == 0) calculate_sequence_length(); | | if (seqlen == 0) calculate_sequence_length(); | |
| p += seqlen; | | p += seqlen; | |
| if (p == end) p = NULL; | | if (p == end) p = NULL; | |
| seqlen = 0; | | seqlen = 0; | |
| return *this; | | return *this; | |
| } | | } | |
| | | | |
| skipping to change at line 184 | | skipping to change at line 184 | |
| typedef std::input_iterator_tag iterator_category; | | typedef std::input_iterator_tag iterator_category; | |
| typedef unsigned value_type; | | typedef unsigned value_type; | |
| typedef size_t difference_type; | | typedef size_t difference_type; | |
| typedef const unsigned * pointer; | | typedef const unsigned * pointer; | |
| typedef const unsigned & reference; | | typedef const unsigned & reference; | |
| //@} | | //@} | |
| }; | | }; | |
| | | | |
| namespace Unicode { | | namespace Unicode { | |
| | | | |
|
| /** Each unicode character is in one of these categories. */ | | /** Each Unicode character is in exactly one of these categories. */ | |
| typedef enum { | | typedef enum { | |
| UNASSIGNED, | | UNASSIGNED, | |
| UPPERCASE_LETTER, | | UPPERCASE_LETTER, | |
| LOWERCASE_LETTER, | | LOWERCASE_LETTER, | |
| TITLECASE_LETTER, | | TITLECASE_LETTER, | |
| MODIFIER_LETTER, | | MODIFIER_LETTER, | |
| OTHER_LETTER, | | OTHER_LETTER, | |
| NON_SPACING_MARK, | | NON_SPACING_MARK, | |
| ENCLOSING_MARK, | | ENCLOSING_MARK, | |
| COMBINING_SPACING_MARK, | | COMBINING_SPACING_MARK, | |
| | | | |
| skipping to change at line 227 | | skipping to change at line 227 | |
| | | | |
| namespace Internal { | | namespace Internal { | |
| /** @internal Extract the information about a character from the Unicod
e | | /** @internal Extract the information about a character from the Unicod
e | |
| * character tables. | | * character tables. | |
| * | | * | |
| * ch must be a valid Unicode character value (i.e. < 0x110000) | | * ch must be a valid Unicode character value (i.e. < 0x110000) | |
| */ | | */ | |
| XAPIAN_VISIBILITY_DEFAULT | | XAPIAN_VISIBILITY_DEFAULT | |
| int get_character_info(unsigned ch); | | int get_character_info(unsigned ch); | |
| | | | |
|
| /** @internal Extract how to convert the case of a unicode character fr
om | | /** @internal Extract how to convert the case of a Unicode character fr
om | |
| * its info. | | * its info. | |
| */ | | */ | |
| inline int get_case_type(int info) { return ((info & 0xe0) >> 5); } | | inline int get_case_type(int info) { return ((info & 0xe0) >> 5); } | |
| | | | |
|
| /// @internal Extract the category of a unicode character from its info
. | | /// @internal Extract the category of a Unicode character from its info
. | |
| inline category get_category(int info) { return static_cast<category>(i
nfo & 0x1f); } | | inline category get_category(int info) { return static_cast<category>(i
nfo & 0x1f); } | |
| | | | |
| /** @internal Extract the delta to use for case conversion of a charact
er | | /** @internal Extract the delta to use for case conversion of a charact
er | |
| * from its info. | | * from its info. | |
| */ | | */ | |
| inline int get_delta(int info) { | | inline int get_delta(int info) { | |
| /* It's implementation defined if sign extension happens on right sh
ift | | /* It's implementation defined if sign extension happens on right sh
ift | |
| * of a signed int, hence the conditional (hopefully the compiler wi
ll | | * of a signed int, hence the conditional (hopefully the compiler wi
ll | |
| * spot this and optimise it to a sign-extending shift on architectu
res | | * spot this and optimise it to a sign-extending shift on architectu
res | |
| * with a suitable instruction). | | * with a suitable instruction). | |
| */ | | */ | |
| return (info >= 0) ? (info >> 15) : (~(~info >> 15)); | | return (info >= 0) ? (info >> 15) : (~(~info >> 15)); | |
| } | | } | |
| } | | } | |
| | | | |
|
| /** Convert a single non-ASCII unicode character to UTF-8. | | /** Convert a single non-ASCII Unicode character to UTF-8. | |
| * | | * | |
| * This is intended mainly as a helper method for to_utf8(). | | * This is intended mainly as a helper method for to_utf8(). | |
| * | | * | |
| * The character @a ch (which must be > 128) is written to the buffer @a b
uf | | * The character @a ch (which must be > 128) is written to the buffer @a b
uf | |
| * and the length of the resultant UTF-8 character is returned. | | * and the length of the resultant UTF-8 character is returned. | |
| * | | * | |
| * NB buf must have space for (at least) 4 bytes. | | * NB buf must have space for (at least) 4 bytes. | |
| */ | | */ | |
| XAPIAN_VISIBILITY_DEFAULT | | XAPIAN_VISIBILITY_DEFAULT | |
| unsigned nonascii_to_utf8(unsigned ch, char * buf); | | unsigned nonascii_to_utf8(unsigned ch, char * buf); | |
| | | | |
|
| /** Convert a single unicode character to UTF-8. | | /** Convert a single Unicode character to UTF-8. | |
| * | | * | |
| * The character @a ch is written to the buffer @a buf and the length of t
he | | * The character @a ch is written to the buffer @a buf and the length of t
he | |
| * resultant UTF-8 character is returned. | | * resultant UTF-8 character is returned. | |
| * | | * | |
| * NB buf must have space for (at least) 4 bytes. | | * NB buf must have space for (at least) 4 bytes. | |
| */ | | */ | |
| inline unsigned to_utf8(unsigned ch, char *buf) { | | inline unsigned to_utf8(unsigned ch, char *buf) { | |
| if (ch < 128) { | | if (ch < 128) { | |
| *buf = static_cast<unsigned char>(ch); | | *buf = static_cast<unsigned char>(ch); | |
| return 1; | | return 1; | |
| } | | } | |
| return Xapian::Unicode::nonascii_to_utf8(ch, buf); | | return Xapian::Unicode::nonascii_to_utf8(ch, buf); | |
| } | | } | |
| | | | |
|
| /** Append the UTF-8 representation of a single unicode character to a | | /** Append the UTF-8 representation of a single Unicode character to a | |
| * std::string. | | * std::string. | |
| */ | | */ | |
| inline void append_utf8(std::string &s, unsigned ch) { | | inline void append_utf8(std::string &s, unsigned ch) { | |
| char buf[4]; | | char buf[4]; | |
| s.append(buf, to_utf8(ch, buf)); | | s.append(buf, to_utf8(ch, buf)); | |
| } | | } | |
| | | | |
|
| /// Return the category which a given unicode character falls into. | | /// Return the category which a given Unicode character falls into. | |
| inline category get_category(unsigned ch) { | | inline category get_category(unsigned ch) { | |
| // Categorise non-Unicode values as UNASSIGNED. | | // Categorise non-Unicode values as UNASSIGNED. | |
| if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED; | | if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED; | |
| return Internal::get_category(Internal::get_character_info(ch)); | | return Internal::get_category(Internal::get_character_info(ch)); | |
| } | | } | |
| | | | |
|
| /// Test is a given unicode character is a letter or number. | | /// Test if a given Unicode character is "word character". | |
| inline bool is_wordchar(unsigned ch) { | | inline bool is_wordchar(unsigned ch) { | |
| const unsigned int WORDCHAR_MASK = | | const unsigned int WORDCHAR_MASK = | |
| (1 << Xapian::Unicode::UPPERCASE_LETTER) | | | (1 << Xapian::Unicode::UPPERCASE_LETTER) | | |
| (1 << Xapian::Unicode::LOWERCASE_LETTER) | | | (1 << Xapian::Unicode::LOWERCASE_LETTER) | | |
| (1 << Xapian::Unicode::TITLECASE_LETTER) | | | (1 << Xapian::Unicode::TITLECASE_LETTER) | | |
| (1 << Xapian::Unicode::MODIFIER_LETTER) | | | (1 << Xapian::Unicode::MODIFIER_LETTER) | | |
| (1 << Xapian::Unicode::OTHER_LETTER) | | | (1 << Xapian::Unicode::OTHER_LETTER) | | |
| (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) | | | (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) | | |
| (1 << Xapian::Unicode::LETTER_NUMBER) | | | (1 << Xapian::Unicode::LETTER_NUMBER) | | |
| (1 << Xapian::Unicode::OTHER_NUMBER) | | | (1 << Xapian::Unicode::OTHER_NUMBER) | | |
| (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION); | | (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION); | |
| return ((WORDCHAR_MASK >> get_category(ch)) & 1); | | return ((WORDCHAR_MASK >> get_category(ch)) & 1); | |
| } | | } | |
| | | | |
|
| /// Test is a given unicode character is a whitespace character. | | /// Test if a given Unicode character is a whitespace character. | |
| inline bool is_whitespace(unsigned ch) { | | inline bool is_whitespace(unsigned ch) { | |
| const unsigned int WHITESPACE_MASK = | | const unsigned int WHITESPACE_MASK = | |
| (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF. | | (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF. | |
| (1 << Xapian::Unicode::SPACE_SEPARATOR) | | | (1 << Xapian::Unicode::SPACE_SEPARATOR) | | |
| (1 << Xapian::Unicode::LINE_SEPARATOR) | | | (1 << Xapian::Unicode::LINE_SEPARATOR) | | |
| (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR); | | (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR); | |
| return ((WHITESPACE_MASK >> get_category(ch)) & 1); | | return ((WHITESPACE_MASK >> get_category(ch)) & 1); | |
| } | | } | |
| | | | |
|
| /// Test is a given unicode character is a currency symbol. | | /// Test if a given Unicode character is a currency symbol. | |
| inline bool is_currency(unsigned ch) { | | inline bool is_currency(unsigned ch) { | |
| return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL); | | return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL); | |
| } | | } | |
| | | | |
|
| /// Convert a unicode character to lowercase. | | /// Convert a Unicode character to lowercase. | |
| inline unsigned tolower(unsigned ch) { | | inline unsigned tolower(unsigned ch) { | |
| int info; | | int info; | |
| // Leave non-Unicode values unchanged. | | // Leave non-Unicode values unchanged. | |
| if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode
::Internal::get_character_info(ch))) & 2)) | | if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode
::Internal::get_character_info(ch))) & 2)) | |
| return ch; | | return ch; | |
| return ch + Internal::get_delta(info); | | return ch + Internal::get_delta(info); | |
| } | | } | |
| | | | |
|
| /// Convert a unicode character to uppercase. | | /// Convert a Unicode character to uppercase. | |
| inline unsigned toupper(unsigned ch) { | | inline unsigned toupper(unsigned ch) { | |
| int info; | | int info; | |
| // Leave non-Unicode values unchanged. | | // Leave non-Unicode values unchanged. | |
| if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode
::Internal::get_character_info(ch))) & 4)) | | if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode
::Internal::get_character_info(ch))) & 4)) | |
| return ch; | | return ch; | |
| return ch - Internal::get_delta(info); | | return ch - Internal::get_delta(info); | |
| } | | } | |
| | | | |
| /// Convert a UTF-8 std::string to lowercase. | | /// Convert a UTF-8 std::string to lowercase. | |
| inline std::string | | inline std::string | |
| | | | |
End of changes. 17 change blocks. |
| 17 lines changed or deleted | | 17 lines changed or added | |
|