| common.h | | common.h | |
| | | | |
| skipping to change at line 41 | | skipping to change at line 41 | |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| */ | | */ | |
| | | | |
| #include <stdio.h> | | #include <stdio.h> | |
|
| #include <time.h> | | | |
| #include <stdlib.h> | | #include <stdlib.h> | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" | | extern "C" | |
| { | | { | |
| #endif | | #endif | |
| | | | |
| #ifdef _MSC_VER | | #ifdef _MSC_VER | |
| typedef __int8 int8_t; | | typedef __int8 int8_t; | |
| typedef unsigned __int8 uint8_t; | | typedef unsigned __int8 uint8_t; | |
| | | | |
| skipping to change at line 72 | | skipping to change at line 71 | |
| typedef uint32_t uint4; | | typedef uint32_t uint4; | |
| typedef uint16_t uint2; | | typedef uint16_t uint2; | |
| typedef uint8_t uchar; | | typedef uint8_t uchar; | |
| | | | |
| typedef int32_t sint4; | | typedef int32_t sint4; | |
| typedef int16_t sint2; | | typedef int16_t sint2; | |
| typedef int8_t schar; | | typedef int8_t schar; | |
| | | | |
| typedef int8_t boole; | | typedef int8_t boole; | |
| | | | |
|
| extern void *wg_zalloc(size_t size); | | | |
| | | | |
| extern char *wg_getline(char *line, int size, FILE * fp); | | extern char *wg_getline(char *line, int size, FILE * fp); | |
| | | | |
| extern unsigned int wg_split(char **result, char *dest, char *src, | | extern unsigned int wg_split(char **result, char *dest, char *src, | |
| int maxsegments); | | int maxsegments); | |
| extern char *wg_strgmov(char *dest, const char *src, | | extern char *wg_strgmov(char *dest, const char *src, | |
| const char *destlimit); | | const char *destlimit); | |
| extern char *wg_trim(char *dest, const char *src); | | extern char *wg_trim(char *dest, const char *src); | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| } | | } | |
| | | | |
End of changes. 2 change blocks. |
| 3 lines changed or deleted | | 0 lines changed or added | |
|
| fingerprint.h | | fingerprint.h | |
| | | | |
| skipping to change at line 38 | | skipping to change at line 38 | |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| */ | | */ | |
| #include "common.h" | | #include "common.h" | |
|
| | | #include "textcat_properties.h" | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" | | extern "C" | |
| { | | { | |
| #endif | | #endif | |
| | | | |
| extern void *fp_Init(const char *name); | | extern void *fp_Init(const char *name); | |
| extern void fp_Done(void *handle); | | extern void fp_Done(void *handle); | |
| extern int fp_Create(void *handle, const char *buffer, uint4 bufsize, | | extern int fp_Create(void *handle, const char *buffer, uint4 bufsize, | |
| uint4 maxngrams); | | uint4 maxngrams); | |
|
| | | extern int fp_SetProperty(void *handle, textcat_Property property, | |
| | | sint4 value); | |
| extern int fp_Read(void *handle, const char *fname, int maxngrams); | | extern int fp_Read(void *handle, const char *fname, int maxngrams); | |
| extern sint4 fp_Compare(void *cat, void *unknown, int cutoff); | | extern sint4 fp_Compare(void *cat, void *unknown, int cutoff); | |
| extern void fp_Show(void *handle); | | extern void fp_Show(void *handle); | |
|
| #ifdef __cplusplus | | extern const char *fp_Name(void *handle); | |
| extern "C" | | | |
| { | | | |
| #endif | | | |
| extern const char *fp_Name(void *handle); | | | |
| #ifdef __cplusplus | | | |
| } | | | |
| #endif | | | |
| extern void fp_Print(void *handle, FILE * fp); | | extern void fp_Print(void *handle, FILE * fp); | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| } | | } | |
| #endif | | #endif | |
| | | | |
| #endif | | #endif | |
| | | | |
| /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | |
| | | | |
End of changes. 3 change blocks. |
| 8 lines changed or deleted | | 4 lines changed or added | |
|
| textcat.h | | textcat.h | |
| | | | |
| skipping to change at line 40 | | skipping to change at line 40 | |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| */ | | */ | |
| #include "exttextcat-version.h" | | #include "exttextcat-version.h" | |
|
| | | #include "common.h" | |
| | | #include "textcat_properties.h" | |
| | | | |
| #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" | | #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" | |
| #define _TEXTCAT_RESULT_SHORT "SHORT" | | #define _TEXTCAT_RESULT_SHORT "SHORT" | |
|
| | | #define TEXTCAT_RESULT_UNKOWN 0 | |
| | | #define TEXTCAT_RESULT_SHORT -2 | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" | | extern "C" | |
| { | | { | |
| #endif | | #endif | |
| | | | |
|
| /** | | typedef struct | |
| * textcat_Init() - Initialize the text classifier. The textfile | | { | |
| * conffile should contain a list of fingerprint filenames and | | int score; | |
| * identification strings for the categories. The filenames should be | | const char *name; | |
| * reachable from the current working directory. The identification | | } candidate_t; | |
| * strings will are used in the classification output. | | | |
| * | | /** | |
| * Returns: handle on success, NULL on error. (At the moment, the | | * textcat_Init() - Initialize the text classifier. The textfile | |
| * only way errors can occur, is when the library cannot read the | | * conffile should contain a list of fingerprint filenames and | |
| * conffile, or one of the fingerprint files listed in it.) | | * identification strings for the categories. The filenames should be | |
| * | | * reachable from the current working directory. The identification | |
| * Replace older function (and has exacly the same behaviour) | | * strings will are used in the classification output. | |
| * see below | | * | |
| */ | | * Returns: handle on success, NULL on error. (At the moment, the | |
| | | * only way errors can occur, is when the library cannot read the | |
| | | * conffile, or one of the fingerprint files listed in it.) | |
| | | * | |
| | | * Replace older function (and has exacly the same behaviour) | |
| | | * see below | |
| | | */ | |
| extern void *textcat_Init(const char *conffile); | | extern void *textcat_Init(const char *conffile); | |
| | | | |
|
| /** | | /** | |
| * Originaly this function had only one parameter (conffile) it has been mo | | * special_textcat_Init() - Initialize the text classifier. This functi | |
| dified since OOo must be able to load alternativ DB | | on | |
| * Basicaly prefix is the directory path where fingerprints are stored | | * prepare the classifier as needed by OpenOffice.org. The textfile | |
| */ | | * conffile should contain a list of utf8 fingerprint filenames and | |
| | | * identification strings for the categories.prefix will be | |
| | | * prepended to the filenames to locate the files. The identification | |
| | | * strings will be used in the classification output. | |
| | | * | |
| | | * Returns: handle on success, NULL on error. (At the moment, the | |
| | | * only way errors can occur, is when the library cannot read the | |
| | | * conffile, or one of the fingerprint files listed in it.) | |
| | | */ | |
| extern void *special_textcat_Init(const char *conffile, | | extern void *special_textcat_Init(const char *conffile, | |
| const char *prefix); | | const char *prefix); | |
| | | | |
|
| /** | | extern int textcat_SetProperty(void *handle, textcat_Property property, | |
| * textcat_Done() - Free up resources for handle | | sint4 value); | |
| */ | | | |
| | | /** | |
| | | * textcat_Done() - Free up resources for handle | |
| | | */ | |
| extern void textcat_Done(void *handle); | | extern void textcat_Done(void *handle); | |
| | | | |
|
| /** | | /** | |
| * textcat_Classify() - Give the most likely categories for buffer | | * textcat_Classify() - Give the most likely categories for buffer | |
| * with length size. | | * with length size. | |
| * | | * | |
| * Returns: string containing a list of category id's, each one | | * Returns: string containing a list of category id's, each one | |
| * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the | | * between square brackets, "UNKNOWN" when not recognized, "SHORT" if t | |
| * document was too short to make a reliable assessment. | | he | |
| * | | * document was too short to make a reliable assessment. | |
| * Performace note: longer buffers take longer to process. However, | | * | |
| * for many uses it is not necessary to categorize the whole buffer. | | * Performace note: longer buffers take longer to process. However, | |
| * For language classification, a few hundred bytes will suffice. | | * for many uses it is not necessary to categorize the whole buffer. | |
| */ | | * For language classification, a few hundred bytes will suffice. | |
| | | */ | |
| extern char *textcat_Classify(void *handle, const char *buffer, | | extern char *textcat_Classify(void *handle, const char *buffer, | |
| size_t size); | | size_t size); | |
| | | | |
|
| /** | | /** | |
| * textcat_Version() - Returns a string describing the version of this clas | | * textcat_GetClassifyFullOutput() - Create a classifier output handler | |
| sifier. | | */ | |
| */ | | extern candidate_t *textcat_GetClassifyFullOutput(void *handle); | |
| | | | |
| | | /** | |
| | | * textcat_ReleaseClassifyFullOutput() - Free up resources for the | |
| | | * classifier output handler | |
| | | */ | |
| | | extern void textcat_ReleaseClassifyFullOutput(void *handle, | |
| | | candidate_t * candidates) | |
| | | ; | |
| | | | |
| | | /** | |
| | | * textcat_ClassifyFull() - Give the most likely categories for buffer | |
| | | * with length size. | |
| | | * | |
| | | * Returns: the numbers of results. | |
| | | * | |
| | | * Performace note: longer buffers take longer to process. However, | |
| | | * for many uses it is not necessary to categorize the whole buffer. | |
| | | * For language classification, a few hundred bytes will suffice. | |
| | | */ | |
| | | extern int textcat_ClassifyFull(void *handle, const char *buffer, | |
| | | size_t size, candidate_t * candidates); | |
| | | | |
| | | /** | |
| | | * textcat_Version() - Returns a string describing the version of this | |
| | | * classifier. | |
| | | */ | |
| extern const char *textcat_Version(void); | | extern const char *textcat_Version(void); | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| } | | } | |
| #endif | | #endif | |
| #endif | | #endif | |
| | | | |
| /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | |
| | | | |
End of changes. 7 change blocks. |
| 38 lines changed or deleted | | 86 lines changed or added | |
|
| utf8misc.h | | utf8misc.h | |
| | | | |
| skipping to change at line 40 | | skipping to change at line 40 | |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| **************************************************************************
*/ | | **************************************************************************
*/ | |
| | | | |
| #ifndef _UTF8_MISC_H_ | | #ifndef _UTF8_MISC_H_ | |
| #define _UTF8_MISC_H_ | | #define _UTF8_MISC_H_ | |
| | | | |
|
| /** | | | |
| * These variables are used in character processing functions | | | |
| * These have been added to manage utf-8 symbols, particularly escape chars | | | |
| */ | | | |
| #define ESCAPE_MASK 0x80 | | | |
| #define WEIGHT_MASK 0xF0 | | | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" | | extern "C" | |
| { | | { | |
| #endif | | #endif | |
| | | | |
|
| /* | | /* | |
| * Is used to jump to the next start of char | | * Is used to jump to the next start of char | |
| * of course it's only usefull when encoding is utf-8 | | * of course it's only usefull when encoding is utf-8 | |
| * This function have been added by Jocelyn Merand to use libtextcat in OOo | | * This function have been added by Jocelyn Merand to use libtextcat in | |
| */ | | OOo | |
| const char* utf8_next_char(const char *str); | | */ | |
| | | const char *utf8_next_char(const char *str); | |
| /* Copy the char in str to dest of course it's only usefull when encoding i | | | |
| s | | /* | |
| utf8 and the symbol is encoded with more than 1 char return the number o | | * Copy the char in str to dest of course it's only usefull when encodi | |
| f | | ng | |
| char jumped This function have been added by Jocelyn Merand to use | | * is utf8 and the symbol is encoded with more than 1 char return the | |
| libtextcat in OOo */ | | * number of char jumped This function have been added by Jocelyn Meran | |
| int charcopy(const char *str, char *dest); | | d to | |
| | | * use libtextcat in OOo | |
| /* checks if n-gram lex is a prefix of key and of length len | | */ | |
| * len is the number of unicode code points | | int utf8_charcopy(const char *str, char *dest); | |
| * strlen("€") == 3 but len == 1 | | | |
| */ | | /* | |
| int issame(char *lex, char *key, int len); | | * checks if n-gram lex is a prefix of key and of length len len is the | |
| | | * number of unicode code points strlen("€") == 3 but len == 1 | |
| /* | | */ | |
| * len is the number of unicode code points | | int utf8_issame(char *lex, char *key, int len); | |
| * strlen("€") == 3 but len == 1 | | | |
| */ | | /* | |
| extern int utfstrlen(const char *str); | | * len is the number of unicode code points | |
| | | * strlen("€") == 3 but len == 1 | |
| | | */ | |
| | | extern int utf8_strlen(const char *str); | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| } | | } | |
| #endif | | #endif | |
| | | | |
| #endif | | #endif | |
| | | | |
| /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ | |
| | | | |
End of changes. 2 change blocks. |
| 34 lines changed or deleted | | 29 lines changed or added | |
|