common.h   common.h 
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -* - */
#ifndef _COMMON_H_ #ifndef _COMMON_H_
#define _COMMON_H_ #define _COMMON_H_
/** /**
* common.h * common.h
* *
* Copyright (C) 2003 WiseGuys Internet B.V. * Copyright (C) 2003 WiseGuys Internet B.V.
* *
* THE BSD LICENSE * THE BSD LICENSE
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
skipping to change at line 41 skipping to change at line 42
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <stdio.h> #include <stdio.h>
#include <time.h> #include <time.h>
#include <stdlib.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
#include <winsock2.h> typedef __int8 int8_t;
typedef unsigned __int8 uint8_t;
typedef __int8 int8_t; typedef __int16 int16_t;
typedef unsigned __int8 uint8_t; typedef unsigned __int16 uint16_t;
typedef __int16 int16_t; typedef __int32 int32_t;
typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t;
typedef __int32 int32_t; typedef __int64 int64_t;
typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t;
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
#else #else
#include <sys/time.h> # include <stdint.h>
#include <stdint.h>
#endif #endif
typedef uint32_t uint4; typedef uint32_t uint4;
typedef uint16_t uint2; typedef uint16_t uint2;
typedef uint8_t uchar; typedef uint8_t uchar;
typedef int32_t sint4; typedef int32_t sint4;
typedef int16_t sint2; typedef int16_t sint2;
typedef int8_t schar; typedef int8_t schar;
typedef int8_t boole; typedef int8_t boole;
typedef struct wgtimer_s { extern void *wg_zalloc(size_t size);
struct timeval start;
struct timeval stop; extern char *wg_getline(char *line, int size, FILE * fp);
} wgtimer_t;
extern unsigned int wg_split(char **result, char *dest, char *src,
extern void *wg_malloc( size_t size ); int maxsegments);
extern void *wg_calloc( size_t nmemb, size_t size ); extern char *wg_strgmov(char *dest, const char *src,
extern void *wg_zalloc( size_t size ); const char *destlimit);
extern char* wg_strdup( const char *s ); extern char *wg_trim(char *dest, const char *src);
extern void* wg_realloc( void *ptr, size_t size ) ;
extern void wg_free( void *mem );
extern char *wg_getline( char *line, int size, FILE *fp );
extern void wg_timerstart(wgtimer_t *t);
extern uint4 wg_timerstop(wgtimer_t *t);
extern unsigned int wg_split( char **result, char *dest, char *src, int max
segments );
extern char *wg_strgmov( char *dest, const char *src, const char *destlimit
);
extern char *wg_trim( char *dest, const char *src );
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 7 change blocks. 
46 lines changed or deleted 32 lines changed or added


 constants.h   constants.h 
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -* - */
#ifndef _CONSTANTS_H_ #ifndef _CONSTANTS_H_
#define _CONSTANTS_H_ #define _CONSTANTS_H_
/* /*
* constants.h -- some constants used throughout the code. Not pretty, * constants.h -- some constants used throughout the code. Not pretty,
* but certainly convenient. * but certainly convenient.
* *
* Copyright (C) 2003 WiseGuys Internet B.V. * Copyright (C) 2003 WiseGuys Internet B.V.
* *
* THE BSD LICENSE * THE BSD LICENSE
skipping to change at line 42 skipping to change at line 43
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <limits.h> #include <limits.h>
#define _UTF8_ /* Reported matches are those fingerprints with a score less than best scor
e *
#define DESCRIPTION "out of place" THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches must score
within 3% from the best score.) */
/* Reported matches are those fingerprints with a score less than best
* score * THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches
* must score within 3% from the best score.)
*/
#define THRESHOLDVALUE 1.03 #define THRESHOLDVALUE 1.03
/* If more than MAXCANDIDATES matches are found, the classifier reports /* If more than MAXCANDIDATES matches are found, the classifier reports
* unknown, because the input is obviously confusing. unknown, because the input is obviously confusing. */
*/
#define MAXCANDIDATES 5 #define MAXCANDIDATES 5
/* The size of the buffer used to report the classification. /* The size of the buffer used to report the classification. */
*/
#define MAXOUTPUTSIZE 1024 #define MAXOUTPUTSIZE 1024
/* Maximum number of n-grams in a fingerprint */ /* Maximum number of n-grams in a fingerprint */
#define MAXNGRAMS 400 #define MAXNGRAMS 400
/* Maximum number of character of an n-gram? */ /* Maximum number of character of an n-gram? */
#define MAXNGRAMSYMBOL 5 #define MAXNGRAMSYMBOL 5
/* Maximum size of the string representing an n-gram (must be greater than /* Maximum size of the string representing an n-gram (must be greater than
number of symbol) */ number of symbol) */
#ifdef _UTF8_
#define MAXNGRAMSIZE 20 #define MAXNGRAMSIZE 20
#else
#define MAXNGRAMSIZE MAXNGRAMSYMBOL
#endif
/* Which characters are not acceptable in n-grams? */ /* Which characters are not acceptable in n-grams? */
#define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) #define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c))
/* Minimum size (in characters) for accepting a document */ /* Minimum size (in characters) for accepting a document */
#define MINDOCSIZE 6 #define MINDOCSIZE 1
/* Maximum penalty for missing an n-gram in fingerprint */ /* Maximum penalty for missing an n-gram in fingerprint */
#define MAXOUTOFPLACE 400 #define MAXOUTOFPLACE 400
/* Size of hash table is 2^TABLEPOW. */ /* Size of hash table is 2^TABLEPOW. */
#define TABLEPOW 13 #define TABLEPOW 13
#define MAXSCORE INT_MAX #define MAXSCORE INT_MAX
/* where the fingerprints files are stored */ /* where the fingerprints files are stored */
#define DEFAULT_FINGERPRINTS_PATH "" #define DEFAULT_FINGERPRINTS_PATH ""
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 8 change blocks. 
19 lines changed or deleted 10 lines changed or added


 fingerprint.h   fingerprint.h 
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -* - */
#ifndef _FINGERPRINT_H_ #ifndef _FINGERPRINT_H_
#define _FINGERPRINT_H_ #define _FINGERPRINT_H_
/* /*
* Copyright (C) 2003 WiseGuys Internet B.V. * Copyright (C) 2003 WiseGuys Internet B.V.
* *
* THE BSD LICENSE * THE BSD LICENSE
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
skipping to change at line 39 skipping to change at line 40
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "common.h" #include "common.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern void *fp_Init(const char *name); extern void *fp_Init(const char *name);
extern void fp_Done( void *handle ); extern void fp_Done(void *handle);
extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint extern int fp_Create(void *handle, const char *buffer, uint4 bufsize,
4 maxngrams ); uint4 maxngrams);
extern int fp_Read( void *handle, const char *fname, int maxngrams ); extern int fp_Read(void *handle, const char *fname, int maxngrams);
extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); extern sint4 fp_Compare(void *cat, void *unknown, int cutoff);
extern void fp_Show( void *handle ); extern void fp_Show(void *handle);
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern const char *fp_Name( void *handle ); extern const char *fp_Name(void *handle);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
extern void fp_Print( void *handle, FILE *fp ); extern void fp_Print(void *handle, FILE * fp);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 8 change blocks. 
12 lines changed or deleted 15 lines changed or added


 textcat.h   textcat.h 
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -* - */
#ifndef _TEXTCAT_H_ #ifndef _TEXTCAT_H_
#define _TEXTCAT_H_ #define _TEXTCAT_H_
/* /*
* textcat.h -- routines for categorizing text * textcat.h -- routines for categorizing text
* *
* Copyright (C) 2003 WiseGuys Internet B.V. * Copyright (C) 2003 WiseGuys Internet B.V.
* *
* THE BSD LICENSE * THE BSD LICENSE
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
skipping to change at line 38 skipping to change at line 39
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <stdio.h> #include "exttextcat-version.h"
#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
#define _TEXTCAT_RESULT_SHORT "SHORT" #define _TEXTCAT_RESULT_SHORT "SHORT"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
/** /**
* textcat_Init() - Initialize the text classifier. The textfile * textcat_Init() - Initialize the text classifier. The textfile
* conffile should contain a list of fingerprint filenames and * conffile should contain a list of fingerprint filenames and
* identification strings for the categories. The filenames should be * identification strings for the categories. The filenames should be
* reachable from the current working directory. The identification * reachable from the current working directory. The identification
* strings will are used in the classification output. * strings will are used in the classification output.
* *
* Returns: handle on success, NULL on error. (At the moment, the * Returns: handle on success, NULL on error. (At the moment, the
* only way errors can occur, is when the library cannot read the * only way errors can occur, is when the library cannot read the
* conffile, or one of the fingerprint files listed in it.) * conffile, or one of the fingerprint files listed in it.)
* *
* Replace older function (and has exacly the same behaviour) * Replace older function (and has exacly the same behaviour)
* see below * see below
*/ */
extern void *textcat_Init( const char *conffile ); extern void *textcat_Init(const char *conffile);
/** /**
* Originaly this function had only one parameter (conffile) it has been mo dified since OOo must be able to load alternativ DB * Originaly this function had only one parameter (conffile) it has been mo dified since OOo must be able to load alternativ DB
* Basicaly prefix is the directory path where fingerprints are stored * Basicaly prefix is the directory path where fingerprints are stored
*/ */
extern void *special_textcat_Init( const char *conffile, const char *prefix extern void *special_textcat_Init(const char *conffile,
); const char *prefix);
/** /**
* textcat_Done() - Free up resources for handle * textcat_Done() - Free up resources for handle
*/ */
extern void textcat_Done( void *handle ); extern void textcat_Done(void *handle);
/** /**
* textcat_Classify() - Give the most likely categories for buffer * textcat_Classify() - Give the most likely categories for buffer
* with length size. * with length size.
* *
* Returns: string containing a list of category id's, each one * Returns: string containing a list of category id's, each one
* between square brackets, "UNKNOWN" when not recognized, "SHORT" if the * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
* document was too short to make a reliable assessment. * document was too short to make a reliable assessment.
* *
* Performace note: longer buffers take longer to process. However, * Performace note: longer buffers take longer to process. However,
* for many uses it is not necessary to categorize the whole buffer. * for many uses it is not necessary to categorize the whole buffer.
* For language classification, a few hundred bytes will suffice. * For language classification, a few hundred bytes will suffice.
*/ */
extern char *textcat_Classify( void *handle, const char *buffer, size_t siz extern char *textcat_Classify(void *handle, const char *buffer,
e ); size_t size);
/** /**
* textcat_Version() - Returns a string describing the version of this clas sifier. * textcat_Version() - Returns a string describing the version of this clas sifier.
*/ */
extern char *textcat_Version(); extern const char *textcat_Version(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 9 change blocks. 
9 lines changed or deleted 11 lines changed or added


 utf8misc.h   utf8misc.h 
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -* - */
/************************************************************************** * /************************************************************************** *
* Copyright (C) 2006 by Jocelyn Merand * * Copyright (C) 2006 by Jocelyn Merand *
* joc.mer@gmail.com * * joc.mer@gmail.com *
* * * *
* THE BSD LICENSE * THE BSD LICENSE
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* *
skipping to change at line 43 skipping to change at line 44
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
************************************************************************** */ ************************************************************************** */
#ifndef _UTF8_MISC_H_ #ifndef _UTF8_MISC_H_
#define _UTF8_MISC_H_ #define _UTF8_MISC_H_
/** /**
* These variables are used in character processing functions * These variables are used in character processing functions
* These have been added to manage utf-8 symbols, particularly escape chars * These have been added to manage utf-8 symbols, particularly escape chars
*/ */
#ifdef _UTF8_
#define ESCAPE_MASK 0x80 #define ESCAPE_MASK 0x80
#define WEIGHT_MASK 0xF0 #define WEIGHT_MASK 0xF0
#else
#define ESCAPE_MASK 0xFF #ifdef __cplusplus
#define WEIGHT_MASK 0x00 extern "C"
{
#endif #endif
/* /*
* Is used to jump to the next start of char * Is used to jump to the next start of char
* of course it's only usefull when encoding is utf-8 * of course it's only usefull when encoding is utf-8
* This function have been added by Jocelyn Merand to use libtextcat in OOo * This function have been added by Jocelyn Merand to use libtextcat in OOo
*/ */
int nextcharstart(const char *str, int position); const char* utf8_next_char(const char *str);
/*Copy the char in str to dest /* Copy the char in str to dest of course it's only usefull when encoding i
* of course it's only usefull when encoding is utf8 and the symbol is enco s
ded with more than 1 char utf8 and the symbol is encoded with more than 1 char return the number o
* return the number of char jumped f
* This function have been added by Jocelyn Merand to use libtextcat in OOo char jumped This function have been added by Jocelyn Merand to use
*/ libtextcat in OOo */
int charcopy(const char *str, char *dest); int charcopy(const char *str, char *dest);
/* checks if n-gram lex is a prefix of key and of length len /* checks if n-gram lex is a prefix of key and of length len
* if _UTF8_ is defined, it uses escap characters and len is not realy the l * len is the number of unicode code points
ength of lex * strlen("€") == 3 but len == 1
* in this case, len is the number of utf-8 char strlen("€") == 3 but len == */
1 int issame(char *lex, char *key, int len);
*/
int issame( char *lex, char *key, int len ); /*
* len is the number of unicode code points
/* Counts the number of characters * strlen("€") == 3 but len == 1
* if _UTF8_ is defined, it uses escap characters and the result is not real */
y the length of str extern int utfstrlen(const char *str);
* in this case, the result is the number of utf-8 char strlen("€") == 3 but
utfstrlen("€") == 1
*/
#ifdef __cplusplus
extern "C" {
#endif
extern int utfstrlen(const char* str);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 7 change blocks. 
28 lines changed or deleted 23 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/