common.h   common.h 
skipping to change at line 41 skipping to change at line 41
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <stdio.h> #include <stdio.h>
#include <time.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{ {
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
typedef __int8 int8_t; typedef __int8 int8_t;
typedef unsigned __int8 uint8_t; typedef unsigned __int8 uint8_t;
skipping to change at line 72 skipping to change at line 71
typedef uint32_t uint4; typedef uint32_t uint4;
typedef uint16_t uint2; typedef uint16_t uint2;
typedef uint8_t uchar; typedef uint8_t uchar;
typedef int32_t sint4; typedef int32_t sint4;
typedef int16_t sint2; typedef int16_t sint2;
typedef int8_t schar; typedef int8_t schar;
typedef int8_t boole; typedef int8_t boole;
extern void *wg_zalloc(size_t size);
extern char *wg_getline(char *line, int size, FILE * fp); extern char *wg_getline(char *line, int size, FILE * fp);
extern unsigned int wg_split(char **result, char *dest, char *src, extern unsigned int wg_split(char **result, char *dest, char *src,
int maxsegments); int maxsegments);
extern char *wg_strgmov(char *dest, const char *src, extern char *wg_strgmov(char *dest, const char *src,
const char *destlimit); const char *destlimit);
extern char *wg_trim(char *dest, const char *src); extern char *wg_trim(char *dest, const char *src);
#ifdef __cplusplus #ifdef __cplusplus
} }
 End of changes. 2 change blocks. 
3 lines changed or deleted 0 lines changed or added


 constants.h   constants.h 
skipping to change at line 63 skipping to change at line 63
#define MAXOUTPUTSIZE 1024 #define MAXOUTPUTSIZE 1024
/* Maximum number of n-grams in a fingerprint */ /* Maximum number of n-grams in a fingerprint */
#define MAXNGRAMS 400 #define MAXNGRAMS 400
/* Maximum number of character of an n-gram? */ /* Maximum number of character of an n-gram? */
#define MAXNGRAMSYMBOL 5 #define MAXNGRAMSYMBOL 5
/* Maximum size of the string representing an n-gram (must be greater than /* Maximum size of the string representing an n-gram (must be greater than
number of symbol) */ number of symbol) */
#define MAXNGRAMSIZE 20 #define MAXNGRAMSIZE (MAXNGRAMSYMBOL*4)
/* Which characters are not acceptable in n-grams? */ /* Which characters are not acceptable in n-grams? */
#define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) #define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c))
/* Minimum size (in characters) for accepting a document */ /* Minimum size (in characters) for accepting a document */
#define MINDOCSIZE 1 #define MINDOCSIZE 1
/* Maximum penalty for missing an n-gram in fingerprint */ /* Maximum penalty for missing an n-gram in fingerprint */
#define MAXOUTOFPLACE 400 #define MAXOUTOFPLACE 400
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 exttextcat-version.h   exttextcat-version.h 
#ifndef EXTTEXTCAT_VERSION_H #ifndef EXTTEXTCAT_VERSION_H
#define EXTTEXTCAT_VERSION_H #define EXTTEXTCAT_VERSION_H
#define EXTTEXTCAT_VERSION "3.2.0" #define EXTTEXTCAT_VERSION "3.3.0"
#define EXTTEXTCAT_VERSION_MAJOR 3 #define EXTTEXTCAT_VERSION_MAJOR 3
#define EXTTEXTCAT_VERSION_MINOR 2 #define EXTTEXTCAT_VERSION_MINOR 3
#define EXTTEXTCAT_VERSION_MICRO 0 #define EXTTEXTCAT_VERSION_MICRO 0
#endif #endif
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 fingerprint.h   fingerprint.h 
skipping to change at line 38 skipping to change at line 38
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "common.h" #include "common.h"
#include "textcat_properties.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{ {
#endif #endif
extern void *fp_Init(const char *name); extern void *fp_Init(const char *name);
extern void fp_Done(void *handle); extern void fp_Done(void *handle);
extern int fp_Create(void *handle, const char *buffer, uint4 bufsize, extern int fp_Create(void *handle, const char *buffer, uint4 bufsize,
uint4 maxngrams); uint4 maxngrams);
extern int fp_SetProperty(void *handle, textcat_Property property,
sint4 value);
extern int fp_Read(void *handle, const char *fname, int maxngrams); extern int fp_Read(void *handle, const char *fname, int maxngrams);
extern sint4 fp_Compare(void *cat, void *unknown, int cutoff); extern sint4 fp_Compare(void *cat, void *unknown, int cutoff);
extern void fp_Show(void *handle); extern void fp_Show(void *handle);
#ifdef __cplusplus extern const char *fp_Name(void *handle);
extern "C"
{
#endif
extern const char *fp_Name(void *handle);
#ifdef __cplusplus
}
#endif
extern void fp_Print(void *handle, FILE * fp); extern void fp_Print(void *handle, FILE * fp);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 3 change blocks. 
8 lines changed or deleted 4 lines changed or added


 textcat.h   textcat.h 
skipping to change at line 40 skipping to change at line 40
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "exttextcat-version.h" #include "exttextcat-version.h"
#include "common.h"
#include "textcat_properties.h"
#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
#define _TEXTCAT_RESULT_SHORT "SHORT" #define _TEXTCAT_RESULT_SHORT "SHORT"
#define TEXTCAT_RESULT_UNKOWN 0
#define TEXTCAT_RESULT_SHORT -2
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{ {
#endif #endif
/** typedef struct
* textcat_Init() - Initialize the text classifier. The textfile {
* conffile should contain a list of fingerprint filenames and int score;
* identification strings for the categories. The filenames should be const char *name;
* reachable from the current working directory. The identification } candidate_t;
* strings will are used in the classification output.
* /**
* Returns: handle on success, NULL on error. (At the moment, the * textcat_Init() - Initialize the text classifier. The textfile
* only way errors can occur, is when the library cannot read the * conffile should contain a list of fingerprint filenames and
* conffile, or one of the fingerprint files listed in it.) * identification strings for the categories. The filenames should be
* * reachable from the current working directory. The identification
* Replace older function (and has exacly the same behaviour) * strings will are used in the classification output.
* see below *
*/ * Returns: handle on success, NULL on error. (At the moment, the
* only way errors can occur, is when the library cannot read the
* conffile, or one of the fingerprint files listed in it.)
*
* Replace older function (and has exacly the same behaviour)
* see below
*/
extern void *textcat_Init(const char *conffile); extern void *textcat_Init(const char *conffile);
/** /**
* Originaly this function had only one parameter (conffile) it has been mo * special_textcat_Init() - Initialize the text classifier. This functi
dified since OOo must be able to load alternativ DB on
* Basicaly prefix is the directory path where fingerprints are stored * prepare the classifier as needed by OpenOffice.org. The textfile
*/ * conffile should contain a list of utf8 fingerprint filenames and
* identification strings for the categories.prefix will be
* prepended to the filenames to locate the files. The identification
* strings will be used in the classification output.
*
* Returns: handle on success, NULL on error. (At the moment, the
* only way errors can occur, is when the library cannot read the
* conffile, or one of the fingerprint files listed in it.)
*/
extern void *special_textcat_Init(const char *conffile, extern void *special_textcat_Init(const char *conffile,
const char *prefix); const char *prefix);
/** extern int textcat_SetProperty(void *handle, textcat_Property property,
* textcat_Done() - Free up resources for handle sint4 value);
*/
/**
* textcat_Done() - Free up resources for handle
*/
extern void textcat_Done(void *handle); extern void textcat_Done(void *handle);
/** /**
* textcat_Classify() - Give the most likely categories for buffer * textcat_Classify() - Give the most likely categories for buffer
* with length size. * with length size.
* *
* Returns: string containing a list of category id's, each one * Returns: string containing a list of category id's, each one
* between square brackets, "UNKNOWN" when not recognized, "SHORT" if the * between square brackets, "UNKNOWN" when not recognized, "SHORT" if t
* document was too short to make a reliable assessment. he
* * document was too short to make a reliable assessment.
* Performace note: longer buffers take longer to process. However, *
* for many uses it is not necessary to categorize the whole buffer. * Performace note: longer buffers take longer to process. However,
* For language classification, a few hundred bytes will suffice. * for many uses it is not necessary to categorize the whole buffer.
*/ * For language classification, a few hundred bytes will suffice.
*/
extern char *textcat_Classify(void *handle, const char *buffer, extern char *textcat_Classify(void *handle, const char *buffer,
size_t size); size_t size);
/** /**
* textcat_Version() - Returns a string describing the version of this clas * textcat_GetClassifyFullOutput() - Create a classifier output handler
sifier. */
*/ extern candidate_t *textcat_GetClassifyFullOutput(void *handle);
/**
* textcat_ReleaseClassifyFullOutput() - Free up resources for the
* classifier output handler
*/
extern void textcat_ReleaseClassifyFullOutput(void *handle,
candidate_t * candidates)
;
/**
* textcat_ClassifyFull() - Give the most likely categories for buffer
* with length size.
*
* Returns: the numbers of results.
*
* Performace note: longer buffers take longer to process. However,
* for many uses it is not necessary to categorize the whole buffer.
* For language classification, a few hundred bytes will suffice.
*/
extern int textcat_ClassifyFull(void *handle, const char *buffer,
size_t size, candidate_t * candidates);
/**
* textcat_Version() - Returns a string describing the version of this
* classifier.
*/
extern const char *textcat_Version(void); extern const char *textcat_Version(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 7 change blocks. 
38 lines changed or deleted 86 lines changed or added


 utf8misc.h   utf8misc.h 
skipping to change at line 40 skipping to change at line 40
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
************************************************************************** */ ************************************************************************** */
#ifndef _UTF8_MISC_H_ #ifndef _UTF8_MISC_H_
#define _UTF8_MISC_H_ #define _UTF8_MISC_H_
/**
* These variables are used in character processing functions
* These have been added to manage utf-8 symbols, particularly escape chars
*/
#define ESCAPE_MASK 0x80
#define WEIGHT_MASK 0xF0
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{ {
#endif #endif
/* /*
* Is used to jump to the next start of char * Is used to jump to the next start of char
* of course it's only usefull when encoding is utf-8 * of course it's only usefull when encoding is utf-8
* This function have been added by Jocelyn Merand to use libtextcat in OOo * This function have been added by Jocelyn Merand to use libtextcat in
*/ OOo
const char* utf8_next_char(const char *str); */
const char *utf8_next_char(const char *str);
/* Copy the char in str to dest of course it's only usefull when encoding i
s /*
utf8 and the symbol is encoded with more than 1 char return the number o * Copy the char in str to dest of course it's only usefull when encodi
f ng
char jumped This function have been added by Jocelyn Merand to use * is utf8 and the symbol is encoded with more than 1 char return the
libtextcat in OOo */ * number of char jumped This function have been added by Jocelyn Meran
int charcopy(const char *str, char *dest); d to
* use libtextcat in OOo
/* checks if n-gram lex is a prefix of key and of length len */
* len is the number of unicode code points int utf8_charcopy(const char *str, char *dest);
* strlen("€") == 3 but len == 1
*/ /*
int issame(char *lex, char *key, int len); * checks if n-gram lex is a prefix of key and of length len len is the
* number of unicode code points strlen("€") == 3 but len == 1
/* */
* len is the number of unicode code points int utf8_issame(char *lex, char *key, int len);
* strlen("€") == 3 but len == 1
*/ /*
extern int utfstrlen(const char *str); * len is the number of unicode code points
* strlen("€") == 3 but len == 1
*/
extern int utf8_strlen(const char *str);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
 End of changes. 2 change blocks. 
34 lines changed or deleted 29 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/