diff -ruN libtextcat-2.2.part1/src/constants.h libtextcat-2.2/src/constants.h --- libtextcat-2.2.part1/src/constants.h 2007-07-25 10:46:49.000000000 +0100 +++ libtextcat-2.2/src/constants.h 2007-07-25 10:47:25.000000000 +0100 @@ -39,6 +39,8 @@ */ #include +#define _UTF8_ + #define DESCRIPTION "out of place" /* Reported matches are those fingerprints with a score less than best @@ -59,14 +61,21 @@ /* Maximum number of n-grams in a fingerprint */ #define MAXNGRAMS 400 -/* Maximum size of an n-gram? */ -#define MAXNGRAMSIZE 5 +/* Maximum number of character of an n-gram? */ +#define MAXNGRAMSYMBOL 5 + +/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ +#ifdef _UTF8_ +#define MAXNGRAMSIZE 20 +#else +#define MAXNGRAMSIZE MAXNGRAMSYMBOL +#endif /* Which characters are not acceptable in n-grams? */ #define INVALID(c) (isspace((int)c) || isdigit((int)c)) /* Minimum size (in characters) for accepting a document */ -#define MINDOCSIZE 25 +#define MINDOCSIZE 6 /* Maximum penalty for missing an n-gram in fingerprint */ #define MAXOUTOFPLACE 400 @@ -76,4 +85,7 @@ #define MAXSCORE INT_MAX +/* where the fingerprints files are stored */ +#define DEFAULT_FINGERPRINTS_PATH "" + #endif diff -ruN libtextcat-2.2.part1/src/fingerprint.c libtextcat-2.2/src/fingerprint.c --- libtextcat-2.2.part1/src/fingerprint.c 2007-07-25 10:46:49.000000000 +0100 +++ libtextcat-2.2/src/fingerprint.c 2007-07-25 10:47:25.000000000 +0100 @@ -63,6 +63,10 @@ * - put table/heap datastructure in a separate file. */ +#ifndef _UTF8_ +#define _UTF8_ +#endif + #include "config.h" #include #ifdef HAVE_STDLIB_H @@ -80,10 +84,12 @@ #include "wg_mempool.h" #include "constants.h" +#include "utf8misc.h" #define TABLESIZE (1<table[ hash ]; - - while ( entry ) { + + while ( entry ) { if ( issame( entry->str, p, len ) ) { /*** Found it! ***/ entry->cnt++; @@ -168,7 +159,7 @@ } /*** Not found, so create ***/ - entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); + entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); strcpy( entry->str, p ); entry->cnt = 1; @@ -181,12 +172,12 @@ #if 0 /* looks up ngram(p,len) */ -static entry_t *findfreq( table_t *t, char *p, int len ) -{ - uint4 hash = simplehash( p, len ) & TABLEMASK; +static entry_t *findfreq( table_t *t, char *p, int len ) +{ + uint4 hash = simplehash( p, len ) & TABLEMASK; entry_t *entry = t->table[ hash ]; - - while ( entry ) { + + while ( entry ) { if ( issame( entry->str, p, len ) ) { return entry; } @@ -219,7 +210,7 @@ #define GREATER(x,y) ((x).cnt > (y).cnt) #define LESS(x,y) ((x).cnt < (y).cnt) -inline static void siftup( table_t *t, unsigned int child ) +static void siftup( table_t *t, unsigned int child ) { entry_t *heap = t->heap; unsigned int parent = (child-1) >> 1; @@ -241,7 +232,7 @@ } -inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) +static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) { entry_t *heap = t->heap; unsigned int child = parent*2 + 1; @@ -458,21 +449,27 @@ return dest; } - +/** +* this function extract all n-gram from past buffer and put them into the table "t" +* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice +*/ static void createngramtable( table_t *t, const char *buf ) { char n[MAXNGRAMSIZE+1]; const char *p = buf; int i; + int pointer = 0; /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ - for (;;p++) { + while(1) { - const char *q = p; + const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ char *m = n; /*** First char may be an underscore ***/ - *m++ = *q++; + int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ + q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ + m += decay; /*[modified]*/ *m = '\0'; increasefreq( t, n, 1 ); @@ -482,19 +479,22 @@ } /*** Let the compiler unroll this ***/ - for ( i=2; i<=MAXNGRAMSIZE; i++) { + for ( i=2; i<=MAXNGRAMSYMBOL; i++) { - *m++ = *q; + decay = charcopy(q, m); /*[modified] like above*/ + m += decay; *m = '\0'; increasefreq( t, n, i ); if ( *q == '_' ) break; - q++; + q += decay; if ( *q == '\0' ) { return; } } + + pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ } return; } diff -ruN libtextcat-2.2.part1/src/fingerprint.h.orig libtextcat-2.2/src/fingerprint.h.orig --- libtextcat-2.2.part1/src/fingerprint.h.orig 1970-01-01 01:00:00.000000000 +0100 +++ libtextcat-2.2/src/fingerprint.h.orig 2007-07-25 10:47:22.000000000 +0100 @@ -0,0 +1,55 @@ +#ifndef _FINGERPRINT_H_ +#define _FINGERPRINT_H_ +/* + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void *fp_Init(const char *name); +extern void fp_Done( void *handle ); +extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ); +extern int fp_Read( void *handle, const char *fname, int maxngrams ); +extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); +extern void fp_Show( void *handle ); +extern const char *fp_Name( void *handle ); +extern void fp_Print( void *handle, FILE *fp ); + +#ifdef __cplusplus +} +#endif + +#endif diff -ruN libtextcat-2.2.part1/src/textcat.c libtextcat-2.2/src/textcat.c --- libtextcat-2.2.part1/src/textcat.c 2007-07-25 10:46:49.000000000 +0100 +++ libtextcat-2.2/src/textcat.c 2007-07-25 10:47:25.000000000 +0100 @@ -74,6 +74,7 @@ typedef struct { void **fprint; + char *fprint_disable; uint4 size; uint4 maxsize; @@ -112,11 +113,21 @@ fp_Done( h->fprint[i] ); } wg_free( h->fprint ); + wg_free( h->fprint_disable ); wg_free( h ); } -extern void *textcat_Init( const char *conffile ) +/** Replaces older function */ +extern void *textcat_Init( const char *conffile ){ + return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); +} + +/** + * Originaly this function had only one parameter (conffile) it has been modified since OOo use + * Basicaly prefix is the directory path where fingerprints are stored + */ +extern void *special_textcat_Init( const char *conffile, const char *prefix ) { textcat_t *h; char line[1024]; @@ -134,11 +145,13 @@ h->size = 0; h->maxsize = 16; h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); + h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ while ( wg_getline( line, 1024, fp ) ) { char *p; char *segment[4]; - int res; + char finger_print_file_name[512]; + int res; /*** Skip comments ***/ #ifdef HAVE_STRCHR @@ -156,17 +169,23 @@ /*** Ensure enough space ***/ if ( h->size == h->maxsize ) { h->maxsize *= 2; - h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); + h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); + h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); } /*** Load data ***/ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { goto ERROR; } - if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { + finger_print_file_name[0] = '\0'; + strcat(finger_print_file_name, prefix); + strcat(finger_print_file_name, segment[0]); + + if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { textcat_Done(h); goto ERROR; - } + } + h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ h->size++; } @@ -203,11 +222,18 @@ result = _TEXTCAT_RESULT_SHORT; goto READY; } - + /*** Calculate the score for each category. ***/ for (i=0; isize; i++) { - int score = fp_Compare( h->fprint[i], unknown, threshold ); - candidates[i].score = score; + int score; + if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ + score = MAXSCORE; + } + else{ + score = fp_Compare( h->fprint[i], unknown, threshold ); + /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ + } + candidates[i].score = score; candidates[i].name = fp_Name( h->fprint[i] ); if ( score < minscore ) { minscore = score; diff -ruN libtextcat-2.2.part1/src/textcat.h libtextcat-2.2/src/textcat.h --- libtextcat-2.2.part1/src/textcat.h 2007-07-25 10:46:49.000000000 +0100 +++ libtextcat-2.2/src/textcat.h 2007-07-25 10:48:18.000000000 +0100 @@ -55,10 +54,19 @@ * Returns: handle on success, NULL on error. (At the moment, the * only way errors can occur, is when the library cannot read the * conffile, or one of the fingerprint files listed in it.) + * + * Replace older function (and has exacly the same behaviour) + * see below */ extern void *textcat_Init( const char *conffile ); /** + * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB + * Basicaly prefix is the directory path where fingerprints are stored + */ +extern void *special_textcat_Init( const char *conffile, const char *prefix ); + +/** * textcat_Done() - Free up resources for handle */ extern void textcat_Done( void *handle ); diff -ruN libtextcat-2.2.part1/src/utf8misc.c libtextcat-2.2/src/utf8misc.c --- libtextcat-2.2.part1/src/utf8misc.c 1970-01-01 01:00:00.000000000 +0100 +++ libtextcat-2.2/src/utf8misc.c 2007-07-25 10:48:57.000000000 +0100 @@ -0,0 +1,132 @@ +/*************************************************************************** + * Copyright (C) 2006 by Jocelyn Merand * + * joc.mer@gmail.com * + * * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +#ifndef _UTF8_MISC_H_ +#include "utf8misc.h" +#endif + + +int nextcharstart(const char *str, int position){ + int pointer = position; + + if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ + + /*then str[pointer] is an escape character*/ + + char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ + + while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ + escape_char = escape_char <<1; + ++pointer; + } + } + if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ + ++pointer; + } + return pointer; +} + + +int charcopy(const char *str, char *dest){ + + int pointer = 0; + if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ + + /*then str[pointer] is an escape character*/ + + char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ + + while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ + dest[pointer] = str[pointer]; + escape_char = escape_char <<1; + ++pointer; + } + } + if(str[pointer]){ + dest[pointer] = str[pointer]; + ++pointer; + } + + return pointer; +} + + +int issame( char *lex, char *key, int len ) +{ + /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ + int char_counter = 0; + int pointer = 0; + while(char_counter < len) { + + if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ + + /*then key[pointer] is an escap character*/ + + char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ + + while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ + escape_char = escape_char <<1; + ++pointer; + } + } + ++char_counter; /*and we are on a new utf8 character*/ + if ( key[pointer] != lex[pointer] ) { + return 0; + /*printf(" NO\n", lex, key, len);*/ + } + ++pointer; + } + if ( lex[pointer] != '\0' ) { + return 0; + /*printf(" NO\n");*/ + } + + /*printf(" YES\n");*/ + + return 1; +} + + +extern int utfstrlen(const char* str){ + int char_counter = 0; + int pointer = 0; + while(str[pointer]) { + pointer = nextcharstart(str, pointer); + + ++char_counter; /*and we are on a new utf8 character*/ + } + return char_counter; +} + diff -ruN libtextcat-2.2.part1/src/utf8misc.h libtextcat-2.2/src/utf8misc.h --- libtextcat-2.2.part1/src/utf8misc.h 1970-01-01 01:00:00.000000000 +0100 +++ libtextcat-2.2/src/utf8misc.h 2007-07-25 10:48:57.000000000 +0100 @@ -0,0 +1,88 @@ +/*************************************************************************** + * Copyright (C) 2006 by Jocelyn Merand * + * joc.mer@gmail.com * + * * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +#ifndef _UTF8_MISC_H_ +#define _UTF8_MISC_H_ + +/** + * These variables are used in character processing functions + * These have been added to manage utf-8 symbols, particularly escape chars + */ +#ifdef _UTF8_ +#define ESCAPE_MASK 0x80 +#define WEIGHT_MASK 0xF0 +#else +#define ESCAPE_MASK 0xFF +#define WEIGHT_MASK 0x00 +#endif + + +/* + * Is used to jump to the next start of char + * of course it's only usefull when encoding is utf-8 + * This function have been added by Jocelyn Merand to use libtextcat in OOo + */ +int nextcharstart(const char *str, int position); + + +/*Copy the char in str to dest + * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char + * return the number of char jumped + * This function have been added by Jocelyn Merand to use libtextcat in OOo + */ +int charcopy(const char *str, char *dest); + + +/* checks if n-gram lex is a prefix of key and of length len +* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex +* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 +*/ +int issame( char *lex, char *key, int len ); + + +/* Counts the number of characters +* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str +* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 +*/ +#ifdef __cplusplus +extern "C" { +#endif +extern int utfstrlen(const char* str); +#ifdef __cplusplus +} +#endif + +#endif + --- libtextcat-2.2.part2/src/Makefile.am 2007-07-25 10:55:02.000000000 +0100 +++ libtextcat-2.2/src/Makefile.am 2007-07-25 10:55:52.000000000 +0100 @@ -12,11 +12,11 @@ libtextcat_includedir = $(includedir)/libtextcat libtextcat_include_HEADERS = \ - common.h constants.h fingerprint.h textcat.h + common.h constants.h fingerprint.h textcat.h utf8misc.h lib_LTLIBRARIES = libtextcat.la libtextcat_la_SOURCES = \ - common.c fingerprint.c textcat.c wg_mempool.c + common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c bin_PROGRAMS = createfp createfp_SOURCES = createfp.c