diff options
author | root <root@rshg047.dnsready.net> | 2011-05-08 22:33:52 +0000 |
---|---|---|
committer | root <root@rshg047.dnsready.net> | 2011-05-08 22:33:52 +0000 |
commit | 886d3d0994eb8e9f3e797155619619e0ef1353f8 (patch) | |
tree | aecae53a57082f8f3e4b72755149285659c1bee3 /extra/libtextcat | |
parent | fe6dc99352fe2b801d251e55b2b8baa71441908e (diff) |
Sun May 8 22:33:51 UTC 2011
Diffstat (limited to 'extra/libtextcat')
-rw-r--r-- | extra/libtextcat/PKGBUILD | 50 | ||||
-rw-r--r-- | extra/libtextcat/fpdb.conf | 86 | ||||
-rw-r--r-- | extra/libtextcat/libtextcat-2.2-OOo.patch | 634 | ||||
-rw-r--r-- | extra/libtextcat/libtextcat-2.2-exportapi.patch | 305 |
4 files changed, 1075 insertions, 0 deletions
diff --git a/extra/libtextcat/PKGBUILD b/extra/libtextcat/PKGBUILD new file mode 100644 index 000000000..dfccfb8db --- /dev/null +++ b/extra/libtextcat/PKGBUILD @@ -0,0 +1,50 @@ +# $Id: PKGBUILD 122957 2011-05-07 14:06:48Z andyrtr $ +# Maintainer: AndyRTR <andyrtr@archlinux.org> +# Contributor: Alexander Fehr <pizzapunk gmail com> +# Contributor: William Rea <sillywilly@gmail.com> +# Contributor: Daniel J Griffiths <ghost1227@archlinux.us> + +pkgname=libtextcat +pkgver=2.2 +pkgrel=8 +pkgdesc="Library that implements N-gram-based text categorization" +arch=('i686' 'x86_64') +url="http://software.wise-guys.nl/libtextcat/" +license=('BSD') +depends=('glibc') +options=(!libtool) +source=(http://software.wise-guys.nl/download/${pkgname}-${pkgver}.tar.gz + # from http://hg.services.openoffice.org/hg/DEV300/raw-file/tip/libtextcat/data/new_fingerprints/fpdb.conf + fpdb.conf + libtextcat-2.2-exportapi.patch + libtextcat-2.2-OOo.patch) +md5sums=('128cfc86ed5953e57fe0f5ae98b62c2e' + 'f4fafe97d3aa184f5476e4918dba045d' + '4c46fcb825ec13e9f7ae3728f5f4c834' + '1d5f1026392365c58f7a7406e923f886') + +build() { + cd ${srcdir}/${pkgname}-${pkgver} + patch -Np1 -i ${srcdir}/libtextcat-2.2-exportapi.patch + patch -Np1 -i ${srcdir}/libtextcat-2.2-OOo.patch + autoreconf -fi + ./configure --prefix=/usr \ + --disable-static + make +} + +package() { + cd ${srcdir}/${pkgname}-${pkgver} + make DESTDIR=${pkgdir} install +# install -D -m644 src/textcat.h ${pkgdir}/usr/include/textcat.h + + install -dm755 ${pkgdir}/usr/share/libtextcat/{LM,ShortTexts} + install -m644 ${srcdir}/fpdb.conf ${pkgdir}/usr/share/libtextcat + install -m644 langclass/conf.txt ${pkgdir}/usr/share/libtextcat + install -m644 langclass/LM/*.lm ${pkgdir}/usr/share/libtextcat/LM + install -m644 langclass/ShortTexts/*.txt ${pkgdir}/usr/share/libtextcat/ShortTexts + + install -D -m644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/LICENSE + sed -i 's|LM|/usr/share/libtextcat/LM|' ${pkgdir}/usr/share/libtextcat/conf.txt +} + diff --git a/extra/libtextcat/fpdb.conf b/extra/libtextcat/fpdb.conf new file mode 100644 index 000000000..329184d51 --- /dev/null +++ b/extra/libtextcat/fpdb.conf @@ -0,0 +1,86 @@ +# +# A sample config file for the language models +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - You may consider eliminating a couple of small languages from this +# list because they cause false positives with big languages and are +# bad for performance. (Do you really want to recognize Drents?) +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# + +# this file have been modified (to OOo by Jocelyn MERAND joc.mer@gmail.com) to include country and encoding +# guess strings are made as following : language-country-encoding + +afrikaans.lm af--utf8 +albanian.lm sq--utf8 +amharic_utf.lm am--utf8 +arabic.lm ar--utf8 +basque.lm eu--utf8 +belarus.lm be--utf8 +bosnian.lm bs--utf8 +breton.lm br--utf8 +catalan.lm ca--utf8 +chinese_simplified.lm zh-CN-utf8 +chinese_traditional.lm zh-TW-utf8 +croatian.lm hr--utf8 +czech.lm cs--utf8 +danish.lm da--utf8 +dutch.lm nl--utf8 +english.lm en--utf8 +esperanto.lm eo--utf8 +estonian.lm et--utf8 +finnish.lm fi--utf8 +french.lm fr--utf8 +frisian.lm fy--utf8 +georgian.lm ka--utf8 +german.lm de--utf8 +greek.lm el--utf8 +hebrew.lm he--utf8 +hindi.lm hi--utf8 +hungarian.lm hu--utf8 +icelandic.lm is--utf8 +indonesian.lm id--utf8 +irish_gaelic.lm ga--utf8 +italian.lm it--utf8 +japanese.lm ja--utf8 +korean.lm ko--utf8 +latin.lm la--utf8 +latvian.lm lv--utf8 +lithuanian.lm lt--utf8 +luxembourgish.lm lb--utf8 +malay.lm ms--utf8 +manx_gaelic.lm gv--utf8 +marathi.lm mr--utf8 +mongolian_cyrillic.lm mn--utf8 +nepali.lm ne--utf8 +norwegian.lm nb--utf8 # Norwegian (Bokmal) +persian.lm fa--utf8 # Farsi +polish.lm pl--utf8 +portuguese.lm pt-PT-utf8 +quechua.lm qu--utf8 +romanian.lm ro--utf8 +romansh.lm rm--utf8 +russian.lm ru--utf8 +sanskrit.lm sa--utf8 +scots.lm sco--utf8 +scots_gaelic.lm gd--utf8 +serbian.lm sr--utf-8 +serbian-latin.lm sh--utf-8 +slovak_ascii.lm sk-SK-utf8 +slovenian.lm sl--utf8 +spanish.lm es--utf8 +swahili.lm sw--utf8 +swedish.lm sv--utf8 +tagalog.lm tl--utf8 +tamil.lm ta--utf8 +thai.lm th--utf8 +turkish.lm tr--utf8 +ukrainian.lm uk--utf8 +vietnamese.lm vi--utf8 +welsh.lm cy--utf8 +yiddish_utf.lm yi--utf8 +zulu.lm zu--utf8 diff --git a/extra/libtextcat/libtextcat-2.2-OOo.patch b/extra/libtextcat/libtextcat-2.2-OOo.patch new file mode 100644 index 000000000..70f9d8d23 --- /dev/null +++ b/extra/libtextcat/libtextcat-2.2-OOo.patch @@ -0,0 +1,634 @@ +diff -ruN libtextcat-2.2.part1/src/constants.h libtextcat-2.2/src/constants.h +--- libtextcat-2.2.part1/src/constants.h 2007-07-25 10:46:49.000000000 +0100 ++++ libtextcat-2.2/src/constants.h 2007-07-25 10:47:25.000000000 +0100 +@@ -39,6 +39,8 @@ + */ + #include <limits.h> + ++#define _UTF8_ ++ + #define DESCRIPTION "out of place" + + /* Reported matches are those fingerprints with a score less than best +@@ -59,14 +61,21 @@ + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +-/* Maximum size of an n-gram? */ +-#define MAXNGRAMSIZE 5 ++/* Maximum number of character of an n-gram? */ ++#define MAXNGRAMSYMBOL 5 ++ ++/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ ++#ifdef _UTF8_ ++#define MAXNGRAMSIZE 20 ++#else ++#define MAXNGRAMSIZE MAXNGRAMSYMBOL ++#endif + + /* Which characters are not acceptable in n-grams? */ + #define INVALID(c) (isspace((int)c) || isdigit((int)c)) + + /* Minimum size (in characters) for accepting a document */ +-#define MINDOCSIZE 25 ++#define MINDOCSIZE 6 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +@@ -76,4 +85,7 @@ + + #define MAXSCORE INT_MAX + ++/* where the fingerprints files are stored */ ++#define DEFAULT_FINGERPRINTS_PATH "" ++ + #endif +diff -ruN libtextcat-2.2.part1/src/fingerprint.c libtextcat-2.2/src/fingerprint.c +--- libtextcat-2.2.part1/src/fingerprint.c 2007-07-25 10:46:49.000000000 +0100 ++++ libtextcat-2.2/src/fingerprint.c 2007-07-25 10:47:25.000000000 +0100 +@@ -63,6 +63,10 @@ + * - put table/heap datastructure in a separate file. + */ + ++#ifndef _UTF8_ ++#define _UTF8_ ++#endif ++ + #include "config.h" + #include <stdio.h> + #ifdef HAVE_STDLIB_H +@@ -80,10 +84,12 @@ + #include "wg_mempool.h" + #include "constants.h" + ++#include "utf8misc.h" + + #define TABLESIZE (1<<TABLEPOW) + #define TABLEMASK ((TABLESIZE)-1) + ++ + typedef struct { + + sint2 rank; +@@ -134,29 +140,14 @@ + } + + +-/* checks if n-gram lex is a prefix of key and of length len */ +-inline int issame( char *lex, char *key, int len ) +-{ +- int i; +- for (i=0; i<len; i++) { +- if ( key[i] != lex[i] ) { +- return 0; +- } +- } +- if ( lex[i] != 0 ) { +- return 0; +- } +- return 1; +-} +- + + /* increases frequency of ngram(p,len) */ +-static inline int increasefreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static int increasefreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +@@ -168,7 +159,7 @@ + } + + /*** Not found, so create ***/ +- entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); ++ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); + strcpy( entry->str, p ); + entry->cnt = 1; + +@@ -181,12 +172,12 @@ + #if 0 + + /* looks up ngram(p,len) */ +-static entry_t *findfreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static entry_t *findfreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +@@ -219,7 +210,7 @@ + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +-inline static void siftup( table_t *t, unsigned int child ) ++static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +@@ -241,7 +232,7 @@ + } + + +-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) ++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +@@ -458,21 +449,27 @@ + return dest; + } + +- ++/** ++* this function extract all n-gram from past buffer and put them into the table "t" ++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice ++*/ + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; ++ int pointer = 0; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +- for (;;p++) { ++ while(1) { + +- const char *q = p; ++ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ + char *m = n; + + /*** First char may be an underscore ***/ +- *m++ = *q++; ++ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ ++ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ ++ m += decay; /*[modified]*/ + *m = '\0'; + + increasefreq( t, n, 1 ); +@@ -482,19 +479,22 @@ + } + + /*** Let the compiler unroll this ***/ +- for ( i=2; i<=MAXNGRAMSIZE; i++) { ++ for ( i=2; i<=MAXNGRAMSYMBOL; i++) { + +- *m++ = *q; ++ decay = charcopy(q, m); /*[modified] like above*/ ++ m += decay; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +- q++; ++ q += decay; + if ( *q == '\0' ) { + return; + } + } ++ ++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ + } + return; + } +diff -ruN libtextcat-2.2.part1/src/fingerprint.h.orig libtextcat-2.2/src/fingerprint.h.orig +--- libtextcat-2.2.part1/src/fingerprint.h.orig 1970-01-01 01:00:00.000000000 +0100 ++++ libtextcat-2.2/src/fingerprint.h.orig 2007-07-25 10:47:22.000000000 +0100 +@@ -0,0 +1,55 @@ ++#ifndef _FINGERPRINT_H_ ++#define _FINGERPRINT_H_ ++/* ++ * Copyright (C) 2003 WiseGuys Internet B.V. ++ * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include "common.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern void *fp_Init(const char *name); ++extern void fp_Done( void *handle ); ++extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ); ++extern int fp_Read( void *handle, const char *fname, int maxngrams ); ++extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); ++extern void fp_Show( void *handle ); ++extern const char *fp_Name( void *handle ); ++extern void fp_Print( void *handle, FILE *fp ); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff -ruN libtextcat-2.2.part1/src/textcat.c libtextcat-2.2/src/textcat.c +--- libtextcat-2.2.part1/src/textcat.c 2007-07-25 10:46:49.000000000 +0100 ++++ libtextcat-2.2/src/textcat.c 2007-07-25 10:47:25.000000000 +0100 +@@ -74,6 +74,7 @@ + typedef struct { + + void **fprint; ++ char *fprint_disable; + uint4 size; + uint4 maxsize; + +@@ -112,11 +113,21 @@ + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); ++ wg_free( h->fprint_disable ); + wg_free( h ); + + } + +-extern void *textcat_Init( const char *conffile ) ++/** Replaces older function */ ++extern void *textcat_Init( const char *conffile ){ ++ return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); ++} ++ ++/** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo use ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ) + { + textcat_t *h; + char line[1024]; +@@ -134,11 +145,13 @@ + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +- int res; ++ char finger_print_file_name[512]; ++ int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +@@ -156,17 +169,23 @@ + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +- h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +- if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { ++ finger_print_file_name[0] = '\0'; ++ strcat(finger_print_file_name, prefix); ++ strcat(finger_print_file_name, segment[0]); ++ ++ if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +- } ++ } ++ h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ + h->size++; + } + +@@ -203,11 +222,18 @@ + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +- ++ + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +- int score = fp_Compare( h->fprint[i], unknown, threshold ); +- candidates[i].score = score; ++ int score; ++ if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ ++ score = MAXSCORE; ++ } ++ else{ ++ score = fp_Compare( h->fprint[i], unknown, threshold ); ++ /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ ++ } ++ candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +diff -ruN libtextcat-2.2.part1/src/textcat.h libtextcat-2.2/src/textcat.h +--- libtextcat-2.2.part1/src/textcat.h 2007-07-25 10:46:49.000000000 +0100 ++++ libtextcat-2.2/src/textcat.h 2007-07-25 10:48:18.000000000 +0100 +@@ -55,10 +54,19 @@ + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) ++ * ++ * Replace older function (and has exacly the same behaviour) ++ * see below + */ + extern void *textcat_Init( const char *conffile ); + + /** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ); ++ ++/** + * textcat_Done() - Free up resources for handle + */ + extern void textcat_Done( void *handle ); +diff -ruN libtextcat-2.2.part1/src/utf8misc.c libtextcat-2.2/src/utf8misc.c +--- libtextcat-2.2.part1/src/utf8misc.c 1970-01-01 01:00:00.000000000 +0100 ++++ libtextcat-2.2/src/utf8misc.c 2007-07-25 10:48:57.000000000 +0100 +@@ -0,0 +1,132 @@ ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#include "utf8misc.h" ++#endif ++ ++ ++int nextcharstart(const char *str, int position){ ++ int pointer = position; ++ ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ ++ ++pointer; ++ } ++ return pointer; ++} ++ ++ ++int charcopy(const char *str, char *dest){ ++ ++ int pointer = 0; ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ dest[pointer] = str[pointer]; ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ ++ dest[pointer] = str[pointer]; ++ ++pointer; ++ } ++ ++ return pointer; ++} ++ ++ ++int issame( char *lex, char *key, int len ) ++{ ++ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ ++ int char_counter = 0; ++ int pointer = 0; ++ while(char_counter < len) { ++ ++ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then key[pointer] is an escap character*/ ++ ++ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ ++char_counter; /*and we are on a new utf8 character*/ ++ if ( key[pointer] != lex[pointer] ) { ++ return 0; ++ /*printf(" NO\n", lex, key, len);*/ ++ } ++ ++pointer; ++ } ++ if ( lex[pointer] != '\0' ) { ++ return 0; ++ /*printf(" NO\n");*/ ++ } ++ ++ /*printf(" YES\n");*/ ++ ++ return 1; ++} ++ ++ ++extern int utfstrlen(const char* str){ ++ int char_counter = 0; ++ int pointer = 0; ++ while(str[pointer]) { ++ pointer = nextcharstart(str, pointer); ++ ++ ++char_counter; /*and we are on a new utf8 character*/ ++ } ++ return char_counter; ++} ++ +diff -ruN libtextcat-2.2.part1/src/utf8misc.h libtextcat-2.2/src/utf8misc.h +--- libtextcat-2.2.part1/src/utf8misc.h 1970-01-01 01:00:00.000000000 +0100 ++++ libtextcat-2.2/src/utf8misc.h 2007-07-25 10:48:57.000000000 +0100 +@@ -0,0 +1,88 @@ ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#define _UTF8_MISC_H_ ++ ++/** ++ * These variables are used in character processing functions ++ * These have been added to manage utf-8 symbols, particularly escape chars ++ */ ++#ifdef _UTF8_ ++#define ESCAPE_MASK 0x80 ++#define WEIGHT_MASK 0xF0 ++#else ++#define ESCAPE_MASK 0xFF ++#define WEIGHT_MASK 0x00 ++#endif ++ ++ ++/* ++ * Is used to jump to the next start of char ++ * of course it's only usefull when encoding is utf-8 ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int nextcharstart(const char *str, int position); ++ ++ ++/*Copy the char in str to dest ++ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char ++ * return the number of char jumped ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int charcopy(const char *str, char *dest); ++ ++ ++/* checks if n-gram lex is a prefix of key and of length len ++* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex ++* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 ++*/ ++int issame( char *lex, char *key, int len ); ++ ++ ++/* Counts the number of characters ++* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str ++* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 ++*/ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern int utfstrlen(const char* str); ++#ifdef __cplusplus ++} ++#endif ++ ++#endif ++ +--- libtextcat-2.2.part2/src/Makefile.am 2007-07-25 10:55:02.000000000 +0100 ++++ libtextcat-2.2/src/Makefile.am 2007-07-25 10:55:52.000000000 +0100 +@@ -12,11 +12,11 @@ + + libtextcat_includedir = $(includedir)/libtextcat + libtextcat_include_HEADERS = \ +- common.h constants.h fingerprint.h textcat.h ++ common.h constants.h fingerprint.h textcat.h utf8misc.h + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +- common.c fingerprint.c textcat.c wg_mempool.c ++ common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c + + bin_PROGRAMS = createfp + createfp_SOURCES = createfp.c diff --git a/extra/libtextcat/libtextcat-2.2-exportapi.patch b/extra/libtextcat/libtextcat-2.2-exportapi.patch new file mode 100644 index 000000000..acf84e099 --- /dev/null +++ b/extra/libtextcat/libtextcat-2.2-exportapi.patch @@ -0,0 +1,305 @@ +diff -ruN libtextcat-2.2.orig/src/common.c libtextcat-2.2/src/common.c +--- libtextcat-2.2.orig/src/common.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/common.c 2007-06-27 17:45:16.000000000 +0100 +@@ -45,7 +45,7 @@ + #endif + #include <stdarg.h> + #include <ctype.h> +-#include "common.h" ++#include "common_impl.h" + + extern void wgmem_error( const char *fmt, ... ) + { +@@ -55,8 +55,6 @@ + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); +- +- exit(-1); + } + + +diff -ruN libtextcat-2.2.orig/src/common_impl.h libtextcat-2.2/src/common_impl.h +--- libtextcat-2.2.orig/src/common_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ libtextcat-2.2/src/common_impl.h 2007-06-27 17:45:16.000000000 +0100 +@@ -0,0 +1,66 @@ ++#ifndef _COMMON_IMPL_H_ ++#define _COMMON_IMPL_H_ ++/** ++ * common_impl.h -- a mixed bag of helper functions ++ * ++ * Copyright (C) 2003 WiseGuys Internet B.V. ++ * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "config.h" ++#ifndef HAVE_MALLOC ++#error "This library needs a GNU like malloc to compile. 'configure' says there isn't one." ++#endif ++#ifndef HAVE_REALLOC ++#error "This library needs a GNU like realloc to compile. 'configure' says there isn't one." ++#endif ++#ifndef HAVE_STRDUP ++#error "This library needs a GNU like strdup to compile. 'configure' says there isn't one." ++#endif ++#ifdef HAVE_INTTYPES_H ++#include <inttypes.h> ++#else ++#ifdef HAVE_STDINT_H ++#include <stdint.h> ++#endif ++#endif ++#ifdef HAVE_SYS_TIME_H ++#include <sys/time.h> ++#endif ++#include "common.h" ++ ++#define WGMIN(x,y) ((x)<=(y)?(x):(y)) ++#define WGMAX(x,y) ((x)<=(y)?(y):(x)) ++#define __STR__(x) #x ++#define WGSTR(x) __STR__(x) ++ ++#endif +diff -ruN libtextcat-2.2.orig/src/createfp.c libtextcat-2.2/src/createfp.c +--- libtextcat-2.2.orig/src/createfp.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/createfp.c 2007-06-27 17:45:16.000000000 +0100 +@@ -44,7 +44,7 @@ + #endif + + #include "fingerprint.h" +-#include "common.h" ++#include "common_impl.h" + + #define BLOCKSIZE 4096 + +diff -ruN libtextcat-2.2.orig/src/fingerprint.c libtextcat-2.2/src/fingerprint.c +--- libtextcat-2.2.orig/src/fingerprint.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/fingerprint.c 2007-06-27 17:45:16.000000000 +0100 +@@ -76,7 +76,7 @@ + #endif + #include <ctype.h> + +-#include "common.h" ++#include "common_impl.h" + #include "wg_mempool.h" + #include "constants.h" + +diff -ruN libtextcat-2.2.orig/src/Makefile.am libtextcat-2.2/src/Makefile.am +--- libtextcat-2.2.orig/src/Makefile.am 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/Makefile.am 2007-06-27 17:47:40.000000000 +0100 +@@ -8,7 +8,11 @@ + AM_LDFLAGS = -g + + noinst_HEADERS = \ +- common.h constants.h fingerprint.h textcat.h wg_mempool.h ++ common_impl.h wg_mempool.h ++ ++libtextcat_includedir = $(includedir)/libtextcat ++libtextcat_include_HEADERS = \ ++ common.h constants.h fingerprint.h textcat.h + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +diff -ruN libtextcat-2.2.orig/src/testtextcat.c libtextcat-2.2/src/testtextcat.c +--- libtextcat-2.2.orig/src/testtextcat.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/testtextcat.c 2007-06-27 17:45:16.000000000 +0100 +@@ -47,7 +47,7 @@ + #endif + + #include "textcat.h" +-#include "common.h" ++#include "common_impl.h" + + #define BLOCKSIZE 4096 + +diff -ruN libtextcat-2.2.orig/src/textcat.c libtextcat-2.2/src/textcat.c +--- libtextcat-2.2.orig/src/textcat.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/textcat.c 2007-06-27 17:45:16.000000000 +0100 +@@ -65,7 +65,7 @@ + #include <alloca.h> + #endif + +-#include "common.h" ++#include "common_impl.h" + #include "fingerprint.h" + #include "textcat.h" + #include "constants.h" +diff -ruN libtextcat-2.2.orig/src/wg_mempool.c libtextcat-2.2/src/wg_mempool.c +--- libtextcat-2.2.orig/src/wg_mempool.c 2007-06-27 17:02:34.000000000 +0100 ++++ libtextcat-2.2/src/wg_mempool.c 2007-06-27 17:45:16.000000000 +0100 +@@ -41,7 +41,7 @@ + #ifdef HAVE_STRING_H + #include <string.h> + #endif +-#include "common.h" ++#include "common_impl.h" + + typedef struct memblock_s { + char *pool; +diff -ru libtextcat-2.2.orig/src/common.h libtextcat-2.2/src/common.h +--- libtextcat-2.2.orig/src/common.h 2003-05-22 14:02:29.000000000 +0100 ++++ libtextcat-2.2/src/common.h 2007-06-28 09:10:42.000000000 +0100 +@@ -1,7 +1,7 @@ + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +- * common.h -- a mixed bag of helper functions ++ * common.h + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * +@@ -36,56 +36,25 @@ + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#include "config.h" +-#ifndef HAVE_MALLOC +-#error "This library needs a GNU like malloc to compile. 'configure' says there isn't one." +-#endif +-#ifndef HAVE_REALLOC +-#error "This library needs a GNU like realloc to compile. 'configure' says there isn't one." +-#endif +-#ifndef HAVE_STRDUP +-#error "This library needs a GNU like strdup to compile. 'configure' says there isn't one." +-#endif + #include <stdio.h> +-#ifdef HAVE_INTTYPES_H + #include <inttypes.h> +-#else +-#ifdef HAVE_STDINT_H +-#include <stdint.h> +-#endif +-#endif +-#ifdef HAVE_SYS_TIME_H +-#include <sys/time.h> +-#endif + #include <time.h> + +-#define WGMIN(x,y) ((x)<=(y)?(x):(y)) +-#define WGMAX(x,y) ((x)<=(y)?(y):(x)) +-#define __STR__(x) #x +-#define WGSTR(x) __STR__(x) +- +-#ifdef HAVE_INTTYPES_H +-typedef uint32_t uint4; +-typedef uint16_t uint2; +-typedef uint8_t uchar; +- +-typedef int32_t sint4; +-typedef int16_t sint2; +-typedef int8_t schar; +- +-typedef int8_t boole; +-#else +-typedef unsigned long uint4; +-typedef unsigned int uint2; +-typedef unsigned char uchar; +- +-typedef long sint4; +-typedef int sint2; +-typedef char schar; +- +-typedef char boole; ++#include <sys/time.h> ++#ifdef __cplusplus ++extern "C" { + #endif + ++typedef uint32_t uint4; ++typedef uint16_t uint2; ++typedef uint8_t uchar; ++ ++typedef int32_t sint4; ++typedef int16_t sint2; ++typedef int8_t schar; ++ ++typedef int8_t boole; ++ + typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; +@@ -108,6 +76,8 @@ + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + ++#ifdef __cplusplus ++} ++#endif + + #endif +- +diff -ru libtextcat-2.2.orig/src/fingerprint.h libtextcat-2.2/src/fingerprint.h +--- libtextcat-2.2.orig/src/fingerprint.h 2003-05-19 13:16:31.000000000 +0100 ++++ libtextcat-2.2/src/fingerprint.h 2007-06-28 09:11:17.000000000 +0100 +@@ -35,6 +35,10 @@ + */ + #include "common.h" + ++#ifdef __cplusplus ++extern "C" { ++#endif ++ + extern void *fp_Init(const char *name); + extern void fp_Done( void *handle ); + extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ); +@@ -44,4 +48,8 @@ + extern const char *fp_Name( void *handle ); + extern void fp_Print( void *handle, FILE *fp ); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif +--- libtextcat-2.2.orig/src/textcat.h 2007-06-28 09:19:26.000000000 +0100 ++++ libtextcat-2.2/src/textcat.h 2007-06-28 09:20:19.000000000 +0100 +@@ -37,6 +37,10 @@ + */ + #include <stdio.h> + ++#ifdef __cplusplus ++extern "C" { ++#endif ++ + #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" + #define _TEXTCAT_RESULT_SHORT "SHORT" + +@@ -77,4 +81,9 @@ + * textcat_Version() - Returns a string describing the version of this classifier. + */ + extern char *textcat_Version(); ++ ++#ifdef __cplusplus ++} ++#endif ++ + #endif |