/* Copyright (C) 2000-2007 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <stdio.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_uniconv.h"
#include "udm_searchtool.h"
#include "udm_vars.h"
#include "udm_mutex.h"
#include "udm_chinese.h"


#ifdef CHASEN
#include <chasen.h>
#endif

#ifdef MECAB
#include <mecab.h>
#endif

int *UdmUniSegment(UDM_AGENT *Indexer, int *ustr, const char *lang, const char *seg)
{

#ifdef CHASEN
  if ((!seg  || !strcasecmp(seg, "Chasen")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    char        *eucstr, *eucstr_seg;
    UDM_CHARSET *eucjp_cs;
    UDM_CONV    uni_eucjp, eucjp_uni;
    size_t      reslen;
    size_t       dstlen = UdmUniLen(ustr);
    
    eucjp_cs = UdmGetCharSet("euc-jp");
    if (!eucjp_cs) eucjp_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_eucjp, &udm_charset_sys_int, eucjp_cs, UDM_RECODE_HTML);
    UdmConvInit(&eucjp_uni, eucjp_cs, &udm_charset_sys_int, UDM_RECODE_HTML);
    eucstr = (char*)UdmMalloc(12 * dstlen + 1);
    UdmConv(&uni_eucjp, eucstr, 12 * dstlen + 1, (char*)ustr, sizeof(*ustr)*(dstlen + 1));
    
    UDM_GETLOCK(Indexer, UDM_LOCK_SEGMENTER);
    eucstr_seg = chasen_sparse_tostr(eucstr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_SEGMENTER);
    
    reslen = strlen(eucstr_seg) + 1;
    ustr = (int*)UdmRealloc(ustr, reslen * sizeof(int));
    UdmConv(&eucjp_uni, (char*)ustr, reslen * sizeof(int), eucstr_seg, reslen);
    UDM_FREE(eucstr);
    return ustr;
  }
#endif


#ifdef MECAB
  if ((!seg  || !strcasecmp(seg, "Mecab")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    UDM_CHARSET *sjis_cs;
    UDM_CONV    uni_sjis, sjis_uni;
    char        *sjisstr, *sjisstr_seg;
    size_t      reslen;
    size_t       dstlen = UdmUniLen(ustr);

    sjis_cs = UdmGetCharSet("euc-jp");
    if (!sjis_cs) sjis_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_sjis, &udm_charset_sys_int, sjis_cs, UDM_RECODE_HTML);
    UdmConvInit(&sjis_uni, sjis_cs, &udm_charset_sys_int, UDM_RECODE_HTML);

    sjisstr= (char*)UdmMalloc(12 * dstlen + 1);
    reslen= UdmConv(&uni_sjis, sjisstr, 12 * dstlen, (char*)ustr, sizeof(*ustr) * dstlen);
    sjisstr[reslen]= '\0';
    
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
#ifdef HAVE_PTHREADS
    mecab_lock(Indexer->Conf->mecab);
#endif
    sjisstr_seg = mecab_sparse_tostr(Indexer->Conf->mecab, sjisstr);
#ifdef HAVE_PTHREADS
    mecab_unlock(Indexer->Conf->mecab);
#endif
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

    reslen= strlen(sjisstr_seg);
    ustr= (int*)UdmRealloc(ustr, (reslen + 1) * sizeof(int));
    reslen= UdmConv(&sjis_uni, (char*)ustr, reslen * sizeof(int),
                    sjisstr_seg, reslen) / sizeof(int);
    ustr[reslen]= '\0';
    UDM_FREE(sjisstr);
    return ustr;
  }
#endif


#ifdef HAVE_CHARSET_gb2312
  if ((!seg  || !strcasecmp(seg, "Freq")) && Indexer->Conf->Chi.nwords &&
      (!lang || !lang[0] || 
       !strncasecmp(lang, "zh", 2) ||
       !strncasecmp(lang, "cn", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Chi, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    return ustr;
  }
#endif

  if ((!seg  || !strcasecmp(seg, "Freq")) && Indexer->Conf->Thai.nwords &&
      (!lang || !strncasecmp(lang, "th", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Thai, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    return ustr;
  }

  return ustr;
}
