/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: Utility.c
 * Abstract:
 *           Utility functions for Chinese Language Processing
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-1-8
 *
 * Notes:
 *                
 ****************************************************************************/
#include "stdafx.h"
#include "Utility.h"
#include <stdio.h>
#include <string.h>
/*********************************************************************
 *
 *  Func Name  : GB2312_Generate
 *
 *  Description:  Generate the GB2312 List file
 *              
 *
 *  Parameters : sFilename: the file name for the output GB2312 List
 *    
 *  Returns    : bool
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
bool GB2312_Generate(char *sFileName)
{
   FILE *fp;
   unsigned int i,j;

   if((fp=fopen(sFileName,"wt"))==NULL)
       return false;//fail while opening the file
   for(i=161;i<255;i++)
       for(j=161;j<255;j++)
           fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
   fclose(fp);
   return true;
}
/*********************************************************************
 *
 *  Func Name  : CC_Generate
 *
 *  Description:  Generate the Chinese Char List file
 *              
 *
 *  Parameters : sFilename: the file name for the output CC List
 *    
 *  Returns    : bool
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
bool CC_Generate(char *sFileName)
{
   FILE *fp;
   unsigned int i,j;
   if((fp=fopen(sFileName,"wt"))==NULL)
       return false;//fail while opening the file
   for(i=176;i<255;i++)
       for(j=161;j<255;j++)
           fprintf(fp,"%c%c,%d,%d\n",i,j,i,j);
   fclose(fp);
   return true;
}
/*********************************************************************
 *
 *  Func Name  : CC_Find
 *
 *  Description: Find a Chinese sub-string in the Chinese String 
 *              
 *
 *  Parameters :  string:Null-terminated string to search
 *
 *                   strCharSet:Null-terminated string to search for
 *
 *  Returns    : char *
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
char *CC_Find(const char *string, const char *strCharSet)
{
   char *cp=strstr(string,strCharSet);
   if(cp!=NULL&&(cp-string)%2==1)
   {
      return NULL;
   }
   return cp;
}
/*********************************************************************
 *
 *  Func Name  : charType
 *
 *  Description: Judge the type of sChar or (sChar,sChar+1)
 *              
 *
 *  Parameters : sFilename: the file name for the output CC List
 *    
 *  Returns    : int : the type of char
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
int charType(unsigned char *sChar)
{
  if(*sChar<128)
  {
     if(strchr("\042!,.?()[]{}+=",(int)*sChar))
         return CT_DELIMITER;
     return CT_SINGLE;
  }
  else if(*sChar==162)
      return CT_INDEX;
  else if(*sChar==163&&*(sChar+1)>175&&*(sChar+1)<186)
      return CT_NUM;
  else if(*sChar==163&&(*(sChar+1)>=193&&*(sChar+1)<=218||*(sChar+1)>=225&&*(sChar+1)<=250))
      return CT_LETTER;
  else if(*sChar==161||*sChar==163)
      return CT_DELIMITER;
  else if(*sChar>=176&&*sChar<=247)
      return CT_CHINESE;
  else
      return CT_OTHER;
}
/*********************************************************************
 *
 *  Func Name  : GetCCPrefix
 *
 *  Description: Get the max Prefix string made up of Chinese Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-8
 *********************************************************************/
unsigned int  GetCCPrefix(unsigned char *sSentence)
{
   unsigned int nLen=strlen((const char *)sSentence),nCurPos=0;
   while(nCurPos<nLen&&sSentence[nCurPos]>175&&sSentence[nCurPos]<248)
   {
      nCurPos+=2;//Get next Chinese Char
   }
   return nCurPos;
}
/*********************************************************************
 *
 *  Func Name  : IsAllSingleByte
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllChinese(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen-1&&sString[i]<248&&sString[i]>175)
    {
        i+=2;
    }
    if(i<nLen)
        return false;
    return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllNonChinese
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllNonChinese(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen)
    {
        if(sString[i]<248&&sString[i]>175)
            return false;
        if(sString[i]>128)
            i+=2;
        else
            i+=1;
    }
    return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllSingleByte
 *
 *  Description: Judge the string is all made up of Single Byte Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllSingleByte(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen&&sString[i]<128)
    {
        i++;
    }
    if(i<nLen)
        return false;
    return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllNum
 *
 *  Description: Judge the string is all made up of Num Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllNum(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    char sChar[3];
    sChar[2]=0;
    if(i<nLen)//Get prefix such as + -
    {
        sChar[0]=sString[i++];
        if(sChar[0]<0)//Get first char
            sChar[1]=sString[i++];
        else
            sChar[1]=0;
        if(!strstr("±+—-+",sChar))
        {
            i=0;
        }
    }
    while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
    {
        i+=2;
    }
    if(i<nLen)//Get middle delimiter such as .
    {
        sChar[0]=sString[i++];
        if(sChar[0]<0)//Get first char
            sChar[1]=sString[i++];
        else
            sChar[1]=0;
        if(CC_Find("∶·.",sChar)||sChar[0]=='.')
        {//98.1%
            while(i<nLen-1&&sString[i]==163&&sString[i+1]>175&&sString[i+1]<186)
            {
                i+=2;
            }
        }    
        else
        {
            i-=strlen(sChar);
        }
    }

    if(i>=nLen)
        return true;
    while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
    {//single byte number char
        i+=1;
    }
    if(i<nLen)//Get middle delimiter such as .
    {
        sChar[0]=sString[i++];
        if(sChar[0]<0)//Get first char
            sChar[1]=sString[i++];
        else
            sChar[1]=0;
        if(CC_Find("∶·.",sChar)||sChar[0]=='.')
        {//98.1%
            while(i<nLen&&sString[i]>'0'-1&&sString[i]<'9'+1)
            {
                i+=1;
            }
        }    
        else
        {
            i-=strlen(sChar);
        }
    }
    if(i<nLen)//Get middle delimiter such as .
    {
        sChar[0]=sString[i++];
        if(sChar[0]<0)//Get first char
            sChar[1]=sString[i++];
        else
            sChar[1]=0;
        if(!CC_Find("百千万亿佰仟%‰",sChar)&&sChar[0]!='%')
            i-=strlen(sChar);
    }
    if(i>=nLen)
        return true;
    return false;
}
/*********************************************************************
 *
 *  Func Name  : IsAllIndex
 *
 *  Description: Judge the string is all made up of Index Num Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllIndex(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen-1&&sString[i]==162)
    {
        i+=2;
    }
    if(i>=nLen)
        return true;
     while(i<nLen&&(sString[i]>'A'-1&&sString[i]<'Z'+1)||(sString[i]>'a'-1&&sString[i]<'z'+1))
     {//single byte number char
        i+=1;
     }

    if(i<nLen)
        return false;
    return true;

}
/*********************************************************************
 *
 *  Func Name  : IsAllLetter
 *
 *  Description: Judge the string is all made up of Letter Char
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllLetter(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen-1&&sString[i]==163&&((sString[i+1]>=193&&sString[i+1]<=218)||(sString[i+1]>=225&&sString[i+1]<=250)))
    {
        i+=2;
    }
    if(i<nLen)
        return false;

    return true;
}
/*********************************************************************
 *
 *  Func Name  : IsAllDelimiter
 *
 *  Description: Judge the string is all made up of Delimiter
 *              
 *
 *  Parameters : sSentence: the original sentence which includes Chinese or Non-Chinese char
 *    
 *  Returns    : the end of the sub-sentence
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-24
 *********************************************************************/
bool IsAllDelimiter(unsigned char *sString)
{
    unsigned int nLen=strlen((const char *)sString),i=0;
    while(i<nLen-1&&(sString[i]==161||sString[i]==163))
    {
        i+=2;
    }
    if(i<nLen)
        return false;
    return true;
}
/*********************************************************************
 *
 *  Func Name  : BinarySearch
 *
 *  Description: Lookup the index of nVal in the table nTable which length is nTableLen
 *
 *  Parameters : nPOS: the POS value
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-25
 *********************************************************************/
int BinarySearch(int nVal, int *nTable,int nTableLen)
{
    int nStart=0,nEnd=nTableLen-1,nMid=(nStart+nEnd)/2;
    while(nStart<=nEnd)//Binary search
    {
       if(nTable[nMid]==nVal)
       {
            return nMid;//find it
       }
       else if(nTable[nMid]<nVal)
       {
           nStart=nMid+1;
       }
       else
       {
           nEnd=nMid-1;
       }
       nMid=(nStart+nEnd)/2;
    }
    return -1;//Can not find it;
}
/*********************************************************************
 *
 *  Func Name  : IsForeign
 *
 *  Description: Decide whether the word is not a Non-fereign word
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-26
 *********************************************************************/
bool IsForeign(char *sWord)
{
  int nForeignCount=GetForeignCharCount(sWord),nCharCount=strlen(sWord);
  if(nCharCount>2||nForeignCount>=1*nCharCount/2)
      return true;
  return false;
}
/*********************************************************************
 *
 *  Func Name  : IsAllForeign
 *
 *  Description: Decide whether the word is not a Non-fereign word
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-3-25
 *********************************************************************/
bool IsAllForeign(char *sWord)
{
  unsigned int nForeignCount=(unsigned int)GetForeignCharCount(sWord);
  if(2*nForeignCount==strlen(sWord))
      return true;
  return false;
}
/*********************************************************************
 *
 *  Func Name  : IsForeign
 *
 *  Description: Decide whether the word is Chinese Num word
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-26
 *********************************************************************/
bool IsAllChineseNum(char *sWord)
{//百分之五点六的人早上八点十八分起床
  unsigned int  k; 
  char tchar[3];
  char ChineseNum[]="零○一二三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
  char sPrefix[]="几第数两";
  for(k = 0; k < strlen(sWord); k+=2)
  {
     strncpy(tchar,sWord+k,2) ;
     tchar[2]='\0';
     if(strncmp(sWord+k,"分之",4)==0)//百分之五
     {
        k+=2;
        continue;
     }

     if(!CC_Find(ChineseNum, tchar)&&!(k==0&&CC_Find(sPrefix, tchar)))
         return false;
  }
  return true;
}
/*********************************************************************
 *
 *  Func Name  : GetForeignCharCount
 *
 *  Description: 
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-4-4
 *              2.Modify  2002-5-21
 *********************************************************************/
int GetForeignCharCount(char *sWord)
{
  unsigned int nForeignCount,nCount;
  nForeignCount=GetCharCount(TRANS_ENGLISH,sWord);//English char counnts
  nCount=GetCharCount(TRANS_JAPANESE,sWord);//Japan char counnts
  if(nForeignCount<=nCount)
    nForeignCount=nCount;
  nCount=GetCharCount(TRANS_RUSSIAN,sWord);//Russian char counnts
  if(nForeignCount<=nCount)
    nForeignCount=nCount;
  return nForeignCount;
}
/*********************************************************************
 *
 *  Func Name  : GetCharCount
 *
 *  Description: Get the count of char which is in sWord and in sCharSet
 *
 *  Parameters : sWord: the word
 * 
 *  Returns    : COUNT
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-5-21
 *********************************************************************/
int GetCharCount(char *sCharSet,char *sWord)
{
  unsigned int  k; 
  char tchar[3];
  int nCount=0,nCharCount=0;
  for (k = 0; k < strlen(sWord); k+=2)
  {
     while(k < strlen(sWord)&&sWord[k]>0)
         k++;
     strncpy(tchar,sWord+k,2) ;
     tchar[2]='\0';
     nCharCount++;
     if(CC_Find(sCharSet, tchar))
          nCount++;
  }
  return nCount;
}
/*********************************************************************
 *
 *  Func Name  : GetForeignCharCount
 *
 *  Description: Return the foreign type 
 *
 *  Parameters : sWord: the word
 *
 *  Returns    : the index value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-4-4
 *              2.Modify  2002-5-21
 *********************************************************************/
int GetForeignType(char *sWord)
{
  unsigned int nForeignCount,nCount,nType=TT_ENGLISH;
  nForeignCount=GetCharCount(TRANS_ENGLISH,sWord);//English char counnts
  nCount=GetCharCount(TRANS_RUSSIAN,sWord);//Russian char counnts
  if(nForeignCount<nCount)
  {
      nForeignCount=nCount;
      nType=TT_RUSSIAN;
  }
  nCount=GetCharCount(TRANS_JAPANESE,sWord);//Japan char counnts
  if(nForeignCount<nCount)
  {
      nForeignCount=nCount;
      nType=TT_JAPANESE;
  }
  return nType;
}
bool PostfixSplit(char *sWord, char *sWordRet, char *sPostfix)
{
    char sSinglePostfix[]=POSTFIX_SINGLE;
    char sMultiPostfix[][9]=POSTFIX_MUTIPLE;
    unsigned int nPostfixLen=0,nWordLen=strlen(sWord);
    int i=0;

    while(sMultiPostfix[i][0]!=0&&strncmp(sWord+nWordLen-strlen(sMultiPostfix[i]),sMultiPostfix[i],strlen(sMultiPostfix[i]))!=0)
    {//Try to get the postfix of an address
        i++;
    }
    strcpy(sPostfix,sMultiPostfix[i]);
    nPostfixLen=strlen(sMultiPostfix[i]);//Get the length of place postfix

    if(nPostfixLen==0)
    {
        sPostfix[2]=0;
        strncpy(sPostfix,sWord+nWordLen-2,2);
        if(CC_Find(sSinglePostfix,sPostfix))
            nPostfixLen=2;
    }
    
    strncpy(sWordRet,sWord,nWordLen-nPostfixLen);
    sWordRet[nWordLen-nPostfixLen]=0;//Get the place name which have erasing the postfix
    sPostfix[nPostfixLen]=0;
    return true;
}