#include "stdafx.h"
#include "Dictionary.h"
#include "Utility.h"
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
CDictionary::CDictionary()
{
memset(m_IndexTable,0,sizeof(m_IndexTable));
m_pModifyTable=NULL;
}
CDictionary::~CDictionary()
{
for(int i=0;i<CC_NUM;i++)
{
for(int j=0;j<m_IndexTable[i].nCount;j++)
delete m_IndexTable[i].pWordItemHead[j].sWord;
delete [] m_IndexTable[i].pWordItemHead;
}
DelModified();
}
bool CDictionary::Load(char *sFilename,bool bReset)
{
FILE *fp;
int i,j,nBuffer[3];
if((fp=fopen(sFilename,"rb"))==NULL)
return false;
for( i=0;i<CC_NUM;i++)
{
for( j=0;j<m_IndexTable[i].nCount;j++)
delete m_IndexTable[i].pWordItemHead[j].sWord;
delete [] m_IndexTable[i].pWordItemHead;
}
DelModified();
for(i=0;i<CC_NUM;i++)
{
fread(&(m_IndexTable[i].nCount),sizeof(int),1,fp);
if(m_IndexTable[i].nCount>0)
m_IndexTable[i].pWordItemHead=new WORD_ITEM[m_IndexTable[i].nCount];
else
{
m_IndexTable[i].pWordItemHead=0;
continue;
}
j=0;
while(j<m_IndexTable[i].nCount)
{
fread(nBuffer,sizeof(int),3,fp);
m_IndexTable[i].pWordItemHead[j].sWord=new char[nBuffer[1]+1];
if(nBuffer[1])
{
fread(m_IndexTable[i].pWordItemHead[j].sWord,sizeof(char),nBuffer[1],fp);
}
m_IndexTable[i].pWordItemHead[j].sWord[nBuffer[1]]=0;
if(bReset)
m_IndexTable[i].pWordItemHead[j].nFrequency=0;
else
m_IndexTable[i].pWordItemHead[j].nFrequency=nBuffer[0];
m_IndexTable[i].pWordItemHead[j].nWordLen=nBuffer[1];
m_IndexTable[i].pWordItemHead[j].nHandle=nBuffer[2];
j+=1;
}
}
fclose(fp);
return true;
}
bool CDictionary::Save(char *sFilename)
{
FILE *fp;
int i,j,nCount,nBuffer[3];
PWORD_CHAIN pCur;
if((fp=fopen(sFilename,"wb"))==NULL)
return false;
for(i=0;i<CC_NUM;i++)
{
pCur=NULL;
if(m_pModifyTable)
{
nCount=m_IndexTable[i].nCount+m_pModifyTable[i].nCount-m_pModifyTable[i].nDelete;
fwrite(&nCount,sizeof(int),1,fp);
pCur=m_pModifyTable[i].pWordItemHead;
j=0;
while(pCur!=NULL&&j<m_IndexTable[i].nCount)
{
if(strcmp(pCur->data.sWord,m_IndexTable[i].pWordItemHead[j].sWord)<0||(strcmp(pCur->data.sWord,m_IndexTable[i].pWordItemHead[j].sWord)==0&&pCur->data.nHandle<m_IndexTable[i].pWordItemHead[j].nHandle))
{
nBuffer[0]=pCur->data.nFrequency;
nBuffer[1]=pCur->data.nWordLen;
nBuffer[2]=pCur->data.nHandle;
fwrite(nBuffer,sizeof(int),3,fp);
if(nBuffer[1])
fwrite(pCur->data.sWord,sizeof(char),nBuffer[1],fp);
pCur=pCur->next;
}
else if(m_IndexTable[i].pWordItemHead[j].nFrequency==-1)
{
j+=1;
}
else if(strcmp(pCur->data.sWord,m_IndexTable[i].pWordItemHead[j].sWord)>0||(strcmp(pCur->data.sWord,m_IndexTable[i].pWordItemHead[j].sWord)==0&&pCur->data.nHandle>m_IndexTable[i].pWordItemHead[j].nHandle))
{
nBuffer[0]=m_IndexTable[i].pWordItemHead[j].nFrequency;
nBuffer[1]=m_IndexTable[i].pWordItemHead[j].nWordLen;
nBuffer[2]=m_IndexTable[i].pWordItemHead[j].nHandle;
fwrite(nBuffer,sizeof(int),3,fp);
if(nBuffer[1])
fwrite(m_IndexTable[i].pWordItemHead[j].sWord,sizeof(char),nBuffer[1],fp);
j+=1;
}
}
if(j<m_IndexTable[i].nCount)
{
while(j<m_IndexTable[i].nCount)
{
if(m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
{
nBuffer[0]=m_IndexTable[i].pWordItemHead[j].nFrequency;
nBuffer[1]=m_IndexTable[i].pWordItemHead[j].nWordLen;
nBuffer[2]=m_IndexTable[i].pWordItemHead[j].nHandle;
fwrite(nBuffer,sizeof(int),3,fp);
if(nBuffer[1])
fwrite(m_IndexTable[i].pWordItemHead[j].sWord,sizeof(char),nBuffer[1],fp);
}
j+=1;
}
}
else
while(pCur!=NULL)
{
nBuffer[0]=pCur->data.nFrequency;
nBuffer[1]=pCur->data.nWordLen;
nBuffer[2]=pCur->data.nHandle;
fwrite(nBuffer,sizeof(int),3,fp);
if(nBuffer[1])
fwrite(pCur->data.sWord,sizeof(char),nBuffer[1],fp);
pCur=pCur->next;
}
}
else
{
fwrite(&m_IndexTable[i].nCount,sizeof(int),1,fp);
j=0;
while(j<m_IndexTable[i].nCount)
{
nBuffer[0]=m_IndexTable[i].pWordItemHead[j].nFrequency;
nBuffer[1]=m_IndexTable[i].pWordItemHead[j].nWordLen;
nBuffer[2]=m_IndexTable[i].pWordItemHead[j].nHandle;
fwrite(nBuffer,sizeof(int),3,fp);
if(nBuffer[1])
fwrite(m_IndexTable[i].pWordItemHead[j].sWord,sizeof(char),nBuffer[1],fp);
j+=1;
}
}
}
fclose(fp);
return true;
}
bool CDictionary::AddItem(char *sWord, int nHandle,int nFrequency)
{
char sWordAdd[WORD_MAXLENGTH-2];
int nPos,nFoundPos;
PWORD_CHAIN pRet,pTemp,pNext;
int i=0;
if(!PreProcessing(sWord, &nPos,sWordAdd,true))
return false;
if(FindInOriginalTable(nPos,sWordAdd,nHandle,&nFoundPos))
{
if(m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency==-1)
{
m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency=nFrequency;
if(!m_pModifyTable)
{
m_pModifyTable=new MODIFY_TABLE[CC_NUM];
memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
}
m_pModifyTable[nPos].nDelete-=1;
}
else
m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency+=nFrequency;
return true;
}
if(!m_pModifyTable)
{
m_pModifyTable=new MODIFY_TABLE[CC_NUM];
memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
}
if(FindInModifyTable(nPos,sWordAdd,nHandle,&pRet))
{
if(pRet!=NULL)
pRet=pRet->next;
else
pRet=m_pModifyTable[nPos].pWordItemHead;
pRet->data.nFrequency+=nFrequency;
return true;
}
pTemp=new WORD_CHAIN;
if(pTemp==NULL)
return false;
memset(pTemp,0,sizeof(WORD_CHAIN));
pTemp->data.nHandle=nHandle;
pTemp->data.nWordLen=strlen(sWordAdd);
pTemp->data.sWord=new char[1+pTemp->data.nWordLen];
strcpy(pTemp->data.sWord,sWordAdd);
pTemp->data.nFrequency=nFrequency;
pTemp->next=NULL;
if(pRet!=NULL)
{
pNext=pRet->next;
pRet->next=pTemp;
}
else
{
pNext=m_pModifyTable[nPos].pWordItemHead;
m_pModifyTable[nPos].pWordItemHead=pTemp;
}
pTemp->next=pNext;
m_pModifyTable[nPos].nCount++;
return true;
}
bool CDictionary::DelItem(char *sWord,int nHandle)
{
char sWordDel[WORD_MAXLENGTH-2];
int nPos,nFoundPos,nTemp;
PWORD_CHAIN pPre,pTemp,pCur;
if(!PreProcessing(sWord, &nPos,sWordDel))
return false;
if(FindInOriginalTable(nPos,sWordDel,nHandle,&nFoundPos))
{
if(!m_pModifyTable)
{
m_pModifyTable=new MODIFY_TABLE[CC_NUM];
memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
}
m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency=-1;
m_pModifyTable[nPos].nDelete+=1;
if(nHandle==-1)
{
nTemp=nFoundPos+1;
while(nTemp<m_IndexTable[nPos].nCount&&strcmp(m_IndexTable[nPos].pWordItemHead[nFoundPos].sWord,sWordDel)==0)
{
m_IndexTable[nPos].pWordItemHead[nTemp].nFrequency=-1;
m_pModifyTable[nPos].nDelete+=1;
nTemp+=1;
}
}
return true;
}
if(FindInModifyTable(nPos,sWordDel,nHandle,&pPre))
{
pCur=m_pModifyTable[nPos].pWordItemHead;
if(pPre!=NULL)
pCur=pPre->next;
while(pCur!=NULL && _stricmp(pCur->data.sWord,sWordDel)==0&&(pCur->data.nHandle==nHandle||nHandle<0))
{
pTemp=pCur;
if(pPre!=NULL)
pPre->next=pCur->next;
else
m_pModifyTable[nPos].pWordItemHead=pCur->next;
pCur=pCur->next;
delete pTemp->data.sWord;
delete pTemp;
}
return true;
}
return false;
}
bool CDictionary::DelModified()
{
PWORD_CHAIN pTemp,pCur;
if(!m_pModifyTable)
return true;
for(int i=0;i<CC_NUM;i++)
{
pCur=m_pModifyTable[i].pWordItemHead;
while(pCur!=NULL)
{
pTemp=pCur;
pCur=pCur->next;
delete pTemp->data.sWord;
delete pTemp;
}
}
delete [] m_pModifyTable;
m_pModifyTable=NULL;
return true;
}
bool CDictionary::IsExist(char *sWord, int nHandle)
{
char sWordFind[WORD_MAXLENGTH-2];
int nPos;
if(!PreProcessing(sWord, &nPos,sWordFind))
return false;
return(FindInOriginalTable(nPos,sWordFind,nHandle)||FindInModifyTable(nPos,sWordFind,nHandle));
}
bool CDictionary::GetHandle(char *sWord,int *pnCount,int *pnHandle,int *pnFrequency)
{
char sWordGet[WORD_MAXLENGTH-2];
int nPos,nFoundPos,nTemp;
PWORD_CHAIN pPre,pCur;
*pnCount=0;
if(!PreProcessing(sWord, &nPos,sWordGet))
return false;
if(FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos))
{
pnHandle[*pnCount]=m_IndexTable[nPos].pWordItemHead[nFoundPos].nHandle;
pnFrequency[*pnCount]=m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency;
*pnCount+=1;
nTemp=nFoundPos+1;
while(nTemp<m_IndexTable[nPos].nCount&&strcmp(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==0)
{
pnHandle[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
pnFrequency[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nFrequency;
*pnCount+=1;
nTemp+=1;
}
return true;
}
if(FindInModifyTable(nPos,sWordGet,-1,&pPre))
{
pCur=m_pModifyTable[nPos].pWordItemHead;
if(pPre!=NULL)
pCur=pPre->next;
while(pCur!=NULL && _stricmp(pCur->data.sWord,sWordGet)==0)
{
pnHandle[*pnCount]=pCur->data.nHandle;
pnFrequency[*pnCount]=pCur->data.nFrequency;
*pnCount+=1;
pCur=pCur->next;
}
return true;
}
return false;
}
bool CDictionary::FindInOriginalTable(int nInnerCode,char *sWord,int nHandle,int *nPosRet)
{
PWORD_ITEM pItems=m_IndexTable[nInnerCode].pWordItemHead;
int nStart=0,nEnd=m_IndexTable[nInnerCode].nCount-1,nMid=(nStart+nEnd)/2,nCount=0,nCmpValue;
while(nStart<=nEnd)
{
nCmpValue=strcmp(pItems[nMid].sWord,sWord);
if(nCmpValue==0&&(pItems[nMid].nHandle==nHandle||nHandle==-1))
{
if(nPosRet)
{
if(nHandle==-1)
{
nMid-=1;
while(nMid>=0&&strcmp(pItems[nMid].sWord,sWord)==0)
nMid--;
if(nMid<0||strcmp(pItems[nMid].sWord,sWord)!=0)
nMid++;
}
*nPosRet=nMid;
return true;
}
if(nPosRet)
*nPosRet=nMid;
return true;
}
else if(nCmpValue<0||(nCmpValue==0&&pItems[nMid].nHandle<nHandle&&nHandle!=-1))
{
nStart=nMid+1;
}
else if(nCmpValue>0||(nCmpValue==0&&pItems[nMid].nHandle>nHandle&&nHandle!=-1))
{
nEnd=nMid-1;
}
nMid=(nStart+nEnd)/2;
}
if(nPosRet)
{
*nPosRet=nMid-1;
}
return false;
}
bool CDictionary::FindInModifyTable(int nInnerCode,char *sWord,int nHandle,PWORD_CHAIN *pFindRet)
{
PWORD_CHAIN pCur,pPre;
if(m_pModifyTable==NULL)
return false;
pCur=m_pModifyTable[nInnerCode].pWordItemHead;
pPre=NULL;
while(pCur!=NULL&&(_stricmp(pCur->data.sWord,sWord)<0||(_stricmp(pCur->data.sWord,sWord)==0&&pCur->data.nHandle<nHandle)))
{
pPre=pCur;
pCur=pCur->next;
}
if(pFindRet)
*pFindRet=pPre;
if(pCur!=NULL && _stricmp(pCur->data.sWord,sWord)==0&&(pCur->data.nHandle==nHandle||nHandle<0))
{
return true;
}
return false;
}
int CDictionary::GetWordType(char *sWord)
{
int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
if(nLen>0&&nType==CT_CHINESE&&IsAllChinese((unsigned char *)sWord))
return WT_CHINESE;
else if(nLen>0&&nType==CT_DELIMITER)
return WT_DELIMITER;
else
return WT_OTHER;
}
bool CDictionary::PreProcessing(char *sWord, int *nId, char *sWordRet,bool bAdd)
{
int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
int nEnd=nLen-1,nBegin=0;
if(nLen==0)
return false;
while(nEnd>=0&&sWord[nEnd]==' ')
nEnd-=1;
while(nBegin<=nEnd&&sWord[nBegin]==' ')
nBegin+=1;
if(nBegin>nEnd)
return false;
if(nEnd!=nLen-1||nBegin!=0)
{
strncpy(sWord,sWord+nBegin,nEnd-nBegin+1);
sWord[nEnd-nBegin+1]=0;
}
if(nType==CT_CHINESE)
{
*nId=CC_ID(sWord[0],sWord[1]);
strcpy(sWordRet,&sWord[2]);
return true;
}
if(nType==CT_DELIMITER)
{
*nId=3755;
strcpy(sWordRet,sWord);
return true;
}
return false;
}
bool CDictionary::MergePOS(int nHandle)
{
int i,j,nCompare;
char sWordPrev[WORD_MAXLENGTH];
PWORD_CHAIN pPre,pCur,pTemp;
if(!m_pModifyTable)
{
m_pModifyTable=new MODIFY_TABLE[CC_NUM];
memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
}
for( i=0;i<CC_NUM;i++)
{
sWordPrev[0]=0;
for(j=0;j<m_IndexTable[i].nCount;j++)
{
nCompare=_stricmp(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);
if((j==0||nCompare<0)&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
{
m_IndexTable[i].pWordItemHead[j].nHandle=nHandle;
strcpy(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);
}
else if(nCompare==0&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
{
m_IndexTable[i].pWordItemHead[j].nFrequency=-1;
m_pModifyTable[i].nDelete+=1;
}
}
}
for( i=0;i<CC_NUM;i++)
{
pPre=NULL;
pCur=m_pModifyTable[i].pWordItemHead;
sWordPrev[0]=0;
while(pCur!=NULL)
{
if(_stricmp(pCur->data.sWord,sWordPrev)>0)
{
pCur->data.nHandle=nHandle;
strcpy(sWordPrev,pCur->data.sWord);
pPre=pCur;
pCur=pCur->next;
}
else
{
pTemp=pCur;
if(pPre!=NULL)
pPre->next=pCur->next;
else
m_pModifyTable[i].pWordItemHead=pCur->next;
pCur=pCur->next;
delete pTemp->data.sWord;
delete pTemp;
}
}
}
return true;
}
bool CDictionary::GetMaxMatch(char *sWord, char *sWordRet,int *npHandleRet)
{
char sWordGet[WORD_MAXLENGTH-2],sFirstChar[3];
int nPos,nFoundPos,nTemp;
PWORD_CHAIN pCur;
*npHandleRet=-1;
if(!PreProcessing(sWord, &nPos,sWordGet))
return false;
sWordRet[0]=0;
strncpy(sFirstChar,sWord,strlen(sWord)-strlen(sWordGet));
sFirstChar[strlen(sWord)-strlen(sWordGet)]=0;
FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos);
nTemp=nFoundPos;
if(nFoundPos==-1)
nTemp=0;
while(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)!=m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
{
nTemp+=1;
}
if(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
{
strcpy(sWordRet,sFirstChar);
strcat(sWordRet,m_IndexTable[nPos].pWordItemHead[nTemp].sWord);
*npHandleRet=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
return true;
}
if(m_pModifyTable&&m_pModifyTable[nPos].pWordItemHead)
pCur=m_pModifyTable[nPos].pWordItemHead;
else
pCur=NULL;
while(pCur!=NULL&&strcmp(pCur->data.sWord,sWordGet)<=0&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)
{
pCur=pCur->next;
}
if(pCur!=NULL&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)
{
strcpy(sWordRet,sFirstChar);
strcat(sWordRet,pCur->data.sWord);
*npHandleRet=pCur->data.nHandle;
return true;
}
return false;
}
int CDictionary::GetPOSValue(char *sPOS)
{
int nPOS;
char *sPlusPos,sTemp[4];
if(strlen(sPOS)<3)
{
nPOS=sPOS[0]*256+sPOS[1];
}
else
{
sPlusPos=strchr(sPOS,'+');
strncpy(sTemp,sPOS,sPlusPos-sPOS);
sTemp[sPlusPos-sPOS]=0;
nPOS=100*GetPOSValue(sTemp);
strncpy(sTemp,sPlusPos+1,4);
nPOS+=atoi(sTemp);
}
return nPOS;
}
bool CDictionary::GetPOSString(int nPOS, char *sPOSRet)
{
if(nPOS>'a'*25600)
{
if((nPOS/100)%256!=0)
sprintf(sPOSRet,"%c%c+%d",nPOS/25600,(nPOS/100)%256,nPOS%100);
else
sprintf(sPOSRet,"%c+%d",nPOS/25600,nPOS%100);
}
else
{
if(nPOS>256)
sprintf(sPOSRet,"%c%c",nPOS/256,nPOS%256);
else
sprintf(sPOSRet,"%c",nPOS%256);
}
return true;
}
int CDictionary::GetFrequency(char *sWord, int nHandle)
{
char sWordFind[WORD_MAXLENGTH-2];
int nPos,nIndex;
PWORD_CHAIN pFound;
if(!PreProcessing(sWord, &nPos,sWordFind))
return 0;
if(FindInOriginalTable(nPos,sWordFind,nHandle,&nIndex))
{
return m_IndexTable[nPos].pWordItemHead[nIndex].nFrequency;
}
if(FindInModifyTable(nPos,sWordFind,nHandle,&pFound))
{
return pFound->data.nFrequency;
}
return 0;
}
bool CDictionary::Output(char *sFilename)
{
FILE *fp;
int i,j;
PWORD_CHAIN pCur;
char sPrevWord[WORD_MAXLENGTH]="", sCurWord[WORD_MAXLENGTH],sPOS[10];
if((fp=fopen(sFilename,"wb"))==NULL)
return false;
if(m_pModifyTable)
{
return false;
}
for(i=0;i<CC_NUM;i++)
{
pCur=NULL;
j=0;
while(j<m_IndexTable[i].nCount)
{
GetPOSString(m_IndexTable[i].pWordItemHead[j].nHandle,sPOS);
sprintf(sCurWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),m_IndexTable[i].pWordItemHead[j].sWord);
if(strcmp(sPrevWord,sCurWord)!=0)
fprintf(fp,"\n%s %s",sCurWord,sPOS);
else
fprintf(fp," %s",sPOS);
strcpy(sPrevWord,sCurWord);
j+=1;
}
}
fclose(fp);
return true;
}