语音合成

来源:互联网 发布:网络大学有哪些专业 编辑:程序博客网 时间:2024/06/10 21:36
节选自Sen Zhang博士的语言识别工具源码:


字转拼音 ,得到汉字对应的拼音,再按照规则生成音节序列,之后根据音节顺次播放声音片段,并在片段边缘做平滑处理,就能接近真人发音的效果了。






/*--------------------------------------------------------------


Convert the Chinese text to Pinyin sequence, the input


Chinese text sequence should be already segmented into 


word sequences and dont have optional one, such as


[ W1 | W2 |..| Wn ]. Use HashWordSegBest after word


segmented can make it ok.






1. each word converted into pinyin based on lexicon


2. each single chinese char is converted into pinyin


  according to dictionary.


    3. the pronunciation prob may be used for choosing one


  best pinyin from multiple choices.


    The converted pinyin sequence is stored in pPinyin.


    If a word has more than 1 pinyin, then all the pinyin


are picked out and stored in the format:


[pinyin1 | pinyin2 | ... | pinyink ]


    If a word or char just has 1 pinyin, then the pinyin


is simply stored as: pinyin1 pinyin2 ... pinyink






    modified in 26, March, 2003


---------------------------------------------------------------*/


int HashText2Pinyin(char *sText, char *pPinyin)


{


int i, j, ii;


int len, num, libFlag;


int nLength, nIndex;


char cWord[64], pinyin[256], gram[256];


char *lText, *rText;


float prob;


CString cs; // debug






if(!sText || !pPinyin )return RTN_ER;






pPinyin[0] = 0;


nLength = strlen(sText);


lText = (char *)malloc(nLength+1);


rText = (char *)malloc(nLength+1);


if(!lText || !rText) {


AfxMessageBox("Memory error in: HashText2Pinyin");


return 0;


}


// for the left and right context


memset(lText, 0, nLength+1);


memset(rText, 0, nLength+1);


nIndex=0; ii=0;


while(nIndex < nLength)


{


if(sText[nIndex]!=' ' && nIndex<nLength-1) {


cWord[ii] = sText[nIndex];


nIndex++; ii++;


continue;


}


else { // now, meet word boundary


if(nIndex==nLength-1) {


cWord[ii] = sText[nIndex];


ii++;


}


nIndex++;


cWord[ii]=0;


ii=0;


}


// if no real content


if(strlen(cWord) <=0) {


ii=0;


continue;


}


// save the right text


memset(rText, 0, nLength+1);


if(nIndex < nLength) strcpy(rText, &sText[nIndex]);


// reset the right context if more than 1 same word


ii = strlen(cWord);


j  = strlen(rText);


i=0;


while(i<j-ii) {


if( strncmp(cWord, &rText[i], ii)==0) {


rText[i] = 0;


break;


}


i++;


}


// reset the left context if more than 1 same word


ii = strlen(cWord);


j  = strlen(lText);


i=j-ii;


while(i>=0) {


if( strncmp(cWord, &lText[i], ii)==0) {


if(j-i-ii>=0) {


memmove(lText, &lText[i+2], j-i-ii);


lText[j-i-ii] = 0;


}


else memset(lText, 0, nLength+1);


break;


}


i--;


}


// now one word or char is ready!


//-------------------------------------------------


// begin converting......


if(strlen(cWord) > 2)libFlag = CNLEX;


else libFlag = CNDICT;


num = HashTokenNumOf1Word(cWord, libFlag);


if(num > 1) {// has more than 1 pinyin


float maxProb = 0.0;


char pyBest[64];


char lFea[32][16], rFea[32][16];


int  lNum=0, rNum=0, ind = 0;


bool lFlag = FALSE, rFlag = FALSE;


// which pinyin is proper based on gram?


memset(pyBest, 0, 64);


for(i=0; i<num; i++) {


prob = 1.0;


memset(gram, 0, 256);


memset(pinyin, 0, 256);


HashGet1TokenAndProperty(cWord, i, pinyin, &prob, gram, libFlag);


// if gram can determine pinyin?


if(gram[0] != '\0') {


// get the left context features


len = strlen(gram);


lNum=0; ind = 0;


for(j=0; j<len; j++) {


if(gram[j] == '_') { 


if(j>0) lNum++;


break;


}


else if(gram[j] == '|') {


ind = 0;


lNum++;


}


else {


lFea[lNum][ind] = gram[j];


ind++;


lFea[lNum][ind] = '\0';


}


} // for end 


// get the right context features


rNum=0; ind =0;


j = j+1;


for( ; j<len+1; j++) {


if( j==len ) { 


if(ind>0) rNum++;


break;


}


else if(gram[j] == '|') {


ind = 0;


rNum++;


}


else {


rFea[rNum][ind] = gram[j];


ind++;


rFea[rNum][ind] = '\0';


}


} // for end 


// try to match the left context


lFlag = FALSE;


for(j=0; j<lNum; j++) {


if(!strcmp(lFea[j], "*")) {


if(strlen(lText) > 0)


lFlag = TRUE;


break;


}


if(strstr(lText, lFea[j])) {


lFlag = TRUE;


break;


}


}


// try to match the right context


rFlag = FALSE;


for(j=0; j<rNum; j++) {


if(!strcmp(rFea[j], "*")) {


if(strlen(rText) > 0)


rFlag = TRUE;


break;


}


if(strstr(rText, rFea[j])) {


rFlag = TRUE;


break;


}


}


// if no left or right context features


if(lNum==0) lFlag = TRUE;


if(rNum==0) rFlag = TRUE;


// if both left and right context ok? 


if(lFlag && rFlag) {


strcpy(pyBest, pinyin);


break;


}


} // if end


// use prob to choose pinyin if gram failed?


if(!lFlag || !rFlag) {


if( maxProb < prob ){


maxProb = prob;


strcpy(pyBest, pinyin);


} // if end


}// if end


} // for end


len = strlen(pPinyin);


if(len > 0)


strcat(pPinyin, " ");


if(pyBest[0] != '\0')


strcat(pPinyin, pyBest);


else // the last one is the best one


strcat(pPinyin, pinyin);


} // if


else if(num == 1) {


HashGetFirstToken(cWord, pinyin, libFlag);


len = strlen(pPinyin);


if(len > 0)


strcat(pPinyin, " ");


strcat(pPinyin, pinyin);


}


else {


cs.Format("No pinyin for word: %s", cWord);


AfxMessageBox(cs);


}


// get the left context


strncpy(lText, sText, nIndex);


// reset for next new word


memset(cWord, 0, 64);


ii = 0;


} // end while()


// free all


if(lText != NULL)free(lText);


if(rText != NULL)free(rText);






return RTN_OK;


}


拼音转音素算法 






//-------------------------------------------------------


// To convert english text into phoneme sequence.


// The text may have more than 1 word, each word's


// phoneme sequence is delimited by space " ".


// Converted phonemes are returned by a pointer.


//-------------------------------------------------------


char* ConvertText2PhonesEn(char *pText)


{


int i, j, k;


int iLast, nLen, nFoneLen;


char word[128], fones[256];


char *lText, *rText;


char *pFone = NULL;


int  nCount = 0;


FILE *fp;






if(!pText ) return pFone;


fp = fopen("TTS-TMP-Fone.txt", "w");


if(!fp) {


AfxMessageBox("File error in:ConvertText2PhonesEn");


return pFone;


}


// initialize them


nLen = strlen(pText);


lText = (char *)malloc(nLen+1);


rText = (char *)malloc(nLen+1);


if(!lText || !rText) {


AfxMessageBox("Mem error in:ConvertText2PhonesEn");


fclose(fp);


return pFone;


}


memset(word, 0, 128);


j=0; i=0; iLast = 0;


while(i<nLen ) 


{


if(pText[i]>=0x30 && i<nLen-1) {


word[j] = pText[i];


i++; j++; 


continue;





else { // meet a word boundary or line end 


if(pText[i]>=0x30 ) { 


word[j] = pText[i];


j++; i++;


}


// save the left and right context


memset(lText, 0, nLen+1);


if(iLast>0 && iLast<nLen-1)


strncpy(lText, pText, iLast);


else {


if(iLast==0)


lText[0] = '\0';


if(iLast>nLen-1)


strcpy(lText, pText);


}


memset(rText, 0, nLen+1);


if(i<nLen-1)


strcpy(rText, &pText[i+1]);


else


rText[0] = '\0'; // null now


iLast = i;


i++; 


j=0;


} // if-else end


if(strlen(word)<=0){


j=0; // word index


memset(word, 0, 128);


continue;


}


// to convert word to phones


// first, look up in lex, then by rule


memset(fones, 0, 256);


k = EnLexPronuNumOf1Word(word);


if(k>1) { //more than 1 pronunciation


// now, use gram to choose best one, ???


// EnLexWordToPhonemeBest(word, lText, rText, fones); 


EnLexWordToPhoneme(word, fones);


}


else if(k==1) { // only 1 pronunciation


EnLexWordToPhoneme(word, fones);


}


else { // no pronunciation, using rule now


EnRuleWord2Phoneme(word, fones);


}


nFoneLen = strlen(fones);


if(nFoneLen<=0) {


j=0;


continue;


}


else {


if(fones[nFoneLen-1] != ' ') {


strcat(fones, " ");


nFoneLen++;


}


nCount = nCount+nFoneLen;


fwrite(fones, sizeof(char), nFoneLen, fp);


}


memset(word, 0, 128);


} // end while


fclose(fp);


fp = fopen("TTS-TMP-Fone.txt", "r");


if(!fp) {


AfxMessageBox("File error in:ConvertText2PhonesEn");


return pFone;


}


pFone = (char *)malloc(nCount+1);


if(pFone != NULL) {


memset(pFone, 0, nCount+1);


fread(pFone, sizeof(char), nCount, fp);


}






free(lText);


free(rText);


return pFone;


}


音素拼接,语调修饰算法 


//-------------------------------------------------------


// the input pText is a string of English words


// not the string of phonemes


//-------------------------------------------------------


void TTSEngineEn(char *pText)


{


int k, n, kk;


int num, nLen, nFoneLen, nOffset, nCV;


char *pFone;


char cFone[8], lFone[8], rFone[8], r1Fone[8];


char biFone[16], biBak[16];


int ampLevel;


short *pRaw, dataBuf[9001], *pTout;


int nCount=0;


FILE *fp; // temp file for saving speech data


char tipBuf[64];






if(!pText) return;


// convert the whole text into phonemes


pFone = ConvertText2PhonesEn(pText);


if(!pFone) return;






fp = fopen("TTS-TMP-Data.raw", "wb");


// if fp==NULL, dont write to temp file






// to output each phone


memset(cFone, 0, 8);


memset(lFone, 0, 8);


memset(rFone, 0, 8);


memset(biFone, 0, 16);


nFoneLen = strlen(pFone);


k = 0;


while( k<nFoneLen ) {


strcpy(lFone, cFone);


// get the current fones, fones are linked


// by "-", etc, aa1-k-ax-n


kk=0; ampLevel=-1; 


memset(cFone, 0, 8);


while(k<nFoneLen) {


if(pFone[k]=='-' || pFone[k]==' ') {


k++;


break;


}


else if(pFone[k]>='0' && pFone[k]<='9') {


ampLevel = pFone[k];


k++;


}


else {


cFone[kk] = pFone[k];


k++; kk++;


}


} // while end


// save the last position


// get the right fones


n = k; kk = 0;


memset(rFone, 0, 8);


while(n<nFoneLen) {


if(pFone[n]=='-' || pFone[n]==' ') {


n++;


break;


}


else if(pFone[n]>='0' && pFone[n]<='9') {


n++;


}


else {


rFone[kk] = pFone[n];


n++; kk++;


}


} // while end


// get the next fone just after right fone


kk = 0;


memset(r1Fone, 0, 8);


while(n<nFoneLen) {


if(pFone[n]=='-' || pFone[n]==' ') {


n++;


break;


}


else if(pFone[n]>='0' && pFone[n]<='9') {


n++;


}


else {


r1Fone[kk] = pFone[n];


n++; kk++;


}


} // while end


// if cFone is null, no more fones


if(strlen(cFone) <= 0) {


continue;


}


// consider the context phones, left first


memset(biFone, 0, 16);


if(lFone[0]=='\0') { // first fone


sprintf(biFone, "%s-%s", "pau", cFone);


}


else { // other cases


if(rFone[0] != '\0')


sprintf(biFone, "%s-%s", cFone, rFone);


else 


sprintf(biFone, "%s-%s", cFone, "pau");


}


// build more precise model based on context


memset(biBak, 0, 16);


strcpy(biBak, biFone);


memset(biFone, 0, 16);


FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);


if(biFone[0]=='\0')


pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);


else


pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);


if(!pRaw || num==0){


sprintf(biFone, "%s-%s", cFone, "pau");


pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);


if(!pRaw || num==0){


sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);


AfxMessageBox(tipBuf);


continue;


}


else


num = pRaw[0];


}


memset(dataBuf, 0, 9000*sizeof(short));


// note: the real speech data is from 1, not 0 


num = num -1;


nCV = pRaw[0];


// adjust duration and play it


nLen = (int)(num*TTSTempo);


nOffset = (int)(nCV*(1.0 - TTSTempo))+1;


memcpy(dataBuf, &pRaw[nOffset], 2*nLen);


nCount += nLen;


// if stress level exists? 


if(ampLevel>=1) { // more than this ???


ScaleWaveSignal(dataBuf, nLen, (float)2.0);


}


if(ampLevel==-1) {


ScaleWaveSignal(dataBuf, nLen, (float)1.5);


}


if(fp != NULL)


fwrite(dataBuf, sizeof(short), nLen, fp);


else


myPlaySound(dataBuf, nLen);


// for the very 1st phone, then ...


if(lFone[0]=='\0' ) {


strcpy(lFone, "pau");


if( rFone[0]=='\0' )


sprintf(biFone, "%s-%s", cFone, "pau");


else


sprintf(biFone, "%s-%s", cFone, rFone);


// try to find more precise bifone model


memset(biBak, 0, 16);


strcpy(biBak, biFone);


memset(biFone, 0, 16);


FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);


if(biFone[0]=='\0')


pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);


else


pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);


if(!pRaw || num==0){


sprintf(biFone, "%s-%s", cFone, "pau");


pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);


if(!pRaw || num==0){


sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);


AfxMessageBox(tipBuf);


continue;


}


else


num = pRaw[0];


}


num = num -1;


// adjust duration and play it


nCV = pRaw[0];


nLen = (int)(num*TTSTempo);


nOffset = (int)(nCV*(1.0 - TTSTempo))+1;


memset(dataBuf, 0, 9000*sizeof(short));


memcpy(dataBuf, &pRaw[nOffset], 2*nLen);


nCount += nLen;


// if stress level exists? 


if(ampLevel>=1)  // more than this ???


ScaleWaveSignal(dataBuf, nLen, (float)2.0);


if(ampLevel==-1) 


ScaleWaveSignal(dataBuf, nLen, (float)1.5);


// adjust duration and play it


nLen = (int)(num*TTSTempo);


nCount += nLen;


if(fp != NULL)


fwrite(dataBuf, sizeof(short), nLen, fp);


else


myPlaySound(dataBuf, nLen);


} // if end


} // while end


free(pFone); // no use of it now


fclose(fp);


fp = fopen("TTS-TMP-Data.raw", "rb");


// after each sentence, add some silence


memset(dataBuf, 0, 9000*sizeof(short));


nLen = (int)(200*TTSTempo); // 200/8000 seconds


if(fp != NULL) {


pTout = (short *)calloc(nCount+16, sizeof(short));


if(!pTout) {


AfxMessageBox("Memory error in: RunEnTTSEngine");


return;


}


fread(pTout, sizeof(short), nCount, fp);


myPlaySound(pTout, nCount);


fclose(fp);


free(pTout);


}


else {


myPlaySound(dataBuf, nLen);


}


// free all resources and exit






// AfxMessageBox("En TTS finished!");


return;


}
原创粉丝点击