语音合成

来源：互联网发布：网络大学有哪些专业编辑：程序博客网时间：2024/06/10 21:36

节选自Sen Zhang博士的语言识别工具源码：

字转拼音，得到汉字对应的拼音，再按照规则生成音节序列，之后根据音节顺次播放声音片段，并在片段边缘做平滑处理，就能接近真人发音的效果了。

/*--------------------------------------------------------------

Convert the Chinese text to Pinyin sequence, the input

Chinese text sequence should be already segmented into

word sequences and dont have optional one, such as

[ W1 | W2 |..| Wn ]. Use HashWordSegBest after word

segmented can make it ok.

1. each word converted into pinyin based on lexicon

2. each single chinese char is converted into pinyin

according to dictionary.

3. the pronunciation prob may be used for choosing one

best pinyin from multiple choices.

The converted pinyin sequence is stored in pPinyin.

If a word has more than 1 pinyin, then all the pinyin

are picked out and stored in the format:

[pinyin1 | pinyin2 | ... | pinyink ]

If a word or char just has 1 pinyin, then the pinyin

is simply stored as: pinyin1 pinyin2 ... pinyink

modified in 26, March, 2003

---------------------------------------------------------------*/

int HashText2Pinyin(char *sText, char *pPinyin)

{

int i, j, ii;

int len, num, libFlag;

int nLength, nIndex;

char cWord[64], pinyin[256], gram[256];

char *lText, *rText;

float prob;

CString cs; // debug

if(!sText || !pPinyin )return RTN_ER;

pPinyin[0] = 0;

nLength = strlen(sText);

lText = (char *)malloc(nLength+1);

rText = (char *)malloc(nLength+1);

if(!lText || !rText) {

AfxMessageBox("Memory error in: HashText2Pinyin");

return 0;

}

// for the left and right context

memset(lText, 0, nLength+1);

memset(rText, 0, nLength+1);

nIndex=0; ii=0;

while(nIndex < nLength)

{

if(sText[nIndex]!=' ' && nIndex<nLength-1) {

cWord[ii] = sText[nIndex];

nIndex++; ii++;

continue;

}

else { // now, meet word boundary

if(nIndex==nLength-1) {

cWord[ii] = sText[nIndex];

ii++;

}

nIndex++;

cWord[ii]=0;

ii=0;

}

// if no real content

if(strlen(cWord) <=0) {

ii=0;

continue;

}

// save the right text

memset(rText, 0, nLength+1);

if(nIndex < nLength) strcpy(rText, &sText[nIndex]);

// reset the right context if more than 1 same word

ii = strlen(cWord);

j = strlen(rText);

i=0;

while(i<j-ii) {

if( strncmp(cWord, &rText[i], ii)==0) {

rText[i] = 0;

break;

}

i++;

}

// reset the left context if more than 1 same word

ii = strlen(cWord);

j = strlen(lText);

i=j-ii;

while(i>=0) {

if( strncmp(cWord, &lText[i], ii)==0) {

if(j-i-ii>=0) {

memmove(lText, &lText[i+2], j-i-ii);

lText[j-i-ii] = 0;

}

else memset(lText, 0, nLength+1);

break;

}

i--;

}

// now one word or char is ready!

//-------------------------------------------------

// begin converting......

if(strlen(cWord) > 2)libFlag = CNLEX;

else libFlag = CNDICT;

num = HashTokenNumOf1Word(cWord, libFlag);

if(num > 1) {// has more than 1 pinyin

float maxProb = 0.0;

char pyBest[64];

char lFea[32][16], rFea[32][16];

int lNum=0, rNum=0, ind = 0;

bool lFlag = FALSE, rFlag = FALSE;

// which pinyin is proper based on gram?

memset(pyBest, 0, 64);

for(i=0; i<num; i++) {

prob = 1.0;

memset(gram, 0, 256);

memset(pinyin, 0, 256);

HashGet1TokenAndProperty(cWord, i, pinyin, &prob, gram, libFlag);

// if gram can determine pinyin?

if(gram[0] != '\0') {

// get the left context features

len = strlen(gram);

lNum=0; ind = 0;

for(j=0; j<len; j++) {

if(gram[j] == '_') {

if(j>0) lNum++;

break;

}

else if(gram[j] == '|') {

ind = 0;

lNum++;

}

else {

lFea[lNum][ind] = gram[j];

ind++;

lFea[lNum][ind] = '\0';

}

} // for end

// get the right context features

rNum=0; ind =0;

j = j+1;

for( ; j<len+1; j++) {

if( j==len ) {

if(ind>0) rNum++;

break;

}

else if(gram[j] == '|') {

ind = 0;

rNum++;

}

else {

rFea[rNum][ind] = gram[j];

ind++;

rFea[rNum][ind] = '\0';

}

} // for end

// try to match the left context

lFlag = FALSE;

for(j=0; j<lNum; j++) {

if(!strcmp(lFea[j], "*")) {

if(strlen(lText) > 0)

lFlag = TRUE;

break;

}

if(strstr(lText, lFea[j])) {

lFlag = TRUE;

break;

}

}

// try to match the right context

rFlag = FALSE;

for(j=0; j<rNum; j++) {

if(!strcmp(rFea[j], "*")) {

if(strlen(rText) > 0)

rFlag = TRUE;

break;

}

if(strstr(rText, rFea[j])) {

rFlag = TRUE;

break;

}

}

// if no left or right context features

if(lNum==0) lFlag = TRUE;

if(rNum==0) rFlag = TRUE;

// if both left and right context ok?

if(lFlag && rFlag) {

strcpy(pyBest, pinyin);

break;

}

} // if end

// use prob to choose pinyin if gram failed?

if(!lFlag || !rFlag) {

if( maxProb < prob ){

maxProb = prob;

strcpy(pyBest, pinyin);

} // if end

}// if end

} // for end

len = strlen(pPinyin);

if(len > 0)

strcat(pPinyin, " ");

if(pyBest[0] != '\0')

strcat(pPinyin, pyBest);

else // the last one is the best one

strcat(pPinyin, pinyin);

} // if

else if(num == 1) {

HashGetFirstToken(cWord, pinyin, libFlag);

len = strlen(pPinyin);

if(len > 0)

strcat(pPinyin, " ");

strcat(pPinyin, pinyin);

}

else {

cs.Format("No pinyin for word: %s", cWord);

AfxMessageBox(cs);

}

// get the left context

strncpy(lText, sText, nIndex);

// reset for next new word

memset(cWord, 0, 64);

ii = 0;

} // end while()

// free all

if(lText != NULL)free(lText);

if(rText != NULL)free(rText);

return RTN_OK;

}

拼音转音素算法

//-------------------------------------------------------

// To convert english text into phoneme sequence.

// The text may have more than 1 word, each word's

// phoneme sequence is delimited by space " ".

// Converted phonemes are returned by a pointer.

//-------------------------------------------------------

char* ConvertText2PhonesEn(char *pText)

{

int i, j, k;

int iLast, nLen, nFoneLen;

char word[128], fones[256];

char *lText, *rText;

char *pFone = NULL;

int nCount = 0;

FILE *fp;

if(!pText ) return pFone;

fp = fopen("TTS-TMP-Fone.txt", "w");

if(!fp) {

AfxMessageBox("File error in:ConvertText2PhonesEn");

return pFone;

}

// initialize them

nLen = strlen(pText);

lText = (char *)malloc(nLen+1);

rText = (char *)malloc(nLen+1);

if(!lText || !rText) {

AfxMessageBox("Mem error in:ConvertText2PhonesEn");

fclose(fp);

return pFone;

}

memset(word, 0, 128);

j=0; i=0; iLast = 0;

while(i<nLen )

{

if(pText[i]>=0x30 && i<nLen-1) {

word[j] = pText[i];

i++; j++;

continue;

}

else { // meet a word boundary or line end

if(pText[i]>=0x30 ) {

word[j] = pText[i];

j++; i++;

}

// save the left and right context

memset(lText, 0, nLen+1);

if(iLast>0 && iLast<nLen-1)

strncpy(lText, pText, iLast);

else {

if(iLast==0)

lText[0] = '\0';

if(iLast>nLen-1)

strcpy(lText, pText);

}

memset(rText, 0, nLen+1);

if(i<nLen-1)

strcpy(rText, &pText[i+1]);

else

rText[0] = '\0'; // null now

iLast = i;

i++;

j=0;

} // if-else end

if(strlen(word)<=0){

j=0; // word index

memset(word, 0, 128);

continue;

}

// to convert word to phones

// first, look up in lex, then by rule

memset(fones, 0, 256);

k = EnLexPronuNumOf1Word(word);

if(k>1) { //more than 1 pronunciation

// now, use gram to choose best one, ???

// EnLexWordToPhonemeBest(word, lText, rText, fones);

EnLexWordToPhoneme(word, fones);

}

else if(k==1) { // only 1 pronunciation

EnLexWordToPhoneme(word, fones);

}

else { // no pronunciation, using rule now

EnRuleWord2Phoneme(word, fones);

}

nFoneLen = strlen(fones);

if(nFoneLen<=0) {

j=0;

continue;

}

else {

if(fones[nFoneLen-1] != ' ') {

strcat(fones, " ");

nFoneLen++;

}

nCount = nCount+nFoneLen;

fwrite(fones, sizeof(char), nFoneLen, fp);

}

memset(word, 0, 128);

} // end while

fclose(fp);

fp = fopen("TTS-TMP-Fone.txt", "r");

if(!fp) {

AfxMessageBox("File error in:ConvertText2PhonesEn");

return pFone;

}

pFone = (char *)malloc(nCount+1);

if(pFone != NULL) {

memset(pFone, 0, nCount+1);

fread(pFone, sizeof(char), nCount, fp);

}

free(lText);

free(rText);

return pFone;

}

音素拼接，语调修饰算法

//-------------------------------------------------------

// the input pText is a string of English words

// not the string of phonemes

//-------------------------------------------------------

void TTSEngineEn(char *pText)

{

int k, n, kk;

int num, nLen, nFoneLen, nOffset, nCV;

char *pFone;

char cFone[8], lFone[8], rFone[8], r1Fone[8];

char biFone[16], biBak[16];

int ampLevel;

short *pRaw, dataBuf[9001], *pTout;

int nCount=0;

FILE *fp; // temp file for saving speech data

char tipBuf[64];

if(!pText) return;

// convert the whole text into phonemes

pFone = ConvertText2PhonesEn(pText);

if(!pFone) return;

fp = fopen("TTS-TMP-Data.raw", "wb");

// if fp==NULL, dont write to temp file

// to output each phone

memset(cFone, 0, 8);

memset(lFone, 0, 8);

memset(rFone, 0, 8);

memset(biFone, 0, 16);

nFoneLen = strlen(pFone);

k = 0;

while( k<nFoneLen ) {

strcpy(lFone, cFone);

// get the current fones, fones are linked

// by "-", etc, aa1-k-ax-n

kk=0; ampLevel=-1;

memset(cFone, 0, 8);

while(k<nFoneLen) {

if(pFone[k]=='-' || pFone[k]==' ') {

k++;

break;

}

else if(pFone[k]>='0' && pFone[k]<='9') {

ampLevel = pFone[k];

k++;

}

else {

cFone[kk] = pFone[k];

k++; kk++;

}

} // while end

// save the last position

// get the right fones

n = k; kk = 0;

memset(rFone, 0, 8);

while(n<nFoneLen) {

if(pFone[n]=='-' || pFone[n]==' ') {

n++;

break;

}

else if(pFone[n]>='0' && pFone[n]<='9') {

n++;

}

else {

rFone[kk] = pFone[n];

n++; kk++;

}

} // while end

// get the next fone just after right fone

kk = 0;

memset(r1Fone, 0, 8);

while(n<nFoneLen) {

if(pFone[n]=='-' || pFone[n]==' ') {

n++;

break;

}

else if(pFone[n]>='0' && pFone[n]<='9') {

n++;

}

else {

r1Fone[kk] = pFone[n];

n++; kk++;

}

} // while end

// if cFone is null, no more fones

if(strlen(cFone) <= 0) {

continue;

}

// consider the context phones, left first

memset(biFone, 0, 16);

if(lFone[0]=='\0') { // first fone

sprintf(biFone, "%s-%s", "pau", cFone);

}

else { // other cases

if(rFone[0] != '\0')

sprintf(biFone, "%s-%s", cFone, rFone);

else

sprintf(biFone, "%s-%s", cFone, "pau");

}

// build more precise model based on context

memset(biBak, 0, 16);

strcpy(biBak, biFone);

memset(biFone, 0, 16);

FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);

if(biFone[0]=='\0')

pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);

else

pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);

if(!pRaw || num==0){

sprintf(biFone, "%s-%s", cFone, "pau");

pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);

if(!pRaw || num==0){

sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);

AfxMessageBox(tipBuf);

continue;

}

else

num = pRaw[0];

}

memset(dataBuf, 0, 9000*sizeof(short));

// note: the real speech data is from 1, not 0

num = num -1;

nCV = pRaw[0];

// adjust duration and play it

nLen = (int)(num*TTSTempo);

nOffset = (int)(nCV*(1.0 - TTSTempo))+1;

memcpy(dataBuf, &pRaw[nOffset], 2*nLen);

nCount += nLen;

// if stress level exists?

if(ampLevel>=1) { // more than this ???

ScaleWaveSignal(dataBuf, nLen, (float)2.0);

}

if(ampLevel==-1) {

ScaleWaveSignal(dataBuf, nLen, (float)1.5);

}

if(fp != NULL)

fwrite(dataBuf, sizeof(short), nLen, fp);

else

myPlaySound(dataBuf, nLen);

// for the very 1st phone, then ...

if(lFone[0]=='\0' ) {

strcpy(lFone, "pau");

if( rFone[0]=='\0' )

sprintf(biFone, "%s-%s", cFone, "pau");

else

sprintf(biFone, "%s-%s", cFone, rFone);

// try to find more precise bifone model

memset(biBak, 0, 16);

strcpy(biBak, biFone);

memset(biFone, 0, 16);

FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);

if(biFone[0]=='\0')

pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);

else

pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);

if(!pRaw || num==0){

sprintf(biFone, "%s-%s", cFone, "pau");

pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);

if(!pRaw || num==0){

sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);

AfxMessageBox(tipBuf);

continue;

}

else

num = pRaw[0];

}

num = num -1;

// adjust duration and play it

nCV = pRaw[0];

nLen = (int)(num*TTSTempo);

nOffset = (int)(nCV*(1.0 - TTSTempo))+1;

memset(dataBuf, 0, 9000*sizeof(short));

memcpy(dataBuf, &pRaw[nOffset], 2*nLen);

nCount += nLen;

// if stress level exists?

if(ampLevel>=1) // more than this ???

ScaleWaveSignal(dataBuf, nLen, (float)2.0);

if(ampLevel==-1)

ScaleWaveSignal(dataBuf, nLen, (float)1.5);

// adjust duration and play it

nLen = (int)(num*TTSTempo);

nCount += nLen;

if(fp != NULL)

fwrite(dataBuf, sizeof(short), nLen, fp);

else

myPlaySound(dataBuf, nLen);

} // if end

} // while end

free(pFone); // no use of it now

fclose(fp);

fp = fopen("TTS-TMP-Data.raw", "rb");

// after each sentence, add some silence

memset(dataBuf, 0, 9000*sizeof(short));

nLen = (int)(200*TTSTempo); // 200/8000 seconds

if(fp != NULL) {

pTout = (short *)calloc(nCount+16, sizeof(short));

if(!pTout) {

AfxMessageBox("Memory error in: RunEnTTSEngine");

return;

}

fread(pTout, sizeof(short), nCount, fp);

myPlaySound(pTout, nCount);

fclose(fp);

free(pTout);

}

else {

myPlaySound(dataBuf, nLen);

}

// free all resources and exit

// AfxMessageBox("En TTS finished!");

return;

}