语音合成
来源:互联网 发布:网络大学有哪些专业 编辑:程序博客网 时间:2024/06/10 21:36
节选自Sen Zhang博士的语言识别工具源码:
字转拼音 ,得到汉字对应的拼音,再按照规则生成音节序列,之后根据音节顺次播放声音片段,并在片段边缘做平滑处理,就能接近真人发音的效果了。
/*--------------------------------------------------------------
Convert the Chinese text to Pinyin sequence, the input
Chinese text sequence should be already segmented into
word sequences and dont have optional one, such as
[ W1 | W2 |..| Wn ]. Use HashWordSegBest after word
segmented can make it ok.
1. each word converted into pinyin based on lexicon
2. each single chinese char is converted into pinyin
according to dictionary.
3. the pronunciation prob may be used for choosing one
best pinyin from multiple choices.
The converted pinyin sequence is stored in pPinyin.
If a word has more than 1 pinyin, then all the pinyin
are picked out and stored in the format:
[pinyin1 | pinyin2 | ... | pinyink ]
If a word or char just has 1 pinyin, then the pinyin
is simply stored as: pinyin1 pinyin2 ... pinyink
modified in 26, March, 2003
---------------------------------------------------------------*/
int HashText2Pinyin(char *sText, char *pPinyin)
{
int i, j, ii;
int len, num, libFlag;
int nLength, nIndex;
char cWord[64], pinyin[256], gram[256];
char *lText, *rText;
float prob;
CString cs; // debug
if(!sText || !pPinyin )return RTN_ER;
pPinyin[0] = 0;
nLength = strlen(sText);
lText = (char *)malloc(nLength+1);
rText = (char *)malloc(nLength+1);
if(!lText || !rText) {
AfxMessageBox("Memory error in: HashText2Pinyin");
return 0;
}
// for the left and right context
memset(lText, 0, nLength+1);
memset(rText, 0, nLength+1);
nIndex=0; ii=0;
while(nIndex < nLength)
{
if(sText[nIndex]!=' ' && nIndex<nLength-1) {
cWord[ii] = sText[nIndex];
nIndex++; ii++;
continue;
}
else { // now, meet word boundary
if(nIndex==nLength-1) {
cWord[ii] = sText[nIndex];
ii++;
}
nIndex++;
cWord[ii]=0;
ii=0;
}
// if no real content
if(strlen(cWord) <=0) {
ii=0;
continue;
}
// save the right text
memset(rText, 0, nLength+1);
if(nIndex < nLength) strcpy(rText, &sText[nIndex]);
// reset the right context if more than 1 same word
ii = strlen(cWord);
j = strlen(rText);
i=0;
while(i<j-ii) {
if( strncmp(cWord, &rText[i], ii)==0) {
rText[i] = 0;
break;
}
i++;
}
// reset the left context if more than 1 same word
ii = strlen(cWord);
j = strlen(lText);
i=j-ii;
while(i>=0) {
if( strncmp(cWord, &lText[i], ii)==0) {
if(j-i-ii>=0) {
memmove(lText, &lText[i+2], j-i-ii);
lText[j-i-ii] = 0;
}
else memset(lText, 0, nLength+1);
break;
}
i--;
}
// now one word or char is ready!
//-------------------------------------------------
// begin converting......
if(strlen(cWord) > 2)libFlag = CNLEX;
else libFlag = CNDICT;
num = HashTokenNumOf1Word(cWord, libFlag);
if(num > 1) {// has more than 1 pinyin
float maxProb = 0.0;
char pyBest[64];
char lFea[32][16], rFea[32][16];
int lNum=0, rNum=0, ind = 0;
bool lFlag = FALSE, rFlag = FALSE;
// which pinyin is proper based on gram?
memset(pyBest, 0, 64);
for(i=0; i<num; i++) {
prob = 1.0;
memset(gram, 0, 256);
memset(pinyin, 0, 256);
HashGet1TokenAndProperty(cWord, i, pinyin, &prob, gram, libFlag);
// if gram can determine pinyin?
if(gram[0] != '\0') {
// get the left context features
len = strlen(gram);
lNum=0; ind = 0;
for(j=0; j<len; j++) {
if(gram[j] == '_') {
if(j>0) lNum++;
break;
}
else if(gram[j] == '|') {
ind = 0;
lNum++;
}
else {
lFea[lNum][ind] = gram[j];
ind++;
lFea[lNum][ind] = '\0';
}
} // for end
// get the right context features
rNum=0; ind =0;
j = j+1;
for( ; j<len+1; j++) {
if( j==len ) {
if(ind>0) rNum++;
break;
}
else if(gram[j] == '|') {
ind = 0;
rNum++;
}
else {
rFea[rNum][ind] = gram[j];
ind++;
rFea[rNum][ind] = '\0';
}
} // for end
// try to match the left context
lFlag = FALSE;
for(j=0; j<lNum; j++) {
if(!strcmp(lFea[j], "*")) {
if(strlen(lText) > 0)
lFlag = TRUE;
break;
}
if(strstr(lText, lFea[j])) {
lFlag = TRUE;
break;
}
}
// try to match the right context
rFlag = FALSE;
for(j=0; j<rNum; j++) {
if(!strcmp(rFea[j], "*")) {
if(strlen(rText) > 0)
rFlag = TRUE;
break;
}
if(strstr(rText, rFea[j])) {
rFlag = TRUE;
break;
}
}
// if no left or right context features
if(lNum==0) lFlag = TRUE;
if(rNum==0) rFlag = TRUE;
// if both left and right context ok?
if(lFlag && rFlag) {
strcpy(pyBest, pinyin);
break;
}
} // if end
// use prob to choose pinyin if gram failed?
if(!lFlag || !rFlag) {
if( maxProb < prob ){
maxProb = prob;
strcpy(pyBest, pinyin);
} // if end
}// if end
} // for end
len = strlen(pPinyin);
if(len > 0)
strcat(pPinyin, " ");
if(pyBest[0] != '\0')
strcat(pPinyin, pyBest);
else // the last one is the best one
strcat(pPinyin, pinyin);
} // if
else if(num == 1) {
HashGetFirstToken(cWord, pinyin, libFlag);
len = strlen(pPinyin);
if(len > 0)
strcat(pPinyin, " ");
strcat(pPinyin, pinyin);
}
else {
cs.Format("No pinyin for word: %s", cWord);
AfxMessageBox(cs);
}
// get the left context
strncpy(lText, sText, nIndex);
// reset for next new word
memset(cWord, 0, 64);
ii = 0;
} // end while()
// free all
if(lText != NULL)free(lText);
if(rText != NULL)free(rText);
return RTN_OK;
}
拼音转音素算法
//-------------------------------------------------------
// To convert english text into phoneme sequence.
// The text may have more than 1 word, each word's
// phoneme sequence is delimited by space " ".
// Converted phonemes are returned by a pointer.
//-------------------------------------------------------
char* ConvertText2PhonesEn(char *pText)
{
int i, j, k;
int iLast, nLen, nFoneLen;
char word[128], fones[256];
char *lText, *rText;
char *pFone = NULL;
int nCount = 0;
FILE *fp;
if(!pText ) return pFone;
fp = fopen("TTS-TMP-Fone.txt", "w");
if(!fp) {
AfxMessageBox("File error in:ConvertText2PhonesEn");
return pFone;
}
// initialize them
nLen = strlen(pText);
lText = (char *)malloc(nLen+1);
rText = (char *)malloc(nLen+1);
if(!lText || !rText) {
AfxMessageBox("Mem error in:ConvertText2PhonesEn");
fclose(fp);
return pFone;
}
memset(word, 0, 128);
j=0; i=0; iLast = 0;
while(i<nLen )
{
if(pText[i]>=0x30 && i<nLen-1) {
word[j] = pText[i];
i++; j++;
continue;
}
else { // meet a word boundary or line end
if(pText[i]>=0x30 ) {
word[j] = pText[i];
j++; i++;
}
// save the left and right context
memset(lText, 0, nLen+1);
if(iLast>0 && iLast<nLen-1)
strncpy(lText, pText, iLast);
else {
if(iLast==0)
lText[0] = '\0';
if(iLast>nLen-1)
strcpy(lText, pText);
}
memset(rText, 0, nLen+1);
if(i<nLen-1)
strcpy(rText, &pText[i+1]);
else
rText[0] = '\0'; // null now
iLast = i;
i++;
j=0;
} // if-else end
if(strlen(word)<=0){
j=0; // word index
memset(word, 0, 128);
continue;
}
// to convert word to phones
// first, look up in lex, then by rule
memset(fones, 0, 256);
k = EnLexPronuNumOf1Word(word);
if(k>1) { //more than 1 pronunciation
// now, use gram to choose best one, ???
// EnLexWordToPhonemeBest(word, lText, rText, fones);
EnLexWordToPhoneme(word, fones);
}
else if(k==1) { // only 1 pronunciation
EnLexWordToPhoneme(word, fones);
}
else { // no pronunciation, using rule now
EnRuleWord2Phoneme(word, fones);
}
nFoneLen = strlen(fones);
if(nFoneLen<=0) {
j=0;
continue;
}
else {
if(fones[nFoneLen-1] != ' ') {
strcat(fones, " ");
nFoneLen++;
}
nCount = nCount+nFoneLen;
fwrite(fones, sizeof(char), nFoneLen, fp);
}
memset(word, 0, 128);
} // end while
fclose(fp);
fp = fopen("TTS-TMP-Fone.txt", "r");
if(!fp) {
AfxMessageBox("File error in:ConvertText2PhonesEn");
return pFone;
}
pFone = (char *)malloc(nCount+1);
if(pFone != NULL) {
memset(pFone, 0, nCount+1);
fread(pFone, sizeof(char), nCount, fp);
}
free(lText);
free(rText);
return pFone;
}
音素拼接,语调修饰算法
//-------------------------------------------------------
// the input pText is a string of English words
// not the string of phonemes
//-------------------------------------------------------
void TTSEngineEn(char *pText)
{
int k, n, kk;
int num, nLen, nFoneLen, nOffset, nCV;
char *pFone;
char cFone[8], lFone[8], rFone[8], r1Fone[8];
char biFone[16], biBak[16];
int ampLevel;
short *pRaw, dataBuf[9001], *pTout;
int nCount=0;
FILE *fp; // temp file for saving speech data
char tipBuf[64];
if(!pText) return;
// convert the whole text into phonemes
pFone = ConvertText2PhonesEn(pText);
if(!pFone) return;
fp = fopen("TTS-TMP-Data.raw", "wb");
// if fp==NULL, dont write to temp file
// to output each phone
memset(cFone, 0, 8);
memset(lFone, 0, 8);
memset(rFone, 0, 8);
memset(biFone, 0, 16);
nFoneLen = strlen(pFone);
k = 0;
while( k<nFoneLen ) {
strcpy(lFone, cFone);
// get the current fones, fones are linked
// by "-", etc, aa1-k-ax-n
kk=0; ampLevel=-1;
memset(cFone, 0, 8);
while(k<nFoneLen) {
if(pFone[k]=='-' || pFone[k]==' ') {
k++;
break;
}
else if(pFone[k]>='0' && pFone[k]<='9') {
ampLevel = pFone[k];
k++;
}
else {
cFone[kk] = pFone[k];
k++; kk++;
}
} // while end
// save the last position
// get the right fones
n = k; kk = 0;
memset(rFone, 0, 8);
while(n<nFoneLen) {
if(pFone[n]=='-' || pFone[n]==' ') {
n++;
break;
}
else if(pFone[n]>='0' && pFone[n]<='9') {
n++;
}
else {
rFone[kk] = pFone[n];
n++; kk++;
}
} // while end
// get the next fone just after right fone
kk = 0;
memset(r1Fone, 0, 8);
while(n<nFoneLen) {
if(pFone[n]=='-' || pFone[n]==' ') {
n++;
break;
}
else if(pFone[n]>='0' && pFone[n]<='9') {
n++;
}
else {
r1Fone[kk] = pFone[n];
n++; kk++;
}
} // while end
// if cFone is null, no more fones
if(strlen(cFone) <= 0) {
continue;
}
// consider the context phones, left first
memset(biFone, 0, 16);
if(lFone[0]=='\0') { // first fone
sprintf(biFone, "%s-%s", "pau", cFone);
}
else { // other cases
if(rFone[0] != '\0')
sprintf(biFone, "%s-%s", cFone, rFone);
else
sprintf(biFone, "%s-%s", cFone, "pau");
}
// build more precise model based on context
memset(biBak, 0, 16);
strcpy(biBak, biFone);
memset(biFone, 0, 16);
FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);
if(biFone[0]=='\0')
pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);
else
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(biFone, "%s-%s", cFone, "pau");
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);
AfxMessageBox(tipBuf);
continue;
}
else
num = pRaw[0];
}
memset(dataBuf, 0, 9000*sizeof(short));
// note: the real speech data is from 1, not 0
num = num -1;
nCV = pRaw[0];
// adjust duration and play it
nLen = (int)(num*TTSTempo);
nOffset = (int)(nCV*(1.0 - TTSTempo))+1;
memcpy(dataBuf, &pRaw[nOffset], 2*nLen);
nCount += nLen;
// if stress level exists?
if(ampLevel>=1) { // more than this ???
ScaleWaveSignal(dataBuf, nLen, (float)2.0);
}
if(ampLevel==-1) {
ScaleWaveSignal(dataBuf, nLen, (float)1.5);
}
if(fp != NULL)
fwrite(dataBuf, sizeof(short), nLen, fp);
else
myPlaySound(dataBuf, nLen);
// for the very 1st phone, then ...
if(lFone[0]=='\0' ) {
strcpy(lFone, "pau");
if( rFone[0]=='\0' )
sprintf(biFone, "%s-%s", cFone, "pau");
else
sprintf(biFone, "%s-%s", cFone, rFone);
// try to find more precise bifone model
memset(biBak, 0, 16);
strcpy(biBak, biFone);
memset(biFone, 0, 16);
FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);
if(biFone[0]=='\0')
pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);
else
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(biFone, "%s-%s", cFone, "pau");
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);
AfxMessageBox(tipBuf);
continue;
}
else
num = pRaw[0];
}
num = num -1;
// adjust duration and play it
nCV = pRaw[0];
nLen = (int)(num*TTSTempo);
nOffset = (int)(nCV*(1.0 - TTSTempo))+1;
memset(dataBuf, 0, 9000*sizeof(short));
memcpy(dataBuf, &pRaw[nOffset], 2*nLen);
nCount += nLen;
// if stress level exists?
if(ampLevel>=1) // more than this ???
ScaleWaveSignal(dataBuf, nLen, (float)2.0);
if(ampLevel==-1)
ScaleWaveSignal(dataBuf, nLen, (float)1.5);
// adjust duration and play it
nLen = (int)(num*TTSTempo);
nCount += nLen;
if(fp != NULL)
fwrite(dataBuf, sizeof(short), nLen, fp);
else
myPlaySound(dataBuf, nLen);
} // if end
} // while end
free(pFone); // no use of it now
fclose(fp);
fp = fopen("TTS-TMP-Data.raw", "rb");
// after each sentence, add some silence
memset(dataBuf, 0, 9000*sizeof(short));
nLen = (int)(200*TTSTempo); // 200/8000 seconds
if(fp != NULL) {
pTout = (short *)calloc(nCount+16, sizeof(short));
if(!pTout) {
AfxMessageBox("Memory error in: RunEnTTSEngine");
return;
}
fread(pTout, sizeof(short), nCount, fp);
myPlaySound(pTout, nCount);
fclose(fp);
free(pTout);
}
else {
myPlaySound(dataBuf, nLen);
}
// free all resources and exit
// AfxMessageBox("En TTS finished!");
return;
}
字转拼音 ,得到汉字对应的拼音,再按照规则生成音节序列,之后根据音节顺次播放声音片段,并在片段边缘做平滑处理,就能接近真人发音的效果了。
/*--------------------------------------------------------------
Convert the Chinese text to Pinyin sequence, the input
Chinese text sequence should be already segmented into
word sequences and dont have optional one, such as
[ W1 | W2 |..| Wn ]. Use HashWordSegBest after word
segmented can make it ok.
1. each word converted into pinyin based on lexicon
2. each single chinese char is converted into pinyin
according to dictionary.
3. the pronunciation prob may be used for choosing one
best pinyin from multiple choices.
The converted pinyin sequence is stored in pPinyin.
If a word has more than 1 pinyin, then all the pinyin
are picked out and stored in the format:
[pinyin1 | pinyin2 | ... | pinyink ]
If a word or char just has 1 pinyin, then the pinyin
is simply stored as: pinyin1 pinyin2 ... pinyink
modified in 26, March, 2003
---------------------------------------------------------------*/
int HashText2Pinyin(char *sText, char *pPinyin)
{
int i, j, ii;
int len, num, libFlag;
int nLength, nIndex;
char cWord[64], pinyin[256], gram[256];
char *lText, *rText;
float prob;
CString cs; // debug
if(!sText || !pPinyin )return RTN_ER;
pPinyin[0] = 0;
nLength = strlen(sText);
lText = (char *)malloc(nLength+1);
rText = (char *)malloc(nLength+1);
if(!lText || !rText) {
AfxMessageBox("Memory error in: HashText2Pinyin");
return 0;
}
// for the left and right context
memset(lText, 0, nLength+1);
memset(rText, 0, nLength+1);
nIndex=0; ii=0;
while(nIndex < nLength)
{
if(sText[nIndex]!=' ' && nIndex<nLength-1) {
cWord[ii] = sText[nIndex];
nIndex++; ii++;
continue;
}
else { // now, meet word boundary
if(nIndex==nLength-1) {
cWord[ii] = sText[nIndex];
ii++;
}
nIndex++;
cWord[ii]=0;
ii=0;
}
// if no real content
if(strlen(cWord) <=0) {
ii=0;
continue;
}
// save the right text
memset(rText, 0, nLength+1);
if(nIndex < nLength) strcpy(rText, &sText[nIndex]);
// reset the right context if more than 1 same word
ii = strlen(cWord);
j = strlen(rText);
i=0;
while(i<j-ii) {
if( strncmp(cWord, &rText[i], ii)==0) {
rText[i] = 0;
break;
}
i++;
}
// reset the left context if more than 1 same word
ii = strlen(cWord);
j = strlen(lText);
i=j-ii;
while(i>=0) {
if( strncmp(cWord, &lText[i], ii)==0) {
if(j-i-ii>=0) {
memmove(lText, &lText[i+2], j-i-ii);
lText[j-i-ii] = 0;
}
else memset(lText, 0, nLength+1);
break;
}
i--;
}
// now one word or char is ready!
//-------------------------------------------------
// begin converting......
if(strlen(cWord) > 2)libFlag = CNLEX;
else libFlag = CNDICT;
num = HashTokenNumOf1Word(cWord, libFlag);
if(num > 1) {// has more than 1 pinyin
float maxProb = 0.0;
char pyBest[64];
char lFea[32][16], rFea[32][16];
int lNum=0, rNum=0, ind = 0;
bool lFlag = FALSE, rFlag = FALSE;
// which pinyin is proper based on gram?
memset(pyBest, 0, 64);
for(i=0; i<num; i++) {
prob = 1.0;
memset(gram, 0, 256);
memset(pinyin, 0, 256);
HashGet1TokenAndProperty(cWord, i, pinyin, &prob, gram, libFlag);
// if gram can determine pinyin?
if(gram[0] != '\0') {
// get the left context features
len = strlen(gram);
lNum=0; ind = 0;
for(j=0; j<len; j++) {
if(gram[j] == '_') {
if(j>0) lNum++;
break;
}
else if(gram[j] == '|') {
ind = 0;
lNum++;
}
else {
lFea[lNum][ind] = gram[j];
ind++;
lFea[lNum][ind] = '\0';
}
} // for end
// get the right context features
rNum=0; ind =0;
j = j+1;
for( ; j<len+1; j++) {
if( j==len ) {
if(ind>0) rNum++;
break;
}
else if(gram[j] == '|') {
ind = 0;
rNum++;
}
else {
rFea[rNum][ind] = gram[j];
ind++;
rFea[rNum][ind] = '\0';
}
} // for end
// try to match the left context
lFlag = FALSE;
for(j=0; j<lNum; j++) {
if(!strcmp(lFea[j], "*")) {
if(strlen(lText) > 0)
lFlag = TRUE;
break;
}
if(strstr(lText, lFea[j])) {
lFlag = TRUE;
break;
}
}
// try to match the right context
rFlag = FALSE;
for(j=0; j<rNum; j++) {
if(!strcmp(rFea[j], "*")) {
if(strlen(rText) > 0)
rFlag = TRUE;
break;
}
if(strstr(rText, rFea[j])) {
rFlag = TRUE;
break;
}
}
// if no left or right context features
if(lNum==0) lFlag = TRUE;
if(rNum==0) rFlag = TRUE;
// if both left and right context ok?
if(lFlag && rFlag) {
strcpy(pyBest, pinyin);
break;
}
} // if end
// use prob to choose pinyin if gram failed?
if(!lFlag || !rFlag) {
if( maxProb < prob ){
maxProb = prob;
strcpy(pyBest, pinyin);
} // if end
}// if end
} // for end
len = strlen(pPinyin);
if(len > 0)
strcat(pPinyin, " ");
if(pyBest[0] != '\0')
strcat(pPinyin, pyBest);
else // the last one is the best one
strcat(pPinyin, pinyin);
} // if
else if(num == 1) {
HashGetFirstToken(cWord, pinyin, libFlag);
len = strlen(pPinyin);
if(len > 0)
strcat(pPinyin, " ");
strcat(pPinyin, pinyin);
}
else {
cs.Format("No pinyin for word: %s", cWord);
AfxMessageBox(cs);
}
// get the left context
strncpy(lText, sText, nIndex);
// reset for next new word
memset(cWord, 0, 64);
ii = 0;
} // end while()
// free all
if(lText != NULL)free(lText);
if(rText != NULL)free(rText);
return RTN_OK;
}
拼音转音素算法
//-------------------------------------------------------
// To convert english text into phoneme sequence.
// The text may have more than 1 word, each word's
// phoneme sequence is delimited by space " ".
// Converted phonemes are returned by a pointer.
//-------------------------------------------------------
char* ConvertText2PhonesEn(char *pText)
{
int i, j, k;
int iLast, nLen, nFoneLen;
char word[128], fones[256];
char *lText, *rText;
char *pFone = NULL;
int nCount = 0;
FILE *fp;
if(!pText ) return pFone;
fp = fopen("TTS-TMP-Fone.txt", "w");
if(!fp) {
AfxMessageBox("File error in:ConvertText2PhonesEn");
return pFone;
}
// initialize them
nLen = strlen(pText);
lText = (char *)malloc(nLen+1);
rText = (char *)malloc(nLen+1);
if(!lText || !rText) {
AfxMessageBox("Mem error in:ConvertText2PhonesEn");
fclose(fp);
return pFone;
}
memset(word, 0, 128);
j=0; i=0; iLast = 0;
while(i<nLen )
{
if(pText[i]>=0x30 && i<nLen-1) {
word[j] = pText[i];
i++; j++;
continue;
}
else { // meet a word boundary or line end
if(pText[i]>=0x30 ) {
word[j] = pText[i];
j++; i++;
}
// save the left and right context
memset(lText, 0, nLen+1);
if(iLast>0 && iLast<nLen-1)
strncpy(lText, pText, iLast);
else {
if(iLast==0)
lText[0] = '\0';
if(iLast>nLen-1)
strcpy(lText, pText);
}
memset(rText, 0, nLen+1);
if(i<nLen-1)
strcpy(rText, &pText[i+1]);
else
rText[0] = '\0'; // null now
iLast = i;
i++;
j=0;
} // if-else end
if(strlen(word)<=0){
j=0; // word index
memset(word, 0, 128);
continue;
}
// to convert word to phones
// first, look up in lex, then by rule
memset(fones, 0, 256);
k = EnLexPronuNumOf1Word(word);
if(k>1) { //more than 1 pronunciation
// now, use gram to choose best one, ???
// EnLexWordToPhonemeBest(word, lText, rText, fones);
EnLexWordToPhoneme(word, fones);
}
else if(k==1) { // only 1 pronunciation
EnLexWordToPhoneme(word, fones);
}
else { // no pronunciation, using rule now
EnRuleWord2Phoneme(word, fones);
}
nFoneLen = strlen(fones);
if(nFoneLen<=0) {
j=0;
continue;
}
else {
if(fones[nFoneLen-1] != ' ') {
strcat(fones, " ");
nFoneLen++;
}
nCount = nCount+nFoneLen;
fwrite(fones, sizeof(char), nFoneLen, fp);
}
memset(word, 0, 128);
} // end while
fclose(fp);
fp = fopen("TTS-TMP-Fone.txt", "r");
if(!fp) {
AfxMessageBox("File error in:ConvertText2PhonesEn");
return pFone;
}
pFone = (char *)malloc(nCount+1);
if(pFone != NULL) {
memset(pFone, 0, nCount+1);
fread(pFone, sizeof(char), nCount, fp);
}
free(lText);
free(rText);
return pFone;
}
音素拼接,语调修饰算法
//-------------------------------------------------------
// the input pText is a string of English words
// not the string of phonemes
//-------------------------------------------------------
void TTSEngineEn(char *pText)
{
int k, n, kk;
int num, nLen, nFoneLen, nOffset, nCV;
char *pFone;
char cFone[8], lFone[8], rFone[8], r1Fone[8];
char biFone[16], biBak[16];
int ampLevel;
short *pRaw, dataBuf[9001], *pTout;
int nCount=0;
FILE *fp; // temp file for saving speech data
char tipBuf[64];
if(!pText) return;
// convert the whole text into phonemes
pFone = ConvertText2PhonesEn(pText);
if(!pFone) return;
fp = fopen("TTS-TMP-Data.raw", "wb");
// if fp==NULL, dont write to temp file
// to output each phone
memset(cFone, 0, 8);
memset(lFone, 0, 8);
memset(rFone, 0, 8);
memset(biFone, 0, 16);
nFoneLen = strlen(pFone);
k = 0;
while( k<nFoneLen ) {
strcpy(lFone, cFone);
// get the current fones, fones are linked
// by "-", etc, aa1-k-ax-n
kk=0; ampLevel=-1;
memset(cFone, 0, 8);
while(k<nFoneLen) {
if(pFone[k]=='-' || pFone[k]==' ') {
k++;
break;
}
else if(pFone[k]>='0' && pFone[k]<='9') {
ampLevel = pFone[k];
k++;
}
else {
cFone[kk] = pFone[k];
k++; kk++;
}
} // while end
// save the last position
// get the right fones
n = k; kk = 0;
memset(rFone, 0, 8);
while(n<nFoneLen) {
if(pFone[n]=='-' || pFone[n]==' ') {
n++;
break;
}
else if(pFone[n]>='0' && pFone[n]<='9') {
n++;
}
else {
rFone[kk] = pFone[n];
n++; kk++;
}
} // while end
// get the next fone just after right fone
kk = 0;
memset(r1Fone, 0, 8);
while(n<nFoneLen) {
if(pFone[n]=='-' || pFone[n]==' ') {
n++;
break;
}
else if(pFone[n]>='0' && pFone[n]<='9') {
n++;
}
else {
r1Fone[kk] = pFone[n];
n++; kk++;
}
} // while end
// if cFone is null, no more fones
if(strlen(cFone) <= 0) {
continue;
}
// consider the context phones, left first
memset(biFone, 0, 16);
if(lFone[0]=='\0') { // first fone
sprintf(biFone, "%s-%s", "pau", cFone);
}
else { // other cases
if(rFone[0] != '\0')
sprintf(biFone, "%s-%s", cFone, rFone);
else
sprintf(biFone, "%s-%s", cFone, "pau");
}
// build more precise model based on context
memset(biBak, 0, 16);
strcpy(biBak, biFone);
memset(biFone, 0, 16);
FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);
if(biFone[0]=='\0')
pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);
else
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(biFone, "%s-%s", cFone, "pau");
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);
AfxMessageBox(tipBuf);
continue;
}
else
num = pRaw[0];
}
memset(dataBuf, 0, 9000*sizeof(short));
// note: the real speech data is from 1, not 0
num = num -1;
nCV = pRaw[0];
// adjust duration and play it
nLen = (int)(num*TTSTempo);
nOffset = (int)(nCV*(1.0 - TTSTempo))+1;
memcpy(dataBuf, &pRaw[nOffset], 2*nLen);
nCount += nLen;
// if stress level exists?
if(ampLevel>=1) { // more than this ???
ScaleWaveSignal(dataBuf, nLen, (float)2.0);
}
if(ampLevel==-1) {
ScaleWaveSignal(dataBuf, nLen, (float)1.5);
}
if(fp != NULL)
fwrite(dataBuf, sizeof(short), nLen, fp);
else
myPlaySound(dataBuf, nLen);
// for the very 1st phone, then ...
if(lFone[0]=='\0' ) {
strcpy(lFone, "pau");
if( rFone[0]=='\0' )
sprintf(biFone, "%s-%s", cFone, "pau");
else
sprintf(biFone, "%s-%s", cFone, rFone);
// try to find more precise bifone model
memset(biBak, 0, 16);
strcpy(biBak, biFone);
memset(biFone, 0, 16);
FindPreciseBifone(biFone, lFone, cFone, rFone, r1Fone);
if(biFone[0]=='\0')
pRaw = TTSGetFoneRawData(biBak, ENMALEVOICE, &num);
else
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(biFone, "%s-%s", cFone, "pau");
pRaw = TTSGetFoneRawData(biFone, ENMALEVOICE, &num);
if(!pRaw || num==0){
sprintf(tipBuf, "No speech data for biphones: <%s> and <%s>", biFone, biBak);
AfxMessageBox(tipBuf);
continue;
}
else
num = pRaw[0];
}
num = num -1;
// adjust duration and play it
nCV = pRaw[0];
nLen = (int)(num*TTSTempo);
nOffset = (int)(nCV*(1.0 - TTSTempo))+1;
memset(dataBuf, 0, 9000*sizeof(short));
memcpy(dataBuf, &pRaw[nOffset], 2*nLen);
nCount += nLen;
// if stress level exists?
if(ampLevel>=1) // more than this ???
ScaleWaveSignal(dataBuf, nLen, (float)2.0);
if(ampLevel==-1)
ScaleWaveSignal(dataBuf, nLen, (float)1.5);
// adjust duration and play it
nLen = (int)(num*TTSTempo);
nCount += nLen;
if(fp != NULL)
fwrite(dataBuf, sizeof(short), nLen, fp);
else
myPlaySound(dataBuf, nLen);
} // if end
} // while end
free(pFone); // no use of it now
fclose(fp);
fp = fopen("TTS-TMP-Data.raw", "rb");
// after each sentence, add some silence
memset(dataBuf, 0, 9000*sizeof(short));
nLen = (int)(200*TTSTempo); // 200/8000 seconds
if(fp != NULL) {
pTout = (short *)calloc(nCount+16, sizeof(short));
if(!pTout) {
AfxMessageBox("Memory error in: RunEnTTSEngine");
return;
}
fread(pTout, sizeof(short), nCount, fp);
myPlaySound(pTout, nCount);
fclose(fp);
free(pTout);
}
else {
myPlaySound(dataBuf, nLen);
}
// free all resources and exit
// AfxMessageBox("En TTS finished!");
return;
}
- 语音合成
- 语音合成
- 语音合成
- 语音合成
- 语音合成
- 语音合成
- 语音识别和语音合成
- iOS语音听写、语音合成
- 语音合成软件
- 多通道语音合成
- Android 语音合成
- 情感语音合成调研
- HTS 语音合成简述
- iOS语音合成
- AVPlayer,AVAudioPlayer,语音合成
- c# 百度语音合成
- android---简单语音合成
- 百度语音合成
- 进销存管理之盘点流程
- 由一个简单的客户端间TCP/UDP通信程序引发的关于设计模式的思考
- ACE_SOCK下的几个类
- nginx源码----queue篇
- 理解PHP5中static和const关键字(转)
- 语音合成
- 语音识别
- 王皓对自己一定要狠
- 开始
- 向量类型的
- Linux下JVM中可生成的最大Thread数量
- 用API写windows程序
- [关于性]_处女情结
- Android开发文件下载中的断点续传源码