编译原理之词法分析和语法分析

来源：互联网发布：康耐视视觉检测软件编辑：程序博客网时间：2024/06/09 20:25

花了一天写出的程序没有顾及很多层面，但对于理解基本的实验道理和交上实验还是有点帮助的。代码实现了基于有限自动机的词法分析，采用递归下降分析法和EBNF文法实现语法分析并生成中间代码。from sdu

lexAnalysis.h

[cpp] view plain copy

/*
* lexAnalysis.h
*
* Created on: 2014-12-2
* Author: liuqiushan
*/
#ifndef LEXANALYSIS_H_
#define LEXANALYSIS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef enum _SYM
{
_CONST,
_VAR,
_procedure,
_begin,
_end,
_if,
_then,
_ID, // such as the 'a' in 'var a;'
_INT, // such as the '10' in 'const a = 10;'
_ASSIGN,// '='
_PLUS,// '+'
_SUB,// '-'
_STAR,// '*'
_DIV,// '/'
_LESS,// '<'
_MORE,// '>'
_LESSEQ,// '<='
_MOREEQ,// '>='
_DH,// ','
_MD,// ':='
_LEFT,// '('
_RIGHT,// ')'
_JH,// '#'
_FH// ‘;’
}SYM;
SYM getSYM(char* str);
void appendStr(char* str, char* c, int p_str);
int isABC(char* c);
int isNumber(char* c);
#endif /* LEXANALYSIS_H_ */

LexAnalysis.c

[cpp] view plain copy

/*
* LexAnalysis.c
*
* Created on: 2014-12-2
* Author: liuqiushan
*/
#include "lexAnalysis.h"
SYM getSYM(char* _str)
{
SYM temp = _ID; // default identifier
if(!strcasecmp(_str, "CONST"))
temp = _CONST;
else if(!strcasecmp(_str, "VAR"))
temp = _VAR;
else if(!strcasecmp(_str, "procedure"))
temp = _procedure;
else if(!strcasecmp(_str, "begin"))
temp = _begin;
else if(!strcasecmp(_str, "end"))
temp = _end;
else if(!strcasecmp(_str, "if"))
temp = _if;
else if(!strcasecmp(_str, "then"))
temp = _then;
return temp;
}
void appendStr(char str[], char* c, int p_str )
{
str[p_str] = *c;
}
int isABC(char* c)
{
char ch = *c;
if(ch=='q'||ch=='w'||ch=='e'||ch=='r'||ch=='t'||ch=='y'||ch=='u'||ch=='i'||ch=='o'||ch=='p'||
ch=='a'||ch=='s'||ch=='d'||ch=='f'||ch=='g'||ch=='h'||ch=='j'||ch=='k'||ch=='l'||
ch=='z'||ch=='x'||ch=='c'||ch=='v'||ch=='b'||ch=='n'||ch=='m'||ch=='Q'||ch=='W'||
ch=='E'||ch=='R'||ch=='T'||ch=='Y'||ch=='U'||ch=='I'||ch=='O'||ch=='P'||ch=='A'||
ch=='S'||ch=='D'||ch=='F'||ch=='G'||ch=='H'||ch=='J'||ch=='K'||ch=='L'||ch=='Z'||
ch=='X'||ch=='C'||ch=='V'||ch=='B'||ch=='N'||ch=='M')
return 1;
else
return 0;
}
int isNumber(char* c)
{
char ch = *c;
if(ch=='1'||ch=='2'||ch=='3'||ch=='4'||ch=='5'||ch=='6'||ch=='7'||ch=='8'||ch=='9'||ch=='0')
return 1;
else
return 0;
}

synAnalysis.h

[cpp] view plain copy

/*
* synAnalysis.h
*
* Created on: 2014-12-2
* Author: liuqiushan
*/
#ifndef SYNANALYSIS_H_
#define SYNANALYSIS_H_
typedef enum _KIND
{
PRO,//procedure
CON,//constant
VAR //variable
}CVPKind;
/*
* PL/0 Object Instruction
*
* There are three domains:
*
* f is the function code
* l is the difference between the declared level and the invoked level of the variable
*
* -----------------------------
* | | | |
* | f | l | a |
* | | | |
* -----------------------------
*/
typedef enum _FunctionCode
{
LIT,//① LIT: put the constant on the top of the stack, a is the constant
LOD,//② LOD: put the variable on the top of the stack, a is the address
STO,//③ STO: put the content on the top of the stack into a variable. such as c := b + 10
CAL,//④ CAL
INT,//⑤ INT: create the data area, and a is the number of the area
JMP,//⑥ JMP
JPC,//⑦ JPC
OPR//⑧ OPR: operation, and the type is delivered by a. And the operation objects are the top of the stack and the next top of the stack
}FunctionCode;
typedef enum _OPR
{
ADD = 1, //+
SUB, //-
MUL, //*
DIV, // /
GT,
LT,
GE,
LE,
UE,
EQ,
WRITE,
READ,
MINUS
}OPRTYPE;
typedef struct _TableItem
{
char* name;
CVPKind kind;
int level;
char* address;
}CVPTableItem;
typedef struct _Instruct
{
FunctionCode f;
int l;
int a;
}ObjectInstruct;
#endif /* SYNANALYSIS_H_ */

SynAnalysis.c

[cpp] view plain copy

/*
============================================================================
Name : SynAnalysis.c
Author : liuqiushan
Date : Dec. 2, 2014 18:16
Version : 1.0
Copyright : Shandong University
Description : Syntax Analysis
============================================================================
*/
/*
* Expression EBNF
*--------------------------------------------
*| Expression -> Term { Addop Term } |
*| Addop -> "+" | "-" |
*| Term -> Factor { Mulop Factor } |
*| Mulop -> "*" | "/" |
*| Factor -> ID | NUM | "(" Expression ")" |
*--------------------------------------------
*/
/*
* Pl/0 Statements to be analyzed
*--------------------------------------------
*| const a=10 |
*| var b,c |
*| procedure p |
*| begin |
*| c:=b+a |
*| end |
*--------------------------------------------
*/
#include "lexAnalysis.h"
#include "synAnalysis.h"
// Data Structure
typedef struct Statement
{
char* _statement;
int _length; // the length of the statement
int _locate_pointer; // default point to the first character
}P_Statement;
// Methods
void putsStatements(void);
void initStatements(void);
void binding(P_Statement* p_s, char* statement);
void lexical_Analysis(char* a, int len_a);
void printLexResults(void);
SYM getNextSYMToken();
char* getNextID();
char* getNextNumber();
void levelProcess(int level);
void processConstant(int level);
int processVariable(int level);
void processProcedure(int level);
void strcpyCVPTable(CVPTableItem* cvp, char* name, char* address);
void printSynResults(void);
int str2int(char *str);
void GEN(FunctionCode f, int l, int a);
void Sentence(int level);
void EBNFExpression(int level);
int getIndexCVP(char* id);
void Term(int level);
void Factor(int level);
/* Global Variables */
// Information about the statements set
#define statementNum 6
char* statements[statementNum] = {
"const a=10",
"var b,c",
"procedure p",
"begin",
"c:=b+a",
"end"
};
// Contain those statements to be analyzed
P_Statement statementSet[statementNum];
// For lexical analysis
#define MAX 100
SYM array_SYM[MAX]; // SYM array for lexical analysis
int p_a_SYM = 0; // SYM array location pointer
char array_Id[MAX][MAX]; // identifier array for lexical analysis
int p_a_Id = 0; // identifier array location pointer
char array_Num[MAX][MAX]; // number array for lexical analysis
int p_a_Num = 0; // number array location pointer
// Token acquired during lexical analysis
char str[MAX] = ""; // current token to be analyzed
int p_str = 0; // token inner location pointer
/* For Syntax Analysis */
// Token acquired during syntax analysis
SYM _currentSYMToken;
int currentPointer = 0;
char* _currentID;
int currentPointerID = 0;
char* _currentNumber;
int currentPointerNumber = 0;
int level = 0; //It indicates which level is under processing, and also the level is just the main level.
CVPTableItem cvptable[MAX]; // used to record every element, and we need the table to show which elements (including constant, variable and procedure) are in the program
int p_table = 0; // the appending pointer of cvptable
int addrVar = 3; // the initial address for the first variable is 3
// the following two are used for the address of procedure. In fact up to now, I do not their meaning.
int indexW = 0;
int offset = 1;
ObjectInstruct CODE[MAX]; // recording the generated intermediate code
int p_code = 0; // the appending pointer of CODE
OPRTYPE type_opr; // Expression operation type
// Main
int main(void) {
puts("====================Initial=====================");
putsStatements(); // print which statements need to be analyzed
initStatements(); // initial all the statements to be analyzed with the struct P_Statement
// start lexical analysis
puts("================Lexical Analysis================");
int i;
for (i = 0; i < statementNum; i++) {
lexical_Analysis(statementSet[i]._statement, statementSet[i]._length);
}
array_SYM[p_a_SYM] = -1;// end with -1
// print the results of lexical analysis
printLexResults();
// Now let us start Syntax Analysis, just enjoy it
puts("================Syntax Analysis================");
_currentSYMToken = getNextSYMToken();
levelProcess(level);
//print the results of syntax analysis
printSynResults();
return EXIT_SUCCESS;
}
// Implementation
void putsStatements(void)
{
puts("Analyzing the following statements: ");
puts("");
int i;
for (i = 0; i < statementNum; i++)
{
puts(statements[i]);
}
}
void initStatements(void)
{
int i;
for (i = 0; i < statementNum; i++)
binding(&statementSet[i], statements[i]);
}
void binding(P_Statement* p_s, char* statement)
{
// calculate the length of statement;
int len = strlen(statement);
p_s->_statement = malloc(len*sizeof(char));
strcpy(p_s->_statement, statement);
p_s->_length = len;
p_s->_locate_pointer = 0;
// testing
// printf("%s\n", p_s->_statement);
// printf("%d\n", p_s->_length);
}
// Lexical Analysis Main Program
void lexical_Analysis(char* a, int len_a)
{
int i;
for (i = 0; i < len_a; i++) {
// Only get one character each time
if (a[i]==' ')
continue;
// Finite Automation Method
if (isABC(&a[i])) // If the first character is letter
{
while (isABC(&a[i])||isNumber(&a[i]))
{
appendStr(str, &a[i], p_str);
p_str++;
i++;
if (i == len_a)
break;
}
i--;
SYM code = getSYM(str);
//either identifier or keyword
if (code == _ID)
{
array_SYM[p_a_SYM] = _ID;
p_a_SYM++;
strcpy(array_Id[p_a_Id], str);
p_a_Id++;
} else { // keyword or reserved word of Pl/0
array_SYM[p_a_SYM] = code;
p_a_SYM++;
}
// before next one, clean them
memset(str, 0, sizeof(str));
p_str = 0;
} else if (isNumber(&a[i])) // If the first character is number
{
while (isNumber(&a[i]))
{
appendStr(str, &a[i], p_str);
p_str++;
i++;
if (i == len_a)
break;
}
i--;
array_SYM[p_a_SYM] = _INT;
p_a_SYM++;
strcpy(array_Num[p_a_Num], str);
p_a_Num++;
// before next one, clean them
memset(str, 0, sizeof(str));
p_str = 0;
} else if (a[i] == '=') {
array_SYM[p_a_SYM] = _ASSIGN;
p_a_SYM++;
} else if (a[i] == ',') {
array_SYM[p_a_SYM] = _DH;
p_a_SYM++;
} else if (a[i] == ':') {
i++;
if (a[i] == '=')
{
array_SYM[p_a_SYM] = _MD;
p_a_SYM++;
}
} else if (a[i] == '+') {
array_SYM[p_a_SYM] = _PLUS;
p_a_SYM++;
}
}
// print the results of lexical analysis
//printLexResults();
}
void printLexResults(void)
{
int i;
puts("============Lexical Analysis Results================");
printf("SYM: [ ");
for (i = 0; i < p_a_SYM; i++)
{
switch(array_SYM[i])
{
case 0: printf("_CONST "); break;
case 1: printf("_VAR "); break;
case 2: printf("_procedure "); break;
case 3: printf("_begin "); break;
case 4: printf("_end "); break;
case 5: printf("_if "); break;
case 6: printf("_then "); break;
case 7: printf("_ID "); break;
case 8: printf("_INT "); break;
case 9: printf("_ASSIGN "); break;
case 10: printf("_PLUS "); break;
case 11: printf("_SUB "); break;
case 12: printf("_STAR "); break;
case 13: printf("_DIV "); break;
case 14: printf("_LESS "); break;
case 15: printf("_MORE "); break;
case 16: printf("_LESSEQ "); break;
case 17: printf("_MOREEQ "); break;
case 18: printf("_DH "); break;
case 19: printf("_MD "); break;
case 20: printf("_LEFT "); break;
case 21: printf("_RIGHT "); break;
case 22: printf("_JH "); break;
case 23: printf("_FH "); break;
}
}
printf("]\n");
printf("ID: [ ");
for (i = 0; i < p_a_Id; i++)
printf("%s ", array_Id[i]);
printf("]\n");
printf("NUM: [ ");
for (i = 0; i < p_a_Num; i++)
printf("%s ", array_Num[i]);
printf("]\n");
}
SYM getNextSYMToken()
{
return array_SYM[currentPointer++];
}
char* getNextID()
{
return array_Id[currentPointerID++];
}
char* getNextNumber()
{
return array_Num[currentPointerNumber++];
}
void levelProcess(int level)
{
int countVar = 0; // record the number of variables in one level
if (_currentSYMToken != -1)
{
if (_currentSYMToken == _CONST) // Constant
{
processConstant(level);
if ((_currentSYMToken=getNextSYMToken())==-1)
return;
}
if (_currentSYMToken == _VAR) // variables follow constant in our example
{
countVar = processVariable(level);
}
if (_currentSYMToken == _procedure) // procedure follows variables in our example
{
_currentSYMToken = getNextSYMToken(); // get the identifier of the procedure
processProcedure(level);
}
}
GEN(INT, 0, 3+countVar); // Create the specified number area. Because we designed addrVar 3, so we plus 3
Sentence(level);
GEN(OPR, 0, 0); // quit the data area
}
void processConstant(int level)
{
if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID) // a
if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ASSIGN) // =
if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_INT) // 10
if ((_currentID=getNextID())!=NULL) // get the identifier
if ((_currentNumber=getNextNumber())!=NULL) // get the number
{
strcpyCVPTable(&cvptable[p_table], _currentID, _currentNumber);
cvptable[p_table].kind = CON;
cvptable[p_table].level = level;
// Do not forget to add p_table
p_table++;
}
}
int processVariable(int level)
{
int count = 0; // record the number of variables
if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID)
if ((_currentID=getNextID())!=NULL) // get the identifier
{
count++;
char str[MAX];
sprintf(str, "%d", addrVar);
strcpyCVPTable(&cvptable[p_table], _currentID, str);
cvptable[p_table].kind = VAR;
cvptable[p_table].level = level;
addrVar++;
// Do not forget to add p_table
p_table++;
}
// Maybe we could meet such var a,b
while((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_DH)
{
if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID)
if ((_currentID=getNextID())!=NULL) // get the identifier
{
count++;
char str[MAX];
sprintf(str, "%d", addrVar);
strcpyCVPTable(&cvptable[p_table], _currentID, str);
cvptable[p_table].kind = VAR;
cvptable[p_table].level = level;
addrVar++;
// Do not forget to add p_table
p_table++;
}
}
return count;
}
void processProcedure(int level)
{
if (_currentSYMToken!=-1&&_currentSYMToken==_ID)
if ((_currentID=getNextID())!=NULL) // get the identifier of the procedure
{
char str[MAX];
int temp = indexW + offset;
sprintf(str, "%d", temp);
strcpyCVPTable(&cvptable[p_table], _currentID, str);
cvptable[p_table].kind = PRO;
cvptable[p_table].level = level;
// Do not forget to add p_table
p_table++;
}
// IMPORTANT PHASE: SUBLEVEL
_currentSYMToken = getNextSYMToken(); // in our example it should be begin
levelProcess(level + 1); // Recursive-descent Parsing
}
void strcpyCVPTable(CVPTableItem* cvp, char* name, char* address)
{
int len_N = strlen(name);
int len_A = strlen(address);
cvp->name = (char*)malloc(len_N*sizeof(char));
cvp->address = (char*)malloc(len_A*sizeof(char));
strcpy(cvp->name, name);
strcpy(cvp->address, address);
}
void printSynResults(void)
{
int i;
puts("============Syntax Analysis Results================");
puts("The Table of Constant, Variable and Procedure");
puts("Name\tKind\tLevel\tAddress\t");
for (i = 0; i < p_table; i++)
{
//printf("%s\t%d\t%d\t%s\n", cvptable[i].name, cvptable[i].kind, cvptable[i].level, cvptable[i].address);
printf("%s\t", cvptable[i].name);
switch(cvptable[i].kind) {
case CON: printf("CON\t"); break;
case VAR: printf("VAR\t"); break;
case PRO: printf("PRO\t"); break;
}
printf("%d\t%s\n", cvptable[i].level, cvptable[i].address);
}
puts("");
puts("Intermediate Code");
puts("No.\tFunction Code\tLevel Difference\tDisplacement");
for (i = 0; i < p_code; i++)
{
printf("%d\t", i);
switch(CODE[i].f)
{
case LIT: printf("LIT\t"); break;
case INT: printf("INT\t"); break;
case LOD: printf("LOD\t"); break;
case STO: printf("STO\t"); break;
case CAL: printf("CAL\t"); break;
case JMP: printf("JMP\t"); break;
case JPC: printf("JPC\t"); break;
case OPR: printf("OPR\t"); break;
}
printf("\t%d\t\t\t%d\n", CODE[i].l, CODE[i].a);
}
}
int str2int(char *str)
{
int v = 0;
do {
v = 10*v+*str-'0';
str++;
} while((*str>='0')&&(*str<='9'));
return v;
}
// To generate the object instruct
void GEN(FunctionCode f, int l, int a)
{
CODE[p_code].f = f;
CODE[p_code].l = l;
CODE[p_code].a = a;
p_code++;
}
void Sentence(int level)
{
// Here I just implement begin and Assignment i.e. _MD(:=)
if (_currentSYMToken != -1)
{
if (_currentSYMToken ==_begin)
{
_currentSYMToken = getNextSYMToken();
Sentence(level);// recurse with the sub-sentence of the sentence (i.e. nest), but they are in the same level
while(_currentSYMToken != -1)
{
if (_currentSYMToken == _end)
{
_currentSYMToken = getNextSYMToken(); // in our example, this is the end of SYM table. it should be -1
return;
}
}
} else if (_currentSYMToken == _ID) // Assignment
{
_currentID = getNextID(); // I omit the variables valid checking, but we sill need to find it in cvptable
int cvpIndex = getIndexCVP(_currentID); // Just find its index in cvptable
if((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_MD) // assignment symbol
{
_currentSYMToken = getNextSYMToken();
EBNFExpression(level);
GEN(STO, level-cvptable[cvpIndex].level, str2int(cvptable[cvpIndex].address));
return; // assignment over
}
}
}
}
void EBNFExpression(int level)
{
Term(level); // the level + and -
// Here we implement + and -
while((_currentSYMToken=getNextSYMToken())!=-1)
{
if (_currentSYMToken == _SUB || _currentSYMToken == _PLUS)
{
if (_currentSYMToken == _SUB)
type_opr = SUB;
else
type_opr = ADD;
_currentSYMToken = getNextSYMToken(); // next term
Term(level);
GEN(OPR, 0, type_opr);
}
}
}
int getIndexCVP(char* id)
{
int i;
for (i = 0; i < p_table; i++)
{
if(!strcmp(cvptable[i].name, id))
break;
}
return i;
}
void Term(int level)
{
Factor(level); // the level of * and /
// we omit * and /
}
void Factor(int level)
{
// In fact, we omit '(' and ')', but it is easy to implement it by recursing with EBNFExpression
if(_currentSYMToken!=-1&&_currentSYMToken==_ID)
{
_currentID = getNextID();
int cvpIndex = getIndexCVP(_currentID); // Just find its index in cvptable
if(cvptable[cvpIndex].kind == VAR) // type checking
{
GEN(LOD, level-cvptable[cvpIndex].level, str2int(cvptable[cvpIndex].address));
} else if (cvptable[cvpIndex].kind == CON) {
GEN(LIT, 0, str2int(cvptable[cvpIndex].address));
}
}
}

实验结果：

0 0