Home

Awesome

TinyCompiler

序言

  1. 项目概述

    本项目是基于flex,bison以及LLVM,使用c++11实现的类C语法编译器,使用flex结合yacc对源代码进行词法、语法分析;在语法分析阶段生成整个源代码相应的抽象语法树后,根据LLVM IR(Intermediate Representation)模块中定义的中间代码语法输出符合LLVM中间语言语法、机器无关的中间代码;最后,本项目通过调用LLVM Back ends模块的接口,根据本地指令集与操作系统架构,将中间代码编译成二进制目标代码。编译生成的目标代码之后可直接编译生成可执行文件,或与其他目标代码链接生成可执行文件。

    本项目解析的语法与是C语言的一个子集,但部分语法存在区别,这些将在最后的测试用例中具体说明。目前已支持的数据类型包括:

    • void
    • int
    • float
    • double
    • char
    • string
    • bool
    • 自定义结构体
    • 数组(包括多维数组)

    支持的主要语法包括:

    • 变量的声明、初始化(包括一维数组初始化,多维数组暂不支持初始化,只能逐个元素赋值使用)
    • 函数声明,函数调用(传递参数类型可以是任意已支持类型)
    • 外部函数声明和调用
    • 控制流语句if-else、for、while及任意层级的嵌套使用
    • 单行注释(#)
    • 二元运算符、赋值、函数参数的隐式类型转换
    • 全局变量的使用
    • ...

    以上提到的类型和语法均已通过编译成目标代码并生成二进制文件运行测试。

  2. 相关术语

    • LLVM:LLVM是一个自由软件项目,它是一种编译器基础设施,以C++写成。LLVM基于静态单赋值形式(SSA)的编译策略提供了完整编译系统的中间层,它会将中间语言(Intermediate form,IF)从编译器取出与最优化,最优化后的IF接着被转换及链接到目标平台的汇编语言。LLVM支持与语言无关的指令集架构及类型系统。每个在静态单赋值形式(SSA)的指令集代表着,每个变量(被称为具有类型的寄存器)仅被赋值一次,这简化了变量间相依性的分析。LLVM允许代码被静态的编译,包含在传统的GCC系统底下,或是类似JAVA等后期编译才将IF编译成机器码所使用的即时编译(JIT)技术。
    • flex:flex(快速词法分析产生器,英语:fast lexical analyzer generator)是一种词法分析程序。它是lex的开放源代码版本,以BSD许可证发布。通常与GNU bison一同运作,但是它本身不是GNU计划的一部分。
    • GNU bison:GNU bison是一个自由软件,用于自动生成语法分析器程序,实际上可用于所有常见的操作系统。Bison把LALR形式的上下文无关文法描述转换为可做语法分析的C或C++程序。在新近版本中,Bison增加了对GLR语法分析算法的支持。
  3. 主要源代码结构

    TinyCompiler
    ├── ASTNodes.h 抽象语法树节点的定义
    ├── CodeGen.cpp 抽象语法树生成中间代码
    ├── CodeGen.h
    ├── Makefile
    ├── ObjGen.cpp 中间代码生成目标代码
    ├── ObjGen.h
    ├── Readme.md
    ├── TypeSystem.cpp 类型系统实现
    ├── TypeSystem.h
    ├── grammar.y 语法分析yacc文件
    ├── main.cpp TinyCompiler程序入口
    ├── test.input 用于测试词法、语法、语义分析的源代码
    ├── testmain.cpp 用于测试链接目标文件的源代码
    ├── token.l 词法分析lex文件
    ├── tests 测试用例及结果图
    │   ├── testArray.input
    │   ├── testArray.png
    │   ├── testArrayAST.png
    │   ├── testBasic.input
    │   ├── testBasic.png
    │   ├── testBasicAST.png
    │   ├── testStruct.input
    │   ├── testStruct.png
    │   └── testStructAST.png
    └── utils.cpp 通用函数实现

  4. 运行环境

    本项目已在 OSX Sierra 12.1(64位) 以及 Ubuntu LTS 14.04(64位)下,使用LLVM 3.9平台进行测试,测试截图见最后一节。

    Windows平台可自行搭建LLVM环境后测试。

  5. 使用说明

    注: 本项目使用LLVM 3.9.0版本的接口,据测试不同版本可能无法直接编译

    • 编译TinyCompiler
    make
    
    • 使用TinyCompiler编译test.c文件,将目标代码输出到output.o
    cat test.c | compiler
    

    用g++链接output.o生成可执行文件

    g++ output.o -o test
    ./test
    

    使用test.input, testmain.cpp文件自动测试编译、链接

    make test
    make testlink
    
  6. 参考资料

    LLVM Language Reference Manual

    LLVM Tutorial

    gnuu - writing your own toy compiler


语法定义

本编译器实现的语法是类似标准C的语法,引入的几个区别包括:

基本语法

int func(int a, int b){
    if( a > b ){
        return func(b * 0.5)
    }else if( a == b ){
        return func(a * b)
    }else{
        return 0
    }
}

int main(int argc, string[1] argv){
    int i = 0
    for( ; i<argc; i=i+1){
        func(i, argc)
    }
    while( 1 ){}
    return 0
}

结构体使用

struct Point{
    int x
    int y
}
int func(struct Point p){
    return p.x
}
int test(){
    struct Point p
    p.x = 1
    p.y = 3
    func(p)
    return 0
}

数组使用

int testArray(){
    int[10] oneDim = [1,2,3,4]
    int[3][4] twoDim
    int i, j
    for(i=0; i<3; i=i+1){
        for(j=0; j<4; j=j+1){
            twoDim[i][j] = 0
        }
    }
    return 0
}

外部函数使用


extern int printf(string format)
extern int scanf(string format)

int testExtern(){
    string input
    scanf("%s", &input)
    printf("%d, %f, input = %s", 1, 0.1, input)
    return 0
}

词法分析


语法分析

img


语义分析


中间代码生成


目标代码生成


测试用例

  1. 基本语法

测试代码

extern int printf(string format)
extern int puts(string s)

int func(int a, int b){
    int res = 0
    if( a <= 1 ) {
        res = 1
    }else if( 1 ){
        res = func(a-1, b) + func(a-2, b)
    }else{
        res = func(b, a)
    }
    return res
}

int main(int argc, string[1] argv){
    int i
    argc = 5
    for( i = 1 ; i<argc; i=i+1){
        printf("i=%d, func=%d", i, func(i, argc))
        puts("")
    }

    return 0
}

抽象语法树

img

中间代码

; ModuleID = 'main'
source_filename = "main"

@string = private unnamed_addr constant [14 x i8] c"i=%d, func=%d\00"
@string.1 = private unnamed_addr constant [1 x i8] zeroinitializer

declare i32 @printf(i8*)

declare i32 @puts(i8*)

define i32 @func(i32 %a, i32 %b) {
entry:
  %0 = alloca i32
  store i32 %a, i32* %0
  %1 = alloca i32
  store i32 %b, i32* %1
  %2 = alloca i32
  store i32 0, i32* %2
  %arrayPtr = load i32, i32* %0
  %3 = load i32, i32* %0
  %cmptmp = icmp sle i32 %3, 1
  %4 = icmp ne i1 %cmptmp, false
  br i1 %4, label %then, label %else

then:                                             ; preds = %entry
  store i32 1, i32* %2
  br label %ifcont12

else:                                             ; preds = %entry
  br i1 true, label %then1, label %else8

then1:                                            ; preds = %else
  %arrayPtr2 = load i32, i32* %0
  %5 = load i32, i32* %0
  %subtmp = sub i32 %5, 1
  %arrayPtr3 = load i32, i32* %1
  %6 = load i32, i32* %1
  %calltmp = call i32 @func(i32 %subtmp, i32 %6)
  %arrayPtr4 = load i32, i32* %0
  %7 = load i32, i32* %0
  %subtmp5 = sub i32 %7, 2
  %arrayPtr6 = load i32, i32* %1
  %8 = load i32, i32* %1
  %calltmp7 = call i32 @func(i32 %subtmp5, i32 %8)
  %addtmp = add i32 %calltmp, %calltmp7
  store i32 %addtmp, i32* %2
  br label %ifcont

else8:                                            ; preds = %else
  %arrayPtr9 = load i32, i32* %1
  %9 = load i32, i32* %1
  %arrayPtr10 = load i32, i32* %0
  %10 = load i32, i32* %0
  %calltmp11 = call i32 @func(i32 %9, i32 %10)
  store i32 %calltmp11, i32* %2
  br label %ifcont

ifcont:                                           ; preds = %else8, %then1
  br label %ifcont12

ifcont12:                                         ; preds = %ifcont, %then
  %arrayPtr13 = load i32, i32* %2
  %11 = load i32, i32* %2
  ret i32 %11
}

define i32 @main(i32 %argc, i8** %argv) {
entry:
  %0 = alloca i32
  store i32 %argc, i32* %0
  %1 = alloca i8**
  store i8** %argv, i8*** %1
  %2 = alloca i32
  store i32 5, i32* %0
  %arrayPtr = load i32, i32* %2
  %3 = load i32, i32* %2
  %arrayPtr1 = load i32, i32* %0
  %4 = load i32, i32* %0
  %cmptmp = icmp ult i32 %3, %4
  %5 = icmp ne i1 %cmptmp, false
  store i32 1, i32* %2
  br i1 %5, label %forloop, label %forcont

forloop:                                          ; preds = %forloop, %entry
  %arrayPtr2 = load i32, i32* %2
  %6 = load i32, i32* %2
  %arrayPtr3 = load i32, i32* %2
  %7 = load i32, i32* %2
  %arrayPtr4 = load i32, i32* %0
  %8 = load i32, i32* %0
  %calltmp = call i32 @func(i32 %7, i32 %8)
  %calltmp5 = call i32 @printf([14 x i8]* @string, i32 %6, i32 %calltmp)
  %calltmp6 = call i32 @puts([1 x i8]* @string.1)
  %arrayPtr7 = load i32, i32* %2
  %9 = load i32, i32* %2
  %addtmp = add i32 %9, 1
  store i32 %addtmp, i32* %2
  %arrayPtr8 = load i32, i32* %2
  %10 = load i32, i32* %2
  %arrayPtr9 = load i32, i32* %0
  %11 = load i32, i32* %0
  %cmptmp10 = icmp ult i32 %10, %11
  %12 = icmp ne i1 %cmptmp10, false
  br i1 %12, label %forloop, label %forcont

forcont:                                          ; preds = %forloop, %entry
  ret i32 0
}

运行结果

img

  1. 结构体使用
struct Point{
    int x
    int y
}
int func(struct Point p){
    return p.x
}
int test(){
    struct Point p
    p.x = 1
    p.y = 3
    func(p)
    return 0
}

抽象语法树可视化

img

中间代码

; ModuleID = 'main'
source_filename = "main"

%Point = type { i32, i32 }

@string = private unnamed_addr constant [3 x i8] c"%s\00"
@string.1 = private unnamed_addr constant [8 x i8] c"%s = %d\00"

declare i32 @printf(i8*)

declare i32 @scanf(i8*)

define i32 @func(%Point %p) {
entry:
  %0 = alloca %Point
  store %Point %p, %Point* %0
  %structPtr = load %Point, %Point* %0, align 4
  %memberPtr = getelementptr inbounds %Point, %Point* %0, i32 0, i32 0
  %1 = load i32, i32* %memberPtr
  ret i32 %1
}

define i32 @main() {
entry:
  %0 = alloca %Point
  %arraytmp = alloca [32 x i8], i32 32
  %arrayPtr = load [32 x i8], [32 x i8]* %arraytmp
  %arrayPtr1 = getelementptr inbounds [32 x i8], [32 x i8]* %arraytmp, i32 0
  %calltmp = call i32 @scanf([3 x i8]* @string, [32 x i8]* %arrayPtr1)
  %arrayPtr2 = load [32 x i8], [32 x i8]* %arraytmp
  %arrayPtr3 = getelementptr inbounds [32 x i8], [32 x i8]* %arraytmp, i32 0
  %arrayPtr4 = load %Point, %Point* %0
  %1 = load %Point, %Point* %0
  %calltmp5 = call i32 @func(%Point %1)
  %calltmp6 = call i32 @printf([8 x i8]* @string.1, [32 x i8]* %arrayPtr3, i32 %calltmp5)
  ret i32 0
}

运行结果

img

  1. 数组使用
extern int printf(string str)
extern int puts(string str)

int main(){

    int[13] arr = [ 1,2,3,4,5,6,7,8,9,10,11,12 ]
    int[3][4] arr2

    int i
    int j
    
    for(i=0; i<3; i=i+1){
        for(j=0; j<4; j=j+1){
            arr2[i][j] = arr[i*4+j]
        }
    }

    for(i=0; i<3; i=i+1){
        for(j=0; j<4; j=j+1){
            printf("%d,", arr2[i][j])
        }
        puts("")
    }
    return 0
}

抽象语法树可视化

img

中间代码

; ModuleID = 'main'
source_filename = "main"

@string = private unnamed_addr constant [4 x i8] c"%d,\00"
@string.1 = private unnamed_addr constant [1 x i8] zeroinitializer

declare i32 @printf(i8*)

declare i32 @puts(i8*)

define i32 @main() {
entry:
  %arraytmp = alloca [13 x i32], i32 13
  %arrayPtr = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 0
  store i32 1, i32* %elementPtr, align 4
  %arrayPtr1 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr2 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 1
  store i32 2, i32* %elementPtr2, align 4
  %arrayPtr3 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr4 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 2
  store i32 3, i32* %elementPtr4, align 4
  %arrayPtr5 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr6 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 3
  store i32 4, i32* %elementPtr6, align 4
  %arrayPtr7 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr8 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 4
  store i32 5, i32* %elementPtr8, align 4
  %arrayPtr9 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr10 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 5
  store i32 6, i32* %elementPtr10, align 4
  %arrayPtr11 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr12 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 6
  store i32 7, i32* %elementPtr12, align 4
  %arrayPtr13 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr14 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 7
  store i32 8, i32* %elementPtr14, align 4
  %arrayPtr15 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr16 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 8
  store i32 9, i32* %elementPtr16, align 4
  %arrayPtr17 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr18 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 9
  store i32 10, i32* %elementPtr18, align 4
  %arrayPtr19 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr20 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 10
  store i32 11, i32* %elementPtr20, align 4
  %arrayPtr21 = load [13 x i32], [13 x i32]* %arraytmp
  %elementPtr22 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 11
  store i32 12, i32* %elementPtr22, align 4
  %arraytmp23 = alloca [12 x i32], i32 12
  %0 = alloca i32
  %1 = alloca i32
  store i32 0, i32* %0
  %arrayPtr24 = load i32, i32* %0
  %2 = load i32, i32* %0
  %cmptmp = icmp ult i32 %2, 3
  %3 = icmp ne i1 %cmptmp, false
  br i1 %3, label %forloop, label %forcont45

forloop:                                          ; preds = %forcont, %entry
  store i32 0, i32* %1
  %arrayPtr26 = load i32, i32* %1
  %4 = load i32, i32* %1
  %cmptmp27 = icmp ult i32 %4, 4
  %5 = icmp ne i1 %cmptmp27, false
  br i1 %5, label %forloop25, label %forcont

forloop25:                                        ; preds = %forloop25, %forloop
  %arrayPtr28 = load [12 x i32], [12 x i32]* %arraytmp23
  %arrayPtr29 = load i32, i32* %0
  %6 = load i32, i32* %0
  %multmp = mul i32 4, %6
  %arrayPtr30 = load i32, i32* %1
  %7 = load i32, i32* %1
  %addtmp = add i32 %multmp, %7
  %elementPtr31 = getelementptr inbounds [12 x i32], [12 x i32]* %arraytmp23, i64 0, i32 %addtmp
  %arrayPtr32 = load i32, i32* %0
  %8 = load i32, i32* %0
  %multmp33 = mul i32 %8, 4
  %arrayPtr34 = load i32, i32* %1
  %9 = load i32, i32* %1
  %addtmp35 = add i32 %multmp33, %9
  %elementPtr36 = getelementptr inbounds [13 x i32], [13 x i32]* %arraytmp, i64 0, i32 %addtmp35
  %10 = load i32, i32* %elementPtr36, align 4
  store i32 %10, i32* %elementPtr31, align 4
  %arrayPtr37 = load i32, i32* %1
  %11 = load i32, i32* %1
  %addtmp38 = add i32 %11, 1
  store i32 %addtmp38, i32* %1
  %arrayPtr39 = load i32, i32* %1
  %12 = load i32, i32* %1
  %cmptmp40 = icmp ult i32 %12, 4
  %13 = icmp ne i1 %cmptmp40, false
  br i1 %13, label %forloop25, label %forcont

forcont:                                          ; preds = %forloop25, %forloop
  %arrayPtr41 = load i32, i32* %0
  %14 = load i32, i32* %0
  %addtmp42 = add i32 %14, 1
  store i32 %addtmp42, i32* %0
  %arrayPtr43 = load i32, i32* %0
  %15 = load i32, i32* %0
  %cmptmp44 = icmp ult i32 %15, 3
  %16 = icmp ne i1 %cmptmp44, false
  br i1 %16, label %forloop, label %forcont45

forcont45:                                        ; preds = %forcont, %entry
  store i32 0, i32* %0
  %arrayPtr47 = load i32, i32* %0
  %17 = load i32, i32* %0
  %cmptmp48 = icmp ult i32 %17, 3
  %18 = icmp ne i1 %cmptmp48, false
  br i1 %18, label %forloop46, label %forcont67

forloop46:                                        ; preds = %forcont61, %forcont45
  store i32 0, i32* %1
  %arrayPtr50 = load i32, i32* %1
  %19 = load i32, i32* %1
  %cmptmp51 = icmp ult i32 %19, 4
  %20 = icmp ne i1 %cmptmp51, false
  br i1 %20, label %forloop49, label %forcont61

forloop49:                                        ; preds = %forloop49, %forloop46
  %arrayPtr52 = load i32, i32* %0
  %21 = load i32, i32* %0
  %multmp53 = mul i32 4, %21
  %arrayPtr54 = load i32, i32* %1
  %22 = load i32, i32* %1
  %addtmp55 = add i32 %multmp53, %22
  %elementPtr56 = getelementptr inbounds [12 x i32], [12 x i32]* %arraytmp23, i64 0, i32 %addtmp55
  %23 = load i32, i32* %elementPtr56, align 4
  %calltmp = call i32 @printf([4 x i8]* @string, i32 %23)
  %arrayPtr57 = load i32, i32* %1
  %24 = load i32, i32* %1
  %addtmp58 = add i32 %24, 1
  store i32 %addtmp58, i32* %1
  %arrayPtr59 = load i32, i32* %1
  %25 = load i32, i32* %1
  %cmptmp60 = icmp ult i32 %25, 4
  %26 = icmp ne i1 %cmptmp60, false
  br i1 %26, label %forloop49, label %forcont61

forcont61:                                        ; preds = %forloop49, %forloop46
  %calltmp62 = call i32 @puts([1 x i8]* @string.1)
  %arrayPtr63 = load i32, i32* %0
  %27 = load i32, i32* %0
  %addtmp64 = add i32 %27, 1
  store i32 %addtmp64, i32* %0
  %arrayPtr65 = load i32, i32* %0
  %28 = load i32, i32* %0
  %cmptmp66 = icmp ult i32 %28, 3
  %29 = icmp ne i1 %cmptmp66, false
  br i1 %29, label %forloop46, label %forcont67

forcont67:                                        ; preds = %forcont61, %forcont45
  ret i32 0
}

运行结果

img