+
95
-

回答

我们模仿以下sqlite数据库,因为他很小,只有一个c文件,功能少模仿简单。

800_auto

说明:整体来说sqlite将sql命令通过分词、解析和代码生成器最后生成字节码,在虚拟机上运行,直到它完成或返回结果、或遇到致命错误、或中断。

开始

主要开发工具vscode,采用xmake做c的构建工具,主要是比较简单和方便,可以跨平台使用,不了解的可以参考我的xmake构建工具的文章。 新建个项目,最终展示如下:

800_auto

先来仿照sqlite的命令行实现个简单的解释器,它会循环执行:读取用户输入-->执行命令-->输出结果-->继续等待用户输出。 关键代码如下:

int main(int argc, char **argv)
{
InputBuffer *input_buffer = new_input_buffer();
while (true) {
print_prompt();
read_input(input_buffer);

if (strcmp(input_buffer->buffer, ".exit") == 0) {
close_input_buffer(input_buffer);
exit(EXIT_SUCCESS);
} else {
printf("unkown command: '%s'.\n", input_buffer->buffer);
}
}
return 0;
}


其中InputBuffer为自定义的保存用户输入的缓冲区,读取一行数据后判断命令是否为".exit"命令,不是退出命令则打印不识别命令,是退出命令则退出,非常简单:

typedef struct {
char* buffer;
size_t buffer_length;
ssize_t input_length;
} InputBuffer;

int getline_my(char **buffer, int *length, FILE *fd)
{
int i = 0;
char ch;
char buf[MAX_LEN] = {0};

while ((ch = fgetc(fd)) != EOF && ch != '\n') {
if (MAX_LEN - 1 == i) {
break;
}
buf[i++] = ch;
}
*length = i;
buf[i] = '\0';
*buffer = (char *)malloc(sizeof(char) * (i + 1));
assert(buffer != NULL);
strncpy(*buffer, buf, i + 1);
return i;
}



void read_input(InputBuffer *input_buffer)
{
ssize_t bytes_read =
getline_my(&(input_buffer->buffer), &(input_buffer->buffer_length), stdin);

if (bytes_read <= 0) {
printf("Error reading input\n");
exit(EXIT_FAILURE);
}

input_buffer->input_length = bytes_read ;
input_buffer->buffer[bytes_read] = 0;
}


运行提示行:

F:\ccode\microdb-my\build\windows\x86\debug>microdb-my.exe

microdb > .test
unkown command: '.test'.
microdb > .help
unkown command: '.help'.
microdb > .exit

小小的改进版本

1 支持语句识别

上个版本的功能除了识别.exit退出外,没啥其他有用的功能,这个版本先添加对语句的支持,另外我们区分下点号开头的表示是元命令,否则就是正常的sql语句,需要我们解析执行的先不管怎么解析和执行。 第2版的main函数,内容如下:

int main(int argc, char **argv)
{
InputBuffer *input_buffer = new_input_buffer();
while (true) {
print_prompt();
read_input(input_buffer);
if (input_buffer->buffer[0] == '.') {
switch (do_meta_command(input_buffer)) {
case (META_COMMAND_SUCCESS):
continue;
case (META_COMMAND_UNRECOGNIZED_COMMAND):
printf("Unrecognized command '%s'\n", input_buffer->buffer);
continue;
}
} else {
Statement statement;
switch (prepare_statement(input_buffer, &statement)) {
case (PREPARE_SUCCESS):
break;
case (PREPARE_UNRECOGNIZED_STATEMENT):
printf("Unrecognized keyword at start of '%s'.\n",
input_buffer->buffer);
continue;
}

execute_statement(&statement);
printf("Executed.\n");
}
}
return 0;
}


将整个输入分成两个部分,以点号开头的表示元命令,不然就当成sql语句进行处理, 解析好语句后,就执行,执行只是打印个输出内容:

F:\ccode\microdb-my\build\windows\x86\debug>microdb-my.exe
microdb > .test
Unrecognized command '.test'
microdb > .help
Unrecognized command '.help'
microdb > insert a
This is where we would do an insert.
Executed.
microdb > select a
This is where we would do a select.
Executed.
microdb > .exit

关于语句和解析语句的定义如下:

typedef enum {
META_COMMAND_SUCCESS,
META_COMMAND_UNRECOGNIZED_COMMAND
} MetaCommandResult;
typedef enum { STATEMENT_INSERT, STATEMENT_SELECT } StatementType;
typedef enum { PREPARE_SUCCESS, PREPARE_UNRECOGNIZED_STATEMENT } PrepareResult;

typedef struct {
StatementType type;
} Statement;


MetaCommandResult do_meta_command(InputBuffer* input_buffer) {
if (strcmp(input_buffer->buffer, ".exit") == 0) {
close_input_buffer(input_buffer);
exit(EXIT_SUCCESS);
} else {
return META_COMMAND_UNRECOGNIZED_COMMAND;
}
}

PrepareResult prepare_statement(InputBuffer *input_buffer,
Statement *statement)
{
if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
statement->type = STATEMENT_INSERT;
return PREPARE_SUCCESS;
}
else if (strncmp(input_buffer->buffer, "select",6) == 0) {
statement->type = STATEMENT_SELECT;
return PREPARE_SUCCESS;
}
return PREPARE_UNRECOGNIZED_STATEMENT;
}

void execute_statement(Statement *statement)
{
switch (statement->type) {
case (STATEMENT_INSERT):
printf("This is where we would do an insert.\n");
break;
case (STATEMENT_SELECT):
printf("This is where we would do a select.\n");
break;
}
}

2 支持在内存中插入和查询语句

首先数据保存在内存中,采用硬编码的方式来保存用户信息,用户信息如下表:
800_auto

插入语句和查询语句的语法也进行了简化,插入语句如下:

insert 1 cstack foo@bar.com
select 

整体实现思路如下:

扩展解析功能,将插入的数据以行为基本单元保存在称为页的一块内存中。
每个页面有很多行组成,行紧凑排列。
整个系统会根据需要申请多个页面,所有的页面都保存在一个大数组里面。
对row的定义如下:

typedef struct {
uint32_t id;
char username[COLUMN_USERNAME_SIZE];
char email[COLUMN_EMAIL_SIZE];
} Row;

// 判断数据类型的长度,(Struct*)0是把null强转为Struct*指针,后获取属性,这里面只获取类型 所以不会报错
#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)

// 获取ID的属性的长度
const uint32_t ID_SIZE = size_of_attribute(Row, id);
// 获取用户名的长度
const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
// 获取邮箱的长度
const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
// ID偏移量
const uint32_t ID_OFFSET = 0;
// 用户名偏移量
const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
// 邮箱的偏移量
const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
// 整行长度
const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;
800_auto
列的长度和偏移量

下面两个函数比较关键,即序列化列和反序列化列,这里面序列化并没有把数据写入到磁盘,只是放入一块连续的平铺的内存中而已,不过这个却是可以方便持久化到磁盘上。

/*序列化行,将row信息放入连续内存行中*/
void serialize_row(Row* source, void* destination) {
memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
}

/*反序列化,将内存块行转成row*/
void deserialize_row(void* source, Row* destination) {
memcpy(&(destination->id), source + ID_OFFSET, ID_SIZE);
memcpy(&(destination->username), source + USERNAME_OFFSET, USERNAME_SIZE);
memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
}

mail), source + EMAIL_OFFSET, EMAIL_SIZE);
}
定义一个页面为4KB大小,然后定义最大只能保存100个页面,每个页面再按照每个用户总的字节数大小进行划分,具体如下:

// 总页数
#define TABLE_MAX_PAGES 100
// 每页的大小
const uint32_t PAGE_SIZE = 4096;
// 每页可以保存row的总量
const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
// 整个表可以保存的总的数据量
const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;

typedef struct {
uint32_t num_rows;
void* pages[TABLE_MAX_PAGES];
} Table;


行不跨页,每页会有些浪费空间. 定位行的算法也是常用在布隆过滤器的算法,如下:

void *row_slot(Table *table, uint32_t row_num)
{
uint32_t page_num = row_num / ROWS_PER_PAGE;
void *page = table->pages[page_num];
if (page == NULL) {
page = table->pages[page_num] = malloc(PAGE_SIZE);
}
uint32_t row_offset = row_num % ROWS_PER_PAGE;
uint32_t byte_offset = row_offset * ROW_SIZE;
return (char *)page + byte_offset;
}


通过:uint32_t page_num = row_num / ROWS_PER_PAGE; 来定位具体的页, 通过uint32_t row_offset = row_num % ROWS_PER_PAGE; 来定位具体的row,然后通过offset来定义在页面的具体偏移量。

uint32_t byte_offset = row_offset * ROW_SIZE;
return (char *)page + byte_offset;

插入和查询函数定义如下:

ExecuteResult execute_insert(Statement *statement, Table *table)
{
if (table->num_rows >= TABLE_MAX_ROWS) {
return EXECUTE_TABLE_FULL;
}
Row *row_to_insert = &(statement->row_to_insert);
serialize_row(row_to_insert, row_slot(table, table->num_rows));
table->num_rows += 1;
return EXECUTE_SUCCESS;
}

ExecuteResult execute_select(Statement *statement, Table *table)
{
Row row;
for (uint32_t i = 0; i < table->num_rows; i++) {
deserialize_row(row_slot(table, i), &row);
print_row(&row);
}
return EXECUTE_SUCCESS;
}


插入比较简单,关键是定位到具体的页面的偏移量,将row的内容持久化到内存块中。 查询是每次都是全部查询没有查询全部的功能。

第一个正式版本

代码一共320行,要定义为main.cpp,用c++编译器编译才可以,虽然大部分是c的语法,但是定义const外部产量原因,c编译器无法通过。 代码实现了内存一个固定表格的插入和查询功能,和一个交互界面,下一阶段是实现文件的持久化和B+树的存储功能。

#include <stdio.h>
#if defined(_MSC_VER)
#include <BaseTsd.h>
typedef SSIZE_T ssize_t;
#endif

#include <stdint.h>
#include <string.h>
#include <malloc.h>
#include <cstdlib>

#ifndef true
#define true 1
#define false 0
#endif

#define EXIT_SUCCESS 0
#define MAX_LEN 1024
#pragma warning(disable : 4819)

#define COLUMN_USERNAME_SIZE 32
#define COLUMN_EMAIL_SIZE 255
#define TABLE_MAX_PAGES 100

/*元命令的解析结果*/
typedef enum {
META_COMMAND_SUCCESS,
META_COMMAND_UNRECOGNIZED_COMMAND
} MetaCommandResult;

/*插入和查询语句*/
typedef enum { STATEMENT_INSERT, STATEMENT_SELECT } StatementType;
/*解析命令结果*/
typedef enum { PREPARE_SUCCESS, PREPARE_UNRECOGNIZED_STATEMENT, PREPARE_NEGATIVE_ID, PREPARE_STRING_TOO_LONG, PREPARE_NEGATIVE_ID, PREPARE_SYNTAX_ERROR} PrepareResult;
/*执行结果*/
typedef enum { EXECUTE_SUCCESS, EXECUTE_TABLE_FULL } ExecuteResult;
// 行定义
typedef struct {
uint32_t id;
char username[COLUMN_USERNAME_SIZE + 1];
char email[COLUMN_EMAIL_SIZE + 1];
} Row;
// 表的定义
typedef struct {
uint32_t num_rows;
void *pages[TABLE_MAX_PAGES];
} Table;
// 语句定义
typedef struct {
StatementType type;
Row row_to_insert;
} Statement;
// 缓冲区定义
typedef struct {
char *buffer;
size_t buffer_length;
ssize_t input_length;
} InputBuffer;


#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)

const uint32_t ID_SIZE = size_of_attribute(Row, id);
const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
// 偏移量
const uint32_t ID_OFFSET = 0;
const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;


const uint32_t PAGE_SIZE = 4096;
const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;


InputBuffer *new_input_buffer()
{
InputBuffer *input_buffer = (InputBuffer *)malloc(sizeof(InputBuffer));
input_buffer->buffer = NULL;
input_buffer->buffer_length = 0;
input_buffer->input_length = 0;
return input_buffer;
}


int getline_my(char **buffer, size_t *length, FILE *fd)
{
int i = 0;
char ch;
char buf[MAX_LEN] = {0};

while ((ch = fgetc(fd)) != EOF && ch != '\n') {
if (MAX_LEN - 1 == i) {
break;
}
buf[i++] = ch;
}
*length = i;
buf[i] = '\0';
*buffer = (char *)malloc(sizeof(char) * (i + 1));
strncpy(*buffer, buf, i + 1);
return i;
}

void print_row(Row *row)
{
printf("(%d,%s,%s)\n", row->id, row->username, row->email);
}
// 从命令行读取一行命令
void read_input(InputBuffer *input_buffer)
{
ssize_t bytes_read =
getline_my(&(input_buffer->buffer), &(input_buffer->buffer_length), stdin);

if (bytes_read <= 0) {
printf("Error reading input\n");
exit(EXIT_FAILURE);
}

input_buffer->input_length = bytes_read ;
input_buffer->buffer[bytes_read] = 0;
}

void close_input_buffer(InputBuffer *input_buffer)
{
free(input_buffer->buffer);
free(input_buffer);
input_buffer = NULL;
}


///////////////////////////////////////////////////////////////////////////////
// 解析元命令
MetaCommandResult do_meta_command(InputBuffer *input_buffer)
{
if (strcmp(input_buffer->buffer, ".exit") == 0) {
close_input_buffer(input_buffer);
exit(EXIT_SUCCESS);
} else {
return META_COMMAND_UNRECOGNIZED_COMMAND;
}
}

// 解析插入语句
PrepareResult prepare_insert(InputBuffer *input_buffer, Statement *statement)
{
statement->type = STATEMENT_INSERT;

char *keyword = strtok(input_buffer->buffer, " ");
char *id_string = strtok(NULL, " ");
char *username = strtok(NULL, " ");
char *email = strtok(NULL, " ");

if (id_string == NULL || username == NULL || email == NULL) {
return PREPARE_SYNTAX_ERROR;
}

int id = atoi(id_string);
if (id < 0) {
return PREPARE_NEGATIVE_ID;
}
if (strlen(username) > COLUMN_USERNAME_SIZE) {
return PREPARE_STRING_TOO_LONG;
}
if (strlen(email) > COLUMN_EMAIL_SIZE) {
return PREPARE_STRING_TOO_LONG;
}

statement->row_to_insert.id = id;
strcpy(statement->row_to_insert.username, username);
strcpy(statement->row_to_insert.email, email);

return PREPARE_SUCCESS;
}

// 语句的解析
PrepareResult prepare_statement(InputBuffer *input_buffer,
Statement *statement)
{
if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
return prepare_insert(input_buffer, statement);
} else if (strncmp(input_buffer->buffer, "select", 6) == 0) {
statement->type = STATEMENT_SELECT;
return PREPARE_SUCCESS;
}

return PREPARE_UNRECOGNIZED_STATEMENT;
}

// 持久化row
void serialize_row(Row *source, void *destination)
{
memcpy((char *)destination + ID_OFFSET, &(source->id), ID_SIZE);
memcpy((char *)destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
memcpy((char *)destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
}
// 反持久化row
void deserialize_row(void *source, Row *destination)
{
memcpy(&(destination->id), (char *)source + ID_OFFSET, ID_SIZE);
memcpy(&(destination->username), (char *)source + USERNAME_OFFSET, USERNAME_SIZE);
memcpy(&(destination->email), (char *)source + EMAIL_OFFSET, EMAIL_SIZE);
}

//核心 row的定位
void *row_slot(Table *table, uint32_t row_num)
{
uint32_t page_num = row_num / ROWS_PER_PAGE;
void *page = table->pages[page_num];
if (page == NULL) {
page = table->pages[page_num] = malloc(PAGE_SIZE);
}
uint32_t row_offset = row_num % ROWS_PER_PAGE;
uint32_t byte_offset = row_offset * ROW_SIZE;
return (char *)page + byte_offset;
}

// 核心 定位row之后持久化
ExecuteResult execute_insert(Statement *statement, Table *table)
{
if (table->num_rows >= TABLE_MAX_ROWS) {
return EXECUTE_TABLE_FULL;
}
Row *row_to_insert = &(statement->row_to_insert);
serialize_row(row_to_insert, row_slot(table, table->num_rows));
table->num_rows += 1;
return EXECUTE_SUCCESS;
}
// 查询语句执行
ExecuteResult execute_select(Statement *statement, Table *table)
{
Row row;
for (uint32_t i = 0; i < table->num_rows; i++) {
deserialize_row(row_slot(table, i), &row);
print_row(&row);
}
return EXECUTE_SUCCESS;
}

ExecuteResult execute_statement(Statement *statement, Table *table)
{
switch (statement->type) {
case (STATEMENT_INSERT):
return execute_insert(statement, table);
case (STATEMENT_SELECT):
return execute_select(statement, table);
}
}


Table *new_table()
{
Table *table = (Table *)malloc(sizeof(Table));
table->num_rows = 0;
for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
table->pages[i] = NULL;
}
return table;
}

void free_table(Table *table)
{
for (int i = 0; table->pages[i]; i++) {
free(table->pages[i]);
}
free(table);
}

void print_prompt()
{
printf("microdb > ");
}

int main(int argc, char **argv)
{
InputBuffer *input_buffer = new_input_buffer();
Table *table = new_table();
while (true) {
print_prompt();
read_input(input_buffer);
if (input_buffer->buffer[0] == '.') {
switch (do_meta_command(input_buffer)) {
case (META_COMMAND_SUCCESS):
continue;
case (META_COMMAND_UNRECOGNIZED_COMMAND):
printf("Unrecognized command '%s'\n", input_buffer->buffer);
continue;
}
} else {
Statement statement;
switch (prepare_statement(input_buffer, &statement)) {
case (PREPARE_SUCCESS):
break;
case (PREPARE_STRING_TOO_LONG):
printf("String is too long.\n");
continue;
case (PREPARE_NEGATIVE_ID):
printf("ID must be positive.\n");
continue;
case (PREPARE_SYNTAX_ERROR):
printf("Syntax error. Could not parse statement.\n");
continue;
case (PREPARE_UNRECOGNIZED_STATEMENT):
printf("Unrecognized keyword at start of '%s'.\n",
input_buffer->buffer);
continue;
}
switch (execute_statement(&statement, table)) {
case EXECUTE_SUCCESS:
printf("Executed.\n");
break;
case EXECUTE_TABLE_FULL:
printf("Error: Table full.\n");
break;
}
}
}
return 0;
}


网友回复

我知道答案,我要回答