Files
serein-flow/Serein.Script/SereinScriptLexer.cs
2025-07-16 16:16:19 +08:00

543 lines
16 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using Newtonsoft.Json.Linq;
using System.Net.Http.Headers;
using System.Runtime.CompilerServices;
using System.Xml.Linq;
using static System.Net.Mime.MediaTypeNames;
namespace Serein.Script
{
/// <summary>
/// Serein脚本词法分析器的Token类型
/// </summary>
internal enum TokenType
{
/// <summary>
/// 预料之外的值
/// </summary>
Null,
/// <summary>
/// 标识符(变量)
/// </summary>
Identifier,
/// <summary>
/// 布尔
/// </summary>
Boolean,
/// <summary>
/// int 整数
/// </summary>
NumberInt,
/// <summary>
/// long 整数
/// </summary>
NumberLong,
/// <summary>
/// float 浮点数
/// </summary>
NumberFloat,
/// <summary>
/// double 浮点数
/// </summary>
NumberDouble,
/// <summary>
/// 字符串
/// </summary>
String,
/// <summary>
/// Char字符
/// </summary>
Char,
/// <summary>
/// 插值字符串
/// </summary>
InterpolatedString,
/// <summary>
/// 关键字
/// </summary>
Keyword,
/// <summary>
/// 操作符
/// </summary>
Operator,
/// <summary>
/// 左小括号
/// </summary>
ParenthesisLeft,
/// <summary>
/// 右小括号
/// </summary>
ParenthesisRight,
/// <summary>
/// 左中括号
/// </summary>
SquareBracketsLeft,
/// <summary>
/// 右中括号
/// </summary>
SquareBracketsRight,
/// <summary>
/// 左大括号
/// </summary>
BraceLeft,
/// <summary>
/// 右大括号
/// </summary>
BraceRight,
/// <summary>
/// 点号
/// </summary>
Dot,
/// <summary>
/// 逗号
/// </summary>
Comma,
/// <summary>
/// 分号
/// </summary>
Semicolon,
/// <summary>
/// 行注释
/// </summary>
// RowComment,
/// <summary>
/// 解析完成
/// </summary>
EOF
}
/// <summary>
/// Serein脚本词法分析器的Token结构体
/// </summary>
internal ref struct Token
{
public TokenType Type { get; }
public string Value { get; }
public int Row { get; set; }
public string Code { get; set; }
public int StartIndex { get; set; }
public int Length { get; set; }
internal Token(TokenType type, string value)
{
Type = type;
Value = value;
}
public override string ToString()
{
return $"token in {Row} row, type is \"{Type}\", value is \"{Value}\".";
}
}
/// <summary>
/// Serein脚本词法分析器
/// </summary>
internal ref struct SereinScriptLexer
{
private readonly ReadOnlySpan<char> _input;
private int _index;
private int _row ;
/// <summary>
/// 关键字,防止声明为变量
/// </summary>
private string[] _keywords = [
"let",
"func",
"if",
"else",
"return",
"while",
"new",
"class",
];
internal SereinScriptLexer(string input)
{
_input = input.AsSpan();
_index = 0;
}
internal Token PeekToken(int count = 1)
{
if (count < 0) throw new Exception() ;
int currentIndex = _index; // 保存当前索引
var currentRow = _row; // 保存当前行数
Token nextToken = new Token(); ;
for (var i = 0; i < count; i++)
{
nextToken = NextToken(); // 获取下一个 token
}
_index = currentIndex; // 恢复索引到当前位置
_row = currentRow; // 恢复到当前行数
return nextToken; // 返回下一个 token
}
/// <summary>
/// 重置Lexer
/// </summary>
public void Reset()
{
this._row = 0;
this._index = 0;
}
/// <summary>
/// 根据 token 重置Lexer
/// </summary>
/// <param name="token"></param>
public void SetToken(Token token)
{
this._row = token.Row;
this._index = token.StartIndex;
}
internal Token NextToken()
{
// 跳过空白字符
while (_index < _input.Length && char.IsWhiteSpace(_input[_index]))
{
if (_input[_index] == '\n')
{
_row++;
}
_index++;
}
if (_index >= _input.Length) return new Token(TokenType.EOF, string.Empty); // 程序结束
char currentChar = _input[_index];
// 识别字符串字面量
if (currentChar == '"')
{
return ReadString();
}
if (currentChar == '\'')
{
if (_input[_index + 2] == '\'')
{
return ReadChar();
}
else
{
throw new Exception($"not is char: {currentChar},in Line.{_row}.");
}
}
// 跳过注释
if (_input[_index] == '/' && _input[_index + 1] == '/')
{
// 一直识别到换行符的出现
while (_index < _input.Length && _input[_index] != '\n')
{
_index++;
}
return NextToken(); // 跳过注释后返回下一个识别token
}
// 识别null字面量
if (currentChar == 'n')
{
if (_input[_index + 1] == 'u'
&& _input[_index + 2] == 'l'
&& _input[_index + 3] == 'l')
{
var value = _input.Slice(_index, 4).ToString();
return CreateToken(TokenType.Null, "null");
}
}
// 识别布尔字面量
if (currentChar == 't')
{
if (_input[_index + 1] == 'r'
&& _input[_index + 2] == 'u'
&& _input[_index + 3] == 'e')
{
return CreateToken(TokenType.Boolean, "true");
}
}
else if (currentChar == 'f')
{
if (_input[_index + 1] == 'a'
&& _input[_index + 2] == 'l'
&& _input[_index + 3] == 's'
&& _input[_index + 4] == 'e')
{
return CreateToken(TokenType.Boolean, "false");
}
}
// 识别数字
if (char.IsDigit(currentChar))
{
#region
if (char.IsDigit(currentChar))
{
var start = _index;
bool hasDot = false;
bool hasSuffix = false;
while (_index < _input.Length)
{
var ch = _input[_index];
if (char.IsDigit(ch))
{
_index++;
}
else if (ch == '.' && !hasDot)
{
hasDot = true;
_index++;
}
else if (ch is 'f' or 'F' or 'd' or 'D' or 'l' or 'L')
{
hasSuffix = true;
_index++;
break; // 后缀后应结束
}
else
{
break;
}
}
var raw = _input.Slice(start, _index - start).ToString();
_index = start; // 回退索引,仅 CreateToken 负责推进
TokenType type;
// 判断类型
if (hasDot)
{
if (raw.EndsWith("f", StringComparison.OrdinalIgnoreCase))
type = TokenType.NumberFloat;
else if (raw.EndsWith("d", StringComparison.OrdinalIgnoreCase))
type = TokenType.NumberDouble;
else
type = TokenType.NumberDouble; // 默认小数为 double
}
else
{
if (raw.EndsWith("l", StringComparison.OrdinalIgnoreCase))
type = TokenType.NumberLong;
else
{
// 自动根据位数判断 int 或 long
if (long.TryParse(raw, out var val))
{
if (val >= int.MinValue && val <= int.MaxValue)
type = TokenType.NumberInt;
else
type = TokenType.NumberLong;
}
else
{
type = TokenType.NumberLong; // 超出 long 会出错,默认成 long
}
}
}
return CreateToken(type, raw);
}
#endregion
}
// 识别标识符(变量名、关键字)
if (char.IsLetter(currentChar))
{
var start = _index;
while (_index < _input.Length && (char.IsLetterOrDigit(_input[_index]) || _input[_index] == '_'))
_index++;
var value = _input.Slice(start, _index - start).ToString();
_index = start; // 回退索引,索引必须只能在 CreateToken 方法内更新
return CreateToken(_keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier, value);
}
// 识别符号
switch (currentChar)
{
case '(': return CreateToken(TokenType.ParenthesisLeft, "(");
case ')': return CreateToken(TokenType.ParenthesisRight, ")");
case '[': return CreateToken(TokenType.SquareBracketsLeft, "[");
case ']': return CreateToken(TokenType.SquareBracketsRight, "]");
case '{': return CreateToken(TokenType.BraceLeft, "{");
case '}': return CreateToken(TokenType.BraceRight, "}");
case ',': return CreateToken(TokenType.Comma, ",");
case ';': return CreateToken(TokenType.Semicolon, ";");
case '+':
case '-':
case '*':
case '/':
return CreateToken(TokenType.Operator, currentChar.ToString());
case '>': // 识别 ">" 或 ">="
if (_index + 1 < _input.Length && _input[_index + 1] == '=')
{
return CreateToken(TokenType.Operator, ">=");
}
return CreateToken(TokenType.Operator, ">");
case '<': // 识别 "<" 或 "<="
if (_index + 1 < _input.Length && _input[_index + 1] == '=')
{
return CreateToken(TokenType.Operator, "<=");
}
return CreateToken(TokenType.Operator, "<");
case '!': // 识别 "!="
if (_index + 1 < _input.Length && _input[_index + 1] == '=')
{
return CreateToken(TokenType.Operator, "!=");
}
break;
case '=': // 识别 "=="
if (_index + 1 < _input.Length && _input[_index + 1] == '=')
{
return CreateToken(TokenType.Operator, "==");
}
else
{
return CreateToken(TokenType.Operator, "=");
}
case '.':
return CreateToken(TokenType.Dot, ".");
//case '$':
// return CreateToken(TokenType.InterpolatedString, "$");
}
throw new Exception("Unexpected character: " + currentChar);
}
/// <summary>
/// 创建一个新的Token实例
/// </summary>
/// <param name="tokenType"></param>
/// <param name="value"></param>
/// <returns></returns>
private Token CreateToken(TokenType tokenType, string value)
{
var code = GetLine(_row).ToString();
var token = new Token(tokenType, value)
{
Row = _row,
StartIndex = _index,
Length = value.Length,
Code = code,
};
_index += value.Length;
return token;
}
/// <summary>
/// 读取硬编码的文本
/// </summary>
/// <returns></returns>
/// <exception cref="Exception"></exception>
private Token ReadString()
{
_index++; // 跳过开头的引号
var start = _index;
while (_index < _input.Length && _input[_index] != '"')
{
if (_input[_index] == '\\' && _index + 1 < _input.Length && (_input[_index + 1] == '"' || _input[_index + 1] == '\\'))
{
// 处理转义字符
_index++;
}
_index++;
}
if (_index >= _input.Length) throw new Exception("Unterminated string literal");
var value = _input.Slice(start, _index - start).ToString();
// var value = _input.Substring(start, _index - start);
_index = start + 1; // 跳过引号
return CreateToken(TokenType.String, value);
// _index++; // 跳过结束的引号
//return new Token(TokenType.String, value.ToString());
}
/// <summary>
/// 读取硬编码的Char字符
/// </summary>
/// <returns></returns>
/// <exception cref="Exception"></exception>
private Token ReadChar()
{
_index++; // 跳过开头的引号
var start = _index;
var cahrValue = _input.Slice(start, 1).ToString();
_index++; // 跳过Char字符串后的引号
return CreateToken(TokenType.Char, cahrValue);
// _index++; // 跳过结束的引号
//return new Token(TokenType.String, value.ToString());
}
/// <summary>
/// 获取对应行的代码文本
/// </summary>
/// <param name="lineNumber"></param>
/// <returns></returns>
private ReadOnlySpan<char> GetLine( int lineNumber)
{
ReadOnlySpan<char> text = _input;
int currentLine = 0;
int start = 0;
for (int i = 0; i < text.Length; i++)
{
if (text[i] == '\n') // 找到换行符
{
if (currentLine == lineNumber)
{
return text.Slice(start, i - start); // 返回从start到当前位置的行文本
}
currentLine++;
start = i + 1; // 下一行的起始位置
}
}
// 如果没有找到指定行返回空的Span
return ReadOnlySpan<char>.Empty;
}
public int GetIndex()
{
return _index;
}
public string GetCoreContent(int index)
{
ReadOnlySpan<char> text = _input;
var content = text.Slice(index, _index - index); // 返回从start到当前位置的行文本
return content.ToString();
}
}
}