Add input tokenization

This commit is contained in:
Andrew Cooper
2022-02-07 22:43:31 +11:00
parent 215af218e5
commit 25c8dad512
7 changed files with 173 additions and 3 deletions

View File

@@ -0,0 +1,8 @@
<Project>
<PropertyGroup>
<Nullable>enable</Nullable>
<LangVersion>10.0</LangVersion>
</PropertyGroup>
</Project>

View File

@@ -2,12 +2,11 @@
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="FluentAssertions" Version="6.4.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.11.0" />
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.3">

View File

@@ -0,0 +1,33 @@
using FluentAssertions;
using Xunit;
namespace Games.Common.IO
{
public class TokenizerTests
{
[Theory]
[MemberData(nameof(TokenizerTestCases))]
public void ParseTokens_SplitsStringIntoExpectedTokens(string input, string[] expected)
{
var result = Tokenizer.ParseTokens(input);
result.Should().BeEquivalentTo(expected);
}
public static TheoryData<string, string[]> TokenizerTestCases() => new()
{
{ "", new[] { "" } },
{ "aBc", new[] { "aBc" } },
{ " Foo ", new[] { "Foo" } },
{ " \" Foo \" ", new[] { " Foo " } },
{ " \" Foo ", new[] { " Foo " } },
{ "\"\"abc", new[] { "" } },
{ "a\"\"bc", new[] { "a\"\"bc" } },
{ "\"\"", new[] { "" } },
{ ",", new[] { "", "" } },
{ " foo ,bar", new[] { "foo", "bar" } },
{ "\"\"bc,de", new[] { "", "de" } },
{ "a\"b,\" c,d\"e, f ,,g", new[] { "a\"b", " c,d", "f", "", "g" } }
};
}
}

View File

@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<TargetFramework>netstandard2.1</TargetFramework>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,37 @@
using System.Text;
namespace Games.Common.IO
{
internal class Token
{
protected readonly StringBuilder _builder;
private int _trailingWhiteSpaceCount;
private Token()
{
_builder = new StringBuilder();
}
public Token Append(char character)
{
_builder.Append(character);
_trailingWhiteSpaceCount = char.IsWhiteSpace(character) ? _trailingWhiteSpaceCount + 1 : 0;
return this;
}
public override string ToString() => _builder.ToString(0, _builder.Length - _trailingWhiteSpaceCount);
public static Token Create() => new();
public static Token CreateQuoted() => new QuotedToken();
public static implicit operator string(Token token) => token.ToString();
internal class QuotedToken : Token
{
public override string ToString() => _builder.ToString();
}
}
}

View File

@@ -0,0 +1,90 @@
using System;
using System.Collections.Generic;
namespace Games.Common.IO
{
internal class Tokenizer
{
private const char Quote = '"';
private const char Separator = ',';
private readonly Queue<char> _characters;
private Tokenizer(string input) => _characters = new Queue<char>(input);
public static IEnumerable<string> ParseTokens(string input)
{
if (input is null) { throw new ArgumentNullException(nameof(input)); }
return new Tokenizer(input).ParseTokens();
}
private IEnumerable<string> ParseTokens()
{
while (true)
{
var (token, isLastToken) = Consume(_characters);
yield return token;
if (isLastToken) { break; }
}
}
public (Token, bool) Consume(Queue<char> characters)
{
var token = Token.Create();
var state = ITokenizerState.LookForStartOfToken;
while (characters.TryDequeue(out var character))
{
(state, token) = state.Consume(character, token);
if (state is AtEndOfTokenState) { return (token, false); }
}
return (token, true);
}
private interface ITokenizerState
{
public static ITokenizerState LookForStartOfToken { get; } = new LookForStartOfTokenState();
(ITokenizerState, Token) Consume(char character, Token token);
}
private struct LookForStartOfTokenState : ITokenizerState
{
public (ITokenizerState, Token) Consume(char character, Token token) =>
character switch
{
Separator => (new AtEndOfTokenState(), token),
Quote => (new InQuotedTokenState(), Token.CreateQuoted()),
_ when char.IsWhiteSpace(character) => (this, token),
_ => (new InTokenState(), token.Append(character))
};
}
private struct InTokenState : ITokenizerState
{
public (ITokenizerState, Token) Consume(char character, Token token) =>
character == Separator ? (new AtEndOfTokenState(), token) : (this, token.Append(character));
}
private struct InQuotedTokenState : ITokenizerState
{
public (ITokenizerState, Token) Consume(char character, Token token) =>
character == Quote ? (new LookForSeparatorState(), token) : (this, token.Append(character));
}
private struct LookForSeparatorState : ITokenizerState
{
public (ITokenizerState, Token) Consume(char character, Token token) =>
(character == Separator ? new AtEndOfTokenState() : this, token);
}
private struct AtEndOfTokenState : ITokenizerState
{
public (ITokenizerState, Token) Consume(char character, Token token) =>
throw new InvalidOperationException();
}
}
}

View File

@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;
[assembly:InternalsVisibleTo("Games.Common.Test")]