Starting soon!

ACCELERATING THE ZIG PARSERwith Niles Salter / @Validark

But first.. a recap of part I

& things to know

  • We discussed how computers can get faster
    • identified fundamental problems that computers must face in the physical universe
    • derived optimization strategies that should be useful across (good) hardware, across time
      • reduce cache misses
      • reduce branch misprediction
      • eliminate unnecessary instructions, fewer is usually better
      • improve data-level parallelism
        • SIMD/SWAR - Single instruction, multiple data & SIMD within a register
      • do work in the pass where it is cheapest to do it

The Zig compiler basics

  • Converts code in text files into executable binaries

Andrew Kelley - Practical DOD
https://vimeo.com/649009599

Intro to
tokenization

export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {
    const newlines: u16 = @bitCast(@as(@Vector(16, u8), chunk) == @as(@Vector(16, u8), @splat('\n')));
    const ones = 0x1111111111111111;
    const ascending_indices = 0xFEDCBA9876543210;
    const restart_nibbles_mask = pdep(newlines, ones ^ 1) *% 0xF;
    const restart_nibbles_indices = pext(ascending_indices, restart_nibbles_mask);
    const prefix_diff = restart_nibbles_indices -% (restart_nibbles_indices << 4);
    const vec: @Vector(8, u8) = @bitCast(ascending_indices -% pdep(prefix_diff, restart_nibbles_mask) *% ones);
    return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}
export
fn
columnCounts
(
chunk
:
@Vector
{
    .kind = .@"export",
    .start = 0,
    .end = 6,
}
{
    .kind = .@"fn",
    .start = 7,
    .end = 9,
}
{
    .kind = .@"identifier",
    .start = 10,
    .end = 22,
}
{
    .kind = .@"(",
    .start = 22,
    .end = 23,
}
{
    .kind = .@"identifier",
    .start = 23,
    .end = 28,
}
{
    .kind = .@":",
    .start = 28,
    .end = 29,
}
{
    .kind = .@"builtin",
    .start = 30,
    .end = 37,
}

A basic tokenizer

pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
    var cur = source[0..];
    var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);

    while (true) {
        switch (cur[0]) {
            'a'...'z', 'A'...'Z', '_' => {
                const start = cur;

                while (true) {
                    cur = cur[1..];
                    switch (cur[0]) {
                        'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                        else => break,
                    }
                }

                // our token span is start..end
                const end = cur;

                (try tokens.addOne(allocator)).* = Token{
                    .kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
                    .start = start,
                    .end = end,
                };
            },
            '0'...'9' => {
                // parse a number
            },
            '~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
                const start = cur;
                cur = cur[1..];
                const end = cur;
                (try tokens.addOne(allocator)).* = Token{
                    .kind = @bitCast(c),
                    .start = start,
                    .end = end,
                };
            }
            '"' => while (true) { // parse a string
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if (cur[0] == '"' and !is_escaped) break;
            },
            ' ', '\t', '\n', '\r' => cur = cur[1..],
            0 => return tokens,
        }
    }
}
source: 
export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {

A better tokenizer

pub fn tokenize(allocator: Allocator, source: []const u8) !std.ArrayListUnmanaged(Token) {
    var cur = source[0..];
    var tokens = try std.ArrayListUnmanaged(Token).initCapacity(allocator, source.len / 8);

    while (true) {
        switch (cur[0]) {
            'a'...'z', 'A'...'Z', '_' => {
                const start = cur;

                while (true) {
                    const V = @Vector(@bitSizeOf(usize), u8);
                    const vec: V = cur[0..@sizeOf(V)].*;

                    const identifier_bitstring =
                        (@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
                        (@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));

                    cur = cur[@ctz(~identifier_bitstring)..];
                    if (~identifier_bitstring != 0) break;
                }

                // our token span is start..end
                const end = cur;

                (try tokens.addOne(allocator)).* = Token{
                    .kind = if (getKeywordKind(start, end)) |kw| kw else .identifier,
                    .start = start,
                    .end = end,
                };
            },
            '0'...'9' => {
                // parse a number
            },
            '~', ':', ';', '[', ']', '?', '(', ')', '{', '}', ',' => |c| {
                const start = cur;
                cur = cur[1..];
                const end = cur;
                (try tokens.addOne(allocator)).* = Token{
                    .kind = @bitCast(c),
                    .start = start,
                    .end = end,
                };
            }
            '"' => while (true) { // parse a string
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if (cur[0] == '"' and !is_escaped) break;
            },
            ' ', '\t', '\n', '\r' => cur = cur[1..],
            0 => return tokens,
        }
    }
}
source: 
export fn columnCounts(chunk: @Vector(16, u8)) @Vector(16, u8) {

Bitstring generation Emit

Basic tokenization loop emit

SWAR emulation

export fn maskForAlphanumericAndUnderscoresBetterSWAR(src: [*]align(8) const u8) u64 {
    const ones: u64 = 0x0101010101010101;
    const mask = comptime ones * 0x7F;
    var result: u64 = 0;

    inline for (0..8) |i| {
        const v: u64 = @bitCast(src[i*8 ..][0..8].*);
        const low_7_bits = v & mask;

        const non_underscore = (low_7_bits ^ comptime ones * '_') + mask;

        // alpha check:
        // Upper 3 bits must be 4 or 6. Lower 5 bits must be in 1...26
        const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
        const non_0x40_or_0x60 = magic_mask + mask;
        const non_alpha = magic_mask + comptime ones * (0x80 - 27);

        // digit check:
        // Upper nibble must be 3. Lower nibble must be in 0...9
        const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
        const non_digit = flipped_0x30 + comptime ones * (0x80 - 0xA);

        const final = 0x8080808080808080 &
            (~v &
                ((non_digit & non_alpha & non_underscore) ^ non_0x40_or_0x60)
            );

        comptime var mult: u64 = 0;
        comptime for (0..8) |j|  { mult |= @as(u64, 1) << (7 * j); };

        const movmask = (final *% mult) >> 56;
        result |= movmask << (8 * i);
    }

    return result;
}



&

^

+


*








+

Bitstring generation Emit SWAR

Interleaved Movemask

fn maskForUnderscoresBetterArm(src: [*]const u8) u64 {
    const indices = std.simd.iota(u6, 16) * @splat(4);

    var vec0 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(0));
    var vec1 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(1));
    var vec2 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(2));
    var vec3 = @shuffle(u8, src[0..64].*, undefined, indices + @splat(3));

    const false_vec = std.mem.zeroes(@Vector(16, u8));
    const true_vec = ~false_vec;

    vec0 = @select(u8, vec0 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec1 = @select(u8, vec1 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec2 = @select(u8, vec2 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);
    vec3 = @select(u8, vec3 == @as(@Vector(16, u8), @splat('_')), true_vec, false_vec);

    const t0 = (vec1 & @splat(0b10000000) | (vec0 >> @splat(1));
    const t1 = (vec3 & @splat(0b10000000) | (vec2 >> @splat(1));
    const t2 = (  t1 & @splat(0b11000000) | (t0 >> @splat(2));
    const t3 = (  t2 & @splat(0b11110000) | (t2 >> @splat(4));

    var t4: @Vector(8, u16) = @bitCast(t3);
    t4 >>= @splat(4);
    const t5: @Vector(16, u8) = @bitCast(t4);
    return @bitCast(@shuffle(u8, t5, undefined, [_]i32{ 0, 2, 4, 6, 8, 10, 12, 14 }));
}
src
{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}
{0,4, 8,12,16,20,24,28,32,36,40,44,48,52,56,60}
{1,5, 9,13,17,21,25,29,33,37,41,45,49,53,57,61}
{2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62}
{3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63}

Interleaved Movemask Emit

My tokenizer design

  • Observation 1: We can reuse bitstrings!
    • Instead of recomputing the bitstrings every time we look at the beginning of a token, let's just compute the bitstrings once for every
      chunk of 64 characters.
    • This also fits nicely with the fact that we have to do utf8-validation for each chunk.
    • Unfortunately, this means we are going to speculatively produce bitstrings whether we need them or not, for every single chunk!
  • Bitstrings produced speculatively:
    • whitespace: '\n', 't', ' '
    • identifier/number matcher: [A-Za-z0-9_]
    • control characters: should just be '\n' (used for jumping to the end of comments or single-line strings)
  • Bitstrings produced on-demand:
    • operator-continuation characters: [!%*+./<=>?\\|]
    • unescaped string / char_literal characters: ['"]
      • just too rare in practice to be a meaningful optimization for Zig

Idealized tokenizer code

pub fn tokenize(gpa: Allocator, source: [:0]align(@bitSizeOf(uword)) const u8) ![]Token {
    const end_ptr = &source.ptr[source.len];
    const extended_source_len = std.mem.alignForward(usize, source.len + EXTENDED_BACK_SENTINELS_LEN, @bitSizeOf(uword));

    const extended_source = source.ptr[0..extended_source_len];
    const tokens = try gpa.alloc(Token, extended_source_len);
    errdefer gpa.free(tokens);

    var cur_token = tokens;
    var prev: []const u8 = extended_source;
    var cur = prev[0..];

    var bitstrings: [3]uword = undefined;
    var op_type: Tag = .whitespace;
    var selected_bitmap: *uword = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
    var bitmap_index = @intFromPtr(source.ptr);
    var utf8_checker: Utf8Checker = .{};

    bitmap_index -%= @bitSizeOf(uword);

    while (true) : (cur = cur[1..]) {
        var aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
        while (true) {
            while (bitmap_index != aligned_ptr) {
                bitmap_index +%= @bitSizeOf(uword);
                const V = @Vector(@bitSizeOf(uword), u8);
                const vec: V = @as([*]align(@bitSizeOf(uword)) const u8, @ptrFromInt(bitmap_index))[0..@bitSizeOf(uword)].*;

                try utf8_checker.validateChunk(vec);

                bitstrings[0] = @as(usize, @bitCast(vec != @as(V, @splat('\x7F')))) &
                    (@as(usize, @bitCast(vec >= @as(V, @splat(' ')))) | @as(usize, @bitCast(vec == @as(V, @splat('\t')))));

                bitstrings[1] uword =
                    @as(usize, @bitCast(vec == @as(V, @splat('\t')))) |
                    @as(usize, @bitCast(vec == @as(V, @splat('\n')))) |
                    @as(usize, @bitCast(vec == @as(V, @splat(' '))));

                bitstrings[2]: uword = (@as(usize, @bitCast(vec == @as(V, @splat('_'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('a')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('z'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('A')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('Z'))))) |
                    (@as(usize, @bitCast(@as(V, @splat('0')) <= vec)) & @as(usize, @bitCast(vec <= @as(V, @splat('9')))));
            }

            const cur_misalignment: std.math.Log2Int(uword) = @truncate(@intFromPtr(cur.ptr));
            const bitstring = selected_bitmap.* >> cur_misalignment;
            cur = cur[@ctz(~bitstring)..];

            // If we made it to the end of chunk(s), wrap around, grab more bits, and start again
            aligned_ptr = @intFromPtr(cur.ptr) / @bitSizeOf(uword) * @bitSizeOf(uword);
            if (bitmap_index == aligned_ptr) break;
        }

        if (op_type == .eof) break;

        while (true) {
            var len: u32 = @intCast(@intFromPtr(cur.ptr) - @intFromPtr(prev.ptr));

            switch (prev[0]) {
                'a'...'z' => if (Keywords.lookup(prev.ptr, len)) |op_kind| {
                    op_type = op_kind;
                    const is_space = @intFromBool(cur[0] == ' ');
                    cur = cur[is_space..];
                    len += is_space;
                },
                else => {},
            }

            cur_token[0] = .{ .len = len, .kind = op_type };
            cur_token = cur_token[1..];
            prev = cur;

            if (cur[0] == '@') {
                cur = cur[1..];
                op_type = switch (cur[0]) {
                    'a'...'z', 'A'...'Z' => .builtin,
                    '"' => .string_identifier,
                    else => return error.MissingQuoteOrLetterAfterAtSymbol,
                };

                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
            } else if (Operators.isSingleCharOp(cur[0])) {
                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(Tag.whitespace)))];
                op_type = Operators.hashOp(Operators.getOpWord(cur.ptr, 1));
            } else {
                op_type = switch (cur[0]) {
                    'a'...'z', 'A'...'Z', '_' => .identifier,
                    '0'...'9' => .number,
                    '\'' => .char_literal,
                    '"' => .string,
                    ' ', '\t', '\n' => .whitespace,
                    0 => .eof,
                    else => .unknown,
                };

                selected_bitmap = bitstrings[@as(u2, @truncate(@intFromEnum(op_type)))];
            }

            if (cur[0] != '\'' and cur[0] != '"') break;

            while (true) {
                cur = cur[1..];
                const is_escaped = cur[0] == '\\';
                cur = cur[@intFromBool(is_escaped)..];
                if ((cur[0] == chr and !is_escaped) or cur[0] < ' ') break;
            }
        }
    }

    if (@intFromPtr(cur.ptr) < @intFromPtr(end_ptr)) return error.Found0ByteInFile;
    cur_token[0] = .{ .len = 1, .kind = .eof };
    return tokens;
}

How do we emulate
count-trailing-zeroes?

How do we emulate
count-trailing-zeroes?

fn ctz(a: u64) u8 {
    if (a == 0) return 64;
    const x = a & (~a +% 1); // a & -a
    const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
    comptime var lookup_table: [64]u8 = undefined;
    comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
    return lookup_table[(deBruijn *% x) >> 58];
}

llvm/llvm-project#84763

How do we emulate
count-trailing-ones?

How do we emulate
count-trailing-ones?

fn cto(a: u64) u8 {
    if (a == -1) return 64;
    const not_a = ~a;
    const x = not_a & (~not_a +% 1); // ~a & -~a
    const deBruijn = 0b0000001000011000101000111001001011001101001111010101110110111111;
    comptime var lookup_table: [64]u8 = undefined;
    comptime for (0..64) |i| { lookup_table[(deBruijn << i) >> 58] = i; };
    return lookup_table[(deBruijn *% x) >> 58];
}

How do we emulate
count-trailing-ones?

define dso_local noundef i64 @_Z3ct1m(i64 noundef %0) local_unnamed_addr #0 {
    %2 = xor i64 %0, -1
    %3 = tail call noundef i64 @llvm.cttz.i64(i64 %2, i1 false), !range !9
    ret i64 %3
}

llvm/llvm-project#82487

Count-trailing-ones on risc-v

Count-trailing-ones on sparc

Count-trailing-ones cross-platform

Optimization: Use Vector shuffles to
match whitespace

Mask Whitespace with shuffle emit

Keyword Lookups

pub fn lookup(kw: [*]const u8, len: u32) ?Tag {
    assert(len != 0);

    const raw_hash = hashKw(kw, len);
    const hash = mapToIndex(raw_hash);

    comptime assert(max_kw_len < '0' and '0' > 16 - 1);

    const KW_VEC = @Vector(16, u8);
    const vec1: KW_VEC = sorted_padded_kws[hash];
    const vec2: KW_VEC = kw[0..16].*; // the safety of this operation is validated at the top of this function
    const len_vec: KW_VEC = @splat(@as(u8, @truncate(len)));
    const cd = @select(u8, len_vec > std.simd.iota(u8, 16), vec2, len_vec);

    if (std.simd.countTrues(cd != vec1) == 0) {
        return @enumFromInt(~hash);
    } else {
        return null;
    }
}

pub fn hashKw(kw: [*]const u8, len: u32) u7 {
    const a = std.mem.readInt(u16, kw[0..2], .little);
    const b = std.mem.readInt(u16, (kw - 2 + @as(usize, @intCast(len)))[0..2], .little);
    return @truncate(((a ^ (len << 14)) *% b) >> 8);
}

fn mapToIndex(raw_hash: u7) u8 {
    const mask = (@as(u64, 1) << @truncate(raw_hash)) -% 1;
    return (if (raw_hash >= 64) @popCount(masks[0]) else 0) + @popCount(mask & masks[raw_hash / 64]);
}

const masks = [2]u64{
    0b0000001000000110001011010011101000001000010010001011011000100100,
    0b0001011010100000000001100000111101100010110011111001100110011100,
};

const sorted_padded_kws = [50][16]u8{
    ("extern"         ++ "\x06" ** 10).*,
    ("while"          ++ "\x05" ** 11).*,
    ("align"          ++ "\x05" ** 11).*,
    ("else"           ++ "\x04" ** 12).*,
    ("or"             ++ "\x02" ** 14).*,
    ("break"          ++ "\x05" ** 11).*,
    ("anytype"        ++ "\x07" **  9).*,
    ("volatile"       ++ "\x08" **  8).*,
    ("async"          ++ "\x05" ** 11).*,
    ("errdefer"       ++ "\x08" **  8).*,
    ("catch"          ++ "\x05" ** 11).*,
    ("test"           ++ "\x04" ** 12).*,
    ("struct"         ++ "\x06" ** 10).*,
    ("const"          ++ "\x05" ** 11).*,
    ("return"         ++ "\x06" ** 10).*,
    ("await"          ++ "\x05" ** 11).*,
    ("resume"         ++ "\x06" ** 10).*,
    ("opaque"         ++ "\x06" ** 10).*,
    ("packed"         ++ "\x06" ** 10).*,
    ("orelse"         ++ "\x06" ** 10).*,
    ("var"            ++ "\x03" ** 13).*,
    ("unreachable"    ++ "\x0b" **  5).*,
    ("threadlocal"    ++ "\x0b" **  5).*,
    ("anyframe"       ++ "\x08" **  8).*,
    ("noinline"       ++ "\x08" **  8).*,
    ("defer"          ++ "\x05" ** 11).*,
    ("try"            ++ "\x03" ** 13).*,
    ("pub"            ++ "\x03" ** 13).*,
    ("usingnamespace" ++ "\x0e" **  2).*,
    ("fn"             ++ "\x02" ** 14).*,
    ("and"            ++ "\x03" ** 13).*,
    ("allowzero"      ++ "\x09" **  7).*,
    ("error"          ++ "\x05" ** 11).*,
    ("addrspace"      ++ "\x09" **  7).*,
    ("if"             ++ "\x02" ** 14).*,
    ("nosuspend"      ++ "\x09" **  7).*,
    ("linksection"    ++ "\x0b" **  5).*,
    ("inline"         ++ "\x06" ** 10).*,
    ("export"         ++ "\x06" ** 10).*,
    ("asm"            ++ "\x03" ** 13).*,
    ("noalias"        ++ "\x07" **  9).*,
    ("suspend"        ++ "\x07" **  9).*,
    ("switch"         ++ "\x06" ** 10).*,
    ("union"          ++ "\x05" ** 11).*,
    ("enum"           ++ "\x04" ** 12).*,
    ("continue"       ++ "\x08" **  8).*,
    ("for"            ++ "\x03" ** 13).*,
    ("callconv"       ++ "\x08" **  8).*,
    ("comptime"       ++ "\x08" **  8).*,
    .{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
};
kw = "export", len = 6
hash = 38
raw_hash = 96
raw_hash = 96 & 63 => 32
const mask = (1 << 32) -% 1

        

        

        

        
add that 17 to @popCount(masks[0]), known at compile-time to be 21

Lookup emit

RISC-V SWAR emit

Passive production of
newline bitstrings

    • I noticed the following code in Zig's AstGen.zig:
fn advanceSourceCursor(astgen: *AstGen, end: usize) void {
    const source = astgen.tree.source;
    var i = astgen.source_offset;
    var line = astgen.source_line;
    var column = astgen.source_column;
    assert(i <= end);
    while (i < end) : (i += 1) {
        if (source[i] == '\n') {
            line += 1;
            column = 0;
        } else {
            column += 1;
        }
    }
    astgen.source_offset = i;
    astgen.source_line = line;
    astgen.source_column = column;
}

Want to learn more
about the tokenizer?

    • Our SIMD on-demand unescaped-quote matcher was designed by the creators of simdjson.
    • The utf8-validator was ported from simdjson / simdutf by travisstaloch and myself.
      • I wrote equivalent SWAR routines so machines without proper vectors can exploit data-level parallelism too!

Daniel Lemire - Parsing JSON Really Quickly: Lessons Learned
https://www.youtube.com/watch?v=wlvKAT7SZIQ

Zig Compiler Internals II

Andrew Kelley - Practical DOD
https://vimeo.com/649009599

I. Operator Precedence Encoder

Part II: The Parser

  • Converts tokens into a tree-like represenation called the Abstract Syntax Tree
    • This is the stage where we validate that the source code fits the "language grammar"
    • We also figure out the order in which operations are supposed to be performed.
      • We need to deal with every operator in the language, including `try`, `catch`, the comparison operators, `.*`, `&`, etc.
      • We also disambiguate between the multiple ways an operator can be interpreted.
        • For instance, `-` can be a binary subtraction or a unary negative based on context.

AST Demo

Let’s Build A Simple Interpreter. Part 7: Abstract Syntax Trees
https://ruslanspivak.com/lsbasi-part7/

Pre-order vs
in-order vs
post-order in code

fn parse(node: ?*Node) {
    if (node == null) return;

    // Operate on the Nodes in pre-order
    parse(node.left);
    // Operate on the Nodes in in-order
    parse(node.right);
    // Operate on the Nodes in post-order
}

Zig's current Parser

Look it up yourself :)

Chandler Carruth's strategy

Shunting yard

More memory efficient strategies

  • We make the Tokenizer emit 2 bytes per token. 1 byte for a kind enum, 1 byte for a length
    • If the length is >255, we store a 0 as the length, and let the next 4 bytes be the true length.
    • This makes the worst case memory consumption scenario a lot better. Now our 1 byte tokens will be mapped to 2 bytes, and our 256+ byte tokens will be mapped to 6 bytes.
    • Drawbacks:
      • All bytes in our source file have to map to a token, even whitespace and comments.
        • Luckily, for most of our tokens, the length is irrelevant to later passes. We only care about the length of identifiers, strings, numbers, and data that we actually need to look at later.
          A `return` is a `return`, and we do not need any further information beyond that.
      • We have to visit all tokens so that we can properly add their length to our cursor.
        • We don't really want to skip over anyone's hard-fought, artisan-crafted code anyway
    • Benefits:
      • We can dramatically reduce memory usage!
      • We can dramatically reduce memory usage!
      • Did I mention we can dramatically reduce memory usage, like by a lot?!

Succinct vs implicit

    • We want to store our trees without pointers to avoid O(n log n) bits being used.
    • There are two classes of data structures which allow us to escape that regime, succinct and implicit.

    Simon Gog - Blog
    simongog.github.io

Choosing implicit

    • Disambiguation makes it so all tokens have a known arity, and are known to be a prefix/infix/suffix operator
    • Therefore, an implicit representation can be modeled as a simple re-ordering of the tokens that gives us sufficient information to check syntactic correctness and get the correct order of operations out.

Analysis of direct postfix consumption

  • Inherently, using an algorithm that rearranges data solely by pushing to and popping from a stack, we can only move data to occur later than it did originally.
    • Prefix operators can be moved to in-order, Infix operators can be moved to post-order, and Postfix operators are going to stay in post-order.
    • In order to use the `length` trick I want, we cannot use a Post-ordering of the AST/Parse Tree, because we have to know the pre-order, in-order, and post-order to pull it off.

My Design

  • We model all tokens as either an operator or and operand, and shove it into a specialized version of the Shunting Yard algorithm.
  • Sometimes, there are some oddities in well-formed Zig code.
    • E.g. `break: blk thing` has two identifiers next to each other, so the whitespace in between them will get its own token.
  • Example transformation:
    • -(-b * -c - -d.*.?) / (e + f) * g + h - -i.*.? * j * (k * (l - m) + n / o);
    • ; - + * / unary - ( - * unary - b unary - c unary - .? .* d ) ( + e f ) g h * * unary - .? .* i j ( + * k ( - l m ) / n o )
  • Everyone pays the cost of producing in post-order and then iterating over it in pre-order, we want to reduce that cost.
  • Observation:
    • Operands never change order with respect to each other. Operators are the ones we are shuffling around.
    • We can omit operands from the algorithm and decrease the amount of data our algorithm has to deal with by roughly a factor of 2.

The OpString

struct {
    operators: [BUF_SIZE]Token,
    merge_bitstring: u64,
    next: u32,
    operators_len: u8,
    bitstr_len: u8,
}
// Example:
// .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
// .merge_bitstring = 1010101010;

Merging OpStrings

  • Given a bitstring and two buffers, how do we combine them efficiently?
  • struct {
        operators: [BUF_SIZE]Token,
        merge_bitstring: u64,
        next: u32,
        operators_len: u8,
        bitstr_len: u8,
    }
    // Example:
    // .operators = .{ +, *, -, /, +, undefined, undefined, undefined }
    // .merge_bitstring = 1010101010;
    // .operands = .{  a, b, c, d, e, f, g, h, i ... }
    
    // Output: a + b * c - d / e + f ...
    

    There's an instruction for that!

    Can we emulate VPEXPANDW?

    • Use vector shuffle/permute/swizzle instructions!

    Expansion & Compaction

    Where do we have vector
    shuffle instructions?

    • x86-64:
      • SSSE3 (2006+) => PSHUFB (16 byte-lookups) (Steam hardware survey found 99.73% support)
      • XOP (AMD 2011-2017, Intel never) => VPPERM (effectively 32 byte-lookups due 2 source
        vectors)
      • AVX2 (2011+) => VPSHUFB (16 byte-lookups, optionally on both halfs of a 32-byte vector
        simultaneously but cross-lane shuffling is disallowed)
      • AVX-512 (Intel 2016+/2018+, AMD Zen 4 2022+) => VPSHUF/VPERM (all your wildest dreams)
        • for (V)PSHUF{B,W,D,Q}, if bit 7 is set, the corresponding output will be 0, otherwise only the lower 4 bits are used for the lookup
    • arm64 / aarch64:
      • NEON => TBL & TBX (at least 16 byte-lookups, but can take up to 4 source vectors, so effectively we can
        do 16 lookups in a 64 byte table)
        • if out of range, the corresponding output will be zeroed (TBL) or the original value preserved (TBX)
        • arm32 has the same instructions, but the vector sizes are 8 bytes instead of 16.
    • PowerPC:
      • VSX => XXPERMX
      • AltiVec => VPERM (32 byte-lookups)
    • RISC-V:
      • V => VRGATHER (at least 16 byte-lookups)
        • My single-board RISC-V machine, the Star 64 (Sifive U74 CPU) does not have vectors,
          and therefore no vector shuffles.
    • MIPS:
      • MSA => VSHF (16 byte-lookups)
    • SPARC:
      • VIS 2.0 => BSHUFFLE (16 byte-lookups), but you can only keep the first 8 indices
    • WASM:
      • SWIZZLE (16 byte-lookups)

    How do we produce the
    shuffle vector?

    • Generic:
      • Could use a lookup table
        • Problem: lookup tables for expand/compress instructions are often prohibitively large
        • For 32 byte shuffles, mapping 16 bits => 32 bytes takes 2MiB
          • Can be worth it in tight loops, but adds bloat to the stack and the executable file
        • For 16 byte shuffles, mapping 8 bits => 16 bytes takes 4KiB
          • Could reduce @Vector(16, u8) => @Vector(16, u4), cutting size to 2KiB
            at the cost of extra instructions to decode it
    • PowerPC:
      • Power10 => XXGENPCVHM
        • Generate-Permute-Control-Vector-Halfword-according-to-Mask can produce
          a vector for expansion or compaction
    • RISC-V:
      • V => VIOTA
        • Predicated prefix-sum instruction
        • This strategy is applicable across hardware!
    fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
        const splatted = @as(u64, x) * 0x0101010101010101;
        const bit_positions: u64 = @bitCast(@as(@Vector(8, u8), @splat(1)) << std.simd.iota(u3, 8));
        const splatted_msbs = (splatted & bit_positions) + (0x8080808080808080 - bit_positions);
        const splatted_lsbs = (splatted_msbs >> 7) & 0x0101010101010101;
        const prefix_sums1 = ((splatted_lsbs << 4) | splatted_lsbs) *% 0x1111111111111110;
        const splatted_lsbs_inv = splatted_lsbs ^ 0x0101010101010101;
        const prefix_sums2 = ((splatted_lsbs_inv << 4) | splatted_lsbs_inv) *% 0x1111111111111110;
        const selector = splatted_lsbs * 255;
        const interleaved_shuffle_vector = (selector & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2;
        return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
    }
    [0]00000000000000000000000000000000000000000000000000000000x = 01010111
    00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001
    *
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    +
    
    
    
    1....... .1...... ..1..... ...1.... ....1... .....1.. ......1. .......1
    &
    
    
    
    0....... 01...... 011..... 0111.... 01111... 011111.. 0111111. 01111111
    
    
    
    +
    
    
    
    
    |
    
    
    
    
    
    
    *
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    +
    splatted_lsbs = 
         selector = 
     prefix_sums1 = 
     prefix_sums2 = 
     (       1 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2
     (       0 & (prefix_sums1 ^ prefix_sums2)) ^ prefix_sums2
    ≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈≈
    00000001 00000001 00000001 00000001 00000001 00000001 00000001 00000001
    ^
    
    
    
    
    |
    
    
    
    
    *
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    +
    
    
    
    
    
    
    *
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    +
    
    
    
    
    
    
    
    

    More VPEXPANDW emulation?

    • x86-64:
      • AVX-512F (first mainstream support was Intel Cannon Lake 2018+) => VPEXPAND{D, Q}
        • Intel machines before Ice Lake (2019) lack AVX512_VBMI2, which introduced VPEXPAND{B, W}
        • @Vector(Nu16) => @Vector(Nu32), do our operation, then @Vector(Nu32) => @Vector(Nu16)
    • BESM Soviet mainframes:
      • BESM-6 (Designed in 1965. Produced 1968-1987) => AUX & APX
    • x86-64:
      • BMI2 (Intel Haswell 2013+, AMD Excavator 2015+, Fast on AMD since Zen 3 2020+) => PDEP & PEXT
    • arm64 / aarch64:
    • PowerPC:
    • RISC-V:
      • Zbe => BDEP & BEXT, but did not make it into ratified B instruction set
    fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
        const splatted = pdep(x, 0x0101010101010101);
        const vec = splatted * 255;
        const nibble_indices: u64 = @bitCast(std.simd.iota(u4, 16));
        const interleaved_shuffle_vector = pdep(nibble_indices, vec) | pdep(nibble_indices, ~vec);
        return @as(@Vector(16, u4), @bitCast(interleaved_shuffle_vector));
    }
    00000000000000000000000000000000000000000000000000000000000x = 01010111
    01010111
    
    
    
    
    *
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    uh-we-encountered-a-problem
    +
    uh-we-encountered-a-problem
    11111110 11011100 10111010 ........ 10011000 ........ 01110110 ........ 01010100 00110010 00010000
    ---F---E ---D---C ---B---A ---9---8 ---7---6 ---5---4 ---3---2 ---1---0
    11111110 11011100 10111010 10011000 01110110 01010100 ........ 00110010 ........ 00010000 ........ ........ ........
    |
    01010100 10011000 00110010 01110110 00010000 01010100 00110010 00010000
    ---5---4 ---9---8 ---3---2 ---7---6 ---1---0 ---5---4 ---3---2 ---1---0

    Conclusion

      • We discussed ways to optimize modern hardware across ISA's, and what compilers could do in the future to help.
      • We decreased the memory usage and thereby increased the density of the data with regards to the second pass of the compiler.
      • We made it feasible to combine the next two passes of the compiler, and have it do all the validation we need.
        • (aside from ensuring that the transformation of the second pass is actually correct)
      • We hopefully got somebody interested in this stuff!

    Fin