SPIRV-Tools/tools/io.cpp
Shahbaz Youssefi 02433568af
tools: Accept hex representation as binary input (#5870)
Sometimes when debugging or logging, SPIR-V may be dumped as a stream of
hex values.  There are tools to convert such a stream to binary
(such as [1]) but they create an inconvenient extra step when for
example the disassembly of that hex stream is needed.

[1]: https://www.khronos.org/spir/visualizer/hexdump.html

In this change, the binary reader used by the tools is enhanced to
detect when the binary is actually a hex stream, and parse that instead.
The following formats are accepted, detected based on how the SPIR-V
magic number is output:

=== Words

If the first token of the hex stream is one of 0x07230203, 0x7230203,
x07230203, or x7230203, the hex stream is expected to consist of 32-bit
hex words prefixed with 0x or x.  For example:

    0x7230203, 0x10400, 0x180001, 0x79, 0x0

is parsed as:

    0x07230203 0x00010400 0x00180001 0x00000079 0x00000000

Note that `,` is optional in the stream, but the hex values are expected
to be delimited by either `,` or whitespace.

=== Bytes With Prefix

If the first token of the hex stream is one of 0x07, 0x7, x07, x7, 0x03,
0x3, x03, or x3, the hex stream is expected to consist of 8-bit hex
bytes prefixed with 0x or x.  If the first token has a value of 7, the
stream is big-endian.  Otherwise it's little-endian.  For example:

    0x3, 0x2, 0x23, 0x7, 0x0, 0x4, 0x1, 0x0, 0x1, 0x0, 0x18, 0x0, 0x79, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0

is parsed as:

    0x07230203 0x00010400 0x00180001 0x00000079 0x00000000

Similar to "Words", `,` is optional in the stream, but the hex values
are expected to be delimited by either `,` or whitespace.

=== Bytes Without Prefix

If the first two characters of the hex stream is 07, or 03, the hex
stream is expected to consist of 8-bit hex bytes of 2 characters each.
If the first token is 07, the stream is big-endian.  Otherwise it's
little-endian.  Unlike the other modes, delimiter is optional (which
automatically handles 32-bit word streams), but no 0-padding is done.
For example, all of the following:

    03, 02, 23, 07, 00, 04, 01, 00, 01, 00, 18, 00, 79, 00, 00, 00, 00, 00, 00, 00
    03 02 23 07 00 04 01 00 01 00 18 00 79 00 00 00 00 00 00 00
    03022307 00040100 01001800 79000000 00000000
    07,23,02,03,00,01,04,00,00,18,00,01,00,00,00,79,00,00,00,00
    07230203, 00010400, 00180001, 00000079, 00000000

are parsed as:

    0x07230203 0x00010400 0x00180001 0x00000079 0x00000000
2024-11-04 09:57:37 -05:00

477 lines
13 KiB
C++

// Copyright (c) 2024 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "io.h"
#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#if defined(SPIRV_WINDOWS)
#include <fcntl.h>
#include <io.h>
#define SET_STDIN_TO_BINARY_MODE() _setmode(_fileno(stdin), O_BINARY);
#define SET_STDIN_TO_TEXT_MODE() _setmode(_fileno(stdin), O_TEXT);
#define SET_STDOUT_TO_BINARY_MODE() _setmode(_fileno(stdout), O_BINARY);
#define SET_STDOUT_TO_TEXT_MODE() _setmode(_fileno(stdout), O_TEXT);
#define SET_STDOUT_MODE(mode) _setmode(_fileno(stdout), mode);
#else
#define SET_STDIN_TO_BINARY_MODE()
#define SET_STDIN_TO_TEXT_MODE()
#define SET_STDOUT_TO_BINARY_MODE() 0
#define SET_STDOUT_TO_TEXT_MODE() 0
#define SET_STDOUT_MODE(mode)
#endif
namespace {
// Appends the contents of the |file| to |data|, assuming each element in the
// file is of type |T|.
template <typename T>
void ReadFile(FILE* file, std::vector<T>* data) {
if (file == nullptr) return;
const int buf_size = 4096 / sizeof(T);
T buf[buf_size];
while (size_t len = fread(buf, sizeof(T), buf_size, file)) {
data->insert(data->end(), buf, buf + len);
}
}
// Returns true if |file| has encountered an error opening the file or reading
// from it. If there was an error, writes an error message to standard error.
bool WasFileCorrectlyRead(FILE* file, const char* filename) {
if (file == nullptr) {
fprintf(stderr, "error: file does not exist '%s'\n", filename);
return false;
}
if (ftell(file) == -1L) {
if (ferror(file)) {
fprintf(stderr, "error: error reading file '%s'\n", filename);
return false;
}
}
return true;
}
// Ensure the file contained an exact number of elements, whose size is given in
// |alignment|.
bool WasFileSizeAligned(const char* filename, size_t read_size,
size_t alignment) {
assert(alignment != 1);
if ((read_size % alignment) != 0) {
fprintf(stderr,
"error: file size should be a multiple of %zd; file '%s' corrupt\n",
alignment, filename);
return false;
}
return true;
}
// Different formats the hex is expected to be in.
enum class HexMode {
// 0x07230203, ...
Words,
// 0x07, 0x23, 0x02, 0x03, ...
BytesBigEndian,
// 0x03, 0x02, 0x23, 0x07, ...
BytesLittleEndian,
// 07 23 02 03 ...
StreamBigEndian,
// 03 02 23 07 ...
StreamLittleEndian,
};
// Whether a character should be skipped as whitespace / separator /
// end-of-file.
bool IsSpace(char c) { return isspace(c) || c == ',' || c == '\0'; }
bool IsHexStream(const std::vector<char>& stream) {
for (char c : stream) {
if (IsSpace(c)) {
continue;
}
// Every possible case of a SPIR-V hex stream starts with either '0' or 'x'
// (see |HexMode| values). Make a decision upon inspecting the first
// non-space character.
return c == '0' || c == 'x' || c == 'X';
}
return false;
}
bool MatchIgnoreCase(const char* token, const char* expect, size_t len) {
for (size_t i = 0; i < len; ++i) {
if (tolower(token[i]) != tolower(expect[i])) {
return false;
}
}
return true;
}
// Helper class to tokenize a hex stream
class HexTokenizer {
public:
HexTokenizer(const char* filename, const std::vector<char>& stream,
std::vector<uint32_t>* data)
: filename_(filename), stream_(stream), data_(data) {
DetermineMode();
}
bool Parse() {
while (current_ < stream_.size() && !encountered_error_) {
data_->push_back(GetNextWord());
// Make sure trailing space does not lead to parse error by skipping it
// and exiting the loop.
SkipSpace();
}
return !encountered_error_;
}
private:
void ParseError(const char* reason) {
if (!encountered_error_) {
fprintf(stderr,
"error: hex stream parse error at character %zu: %s in '%s'\n",
current_, reason, filename_);
encountered_error_ = true;
}
}
// Skip whitespace until the next non-whitespace non-comma character.
void SkipSpace() {
while (current_ < stream_.size()) {
char c = stream_[current_];
if (!IsSpace(c)) {
return;
}
++current_;
}
}
// Skip the 0x or x at the beginning of a hex value.
void Skip0x() {
// The first character must be 0 or x.
const char first = Next();
if (first != '0' && first != 'x' && first != 'X') {
ParseError("expected 0x or x");
} else if (first == '0') {
const char second = Next();
if (second != 'x' && second != 'X') {
ParseError("expected 0x");
}
}
}
// Consume the next character.
char Next() { return current_ < stream_.size() ? stream_[current_++] : '\0'; }
// Determine how to read the hex stream based on the first token.
void DetermineMode() {
SkipSpace();
// Read 11 bytes, that is the size of the biggest token (10) + one more.
char first_token[11];
for (uint32_t i = 0; i < 11; ++i) {
first_token[i] = Next();
}
// Table of how to match the first token with a mode.
struct {
const char* expect;
bool must_have_delimiter;
HexMode mode;
} parse_info[] = {
{"0x07230203", true, HexMode::Words},
{"0x7230203", true, HexMode::Words},
{"x07230203", true, HexMode::Words},
{"x7230203", true, HexMode::Words},
{"0x07", true, HexMode::BytesBigEndian},
{"0x7", true, HexMode::BytesBigEndian},
{"x07", true, HexMode::BytesBigEndian},
{"x7", true, HexMode::BytesBigEndian},
{"0x03", true, HexMode::BytesLittleEndian},
{"0x3", true, HexMode::BytesLittleEndian},
{"x03", true, HexMode::BytesLittleEndian},
{"x3", true, HexMode::BytesLittleEndian},
{"07", false, HexMode::StreamBigEndian},
{"03", false, HexMode::StreamLittleEndian},
};
// Check to see if any of the possible first tokens are matched. If not,
// this is not a recognized hex stream.
encountered_error_ = true;
for (const auto& info : parse_info) {
const size_t expect_len = strlen(info.expect);
const bool matches_expect =
MatchIgnoreCase(first_token, info.expect, expect_len);
const bool satisfies_delimeter =
!info.must_have_delimiter || IsSpace(first_token[expect_len]);
if (matches_expect && satisfies_delimeter) {
mode_ = info.mode;
encountered_error_ = false;
break;
}
}
if (encountered_error_) {
fprintf(stderr,
"error: hex format detected, but pattern '%.11s' is not "
"recognized '%s'\n",
first_token, filename_);
}
// Reset the position to restart parsing with the determined mode.
current_ = 0;
}
// Consume up to |max_len| characters and put them in |token_chars|. A
// delimiter is expected. The resulting string is NUL-terminated.
void NextN(char token_chars[9], size_t max_len) {
assert(max_len < 9);
for (size_t i = 0; i <= max_len; ++i) {
char c = Next();
if (IsSpace(c)) {
token_chars[i] = '\0';
return;
}
token_chars[i] = c;
if (!isxdigit(c)) {
ParseError("encountered non-hex character");
}
}
// If space is not reached before the maximum number of characters where
// consumed, that's an error.
ParseError("expected delimiter (space or comma)");
token_chars[max_len] = '\0';
}
// Consume one hex digit.
char NextHexDigit() {
char c = Next();
if (!isxdigit(c)) {
ParseError("encountered non-hex character");
}
return c;
}
// Extract a token out of the stream. It could be either a word or a byte,
// based on |mode_|.
uint32_t GetNextToken() {
SkipSpace();
// The longest token can be 8 chars (for |HexMode::Words|), add one for
// '\0'.
char token_chars[9];
switch (mode_) {
case HexMode::Words:
case HexMode::BytesBigEndian:
case HexMode::BytesLittleEndian:
// Start with 0x, followed by up to 8 (for Word) or 2 (for Byte*)
// digits.
Skip0x();
NextN(token_chars, mode_ == HexMode::Words ? 8 : 2);
break;
case HexMode::StreamBigEndian:
case HexMode::StreamLittleEndian:
// Always expected to see two consecutive hex digits.
token_chars[0] = NextHexDigit();
token_chars[1] = NextHexDigit();
token_chars[2] = '\0';
break;
}
if (encountered_error_) {
return 0;
}
// Parse the hex value that was just read.
return static_cast<uint32_t>(strtol(token_chars, nullptr, 16));
}
// Construct a word out of tokens
uint32_t GetNextWord() {
if (mode_ == HexMode::Words) {
return GetNextToken();
}
uint32_t tokens[4] = {
GetNextToken(),
GetNextToken(),
GetNextToken(),
GetNextToken(),
};
switch (mode_) {
case HexMode::BytesBigEndian:
case HexMode::StreamBigEndian:
return tokens[0] << 24 | tokens[1] << 16 | tokens[2] << 8 | tokens[3];
case HexMode::BytesLittleEndian:
case HexMode::StreamLittleEndian:
return tokens[3] << 24 | tokens[2] << 16 | tokens[1] << 8 | tokens[0];
default:
assert(false);
return 0;
}
}
const char* filename_;
const std::vector<char>& stream_;
std::vector<uint32_t>* data_;
HexMode mode_ = HexMode::Words;
size_t current_ = 0;
bool encountered_error_ = false;
};
} // namespace
bool ReadBinaryFile(const char* filename, std::vector<uint32_t>* data) {
assert(data->empty());
const bool use_file = filename && strcmp("-", filename);
FILE* fp = nullptr;
if (use_file) {
fp = fopen(filename, "rb");
} else {
SET_STDIN_TO_BINARY_MODE();
fp = stdin;
}
// Read into a char vector first. If this is a hex stream, it needs to be
// processed as such.
std::vector<char> data_raw;
ReadFile(fp, &data_raw);
bool succeeded = WasFileCorrectlyRead(fp, filename);
if (use_file && fp) fclose(fp);
if (!succeeded) {
return false;
}
if (IsHexStream(data_raw)) {
// If a hex stream, parse it and fill |data|.
HexTokenizer tokenizer(filename, data_raw, data);
succeeded = tokenizer.Parse();
} else {
// If not a hex stream, convert it to uint32_t via memcpy.
succeeded = WasFileSizeAligned(filename, data_raw.size(), sizeof(uint32_t));
if (succeeded) {
data->resize(data_raw.size() / sizeof(uint32_t), 0);
memcpy(data->data(), data_raw.data(), data_raw.size());
}
}
return succeeded;
}
bool ConvertHexToBinary(const std::vector<char>& stream,
std::vector<uint32_t>* data) {
HexTokenizer tokenizer("<input string>", stream, data);
return tokenizer.Parse();
}
bool ReadTextFile(const char* filename, std::vector<char>* data) {
assert(data->empty());
const bool use_file = filename && strcmp("-", filename);
FILE* fp = nullptr;
if (use_file) {
fp = fopen(filename, "r");
} else {
SET_STDIN_TO_TEXT_MODE();
fp = stdin;
}
ReadFile(fp, data);
bool succeeded = WasFileCorrectlyRead(fp, filename);
if (use_file && fp) fclose(fp);
return succeeded;
}
namespace {
// A class to create and manage a file for outputting data.
class OutputFile {
public:
// Opens |filename| in the given mode. If |filename| is nullptr, the empty
// string or "-", stdout will be set to the given mode.
OutputFile(const char* filename, const char* mode) : old_mode_(0) {
const bool use_stdout =
!filename || (filename[0] == '-' && filename[1] == '\0');
if (use_stdout) {
if (strchr(mode, 'b')) {
old_mode_ = SET_STDOUT_TO_BINARY_MODE();
} else {
old_mode_ = SET_STDOUT_TO_TEXT_MODE();
}
fp_ = stdout;
} else {
fp_ = fopen(filename, mode);
}
}
~OutputFile() {
if (fp_ == stdout) {
fflush(stdout);
SET_STDOUT_MODE(old_mode_);
} else if (fp_ != nullptr) {
fclose(fp_);
}
}
// Returns a file handle to the file.
FILE* GetFileHandle() const { return fp_; }
private:
FILE* fp_;
int old_mode_;
};
} // namespace
template <typename T>
bool WriteFile(const char* filename, const char* mode, const T* data,
size_t count) {
OutputFile file(filename, mode);
FILE* fp = file.GetFileHandle();
if (fp == nullptr) {
fprintf(stderr, "error: could not open file '%s'\n", filename);
return false;
}
size_t written = fwrite(data, sizeof(T), count, fp);
if (count != written) {
fprintf(stderr, "error: could not write to file '%s'\n", filename);
return false;
}
return true;
}
template bool WriteFile<uint32_t>(const char* filename, const char* mode,
const uint32_t* data, size_t count);
template bool WriteFile<char>(const char* filename, const char* mode,
const char* data, size_t count);