feat: reduce memory consumption with long line

source_location stores the whole line. In case of short range in a long
line like:
```
array = [1, 2, 3, ... , 100, 101, ..., 10000]
                        ^^^- the region
```
It save the whole line as a `std::stirng`. It consumes a lot of memory
and slows down everything. We can omit most of the part of the line
because we only need the region, `100` here.
This commit is contained in:
ToruNiina 2024-10-21 03:03:13 +09:00
parent 42a2628924
commit 869fdbdf8f
4 changed files with 135 additions and 27 deletions

View File

@ -82,11 +82,16 @@ class region
const_iterator cend() const noexcept; const_iterator cend() const noexcept;
std::string as_string() const; std::string as_string() const;
std::vector<std::string> as_lines() const; std::vector<std::pair<std::string, std::size_t>> as_lines() const;
source_ptr const& source() const noexcept {return this->source_;} source_ptr const& source() const noexcept {return this->source_;}
std::string const& source_name() const noexcept {return this->source_name_;} std::string const& source_name() const noexcept {return this->source_name_;}
private:
std::pair<std::string, std::size_t>
take_line(const_iterator begin, const_iterator end) const;
private: private:
source_ptr source_; source_ptr source_;

View File

@ -10,7 +10,34 @@
namespace toml namespace toml
{ {
//
// A struct to contain location in a toml file. // A struct to contain location in a toml file.
//
// To reduce memory consumption, it omits unrelated parts of long lines. like:
//
// 1. one long line, short region
// ```
// |
// 1 | ... "foo", "bar", baz, "qux", "foobar", ...
// | ^-- unknown value
// ```
// 2. long region
// ```
// |
// 1 | array = [ "foo", ... "bar" ]
// | ^^^^^^^^^^^^^^^^^^^^- in this array
// ```
// 3. many lines
// |
// 1 | array = [ "foo",
// | ^^^^^^^^
// | ...
// | ^^^
// |
// 10 | , "bar"]
// | ^^^^^^^^- in this array
// ```
//
struct source_location struct source_location
{ {
public: public:
@ -39,13 +66,19 @@ struct source_location
std::vector<std::string> const& lines() const noexcept {return line_str_;} std::vector<std::string> const& lines() const noexcept {return line_str_;}
// for internal use
std::size_t first_column_offset() const noexcept {return this->first_offset_;}
std::size_t last_column_offset() const noexcept {return this->last_offset_;}
private: private:
bool is_ok_; bool is_ok_;
std::size_t first_line_; std::size_t first_line_;
std::size_t first_column_; std::size_t first_column_; // column num in the actual file
std::size_t first_offset_; // column num in the shown line
std::size_t last_line_; std::size_t last_line_;
std::size_t last_column_; std::size_t last_column_; // column num in the actual file
std::size_t last_offset_; // column num in the shown line
std::size_t length_; std::size_t length_;
std::string file_name_; std::string file_name_;
std::vector<std::string> line_str_; std::vector<std::string> line_str_;

View File

@ -121,27 +121,65 @@ TOML11_INLINE std::string region::as_string() const
} }
} }
TOML11_INLINE std::vector<std::string> region::as_lines() const TOML11_INLINE std::pair<std::string, std::size_t>
region::take_line(const_iterator begin, const_iterator end) const
{
// To omit long line, we cap region by before/after 30 chars
const auto dist_before = std::distance(source_->cbegin(), begin);
const auto dist_after = std::distance(end, source_->cend());
const const_iterator capped_begin = (dist_before <= 30) ? source_->cbegin() : std::prev(begin, 30);
const const_iterator capped_end = (dist_after <= 30) ? source_->cend() : std::next(end, 30);
const auto lf = char_type('\n');
const auto lf_before = std::find(cxx::make_reverse_iterator(begin),
cxx::make_reverse_iterator(capped_begin), lf);
const auto lf_after = std::find(end, capped_end, lf);
auto offset = static_cast<std::size_t>(std::distance(lf_before.base(), begin));
std::string retval = make_string(lf_before.base(), lf_after);
if(lf_before.base() != source_->cbegin() && *lf_before != lf)
{
retval = "... " + retval;
offset += 4;
}
if(lf_after != source_->cend() && *lf_after != lf)
{
retval = retval + " ...";
}
return std::make_pair(retval, offset);
}
TOML11_INLINE std::vector<std::pair<std::string, std::size_t>> region::as_lines() const
{ {
assert(this->is_ok()); assert(this->is_ok());
if(this->length_ == 0) if(this->length_ == 0)
{ {
return std::vector<std::string>{""}; return std::vector<std::pair<std::string, std::size_t>>{
std::make_pair("", std::size_t(0))
};
} }
// Consider the following toml file // Consider the following toml file
// ``` // ```
// array = [ // array = [
// 1, 2, 3,
// ] # comment // ] # comment
// ``` // ```
// and the region represnets // and the region represnets
// ``` // ```
// [ // [
// 1, 2, 3,
// ] // ]
// ``` // ```
// but we want to show the following. // but we want to show the following.
// ``` // ```
// array = [ // array = [
// 1, 2, 3,
// ] # comment // ] # comment
// ``` // ```
// So we need to find LFs before `begin` and after `end`. // So we need to find LFs before `begin` and after `end`.
@ -162,25 +200,45 @@ TOML11_INLINE std::vector<std::string> region::as_lines() const
const auto begin = std::next(this->source_->cbegin(), begin_idx); const auto begin = std::next(this->source_->cbegin(), begin_idx);
const auto end = std::next(this->source_->cbegin(), end_idx); const auto end = std::next(this->source_->cbegin(), end_idx);
const auto line_begin = std::find(cxx::make_reverse_iterator(begin), this->source_->crend(), char_type('\n')).base(); assert(this->first_line_number() <= this->last_line_number());
const auto line_end = std::find(end, this->source_->cend(), char_type('\n'));
const auto reg_lines = make_string(line_begin, line_end); if(this->first_line_number() == this->last_line_number())
if(reg_lines == "") // the region is an empty line that only contains LF
{ {
return std::vector<std::string>{""}; return std::vector<std::pair<std::string, std::size_t>>{
this->take_line(begin, end)
};
} }
std::istringstream iss(reg_lines); // we have multiple lines. `begin` and `end` points different lines.
// that means that there is at least one `LF` between `begin` and `end`.
std::vector<std::string> lines; const auto after_begin = std::distance(begin, this->source_->cend());
std::string line; const auto before_end = std::distance(this->source_->cbegin(), end);
while(std::getline(iss, line))
const_iterator capped_file_end = this->source_->cend();
const_iterator capped_file_begin = this->source_->cbegin();
if(60 < after_begin) {capped_file_end = std::next(begin, 50);}
if(60 < before_end) {capped_file_begin = std::prev(end, 50);}
const auto lf = char_type('\n');
const auto first_line_end = std::find(begin, capped_file_end, lf);
const auto last_line_begin = std::find(capped_file_begin, end, lf);
const auto first_line = this->take_line(begin, first_line_end);
const auto last_line = this->take_line(last_line_begin, end);
if(this->first_line_number() + 1 == this->last_line_number())
{ {
lines.push_back(line); return std::vector<std::pair<std::string, std::size_t>>{
first_line, last_line
};
}
else
{
return std::vector<std::pair<std::string, std::size_t>>{
first_line, std::make_pair("...", 0), last_line
};
} }
return lines;
} }
} // namespace detail } // namespace detail

View File

@ -20,8 +20,10 @@ TOML11_INLINE source_location::source_location(const detail::region& r)
: is_ok_(false), : is_ok_(false),
first_line_(1), first_line_(1),
first_column_(1), first_column_(1),
first_offset_(1),
last_line_(1), last_line_(1),
last_column_(1), last_column_(1),
last_offset_(1),
length_(0), length_(0),
file_name_("unknown file") file_name_("unknown file")
{ {
@ -34,7 +36,17 @@ TOML11_INLINE source_location::source_location(const detail::region& r)
this->last_line_ = r.last_line_number(); this->last_line_ = r.last_line_number();
this->last_column_ = r.last_column_number(); this->last_column_ = r.last_column_number();
this->length_ = r.length(); this->length_ = r.length();
this->line_str_ = r.as_lines();
const auto lines = r.as_lines();
assert( ! lines.empty());
for(const auto& l : lines)
{
this->line_str_.push_back(l.first);
}
this->first_offset_ = lines.at( 0).second + 1; // to 1-origin
this->last_offset_ = lines.at(lines.size()-1).second + 1;
} }
} }
@ -145,36 +157,36 @@ TOML11_INLINE std::string format_location_impl(const std::size_t lnw,
{ {
// when column points LF, it exceeds the size of the first line. // when column points LF, it exceeds the size of the first line.
std::size_t underline_limit = 1; std::size_t underline_limit = 1;
if(loc.first_line().size() < loc.first_column_number()) if(loc.first_line().size() < loc.first_column_offset())
{ {
underline_limit = 1; underline_limit = 1;
} }
else else
{ {
underline_limit = loc.first_line().size() - loc.first_column_number() + 1; underline_limit = loc.first_line().size() - loc.first_column_offset() + 1;
} }
const auto underline_len = (std::min)(underline_limit, loc.length()); const auto underline_len = (std::min)(underline_limit, loc.length());
format_line(oss, lnw, loc.first_line_number(), loc.first_line()); format_line(oss, lnw, loc.first_line_number(), loc.first_line());
format_underline(oss, lnw, loc.first_column_number(), underline_len, msg); format_underline(oss, lnw, loc.first_column_offset(), underline_len, msg);
} }
else if(loc.lines().size() == 2) else if(loc.lines().size() == 2)
{ {
const auto first_underline_len = const auto first_underline_len =
loc.first_line().size() - loc.first_column_number() + 1; loc.first_line().size() - loc.first_column_offset() + 1;
format_line(oss, lnw, loc.first_line_number(), loc.first_line()); format_line(oss, lnw, loc.first_line_number(), loc.first_line());
format_underline(oss, lnw, loc.first_column_number(), format_underline(oss, lnw, loc.first_column_offset(),
first_underline_len, ""); first_underline_len, "");
format_line(oss, lnw, loc.last_line_number(), loc.last_line()); format_line(oss, lnw, loc.last_line_number(), loc.last_line());
format_underline(oss, lnw, 1, loc.last_column_number(), msg); format_underline(oss, lnw, 1, loc.last_column_offset(), msg);
} }
else if(loc.lines().size() > 2) else if(loc.lines().size() > 2)
{ {
const auto first_underline_len = const auto first_underline_len =
loc.first_line().size() - loc.first_column_number() + 1; loc.first_line().size() - loc.first_column_offset() + 1;
format_line(oss, lnw, loc.first_line_number(), loc.first_line()); format_line(oss, lnw, loc.first_line_number(), loc.first_line());
format_underline(oss, lnw, loc.first_column_number(), format_underline(oss, lnw, loc.first_column_offset(),
first_underline_len, "and"); first_underline_len, "and");
if(loc.lines().size() == 3) if(loc.lines().size() == 3)
@ -188,7 +200,7 @@ TOML11_INLINE std::string format_location_impl(const std::size_t lnw,
format_empty_line(oss, lnw); format_empty_line(oss, lnw);
} }
format_line(oss, lnw, loc.last_line_number(), loc.last_line()); format_line(oss, lnw, loc.last_line_number(), loc.last_line());
format_underline(oss, lnw, 1, loc.last_column_number(), msg); format_underline(oss, lnw, 1, loc.last_column_offset(), msg);
} }
// if loc is empty, do nothing. // if loc is empty, do nothing.
return oss.str(); return oss.str();