/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef FOLLY_GEN_STRING_H_ #error This file may only be included from folly/gen/String.h #endif #include #include #include namespace folly { namespace gen { namespace detail { /** * Finds the first occurrence of delimiter in "in", advances "in" past the * delimiter. Populates "prefix" with the consumed bytes, including the * delimiter. * * Returns the number of trailing bytes of "prefix" that make up the * delimiter, or 0 if the delimiter was not found. */ inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, char delimiter) { size_t found = in.find(delimiter); if (found != StringPiece::npos) { ++found; prefix.assign(in.data(), in.data() + found); in.advance(found); return 1; } prefix.clear(); return 0; } /** * As above, but supports multibyte delimiters. */ inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, StringPiece delimiter) { auto found = in.find(delimiter); if (found != StringPiece::npos) { found += delimiter.size(); prefix.assign(in.data(), in.data() + found); in.advance(found); return delimiter.size(); } prefix.clear(); return 0; } /** * As above, but splits by any of the EOL terms: \r, \n, or \r\n. */ inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, MixedNewlines) { const auto kCRLF = "\r\n"; const size_t kLenCRLF = 2; auto p = in.find_first_of(kCRLF); if (p != std::string::npos) { const auto in_start = in.data(); size_t delim_len = 1; in.advance(p); // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time. if (in.removePrefix(kCRLF)) { delim_len = kLenCRLF; } else { in.advance(delim_len); } prefix.assign(in_start, in.data()); return delim_len; } prefix.clear(); return 0; } inline const char* ch(const unsigned char* p) { return reinterpret_cast(p); } // Chop s into pieces of at most maxLength, feed them to cb template bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) { while (!s.empty()) { auto num_to_add = s.size(); if (maxLength) { num_to_add = std::min(num_to_add, maxLength); } if (!cb(StringPiece(s.begin(), num_to_add))) { return false; } s.advance(num_to_add); } return true; } // Consumes all of buffer, plus n chars from s. template bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) { buf.reserve(0, n); memcpy(buf.writableTail(), s.data(), n); buf.append(n); s.advance(n); if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) { return false; } buf.clear(); return true; } } // namespace detail template bool StreamSplitter::flush() { CHECK(maxLength_ == 0 || buffer_.length() < maxLength_); if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) { return false; } // We are ready to handle another stream now. buffer_.clear(); return true; } template bool StreamSplitter::operator()(StringPiece in) { StringPiece prefix; // NB This code assumes a 1-byte delimiter. It's not too hard to support // multibyte delimiters, just remember that maxLength_ chunks can end up // falling in the middle of a delimiter. bool found = detail::splitPrefix(in, prefix, delimiter_); if (buffer_.length() != 0) { if (found) { uint64_t num_to_add = prefix.size(); if (maxLength_) { CHECK(buffer_.length() < maxLength_); // Consume as much of prefix as possible without exceeding maxLength_ num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add); } // Append part of the prefix to the buffer, and send it to the callback if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) { return false; } if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) { return false; } found = detail::splitPrefix(in, prefix, delimiter_); // Post-conditions: // - we consumed all of buffer_ and all of the first prefix. // - found, in, and prefix reflect the second delimiter_ search } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) { // Send all of buffer_, plus a bit of in, to the callback if (!detail::consumeBufferPlus( pieceCb_, buffer_, in, maxLength_ - buffer_.length())) { return false; } // Post-conditions: // - we consumed all of buffer, and the minimal # of bytes from in // - found is false } // Otherwise: found is false & we cannot invoke the callback this turn } // Post-condition: buffer_ is nonempty only if found is false **and** // len(buffer + in) < maxLength_. // Send lines to callback directly from input (no buffer) while (found) { // Buffer guaranteed to be empty if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) { return false; } found = detail::splitPrefix(in, prefix, delimiter_); } // No more delimiters left; consume 'in' until it is shorter than maxLength_ if (maxLength_) { while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty if (!pieceCb_(StringPiece(in.begin(), maxLength_))) { return false; } in.advance(maxLength_); } } if (!in.empty()) { // Buffer may be nonempty // Incomplete line left, append to buffer buffer_.reserve(0, in.size()); memcpy(buffer_.writableTail(), in.data(), in.size()); buffer_.append(in.size()); } CHECK(maxLength_ == 0 || buffer_.length() < maxLength_); return true; } namespace detail { class StringResplitter : public Operator { char delimiter_; bool keepDelimiter_; public: explicit StringResplitter(char delimiter, bool keepDelimiter = false) : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {} template class Generator : public GenImpl> { Source source_; char delimiter_; bool keepDelimiter_; public: Generator(Source source, char delimiter, bool keepDelimiter) : source_(std::move(source)), delimiter_(delimiter), keepDelimiter_(keepDelimiter) {} template bool apply(Body&& body) const { auto splitter = streamSplitter(this->delimiter_, [this, &body](StringPiece s) { // The stream ended with a delimiter; our contract is to swallow // the final empty piece. if (s.empty()) { return true; } if (s.back() != this->delimiter_) { return body(s); } if (!keepDelimiter_) { s.pop_back(); // Remove the 1-character delimiter } return body(s); }); if (!source_.apply(splitter)) { return false; } return splitter.flush(); } static constexpr bool infinite = Source::infinite; }; template > Gen compose(GenImpl&& source) const { return Gen(std::move(source.self()), delimiter_, keepDelimiter_); } template > Gen compose(const GenImpl& source) const { return Gen(source.self(), delimiter_, keepDelimiter_); } }; template class SplitStringSource : public GenImpl> { StringPiece source_; DelimiterType delimiter_; public: SplitStringSource(const StringPiece source, DelimiterType delimiter) : source_(source), delimiter_(std::move(delimiter)) {} template bool apply(Body&& body) const { StringPiece rest(source_); StringPiece prefix; while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) { prefix.subtract(delim_len); // Remove the delimiter if (!body(prefix)) { return false; } } if (!rest.empty()) { if (!body(rest)) { return false; } } return true; } }; /** * Unsplit - For joining tokens from a generator into a string. This is * the inverse of `split` above. * * This type is primarily used through the 'unsplit' function. */ template class Unsplit : public Operator> { Delimiter delimiter_; public: explicit Unsplit(const Delimiter& delimiter) : delimiter_(delimiter) {} template Output compose(const GenImpl& source) const { Output outputBuffer; UnsplitBuffer unsplitter(delimiter_, &outputBuffer); unsplitter.compose(source); return outputBuffer; } }; /** * UnsplitBuffer - For joining tokens from a generator into a string, * and inserting them into a custom buffer. * * This type is primarily used through the 'unsplit' function. */ template class UnsplitBuffer : public Operator> { Delimiter delimiter_; OutputBuffer* outputBuffer_; public: UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer) : delimiter_(delimiter), outputBuffer_(outputBuffer) { CHECK(outputBuffer); } template void compose(const GenImpl& source) const { // If the output buffer is empty, we skip inserting the delimiter for the // first element. bool skipDelim = outputBuffer_->empty(); source | [&](Value v) { if (skipDelim) { skipDelim = false; toAppend(std::forward(v), outputBuffer_); } else { toAppend(delimiter_, std::forward(v), outputBuffer_); } }; } }; /** * Hack for static for-like constructs */ template inline Target passthrough(Target target) { return target; } FOLLY_PUSH_WARNING #ifdef __clang__ // Clang isn't happy with eatField() hack below. #pragma GCC diagnostic ignored "-Wreturn-stack-address" #endif // __clang__ /** * ParseToTuple - For splitting a record and immediatlely converting it to a * target tuple type. Primary used through the 'eachToTuple' helper, like so: * * auto config * = split("1:a 2:b", ' ') * | eachToTuple() * | as>>(); * */ template class SplitTo { Delimiter delimiter_; public: explicit SplitTo(Delimiter delimiter) : delimiter_(delimiter) {} TargetContainer operator()(StringPiece line) const { int i = 0; StringPiece fields[sizeof...(Targets)]; // HACK(tjackson): Used for referencing fields[] corresponding to variadic // template parameters. auto eatField = [&]() -> StringPiece& { return fields[i++]; }; if (!split( delimiter_, line, detail::passthrough(eatField())...)) { throw std::runtime_error("field count mismatch"); } i = 0; return TargetContainer(To()(eatField())...); } }; FOLLY_POP_WARNING } // namespace detail } // namespace gen } // namespace folly