osb/source/core/StarString.hpp

522 lines
17 KiB
C++

#ifndef STAR_STRING_HPP
#define STAR_STRING_HPP
#include "StarUnicode.hpp"
#include "StarHash.hpp"
#include "StarByteArray.hpp"
#include "StarList.hpp"
#include "StarMap.hpp"
#include "StarSet.hpp"
namespace Star {
STAR_CLASS(StringList);
STAR_CLASS(String);
STAR_CLASS(StringView);
STAR_EXCEPTION(StringException, StarException);
// A Unicode string class, which is a basic UTF-8 aware wrapper around
// std::string. Provides methods for accessing UTF-32 "Char" type, which
// provides access to each individual code point. Printing, hashing, copying,
// and in-order access should be basically as fast as std::string, but the more
// complex string processing methods may be much worse.
//
// All case sensitive / insensitive functionality is based on ASCII tolower and
// toupper, and will have no effect on characters outside ASCII. Therefore,
// case insensitivity is really only appropriate for code / script processing,
// not for general strings.
class String {
public:
typedef Utf32Type Char;
// std::basic_string equivalent that guarantees const access time for
// operator[], etc
typedef std::basic_string<Char> WideString;
typedef U8ToU32Iterator<std::string::const_iterator> const_iterator;
typedef Char value_type;
typedef value_type const& const_reference;
enum CaseSensitivity {
CaseSensitive,
CaseInsensitive
};
// Space, horizontal tab, newline, carriage return, and BOM / ZWNBSP
static bool isSpace(Char c);
static bool isAsciiNumber(Char c);
static bool isAsciiLetter(Char c);
// These methods only actually work on unicode characters below 127, i.e.
// ASCII subset.
static Char toLower(Char c);
static Char toUpper(Char c);
static bool charEqual(Char c1, Char c2, CaseSensitivity cs);
// Join two strings together with a joiner, so that only one instance of the
// joiner is in between the left and right strings. For example, joins "foo"
// and "bar" with "?" to produce "foo?bar". Gets rid of repeat joiners, so
// "foo?" and "?bar" with "?" also becomes "foo?bar". Also, if left or right
// is empty, does not add a joiner, for example "" and "baz" joined with "?"
// produces "baz".
static String joinWith(String const& join, String const& left, String const& right);
template <typename... StringType>
static String joinWith(String const& join, String const& first, String const& second, String const& third, StringType const&... rest);
String();
String(String const& s);
String(String&& s);
// These assume utf8 input
String(char const* s);
String(char const* s, size_t n);
String(std::string const& s);
String(std::string&& s);
String(std::wstring const& s);
String(Char const* s);
String(Char const* s, size_t n);
String(Char c, size_t n);
explicit String(Char c);
// const& to internal utf8 data
std::string const& utf8() const;
std::string takeUtf8();
ByteArray utf8Bytes() const;
// Pointer to internal utf8 data, null-terminated.
char const* utf8Ptr() const;
size_t utf8Size() const;
std::wstring wstring() const;
WideString wideString() const;
const_iterator begin() const;
const_iterator end() const;
size_t size() const;
size_t length() const;
void clear();
void reserve(size_t n);
bool empty() const;
Char operator[](size_t i) const;
// Throws StringException if i out of range.
Char at(size_t i) const;
String toUpper() const;
String toLower() const;
String titleCase() const;
bool endsWith(String const& end, CaseSensitivity cs = CaseSensitive) const;
bool endsWith(Char end, CaseSensitivity cs = CaseSensitive) const;
bool beginsWith(String const& beg, CaseSensitivity cs = CaseSensitive) const;
bool beginsWith(Char beg, CaseSensitivity cs = CaseSensitive) const;
String reverse() const;
String rot13() const;
StringList split(Char c, size_t maxSplit = NPos) const;
StringList split(String const& pattern, size_t maxSplit = NPos) const;
StringList rsplit(Char c, size_t maxSplit = NPos) const;
StringList rsplit(String const& pattern, size_t maxSplit = NPos) const;
// Splits on any number of contiguous instances of any of the given
// characters. Behaves differently than regular split in that leading and
// trailing instances of the characters are also ignored, and in general no
// empty strings will be in the resulting split list. If chars is empty,
// then splits on any whitespace.
StringList splitAny(String const& chars = "", size_t maxSplit = NPos) const;
StringList rsplitAny(String const& chars = "", size_t maxSplit = NPos) const;
// Split any with '\n\r'
StringList splitLines(size_t maxSplit = NPos) const;
// Shorthand for splitAny("");
StringList splitWhitespace(size_t maxSplit = NPos) const;
// Splits a string once based on the given characters (defaulting to
// whitespace), and returns the first part. This string is set to the
// second part.
String extract(String const& chars = "");
String rextract(String const& chars = "");
bool hasChar(Char c) const;
// Identical to hasChar, except, if string is empty, tests if c is
// whitespace.
bool hasCharOrWhitespace(Char c) const;
String replace(String const& rplc, String const& val) const;
String trimEnd(String const& chars = "") const;
String trimBeg(String const& chars = "") const;
String trim(String const& chars = "") const;
size_t find(Char c, size_t beg = 0, CaseSensitivity cs = CaseSensitive) const;
size_t find(String const& s, size_t beg = 0, CaseSensitivity cs = CaseSensitive) const;
size_t findLast(Char c, CaseSensitivity cs = CaseSensitive) const;
size_t findLast(String const& s, CaseSensitivity cs = CaseSensitive) const;
// If pattern is empty, finds first whitespace
size_t findFirstOf(String const& chars = "", size_t beg = 0) const;
// If pattern is empty, finds first non-whitespace
size_t findFirstNotOf(String const& chars = "", size_t beg = 0) const;
// finds the the start of the next 'boundary' in a string. used for quickly
// scanning a string
size_t findNextBoundary(size_t index, bool backwards = false) const;
String slice(SliceIndex a = SliceIndex(), SliceIndex b = SliceIndex(), int i = 1) const;
void append(String const& s);
void append(std::string const& s);
void append(Char const* s);
void append(Char const* s, size_t n);
void append(char const* s);
void append(char const* s, size_t n);
void append(Char c);
void prepend(String const& s);
void prepend(std::string const& s);
void prepend(Char const* s);
void prepend(Char const* s, size_t n);
void prepend(char const* s);
void prepend(char const* s, size_t n);
void prepend(Char c);
void push_back(Char c);
void push_front(Char c);
bool contains(String const& s, CaseSensitivity cs = CaseSensitive) const;
// Does this string match the given regular expression?
bool regexMatch(String const& regex, bool full = true, bool caseSensitive = true) const;
int compare(String const& s, CaseSensitivity cs = CaseSensitive) const;
bool equals(String const& s, CaseSensitivity cs = CaseSensitive) const;
// Synonym for equals(s, String::CaseInsensitive)
bool equalsIgnoreCase(String const& s) const;
String substr(size_t position, size_t n = NPos) const;
void erase(size_t pos = 0, size_t n = NPos);
String padLeft(size_t size, String const& filler) const;
String padRight(size_t size, String const& filler) const;
// Replace angle bracket tags in the string with values given by the given
// lookup function. Will be called as:
// String lookup(String const& key);
template <typename Lookup>
String lookupTags(Lookup&& lookup) const;
// StringView variant
template <typename Lookup>
Maybe<String> maybeLookupTagsView(Lookup&& lookup) const;
template <typename Lookup>
String lookupTagsView(Lookup&& lookup) const;
// Replace angle bracket tags in the string with values given by the tags
// map. If replaceWithDefault is true, then values that are not found in the
// tags map are replace with the default string. If replaceWithDefault is
// false, tags that are not found are not replaced at all.
template <typename MapType>
String replaceTags(MapType const& tags, bool replaceWithDefault = false, String defaultValue = "") const;
String& operator=(String const& s);
String& operator=(String&& s);
String& operator+=(String const& s);
String& operator+=(std::string const& s);
String& operator+=(Char const* s);
String& operator+=(char const* s);
String& operator+=(Char c);
friend bool operator==(String const& s1, String const& s2);
friend bool operator==(String const& s1, std::string const& s2);
friend bool operator==(String const& s1, Char const* s2);
friend bool operator==(String const& s1, char const* s2);
friend bool operator==(std::string const& s1, String const& s2);
friend bool operator==(Char const* s1, String const& s2);
friend bool operator==(char const* s1, String const& s2);
friend bool operator!=(String const& s1, String const& s2);
friend bool operator!=(String const& s1, std::string const& s2);
friend bool operator!=(String const& s1, Char const* s2);
friend bool operator!=(String const& s1, char const* c);
friend bool operator!=(std::string const& s1, String const& s2);
friend bool operator!=(Char const* s1, String const& s2);
friend bool operator!=(char const* s1, String const& s2);
friend bool operator<(String const& s1, String const& s2);
friend bool operator<(String const& s1, std::string const& s2);
friend bool operator<(String const& s1, Char const* s2);
friend bool operator<(String const& s1, char const* s2);
friend bool operator<(std::string const& s1, String const& s2);
friend bool operator<(Char const* s1, String const& s2);
friend bool operator<(char const* s1, String const& s2);
friend String operator+(String s1, String const& s2);
friend String operator+(String s1, std::string const& s2);
friend String operator+(String s1, Char const* s2);
friend String operator+(String s1, char const* s2);
friend String operator+(std::string const& s1, String const& s2);
friend String operator+(Char const* s1, String const& s2);
friend String operator+(char const* s1, String const& s2);
friend String operator+(String s, Char c);
friend String operator+(Char c, String const& s);
friend String operator*(String const& s, unsigned times);
friend String operator*(unsigned times, String const& s);
friend std::ostream& operator<<(std::ostream& os, String const& s);
friend std::istream& operator>>(std::istream& is, String& s);
// String view functions
String(StringView s);
String(std::string_view s);
String& operator+=(StringView s);
String& operator+=(std::string_view s);
private:
int compare(size_t selfOffset,
size_t selfLen,
String const& other,
size_t otherOffset,
size_t otherLen,
CaseSensitivity cs) const;
std::string m_string;
};
class StringList : public List<String> {
public:
typedef List<String> Base;
typedef Base::iterator iterator;
typedef Base::const_iterator const_iterator;
typedef Base::value_type value_type;
typedef Base::reference reference;
typedef Base::const_reference const_reference;
template <typename Container>
static StringList from(Container const& m);
StringList();
StringList(Base const& l);
StringList(Base&& l);
StringList(StringList const& l);
StringList(StringList&& l);
StringList(size_t len, String::Char const* const* list);
StringList(size_t len, char const* const* list);
explicit StringList(size_t len, String const& s1 = String());
StringList(std::initializer_list<String> list);
template <typename InputIterator>
StringList(InputIterator beg, InputIterator end)
: Base(beg, end) {}
StringList& operator=(Base const& rhs);
StringList& operator=(Base&& rhs);
StringList& operator=(StringList const& rhs);
StringList& operator=(StringList&& rhs);
StringList& operator=(initializer_list<String> list);
bool contains(String const& s, String::CaseSensitivity cs = String::CaseSensitive) const;
StringList trimAll(String const& chars = "") const;
String join(String const& separator = "") const;
StringList slice(SliceIndex a = SliceIndex(), SliceIndex b = SliceIndex(), int i = 1) const;
template <typename Filter>
StringList filtered(Filter&& filter) const;
template <typename Comparator>
StringList sorted(Comparator&& comparator) const;
StringList sorted() const;
};
std::ostream& operator<<(std::ostream& os, StringList const& list);
template <>
struct hash<String> {
size_t operator()(String const& s) const;
};
struct CaseInsensitiveStringHash {
size_t operator()(String const& s) const;
};
struct CaseInsensitiveStringCompare {
bool operator()(String const& lhs, String const& rhs) const;
};
typedef HashSet<String> StringSet;
template <typename MappedT, typename HashT = hash<String>, typename ComparatorT = std::equal_to<String>>
using StringMap = HashMap<String, MappedT, HashT, ComparatorT>;
template <typename MappedT, typename HashT = hash<String>, typename ComparatorT = std::equal_to<String>>
using StableStringMap = StableHashMap<String, MappedT, HashT, ComparatorT>;
template <typename MappedT>
using CaseInsensitiveStringMap = StringMap<MappedT, CaseInsensitiveStringHash, CaseInsensitiveStringCompare>;
template <>
struct hash<StringList> {
size_t operator()(StringList const& s) const;
};
template <typename... StringType>
String String::joinWith(
String const& join, String const& first, String const& second, String const& third, StringType const&... rest) {
return joinWith(join, joinWith(join, first, second), third, rest...);
}
template <typename Lookup>
String String::lookupTags(Lookup&& lookup) const {
// Operates directly on the utf8 representation of the strings, rather than
// using unicode find / replace methods
auto substrInto = [](std::string const& ref, size_t position, size_t n, std::string& result) {
auto len = ref.size();
if (position > len)
throw OutOfRangeException(strf("out of range in substrInto: %s", position));
auto it = ref.begin();
std::advance(it, position);
for (size_t i = 0; i < n; ++i) {
if (it == ref.end())
break;
result.push_back(*it);
++it;
}
};
std::string finalString;
size_t start = 0;
size_t size = String::size();
finalString.reserve(size);
String key;
while (true) {
if (start >= size)
break;
size_t beginTag = m_string.find("<", start);
size_t endTag = m_string.find(">", beginTag);
if (beginTag != NPos && endTag != NPos) {
substrInto(m_string, beginTag + 1, endTag - beginTag - 1, key.m_string);
substrInto(m_string, start, beginTag - start, finalString);
finalString += lookup(key).m_string;
key.m_string.clear();
start = endTag + 1;
} else {
substrInto(m_string, start, NPos, finalString);
break;
}
}
return move(finalString);
}
template <typename Lookup>
Maybe<String> String::maybeLookupTagsView(Lookup&& lookup) const {
List<std::string_view> finalViews = {};
std::string_view view(utf8());
size_t start = 0;
while (true) {
if (start >= view.size())
break;
size_t beginTag = view.find_first_of('<', start);
if (beginTag == NPos && !start)
return Maybe<String>();
size_t endTag = view.find_first_of('>', beginTag);
if (beginTag != NPos && endTag != NPos) {
finalViews.append(view.substr(start, beginTag - start));
finalViews.append(lookup(view.substr(beginTag + 1, endTag - beginTag - 1)).takeUtf8());
start = endTag + 1;
} else {
finalViews.append(view.substr(start));
break;
}
}
std::string finalString;
size_t finalSize = 0;
for (auto& view : finalViews)
finalSize += view.size();
finalString.reserve(finalSize);
for (auto& view : finalViews)
finalString += view;
return move(finalString);
}
template <typename Lookup>
String String::lookupTagsView(Lookup&& lookup) const {
auto result = maybeLookupTagsView(lookup);
return result ? move(result.take()) : String();
}
template <typename MapType>
String String::replaceTags(MapType const& tags, bool replaceWithDefault, String defaultValue) const {
return lookupTags([&](String const& key) -> String {
auto i = tags.find(key);
if (i == tags.end()) {
if (replaceWithDefault)
return defaultValue;
else
return "<" + key + ">";
} else {
return i->second;
}
});
}
inline size_t hash<String>::operator()(String const& s) const {
PLHasher hash;
for (auto c : s.utf8())
hash.put(c);
return hash.hash();
}
template <typename Container>
StringList StringList::from(Container const& m) {
return StringList(m.begin(), m.end());
}
template <typename Filter>
StringList StringList::filtered(Filter&& filter) const {
StringList l;
l.filter(forward<Filter>(filter));
return l;
}
template <typename Comparator>
StringList StringList::sorted(Comparator&& comparator) const {
StringList l;
l.sort(forward<Comparator>(comparator));
return l;
}
}
#endif