Helpers: Added Support For Removing HTML Encoded Characters

This commit is contained in:
Moss 2022-09-30 22:49:03 -04:00
parent 3f592696d6
commit 350249aa16
3 changed files with 77 additions and 3 deletions

View File

@ -38,6 +38,62 @@ namespace dropout_dl {
return str.substr(start, end - start + 1);
}
std::string replace_html_character_codes(const std::string& str) {
std::string out;
for (int i = 0; i < str.size(); i++) {
if (substr_is(str, i, "&#")) {
i += 2;
char code = 0;
if (i > str.size() - 4) {
if (str[str.size() - 1] == ';') {
// Numerical character code length is two at the end of the string
code = str[str.size() - 2] - '0';
code += (str[str.size() - 3] - '0') * 10;
i += 2;
}
}
else {
if (str[i + 3] == ';') {
// Numerical character code length is three
code = str[i + 2] - '0';
code += (str[i + 1] - '0') * 10;
code += (str[i] - '0') * 10;
i += 3;
}
else if (str[i + 2] == ';'){
code = str[i + 1] - '0';
code += (str[i] - '0') * 10;
i += 2;
}
else {
std::cerr << "HTML CHAR CODE ERROR: Code with numerical length of one used\n";
exit(11);
}
}
if (code < 32) {
std::cerr << "HTML CHAR CODE ERROR: Control Character Decoded. This is not supported and likely an error.\n";
exit(11);
}
out += code;
}
else {
out += str[i];
}
}
return out;
}
std::string format_name_string(const std::string& str) {
return replace_html_character_codes(remove_leading_and_following_whitespace(str));
}
#if defined(__WIN32__)
#include <windows.h>
msec_t time_ms(void)
@ -105,7 +161,7 @@ namespace dropout_dl {
k++;
for (int l = 0; l < html_data.size() - k; l++) {
if (substr_is(html_data, k + l, close_a)) {
return remove_leading_and_following_whitespace(html_data.substr(k, l));
return format_name_string(html_data.substr(k, l));
}
}
}
@ -132,7 +188,7 @@ namespace dropout_dl {
}
for (int j = 0; j < html_data.size() - title_start; j++) {
if (substr_is(html_data, title_start + j, close_strong)) {
return remove_leading_and_following_whitespace(html_data.substr(title_start, j));
return format_name_string(html_data.substr(title_start, j));
}
}
}

View File

@ -171,6 +171,24 @@ namespace dropout_dl {
*/
std::string remove_leading_and_following_whitespace(const std::string& str);
/**
*
* @param str - A string
* @return <b>str</b> with any html character codes replaced with their ascii equivalent.
*
* E.G. \&#39; would be replaced with '
*/
std::string replace_html_character_codes(const std::string& str);
/**
*
* @param str - A string
* @return <b>str</b> with junk removed or replace
*
* Removed leading and following whitespace and replaces html character codes
*/
std::string format_name_string(const std::string& str);
#if defined(__WIN32__)
#include <windows.h>
msec_t time_ms(void);

View File

@ -18,7 +18,7 @@ namespace dropout_dl {
if (substr_is(html_data, j, close_tag)) {
for (int l = 0; l < html_data.size() - j; l++) {
if (substr_is(html_data, j + l, close_a)) {
return remove_leading_and_following_whitespace(html_data.substr(j + 1, l - 1));
return format_name_string(html_data.substr(j + 1, l - 1));
}
}
}