Helpers: Added Support For Removing HTML Encoded Characters
This commit is contained in:
parent
3f592696d6
commit
350249aa16
|
@ -38,6 +38,62 @@ namespace dropout_dl {
|
||||||
return str.substr(start, end - start + 1);
|
return str.substr(start, end - start + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string replace_html_character_codes(const std::string& str) {
|
||||||
|
std::string out;
|
||||||
|
|
||||||
|
for (int i = 0; i < str.size(); i++) {
|
||||||
|
if (substr_is(str, i, "&#")) {
|
||||||
|
i += 2;
|
||||||
|
char code = 0;
|
||||||
|
|
||||||
|
if (i > str.size() - 4) {
|
||||||
|
if (str[str.size() - 1] == ';') {
|
||||||
|
// Numerical character code length is two at the end of the string
|
||||||
|
|
||||||
|
code = str[str.size() - 2] - '0';
|
||||||
|
code += (str[str.size() - 3] - '0') * 10;
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (str[i + 3] == ';') {
|
||||||
|
// Numerical character code length is three
|
||||||
|
code = str[i + 2] - '0';
|
||||||
|
code += (str[i + 1] - '0') * 10;
|
||||||
|
code += (str[i] - '0') * 10;
|
||||||
|
i += 3;
|
||||||
|
}
|
||||||
|
else if (str[i + 2] == ';'){
|
||||||
|
code = str[i + 1] - '0';
|
||||||
|
code += (str[i] - '0') * 10;
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cerr << "HTML CHAR CODE ERROR: Code with numerical length of one used\n";
|
||||||
|
exit(11);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code < 32) {
|
||||||
|
std::cerr << "HTML CHAR CODE ERROR: Control Character Decoded. This is not supported and likely an error.\n";
|
||||||
|
exit(11);
|
||||||
|
}
|
||||||
|
|
||||||
|
out += code;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
out += str[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::string format_name_string(const std::string& str) {
|
||||||
|
return replace_html_character_codes(remove_leading_and_following_whitespace(str));
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__WIN32__)
|
#if defined(__WIN32__)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
msec_t time_ms(void)
|
msec_t time_ms(void)
|
||||||
|
@ -105,7 +161,7 @@ namespace dropout_dl {
|
||||||
k++;
|
k++;
|
||||||
for (int l = 0; l < html_data.size() - k; l++) {
|
for (int l = 0; l < html_data.size() - k; l++) {
|
||||||
if (substr_is(html_data, k + l, close_a)) {
|
if (substr_is(html_data, k + l, close_a)) {
|
||||||
return remove_leading_and_following_whitespace(html_data.substr(k, l));
|
return format_name_string(html_data.substr(k, l));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,7 +188,7 @@ namespace dropout_dl {
|
||||||
}
|
}
|
||||||
for (int j = 0; j < html_data.size() - title_start; j++) {
|
for (int j = 0; j < html_data.size() - title_start; j++) {
|
||||||
if (substr_is(html_data, title_start + j, close_strong)) {
|
if (substr_is(html_data, title_start + j, close_strong)) {
|
||||||
return remove_leading_and_following_whitespace(html_data.substr(title_start, j));
|
return format_name_string(html_data.substr(title_start, j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -171,6 +171,24 @@ namespace dropout_dl {
|
||||||
*/
|
*/
|
||||||
std::string remove_leading_and_following_whitespace(const std::string& str);
|
std::string remove_leading_and_following_whitespace(const std::string& str);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param str - A string
|
||||||
|
* @return <b>str</b> with any html character codes replaced with their ascii equivalent.
|
||||||
|
*
|
||||||
|
* E.G. \' would be replaced with '
|
||||||
|
*/
|
||||||
|
std::string replace_html_character_codes(const std::string& str);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param str - A string
|
||||||
|
* @return <b>str</b> with junk removed or replace
|
||||||
|
*
|
||||||
|
* Removed leading and following whitespace and replaces html character codes
|
||||||
|
*/
|
||||||
|
std::string format_name_string(const std::string& str);
|
||||||
|
|
||||||
#if defined(__WIN32__)
|
#if defined(__WIN32__)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
msec_t time_ms(void);
|
msec_t time_ms(void);
|
||||||
|
|
|
@ -18,7 +18,7 @@ namespace dropout_dl {
|
||||||
if (substr_is(html_data, j, close_tag)) {
|
if (substr_is(html_data, j, close_tag)) {
|
||||||
for (int l = 0; l < html_data.size() - j; l++) {
|
for (int l = 0; l < html_data.size() - j; l++) {
|
||||||
if (substr_is(html_data, j + l, close_a)) {
|
if (substr_is(html_data, j + l, close_a)) {
|
||||||
return remove_leading_and_following_whitespace(html_data.substr(j + 1, l - 1));
|
return format_name_string(html_data.substr(j + 1, l - 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue