From 0e8ad8c44d823659698e1c0c914cd0bdcf1b463c Mon Sep 17 00:00:00 2001 From: Moss Date: Wed, 21 Dec 2022 21:55:16 -0800 Subject: [PATCH] Episode: Update Episode, Season, and Series Name Parsing Now getting json metadata and parsing that. --- src/episode.cpp | 147 +++++++++++++++++++++++++++--------------------- src/episode.h | 72 ++++++++++++++++-------- src/main.cpp | 6 +- src/season.cpp | 2 +- 4 files changed, 133 insertions(+), 94 deletions(-) diff --git a/src/episode.cpp b/src/episode.cpp index eb3b086..ca71d56 100644 --- a/src/episode.cpp +++ b/src/episode.cpp @@ -166,51 +166,61 @@ namespace dropout_dl { return size * nmemb; } + std::string episode::get_meta_data_json(const std::string& html_data) { + std::string data_start("window.Page = {"); + char data_open = '{'; + char data_close = '}'; + char current_char; + // The current grouping depth. 1 because we only use it after we're inside the data brackets + int grouping_depth = 1; + for (int i = 0; i < html_data.size(); i++) { + if (substr_is(html_data, i, data_start)) { + i += data_start.size(); + for (int j = 0; j + i < html_data.size(); j++) { + current_char = html_data[j + i]; + if (current_char == data_open) { + grouping_depth++; + } + else if(current_char == data_close) { + grouping_depth--; + } + + if (grouping_depth == 0) { + return(html_data.substr(i, j)); + } + } + } + } + return "ERROR"; + } + // episode statics - std::string episode::get_series_name(const std::string& html_data) { - std::string series_title("series-title"); - std::string open_a_tag(""); - std::string close_a(""); - - for (int i = 0; i < html_data.size(); i++) { - if (substr_is(html_data, i, series_title)) { - for (int j = i + series_title.size(); j < html_data.size(); j++) { - if (html_data[j] == '\n' || html_data[j] == ' ' || html_data[j] == '\t') continue; - if (substr_is(html_data, j, open_a_tag)) { - for (int k = j + open_a_tag.size(); k < html_data.size(); k++) { - if (substr_is(html_data, k, close_tag)) { - k++; - for (int l = 0; l < html_data.size() - k; l++) { - if (substr_is(html_data, k + l, close_a)) { - return format_name_string(html_data.substr(k, l)); - } - } - } - } - } - } - } - } - return "ERROR"; - } - - std::string episode::get_episode_name(const std::string& html_data) { + std::string episode::get_series_name(const std::string& meta_data) { int title_start = -1; - std::string video_title("video-title"); - std::string open_strong(""); - std::string close_strong(""); - for (int i = 0; i < html_data.size(); i++) { - if (substr_is(html_data, i, video_title)) { - for (int j = i; j < html_data.size(); j++) { - if (substr_is(html_data, j, open_strong)) { - title_start = j + open_strong.size(); - break; - } - } - for (int j = 0; j < html_data.size() - title_start; j++) { - if (substr_is(html_data, title_start + j, close_strong)) { - return format_name_string(html_data.substr(title_start, j)); + std::string parent_title("\"parent\""); + std::string series_title_title("\"name\""); + for (int i = 0; i < meta_data.size(); i++) { + if (substr_is(meta_data, i, parent_title)) { + // Skip "VIDEO_TITLE", the following colon, and the opening quotation mark. + i += parent_title.size() + 2; + + + int j; + for (j = 0; meta_data[i + j] != '}' && i + j < meta_data.size(); j++); + + std::string series_data = meta_data.substr(i, j); + + std::cout << "series_data: " << series_data << '\n'; + + for (j = 0; j < series_data.size(); j++) { + if (substr_is(series_data, j, series_title_title)) { + // Skip "name", the following colon, and the opening quotation mark. + j += series_title_title.size() + 2; + + int k; + for (k = 0; j + k < series_data.size() && series_data[j + k] != '"'; k++); + + return series_data.substr(j, k); } } } @@ -218,22 +228,38 @@ namespace dropout_dl { return "ERROR"; } - std::string episode::get_episode_number(const std::string& html_data) { - std::string episode("Episode"); - std::string close_a(""); - std::string episode_num; - for (int i = 0; i < html_data.size(); i++) { - if (substr_is(html_data, i, episode)) { - for (int j = i + 8; j < html_data.size(); j++) { - if (html_data[j] == '\n' || html_data[j] == ' ' || html_data[j] == '\t') continue; - if (substr_is(html_data, j, close_a)) { - return episode_num; - } - episode_num += html_data[j]; - } + std::string episode::get_season_name(const std::string& meta_data) { + std::string season_title_title("\"COLLECTION_TITLE\""); + for (int i = 0; i < meta_data.size(); i++) { + if (substr_is(meta_data, i, season_title_title)) { + // Skip "VIDEO_TITLE", the following colon, and the opening quotation mark. + i += season_title_title.size() + 2; + + + int j; + for (j = 0; meta_data[i + j] != '"' && i + j < meta_data.size(); j++); + + return meta_data.substr(i, j); } } - return "-1"; + return "ERROR"; + } + + std::string episode::get_episode_name(const std::string& meta_data) { + std::string video_title_title("\"VIDEO_TITLE\""); + for (int i = 0; i < meta_data.size(); i++) { + if (substr_is(meta_data, i, video_title_title)) { + // Skip "VIDEO_TITLE", the following colon, and the opening quotation mark. + i += video_title_title.size() + 2; + + + int j; + for (j = 0; meta_data[i + j] != '"' && i + j < meta_data.size(); j++); + + return meta_data.substr(i, j); + } + } + return "ERROR"; } std::string episode::get_embed_url(const std::string& html_data) { @@ -459,8 +485,6 @@ namespace dropout_dl { if(curl) { std::string out; - - curl_easy_setopt(curl, CURLOPT_URL, get_video_url(quality).c_str()); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, dropout_dl::WriteCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out); @@ -478,10 +502,7 @@ namespace dropout_dl { void episode::download(const std::string& quality, const std::string& series_directory, std::string filename) { if (filename.empty()) { - filename = "E" + (this->episode_number.size() < 2 ? "0" + this->episode_number : this->episode_number) + this->name + - ".mp4"; - - filename = format_filename(filename); + filename = this->series + " - " + this->season + " - " + this->name + ".mp4"; } if (quality == "all") { diff --git a/src/episode.h b/src/episode.h index 0c6b8fe..b18d9b2 100644 --- a/src/episode.h +++ b/src/episode.h @@ -256,10 +256,12 @@ namespace dropout_dl { std::string series; /// The directory for the series std::string series_directory; + /// The name of the season that the episode belongs to + std::string season; + /// The json metadata of the episode + std::string metadata; /// The name of the episode std::string name; - /// The number of the episode in the season. This can be a number or a string - std::string episode_number; /// The url for the main episode page std::string episode_url; /// The data of the main episode page @@ -292,33 +294,43 @@ namespace dropout_dl { */ static std::string get_episode_page(const std::string& url, const std::string& auth_cookie, const std::string& session_cookie, bool verbose = false); + /** + * + * @param html_data - Episode page data + * @return The json data for the episode + * + * Gets the json metadata of the episode + */ + static std::string get_meta_data_json(const std::string& html_data); + // Parsing /** * - * @param html_data - Episode page data + * @param meta_data - Episode metadata in json format * @return The name of the series * - * Get the name of the series from the episode page + * Get the name of the series from the metadata */ - static std::string get_series_name(const std::string& html_data); + static std::string get_series_name(const std::string& meta_data); + /** * - * @param html_data - Episode page data + * @param meta_data - Episode metadata in json format + * @return The name of the season + * + * Get the name of the season from the metadata + */ + static std::string get_season_name(const std::string& meta_data); + + /** + * + * @param meta_data - Episode metadata in json format * @return The name of the episode * - * Get the name of the episode from the episode page + * Get the name of the episode from the metadata */ - static std::string get_episode_name(const std::string& html_data); - - /** - * - * @param html_data - Episode page data - * @return The number of the episode - * - * Get the number of the episode from the episode page - */ - static std::string get_episode_number(const std::string& html_data); + static std::string get_episode_name(const std::string& meta_data); /** * @@ -393,7 +405,17 @@ namespace dropout_dl { episode_data = get_episode_page(episode_url, cookies[0].value, cookies[1].value); - name = get_episode_name(episode_data); + if (verbose) { + std::cout << "Got page data\n"; + } + + metadata = get_meta_data_json(episode_data); + + if (verbose) { + std::cout << "Got episode metadata: " << metadata << '\n'; + } + + name = get_episode_name(metadata); if (verbose) { std::cout << "Got name: " << name << '\n'; @@ -404,18 +426,18 @@ namespace dropout_dl { exit(6); } - this->episode_number = get_episode_number(episode_data); - - if (verbose) { - std::cout << "Got episode: " << this->episode_number << '\n'; - } - - this->series = get_series_name(episode_data); + this->series = get_series_name(metadata); if (verbose) { std::cout << "Got series: " << this->series << '\n'; } + this->season = get_season_name(metadata); + + if (verbose) { + std::cout << "Got season: " << this->season << '\n'; + } + this->series_directory = format_filename(this->series); if (verbose) { diff --git a/src/main.cpp b/src/main.cpp index 3f21464..1ddd88b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -289,7 +289,7 @@ std::vector get_cookies_from_chrome(const std::filesystem::p */ std::vector get_cookies(bool verbose = false) { - std::filesystem::path firefox_profile("_firefox_profile"); + std::filesystem::path firefox_profile("firefox_profile"); std::filesystem::path chrome_profile("chrome_profile"); if (std::filesystem::exists(firefox_profile)) { @@ -369,10 +369,6 @@ int main(int argc, char** argv) { } } - if (options.filename.empty()) { - options.filename = dropout_dl::format_filename(ep.name + ".mp4"); - } - ep.download(options.quality, options.output_directory, options.filename); } diff --git a/src/season.cpp b/src/season.cpp index 7aa13aa..6a5f074 100644 --- a/src/season.cpp +++ b/src/season.cpp @@ -45,7 +45,7 @@ namespace dropout_dl { if (e.episode_url.empty()) { continue; } - std::cout << '\t' << e.episode_number << ": " << e.name << '\n'; + std::cout << '\t' << e.name << '\n'; out.push_back(e); } }