#include "constants.h" #include "status.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include "c11threads.h" #else #include #endif #include "nuklear.h" #include "extractors/extractor.h" #include "logger.h" #include "process_url.h" #include "utils/types.h" #include "utils/utils.h" /* NOTICE: the global curl_conf pointer will only stay valid during downloading, * otherwise, ALWAYS point it to NULL. */ static curl_conf_t *curl_conf; extern Site_map site_map; Options options; static queue_t dl_queue; const char illegal_char[] = {'/', '\\', '|', '<', '>', ':', '"', '?', '*', '\0'}; thrd_t tid[MAX_THREAD]; mtx_t mtx; cnd_t cnd; bool corrupted; static const char *outdir_g, *referer_g; static char *cookie_g; static callback_t callback_g; static callback_struct_t *p_callback_struct_g; static CURLU *h; /* NOTE: Use logger(X) (defined as a generic macro) to log errors. */ static bool logerr_b(CURLcode r) { if (r && !corrupted) { LOG("libcurl", "Error %d: %s\n", r, ERRTOSTRING(r)); } return r; } static bool logerr_h(CURLHcode r) { if (r) { const char *err_str; switch (r) { case CURLHE_BADINDEX: err_str = "header exists but not with this index"; break; case CURLHE_MISSING: // Allow no headers err_str = "no such header exists"; DEBUG_PRINT("Header Error %d: %s\n", r, err_str); return r; break; case CURLHE_NOHEADERS: err_str = "no headers at all exist (yet)"; break; case CURLHE_NOREQUEST: err_str = "no request with this number was used"; break; case CURLHE_OUT_OF_MEMORY: err_str = "out of memory while processing"; break; case CURLHE_BAD_ARGUMENT: err_str = "a function argument was not okay"; break; case CURLHE_NOT_BUILT_IN: err_str = "if API was disabled in the build"; break; default: err_str = "unknown error"; break; } LOG("libcurl", "Header Error %d: %s\n", r, err_str); } return r; } static bool logerr_u(CURLUcode r) { switch (r) { case CURLUE_NO_QUERY: // Accept no query DEBUG_PRINT("The URL has no query.\n"); break; case 0: break; default: LOG("libcurl", "Parse Error %d: Invalid URL\n", r); break; } return r; } static void curl_easy_setcommonopts(CURL *curl) { curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1L); curl_easy_setopt( curl, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0"); curl_easy_setopt(curl, CURLOPT_REFERER, referer_g); /* enable all supported built-in compressions, * since serveral sites enable gzip encoding */ curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); curl_easy_setopt(curl, CURLOPT_COOKIE, cookie_g); } static int progress_callback(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) { thrd_info_t *ti = (thrd_info_t *)clientp; ti->curl_c->dlnow_per_thrd[ti->no] = dlnow; if (ti->curl_c->total_thrd == 1) { ti->curl_c->dltotal = dltotal; } // Return non-zero to abort download return corrupted; } static size_t write2str(void *ptr, size_t size, size_t nmemb, str_data_t *s) { size_t new_len = s->len + size * nmemb; s->string = realloc(s->string, new_len + 1); memcpy(s->string + s->len, ptr, size * nmemb); s->string[new_len] = '\0'; s->len = new_len; return size * nmemb; } static void gen_fullpathfn(char *fullpathfn, const char *outdir, const char *fn) { sprintf(fullpathfn, "%s%s%s", outdir, outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR, fn); } static int parse_url(const char *URL, const char *outdir, char *fn) { CURLUcode ue = logerr(curl_url_set(h, CURLUPART_URL, URL, 0)); if (ue && ue != CURLUE_NO_QUERY) { return 1; } char *domain, *path, *query; if (ue == CURLUE_NO_QUERY) { query = NULL; } else { ue = logerr(curl_url_get(h, CURLUPART_QUERY, &query, 0)); } ue = curl_url_get(h, CURLUPART_HOST, &domain, 0); if (ue) { return 1; } ue = logerr(curl_url_get(h, CURLUPART_PATH, &path, 0)); if (ue) { return 1; } DEBUG_PRINT("Domain: %s\n", domain); DEBUG_PRINT("Path: %s\n", path); DEBUG_PRINT("Query: %s\n", query); for (unsigned short i = 0; i < site_map.size; i++) { if (!strcmp(domain, site_map.pairs[i].domain)) { append_log("Got site: %s\n", domain); thrd_t t; options.site = site_map.pairs[i].site; options.URL = malloc(strlen(URL) + 1); strcpy(options.URL, URL); options.path = malloc(strlen(path) + 1); strcpy(options.path, path); if (query) { options.query = malloc(strlen(query) + 1); strcpy(options.query, query); } else { options.query = calloc(1, sizeof(char)); } append_log("pagedata URL: %s\n", options.URL); thrd_create(&t, extract, &options); thrd_detach(t); curl_free(domain); curl_free(path); curl_free(query); return 0; }; } curl_conf_t *curl_c = malloc(sizeof(curl_conf_t)); memset(curl_c, 0, sizeof(curl_conf_t)); curl_c->URL = malloc(strlen(URL) + 1); strcpy(curl_c->URL, URL); /* filename */ if (!fn) { const char *patterns_str[1] = {"(?:.+\\/)([^#/?]+)"}; str_array_t results = create_str_array(0); const str_array_t patterns = {(char **)patterns_str, 1}; regex_match(path, patterns, &results); for (unsigned short i = 0; i < results.n; i++) { if (results.str[i]) { DEBUG_PRINT("[%d] %s\n", i, results.str[i]); curl_c->outfn = malloc(strlen(outdir) + strlen(results.str[i]) + 2); gen_fullpathfn(curl_c->outfn, outdir, results.str[i]); } } free_str_array(&results); if (!curl_c->outfn || curl_c->outfn[0] == '\0') { // sprintf(curl_c->outfn, "%s%c%s", outdir, SPLITTER, // "test"); LOG("libcurl", "Infer filename failed, please specify a valid filename.\n"); curl_free(domain); curl_free(path); curl_free(query); return 1; } } else { curl_c->outfn = malloc(strlen(outdir) + strlen(fn) + 2); gen_fullpathfn(curl_c->outfn, outdir, fn); FREE_AND_NULLIFY(fn); } DEBUG_PRINT("File will be saved as: %s\n", curl_c->outfn); DEBUG_PRINT("Got regular URL: %s\n", curl_c->URL); enqueue(&dl_queue, (void *)curl_c); curl_free(domain); curl_free(path); curl_free(query); return 0; } bool get_info(const char *URL, curl_off_t *psize, char **p_content_type, char **p_cookie) { CURL *curl; long resp_code; bool support_range = false; struct curl_header *pch; curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, URL); curl_easy_setcommonopts(curl); curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL); CURLcode r = curl_easy_perform(curl); if (logerr(r)) { goto end; } char *ct = NULL; r = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &ct); if (logerr(r)) { goto end; } DEBUG_PRINT("Content-Type: %s\n", ct); if (p_content_type) { *p_content_type = malloc(strlen(ct) + 1); strcpy(*p_content_type, ct); } CURLHcode rh = curl_easy_header(curl, "Set-Cookie", 0, CURLH_HEADER, -1, &pch); if (logerr(rh)) { goto end; } DEBUG_PRINT("Set-Cookie: %s\n", pch->value); if (p_cookie) { *p_cookie = malloc(strlen(pch->value) + 1); strcpy(*p_cookie, pch->value); } if (psize) { r = curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, (curl_off_t *)psize); if (logerr(r)) { goto end; } } rh = curl_easy_header(curl, "Accept-Ranges", 0, CURLH_HEADER, -1, &pch); if (logerr(rh) || strcmp(pch->value, "bytes")) { goto end; } support_range = true; end: curl_easy_cleanup(curl); return support_range; } static int pull_part(void *a) { CURLcode res; thrd_info_t *ti = (thrd_info_t *)a; curl_conf_t *curl_c = ti->curl_c; unsigned char n = ti->no; // Here we need to manually control str_array_t curl_c->partfn.str[n] = malloc(strlen(curl_c->outfn) + 4); sprintf(curl_c->partfn.str[n], "%s.%d", curl_c->outfn, n); DEBUG_PRINT("[THRD %hhu] partfn: %s, range: %s\n", n, get_str_element(&curl_c->partfn, n), ti->range); for (unsigned char retry = 0;; retry++) { curl_c->fplist[n] = fopen(get_str_element(&curl_c->partfn, n), "wb+"); CURL *curl; curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, curl_c->URL); curl_easy_setcommonopts(curl); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 30L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); curl_easy_setopt(curl, CURLOPT_WRITEDATA, curl_c->fplist[n]); curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); if (ti->curl_c->total_thrd != 1) { curl_easy_setopt(curl, CURLOPT_RANGE, ti->range); } curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); curl_easy_setopt(curl, CURLOPT_XFERINFODATA, ti); res = curl_easy_perform(curl); rewind(curl_c->fplist[n]); append_log("[THRD %hhu] File downloaded.\n", n); curl_easy_cleanup(curl); logerr(res); if (!res) { break; } if (retry + 1 == MAX_RETRY) { append_log("Error after %d retries, exiting...\n", MAX_RETRY); corrupted = true; break; } } mtx_lock(&mtx); curl_c->success_thrd += 1; cnd_signal(&cnd); // Unblocks the waiting cleanup thread. If no threads are // blocked, does nothing and returns thrd_success. mtx_unlock(&mtx); return (int)res; } static int merge_and_cleanup(void *p_curl_c) { curl_conf_t *curl_c = *(curl_conf_t **)p_curl_c; // Now sets the global (curl_conf_t *)curl_conf to NULL *(curl_conf_t **)p_curl_c = NULL; if (corrupted) { append_log("Cancelling...\n"); } else { append_log("Merging files...\n"); } FILE *fop; fop = fopen(curl_c->outfn, "wb"); if (fop == NULL) { // User quitted before downloading, return directly return 1; } for (unsigned short i = 0; i < curl_c->total_thrd; i++) { if (!corrupted) { char buffer[1024]; size_t bytesRead = 0; while ((bytesRead = fread(buffer, 1, sizeof(buffer), curl_c->fplist[i])) > 0) { fwrite(buffer, 1, bytesRead, fop); } } fclose(curl_c->fplist[i]); if (remove(get_str_element(&curl_c->partfn, i)) != 0) { append_log("Error deleting partial file %s\n", get_str_element(&curl_c->partfn, i)); } } fclose(fop); if (corrupted) { // Also delete dst file if (remove(curl_c->outfn) != 0) { append_log("Error deleting file %s\n", curl_c->outfn); } } append_log("Download %s finished.\n", curl_c->outfn); bool need_callback = curl_c->need_callback; // Cleanup stat before creating the callback thread to avoid mem overlap curl_c->success_thrd = 0; curl_c->total_thrd = 0; FREE_AND_NULLIFY(curl_c->URL); FREE_AND_NULLIFY(curl_c->outfn); FREE_AND_NULLIFY(curl_c); // Perform the callback (if any) if (need_callback) { thrd_t cb_thrd; thrd_create(&cb_thrd, callback_g, p_callback_struct_g); thrd_detach(cb_thrd); } return 0; } static int download(curl_conf_t *curl_c) { /* Reset thrd info. */ // if (curl_c->success_thrd == curl_c->total_thrd) { curl_c->success_thrd = 0; // } CURL *curl; curl_off_t cl = 0L, begin = 0L, end; static thrd_info_t thrd_info[MAX_THREAD] = {0}; bool support_range = get_info(curl_c->URL, &cl, NULL, NULL); DEBUG_PRINT("Size: %ld bytes.\n", cl); if (support_range && cl > 0L) { curl_c->dltotal = cl; curl_c->total_thrd = (unsigned char)CEIL_DIV(cl, MAX_THREAD_SIZE); if (curl_c->total_thrd > MAX_THREAD) { curl_c->total_thrd = MAX_THREAD; } LOG("libcurl", "Server supports range header, setting threads to %hhu\n", curl_c->total_thrd); } else { LOG("libcurl", "Server doesn't claim range header " "support, falling back to single thread.\n"); curl_c->total_thrd = 1; } curl_off_t size_per_thrd = (cl / curl_c->total_thrd); curl_c->partfn = create_str_array(curl_c->total_thrd); for (unsigned char i = 0; i < curl_c->total_thrd; i++) { curl_off_t chunk_size; thrd_info[i].no = i; if (i + 1 == curl_c->total_thrd) chunk_size = cl - (curl_c->total_thrd - 1) * size_per_thrd; else chunk_size = size_per_thrd; end = begin + chunk_size - 1; if (curl_c->total_thrd != 1) { sprintf(thrd_info[i].range, "%" CURL_FORMAT_CURL_OFF_T "-%" CURL_FORMAT_CURL_OFF_T, begin, end); } thrd_info[i].curl_c = curl_c; int error = thrd_create(&tid[i], pull_part, &thrd_info[i]); if (error) append_log("Couldn't run thread number %d, errno %d\n", i, error); begin = end + 1; } return 0; } static void replace_illegal_char(char *str) { for (unsigned char i = 0; illegal_char[i] != '\0'; i++) { if (repchr(str, illegal_char[i], ' ')) DEBUG_PRINT("Found illegal character '%c', replacing ...\n", illegal_char[i]); } } static char *callback_struct_convert_fullpath(char **p_filename) { char *tmp = malloc(strlen(outdir_g) + strlen(*p_filename) + 2); replace_illegal_char(*p_filename); gen_fullpathfn(tmp, outdir_g, *p_filename); FREE_AND_NULLIFY(*p_filename); return tmp; } void add_cookie(char **p_cookie) { char *cookie = *p_cookie; if (cookie_g) { char *tmp = malloc(strlen(cookie_g) + strlen(cookie) + 3); sprintf(tmp, "%s; %s", cookie_g, cookie); FREE_AND_NULLIFY(cookie_g); cookie_g = tmp; } else { cookie_g = cookie; } } void set_referer(char *referer) { referer_g = referer; } void curl_init(char *cookie) { curl_global_init(CURL_GLOBAL_ALL); h = curl_url(); add_cookie(&cookie); dl_queue = create_queue(); mtx_init(&mtx, mtx_plain); cnd_init(&cnd); } void curl_cleanup(status_t *stat) { /* We only need to cleanup * the currently active thread. */ if (curl_conf) { corrupted = true; // In case libcurl is still downloading /* Now Wait for all threads to cancel... */ mtx_lock(&mtx); while (curl_conf->success_thrd != curl_conf->total_thrd) { cnd_wait(&cnd, &mtx); } mtx_unlock(&mtx); if (!stat->is_done) { merge_and_cleanup(&curl_conf); } mtx_destroy(&mtx); cnd_destroy(&cnd); } free_queue(&dl_queue); FREE_AND_NULLIFY(cookie_g); curl_url_cleanup(h); curl_global_cleanup(); } void poll_status(status_t *stat) { if (!is_empty_queue(&dl_queue) && stat->is_done) { /* extract_done is a flag used to signal that * the extractor process is done. */ curl_conf = (curl_conf_t *)dequeue(&dl_queue); if (download(curl_conf)) { // Something went wrong when creating download task DEBUG_PRINT("Creating download task failed.\n"); }; stat->is_done = false; stat->type = SIZE_BYTES; } if (curl_conf) { curl_conf->dlnow = 0L; for (unsigned char i = 0; i < curl_conf->total_thrd; i++) { curl_conf->dlnow += curl_conf->dlnow_per_thrd[i]; } if (stat->type == SIZE_BYTES) { stat->cur = (unsigned long long)curl_conf->dlnow; stat->total = (unsigned long long)curl_conf->dltotal; } DEBUG_PRINT("success_thrd: %hhu, total_thrd: %hhu, is_done: %s\n", curl_conf->success_thrd, curl_conf->total_thrd, stat->is_done ? "yes" : "no"); mtx_lock(&mtx); if (curl_conf->success_thrd == curl_conf->total_thrd && (curl_conf->total_thrd && !stat->is_done)) { stat->is_done = true; for (unsigned short i = 0; i < curl_conf->total_thrd; i++) { int r; thrd_join(tid[i], &r); } // Do we need to perform callback? if (is_empty_queue(&dl_queue) && callback_g && !corrupted) curl_conf->need_callback = true; // Do not block ui thread while erging thrd_t mg_thrd; thrd_create(&mg_thrd, merge_and_cleanup, (void *)&curl_conf); thrd_detach(mg_thrd); corrupted = false; } mtx_unlock(&mtx); } } int get(const char *URL, char **pdstr) { CURL *curl = curl_easy_init(); str_data_t pagedata = {0}; pagedata.string = malloc(1); pagedata.string[0] = '\0'; curl_easy_setopt(curl, CURLOPT_URL, URL); curl_easy_setcommonopts(curl); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write2str); curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&pagedata); CURLcode res = logerr(curl_easy_perform(curl)); *pdstr = malloc(pagedata.len + 1); strcpy(*pdstr, pagedata.string); FREE_AND_NULLIFY(pagedata.string); curl_easy_cleanup(curl); return res; } /* Add an URL to dl_queue and register all the stuff. * - If outdir is NULL or a empty string, reuse the cached outdir_g * - If fn is NULL or a empty string, infer the filename from URL (otherwise * fail and quit') * - If callback || callback_struct is valid, execute the callback function * after download */ void add_url(const char *URL, const char *outdir, const char *fn, callback_t callback, callback_struct_t *p_callback_struct) { char *filename; if (fn == NULL || fn[0] == '\0') { filename = NULL; } else { filename = malloc(strlen(fn) + 1); strcpy(filename, fn); replace_illegal_char(filename); } if (outdir && outdir[0] != '\0') { outdir_g = outdir; } callback_g = callback; if (p_callback_struct) { for (unsigned short i = 0; i < p_callback_struct->n; i++) { p_callback_struct->str[i] = callback_struct_convert_fullpath(&p_callback_struct->str[i]); } p_callback_struct_g = p_callback_struct; } // Pass our cache (outdir_g) to parse_url() if (parse_url(URL, outdir_g, filename)) { DEBUG_PRINT("parse_url() failed with error.\n"); return; // Parse failed, quit the task directly }; }