From 0d65dcd24c8090dcc719be599cd3ef4dc2220e9b Mon Sep 17 00:00:00 2001 From: frosty Date: Thu, 12 Mar 2026 18:05:09 -0400 Subject: [PATCH] refactor: put HTTP and XML logic into reusable modules --- src/Infobox/Dictionary.c | 74 ++--------- src/Infobox/Wikipedia.c | 122 ++++------------- src/Routes/Images.c | 72 ++-------- src/Scraping/Scraping.c | 278 ++++++++++++++++++++------------------- src/Utility/HttpClient.c | 81 ++++++++++++ src/Utility/HttpClient.h | 16 +++ src/Utility/XmlHelper.c | 65 +++++++++ src/Utility/XmlHelper.h | 14 ++ 8 files changed, 366 insertions(+), 356 deletions(-) create mode 100644 src/Utility/HttpClient.c create mode 100644 src/Utility/HttpClient.h create mode 100644 src/Utility/XmlHelper.c create mode 100644 src/Utility/XmlHelper.h diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c index 768c2c6..58d0dfa 100644 --- a/src/Infobox/Dictionary.c +++ b/src/Infobox/Dictionary.c @@ -1,7 +1,8 @@ #include "Dictionary.h" #include "../Cache/Cache.h" -#include "../Proxy/Proxy.h" #include "../Scraping/Scraping.h" +#include "../Utility/HttpClient.h" +#include "../Utility/XmlHelper.h" #include #include #include @@ -52,44 +53,6 @@ static const char *strcasestr_impl(const char *haystack, const char *needle) { return NULL; } -struct MemStruct { - char *memory; - size_t size; -}; - -static size_t WriteCallback(void *contents, size_t size, size_t nmemb, - void *userp) { - size_t realsize = size * nmemb; - struct MemStruct *mem = (struct MemStruct *)userp; - char *ptr = realloc(mem->memory, mem->size + realsize + 1); - if (!ptr) - return 0; - mem->memory = ptr; - memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - return realsize; -} - -static char *xpath_text(xmlDocPtr doc, const char *xpath) { - xmlXPathContextPtr ctx = xmlXPathNewContext(doc); - if (!ctx) - return NULL; - xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); - xmlXPathFreeContext(ctx); - if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { - if (obj) - xmlXPathFreeObject(obj); - return NULL; - } - xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); - char *result = content ? strdup((char *)content) : NULL; - if (content) - xmlFree(content); - xmlXPathFreeObject(obj); - return result; -} - static char *build_html(const char *word, const char *pron, const char *pos, const char *def, const char *ex) { char html[4096]; @@ -240,13 +203,7 @@ char *construct_dictionary_url(const char *query) { if (!word) return NULL; - CURL *curl = curl_easy_init(); - if (!curl) { - free(word); - return NULL; - } - - char *escaped = curl_easy_escape(curl, word, 0); + char *escaped = curl_easy_escape(NULL, word, 0); const char *base = "https://dictionary.cambridge.org/dictionary/english/"; char *url = malloc(strlen(base) + strlen(escaped) + 1); if (url) { @@ -255,7 +212,6 @@ char *construct_dictionary_url(const char *query) { } curl_free(escaped); - curl_easy_cleanup(curl); free(word); return url; } @@ -309,28 +265,15 @@ InfoBox fetch_dictionary_data(const char *query) { } free(cache_key); - CURL *curl = curl_easy_init(); - if (!curl) { - free(url); - return info; - } - - struct MemStruct chunk = {malloc(1), 0}; - curl_easy_setopt(curl, CURLOPT_URL, url); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - apply_proxy_settings(curl); - - if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { + HttpResponse resp = http_get(url, "Mozilla/5.0"); + if (resp.memory && resp.size > 0) { cache_key = cache_compute_key(url, 0, "dictionary"); if (cache_key && get_cache_ttl_infobox() > 0) { - cache_set(cache_key, chunk.memory, chunk.size); + cache_set(cache_key, resp.memory, resp.size); } free(cache_key); - htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, + htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, url, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (doc) { @@ -358,8 +301,7 @@ InfoBox fetch_dictionary_data(const char *query) { } } - curl_easy_cleanup(curl); - free(chunk.memory); + http_response_free(&resp); free(url); return info; } diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c index ca7238d..b29b678 100644 --- a/src/Infobox/Wikipedia.c +++ b/src/Infobox/Wikipedia.c @@ -1,7 +1,7 @@ #include "Wikipedia.h" #include "../Cache/Cache.h" -#include "../Proxy/Proxy.h" #include "../Scraping/Scraping.h" +#include "../Utility/HttpClient.h" #include #include #include @@ -9,11 +9,6 @@ #include #include -struct WikiMemoryStruct { - char *memory; - size_t size; -}; - static void shorten_summary(char **extract_ptr, int max_chars) { if (!extract_ptr || !*extract_ptr) return; @@ -43,25 +38,6 @@ static void shorten_summary(char **extract_ptr, int max_chars) { } } -static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb, - void *userp) { - size_t realsize = size * nmemb; - struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp; - - char *ptr = realloc(mem->memory, mem->size + realsize + 1); - if (ptr == NULL) { - fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); - return 0; - } - - mem->memory = ptr; - memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - - return realsize; -} - static void extract_wiki_info(xmlNode *node, InfoBox *info) { xmlNode *cur_node = NULL; @@ -113,9 +89,6 @@ static void extract_wiki_info(xmlNode *node, InfoBox *info) { } InfoBox fetch_wiki_data(char *api_url) { - CURL *curl_handle; - CURLcode res; - struct WikiMemoryStruct chunk; InfoBox info = {NULL, NULL, NULL, NULL}; if (!api_url) { @@ -144,47 +117,31 @@ InfoBox fetch_wiki_data(char *api_url) { } free(cache_key); - chunk.memory = malloc(1); - chunk.size = 0; - - curl_handle = curl_easy_init(); - - if (curl_handle) { - curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, - WikiWriteMemoryCallback); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); - curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - apply_proxy_settings(curl_handle); - - res = curl_easy_perform(curl_handle); - - if (res == CURLE_OK && chunk.size > 0) { - cache_key = cache_compute_key(api_url, 0, "wikipedia"); - if (cache_key && get_cache_ttl_infobox() > 0) { - cache_set(cache_key, chunk.memory, chunk.size); - } - free(cache_key); - - xmlDocPtr doc = - xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); - if (doc != NULL) { - xmlNode *root_element = xmlDocGetRootElement(doc); - extract_wiki_info(root_element, &info); - xmlFreeDoc(doc); - } + HttpResponse resp = http_get(api_url, "libcurl-agent/1.0"); + if (resp.memory && resp.size > 0) { + cache_key = cache_compute_key(api_url, 0, "wikipedia"); + if (cache_key && get_cache_ttl_infobox() > 0) { + cache_set(cache_key, resp.memory, resp.size); } + free(cache_key); - curl_easy_cleanup(curl_handle); - free(chunk.memory); + xmlDocPtr doc = + xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); + } } + http_response_free(&resp); return info; } static xmlNode *find_node_recursive(xmlNode *node, const char *target_name) { for (xmlNode *cur = node; cur; cur = cur->next) { - if (cur->type == XML_ELEMENT_NODE && strcmp((const char *)cur->name, target_name) == 0) { + if (cur->type == XML_ELEMENT_NODE && + strcmp((const char *)cur->name, target_name) == 0) { return cur; } xmlNode *found = find_node_recursive(cur->children, target_name); @@ -195,21 +152,15 @@ static xmlNode *find_node_recursive(xmlNode *node, const char *target_name) { } static char *get_first_search_result(const char *search_term) { - CURL *curl = curl_easy_init(); - if (!curl) - return NULL; - - char *escaped_term = curl_easy_escape(curl, search_term, 0); + char *escaped_term = curl_easy_escape(NULL, search_term, 0); const char *search_base = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch="; - const char *search_suffix = - "&format=xml&origin=*&srlimit=1"; + const char *search_suffix = "&format=xml&origin=*&srlimit=1"; char *search_url = malloc(strlen(search_base) + strlen(escaped_term) + - strlen(search_suffix) + 1); + strlen(search_suffix) + 1); if (!search_url) { curl_free(escaped_term); - curl_easy_cleanup(curl); return NULL; } @@ -219,22 +170,13 @@ static char *get_first_search_result(const char *search_term) { curl_free(escaped_term); - struct WikiMemoryStruct chunk = {malloc(1), 0}; - if (!chunk.memory) { - free(search_url); - curl_easy_cleanup(curl); - return NULL; - } - - curl_easy_setopt(curl, CURLOPT_URL, search_url); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WikiWriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - apply_proxy_settings(curl); + HttpResponse resp = http_get(search_url, "libcurl-agent/1.0"); + free(search_url); char *first_title = NULL; - if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { - xmlDocPtr doc = xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); + if (resp.memory && resp.size > 0) { + xmlDocPtr doc = + xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0); if (doc) { xmlNode *root = xmlDocGetRootElement(doc); xmlNode *search_node = find_node_recursive(root, "search"); @@ -255,10 +197,7 @@ static char *get_first_search_result(const char *search_term) { } } - free(chunk.memory); - free(search_url); - curl_easy_cleanup(curl); - + http_response_free(&resp); return first_title; } @@ -267,13 +206,7 @@ char *construct_wiki_url(const char *search_term) { if (!first_title) return NULL; - CURL *curl = curl_easy_init(); - if (!curl) { - free(first_title); - return NULL; - } - - char *escaped_title = curl_easy_escape(curl, first_title, 0); + char *escaped_title = curl_easy_escape(NULL, first_title, 0); const char *base = "https://en.wikipedia.org/w/" "api.php?action=query&prop=extracts|pageimages&exintro&" "explaintext&pithumbsize=400&format=xml&origin=*&titles="; @@ -285,7 +218,6 @@ char *construct_wiki_url(const char *search_term) { } curl_free(escaped_title); - curl_easy_cleanup(curl); free(first_title); return full_url; } diff --git a/src/Routes/Images.c b/src/Routes/Images.c index 5057f81..5f8cf2c 100644 --- a/src/Routes/Images.c +++ b/src/Routes/Images.c @@ -1,7 +1,8 @@ #include "Images.h" -#include "../Proxy/Proxy.h" #include "../Scraping/Scraping.h" +#include "../Utility/HttpClient.h" #include "../Utility/Unescape.h" +#include "../Utility/XmlHelper.h" #include #include @@ -9,61 +10,6 @@ #include #include #include -#include - -struct MemoryBlock { - char *response; - size_t size; -}; - -static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb, - void *userp) { - size_t realsize = size * nmemb; - struct MemoryBlock *mem = (struct MemoryBlock *)userp; - char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1); - if (ptr == NULL) { - return 0; - } - mem->response = ptr; - memcpy(&(mem->response[mem->size]), data, realsize); - mem->size += realsize; - mem->response[mem->size] = 0; - return realsize; -} - -static char *fetch_images_html(const char *url) { - CURL *curl_handle; - struct MemoryBlock chunk = {.response = malloc(1), .size = 0}; - if (!chunk.response) { - return NULL; - } - - curl_handle = curl_easy_init(); - if (!curl_handle) { - free(chunk.response); - return NULL; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); - curl_easy_setopt( - curl_handle, CURLOPT_USERAGENT, - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); - curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L); - apply_proxy_settings(curl_handle); - - CURLcode res = curl_easy_perform(curl_handle); - if (res != CURLE_OK) { - free(chunk.response); - curl_easy_cleanup(curl_handle); - return NULL; - } - - curl_easy_cleanup(curl_handle); - return chunk.response; -} int images_handler(UrlParams *params) { TemplateContext ctx = new_context(); @@ -127,8 +73,10 @@ int images_handler(UrlParams *params) { snprintf(url, sizeof(url), "https://www.bing.com/images/search?q=%s&first=%d", encoded_query, first); - char *html = fetch_images_html(url); - if (!html) { + HttpResponse resp = http_get( + url, + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); + if (!resp.memory) { send_response("

Error fetching images

"); free(encoded_query); free(display_query); @@ -136,10 +84,10 @@ int images_handler(UrlParams *params) { return -1; } - htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL, + htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); if (!doc) { - free(html); + http_response_free(&resp); free(encoded_query); free(display_query); free_context(&ctx); @@ -150,7 +98,7 @@ int images_handler(UrlParams *params) { if (!xpathCtx) { xmlFreeDoc(doc); - free(html); + http_response_free(&resp); free(encoded_query); free(display_query); free_context(&ctx); @@ -325,7 +273,7 @@ int images_handler(UrlParams *params) { xmlXPathFreeContext(xpathCtx); if (doc) xmlFreeDoc(doc); - free(html); + http_response_free(&resp); curl_free(encoded_query); free(display_query); free_context(&ctx); diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 7ba2d97..692377e 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -2,6 +2,7 @@ #include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "../Utility/Unescape.h" +#include "../Utility/XmlHelper.h" #include #include #include @@ -57,15 +58,15 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { return 0; } - const char *link_xpath = "//tr[not(contains(@class, " - "'result-sponsored'))]//a[@class='result-link']"; - xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); + xmlXPathObjectPtr xpathObj = xml_xpath_eval( + xpathCtx, "//tr[not(contains(@class, " + "'result-sponsored'))]//a[@class='result-link']"); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { if (xpathObj) @@ -75,9 +76,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, } int num_links = xpathObj->nodesetval->nodeNr; - - int actual_alloc = (num_links < max_results) ? num_links : max_results; - *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + *out_results = xml_result_alloc(num_links, max_results); if (!*out_results) { xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); @@ -86,7 +85,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, for (int i = 0; i < num_links && found_count < max_results; i++) { xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; - char *title = (char *)xmlNodeGetContent(linkNode); + char *title = xml_node_content(linkNode); char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); char *snippet_text = NULL; @@ -100,13 +99,11 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) snippetRow = snippetRow->next; if (snippetRow) { - xpathCtx->node = snippetRow; - xmlXPathObjectPtr sObj = xmlXPathEvalExpression( - (xmlChar *)".//td[@class='result-snippet']", xpathCtx); + xmlXPathObjectPtr sObj = + xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']"); if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = - (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); + snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]); } if (sObj) xmlXPathFreeObject(sObj); @@ -118,7 +115,6 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, (*out_results)[found_count].title = strdup(title ? title : "No Title"); (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - found_count++; if (title) @@ -138,14 +134,14 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { return 0; } - const char *container_xpath = "//div[contains(@class, 'result')]"; xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]"); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { if (xpathObj) @@ -155,9 +151,7 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, } int num_results = xpathObj->nodesetval->nodeNr; - - int actual_alloc = (num_results < max_results) ? num_results : max_results; - *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + *out_results = xml_result_alloc(num_results, max_results); if (!*out_results) { xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); @@ -168,27 +162,27 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; xpathCtx->node = resultNode; - xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); + xmlXPathObjectPtr linkObj = + xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]"); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], (xmlChar *)"href") : NULL; - xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); + xmlXPathObjectPtr titleObj = + xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]"); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + ? xml_node_content(titleObj->nodesetval->nodeTab[0]) : NULL; - xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); + xmlXPathObjectPtr snippetObj = + xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]"); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) : NULL; if (url && title) { @@ -214,7 +208,6 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, } xpathCtx->node = NULL; - xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return found_count; @@ -224,14 +217,14 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { return 0; } - const char *container_xpath = "//div[contains(@class, 'algo-sr')]"; xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]"); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { if (xpathObj) @@ -241,9 +234,7 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, } int num_results = xpathObj->nodesetval->nodeNr; - - int actual_alloc = (num_results < max_results) ? num_results : max_results; - *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + *out_results = xml_result_alloc(num_results, max_results); if (!*out_results) { xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); @@ -254,28 +245,27 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; xpathCtx->node = resultNode; - xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", - xpathCtx); + xmlXPathObjectPtr linkObj = xml_xpath_eval( + xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']"); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], (xmlChar *)"href") : NULL; - xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); + xmlXPathObjectPtr titleObj = + xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]"); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + ? xml_node_content(titleObj->nodesetval->nodeTab[0]) : NULL; - xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); + xmlXPathObjectPtr snippetObj = + xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p"); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) : NULL; if (url && title) { @@ -344,11 +334,8 @@ static void configure_curl_handle(CURL *curl, const char *full_url, curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); - curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); @@ -357,6 +344,102 @@ static void configure_curl_handle(CURL *curl, const char *full_url, apply_proxy_settings(curl); } +static char *build_search_url(const char *base_url, const char *page_param, + int page_multiplier, int page_base, + const char *encoded_query, int page) { + int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base; + char *url = malloc(1024); + if (!url) { + return NULL; + } + snprintf(url, 1024, "%s%s&%s=%d", base_url, encoded_query, page_param, + page_value); + return url; +} + +static struct curl_slist *build_request_headers(const char *host_header, + const char *referer) { + struct curl_slist *headers = NULL; + char host_buf[256], ref_buf[256]; + + snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer); + + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append( + headers, + "Accept: " + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); + + return headers; +} + +static int check_cache_for_job(ScrapeJob *job) { + if (get_cache_ttl_search() <= 0) { + return 0; + } + + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (!key) { + return 0; + } + + char *cached_data = NULL; + size_t cached_size = 0; + + if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data, + &cached_size) == 0 && + cached_data && cached_size > 0) { + xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + } + free(cached_data); + free(key); + return 1; + } + + free(key); + return 0; +} + +static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { + if (msg->data.result == CURLE_OK && job->response.size > 0) { + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) { + cache_set(key, job->response.memory, job->response.size); + free(key); + } + + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + } + } else { + job->results_count = 0; + } + + struct curl_slist *headers = NULL; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); + if (headers) + curl_slist_free_all(headers); + + free(job->response.memory); + job->response.memory = NULL; +} + int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { int retries = 0; @@ -369,10 +452,6 @@ retry: for (int i = 0; i < num_jobs; i++) { ScrapeJob *job = &jobs[i]; - char cache_key[64]; - char full_url[1024]; - char *encoded_query = NULL; - if (job->handle) { curl_easy_cleanup(job->handle); job->handle = NULL; @@ -381,57 +460,29 @@ retry: free(job->response.memory); } - encoded_query = curl_easy_escape(NULL, job->query, 0); + if (check_cache_for_job(job)) { + job->results_count = job->results_count > 0 ? job->results_count : 0; + continue; + } + + char *encoded_query = curl_easy_escape(NULL, job->query, 0); if (!encoded_query) { continue; } - int page = (job->page < 1) ? 1 : job->page; - int page_value = - (page - 1) * job->engine->page_multiplier + job->engine->page_base; + char *full_url = + build_search_url(job->engine->base_url, job->engine->page_param, + job->engine->page_multiplier, job->engine->page_base, + encoded_query, job->page); + free(encoded_query); - snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url, - encoded_query, job->engine->page_param, page_value); - - char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (key) { - strncpy(cache_key, key, sizeof(cache_key) - 1); - cache_key[sizeof(cache_key) - 1] = '\0'; - free(key); - } else { - snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i, - job->engine->name); - } - - char *cached_data = NULL; - size_t cached_size = 0; - int cache_hit = 0; - - if (get_cache_ttl_search() > 0 && - cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data, - &cached_size) == 0 && - cached_data && cached_size > 0) { - xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | - HTML_PARSE_NOWARNING); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - cache_hit = 1; - } - free(cached_data); - } - - if (cache_hit) { - free(encoded_query); - job->results_count = job->results_count > 0 ? job->results_count : 0; + if (!full_url) { continue; } job->handle = curl_easy_init(); if (!job->handle) { - free(encoded_query); + free(full_url); continue; } @@ -439,23 +490,13 @@ retry: job->response.size = 0; job->response.capacity = 16384; - struct curl_slist *headers = NULL; - char host_buf[256], ref_buf[256]; - snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); - headers = curl_slist_append(headers, host_buf); - headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append( - headers, - "Accept: " - "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); - headers = curl_slist_append(headers, "DNT: 1"); + struct curl_slist *headers = + build_request_headers(job->engine->host_header, job->engine->referer); configure_curl_handle(job->handle, full_url, &job->response, headers); - curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); + free(full_url); curl_multi_add_handle(multi_handle, job->handle); } @@ -485,37 +526,8 @@ retry: if (jobs[i].handle && jobs[i].handle == handle) { ScrapeJob *job = &jobs[i]; - long response_code; - curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); + process_job_response(job, handle, msg); - if (msg->data.result == CURLE_OK && job->response.size > 0) { - char *key = - cache_compute_key(job->query, job->page, job->engine->name); - if (key && get_cache_ttl_search() > 0) { - cache_set(key, job->response.memory, job->response.size); - free(key); - } - - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } - } else { - job->results_count = 0; - } - - struct curl_slist *headers; - curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); - if (headers) - curl_slist_free_all(headers); - - free(job->response.memory); - job->response.memory = NULL; curl_multi_remove_handle(multi_handle, handle); if (handle) curl_easy_cleanup(handle); diff --git a/src/Utility/HttpClient.c b/src/Utility/HttpClient.c new file mode 100644 index 0000000..150b228 --- /dev/null +++ b/src/Utility/HttpClient.c @@ -0,0 +1,81 @@ +#include "HttpClient.h" +#include "../Proxy/Proxy.h" +#include +#include + +static size_t write_callback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + HttpResponse *mem = (HttpResponse *)userp; + + if (mem->size + realsize + 1 > mem->capacity) { + size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) + new_cap *= 2; + + char *ptr = realloc(mem->memory, new_cap); + if (!ptr) { + return 0; + } + mem->memory = ptr; + mem->capacity = new_cap; + } + + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +HttpResponse http_get(const char *url, const char *user_agent) { + HttpResponse resp = {.memory = NULL, .size = 0, .capacity = 0}; + + if (!url) { + return resp; + } + + resp.memory = malloc(16384); + if (!resp.memory) { + return resp; + } + resp.capacity = 16384; + + CURL *curl = curl_easy_init(); + if (!curl) { + free(resp.memory); + resp.memory = NULL; + return resp; + } + + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp); + curl_easy_setopt(curl, CURLOPT_USERAGENT, + user_agent ? user_agent : "libcurl-agent/1.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + apply_proxy_settings(curl); + + CURLcode res = curl_easy_perform(curl); + curl_easy_cleanup(curl); + + if (res != CURLE_OK) { + free(resp.memory); + resp.memory = NULL; + resp.size = 0; + resp.capacity = 0; + } + + return resp; +} + +void http_response_free(HttpResponse *resp) { + if (!resp) { + return; + } + free(resp->memory); + resp->memory = NULL; + resp->size = 0; + resp->capacity = 0; +} diff --git a/src/Utility/HttpClient.h b/src/Utility/HttpClient.h new file mode 100644 index 0000000..6eb002c --- /dev/null +++ b/src/Utility/HttpClient.h @@ -0,0 +1,16 @@ +#ifndef HTTPCLIENT_H +#define HTTPCLIENT_H + +#include +#include + +typedef struct { + char *memory; + size_t size; + size_t capacity; +} HttpResponse; + +HttpResponse http_get(const char *url, const char *user_agent); +void http_response_free(HttpResponse *resp); + +#endif diff --git a/src/Utility/XmlHelper.c b/src/Utility/XmlHelper.c new file mode 100644 index 0000000..4fed96a --- /dev/null +++ b/src/Utility/XmlHelper.c @@ -0,0 +1,65 @@ +#include "XmlHelper.h" +#include +#include + +SearchResult *xml_result_alloc(int count, int max_results) { + if (count <= 0 || max_results <= 0) { + return NULL; + } + int actual = (count < max_results) ? count : max_results; + return (SearchResult *)calloc(actual, sizeof(SearchResult)); +} + +void xml_result_free(SearchResult *results, int count) { + if (!results) { + return; + } + for (int i = 0; i < count; i++) { + free(results[i].url); + free(results[i].title); + free(results[i].snippet); + } + free(results); +} + +xmlXPathObjectPtr xml_xpath_eval(xmlXPathContextPtr ctx, const char *xpath) { + if (!ctx || !xpath) { + return NULL; + } + return xmlXPathEvalExpression((const xmlChar *)xpath, ctx); +} + +char *xml_node_content(xmlNodePtr node) { + if (!node) { + return NULL; + } + char *content = (char *)xmlNodeGetContent(node); + return content; +} + +char *xpath_text(xmlDocPtr doc, const char *xpath) { + if (!doc || !xpath) { + return NULL; + } + + xmlXPathContextPtr ctx = xmlXPathNewContext(doc); + if (!ctx) { + return NULL; + } + + xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); + xmlXPathFreeContext(ctx); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + if (obj) + xmlXPathFreeObject(obj); + return NULL; + } + + xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); + char *result = content ? strdup((char *)content) : NULL; + if (content) + xmlFree(content); + xmlXPathFreeObject(obj); + return result; +} diff --git a/src/Utility/XmlHelper.h b/src/Utility/XmlHelper.h new file mode 100644 index 0000000..95cbcd6 --- /dev/null +++ b/src/Utility/XmlHelper.h @@ -0,0 +1,14 @@ +#ifndef XMLHELPER_H +#define XMLHELPER_H + +#include "../Scraping/Scraping.h" +#include + +SearchResult *xml_result_alloc(int count, int max_results); +void xml_result_free(SearchResult *results, int count); + +xmlXPathObjectPtr xml_xpath_eval(xmlXPathContextPtr ctx, const char *xpath); +char *xml_node_content(xmlNodePtr node); +char *xpath_text(xmlDocPtr doc, const char *xpath); + +#endif