diff --git a/src/Config.h b/src/Config.h index 24dafe6..e0e242c 100644 --- a/src/Config.h +++ b/src/Config.h @@ -23,6 +23,13 @@ #define INFOBOX_FIELD_COUNT 4 #define MAX_RESULTS_PER_ENGINE 10 +#define CURL_TIMEOUT_SECS 15L +#define CURL_DNS_TIMEOUT_SECS 300L + +#define BING_IMAGE_URL "https://www.bing.com/images/search" +#define IMAGE_RESULTS_PER_PAGE 32 +#define IMAGE_RESULT_FIELDS 4 + typedef struct { char host[256]; int port; diff --git a/src/Routes/Images.c b/src/Routes/Images.c index 7536f6b..ae25cf8 100644 --- a/src/Routes/Images.c +++ b/src/Routes/Images.c @@ -1,15 +1,7 @@ #include "Images.h" -#include "../Scraping/Scraping.h" -#include "../Utility/HttpClient.h" +#include "../Scraping/ImageScraping.h" #include "../Utility/Unescape.h" -#include "../Utility/XmlHelper.h" - -#include -#include -#include -#include -#include -#include +#include "Config.h" int images_handler(UrlParams *params) { TemplateContext ctx = new_context(); @@ -28,12 +20,12 @@ int images_handler(UrlParams *params) { } } - context_set(&ctx, "query", raw_query); - char page_str[16], prev_str[16], next_str[16]; snprintf(page_str, sizeof(page_str), "%d", page); snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0); snprintf(next_str, sizeof(next_str), "%d", page + 1); + + context_set(&ctx, "query", raw_query); context_set(&ctx, "page", page_str); context_set(&ctx, "prev_page", prev_str); context_set(&ctx, "next_page", next_str); @@ -49,208 +41,41 @@ int images_handler(UrlParams *params) { return -1; } - CURL *tmp = curl_easy_init(); - if (!tmp) { - send_response("

Error initializing curl

"); - if (display_query) - free(display_query); - free_context(&ctx); - return -1; - } - char *encoded_query = curl_easy_escape(tmp, raw_query, 0); - curl_easy_cleanup(tmp); + ImageResult *results = NULL; + int result_count = 0; - if (!encoded_query) { - send_response("

Error encoding query

"); - if (display_query) - free(display_query); - free_context(&ctx); - return -1; - } - - char url[1024]; - int first = (page - 1) * 32 + 1; - snprintf(url, sizeof(url), "https://www.bing.com/images/search?q=%s&first=%d", - encoded_query, first); - - HttpResponse resp = http_get( - url, - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); - if (!resp.memory) { + if (scrape_images(raw_query, page, &results, &result_count) != 0 || + !results) { send_response("

Error fetching images

"); - free(encoded_query); free(display_query); free_context(&ctx); return -1; } - htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); - if (!doc) { - http_response_free(&resp); - free(encoded_query); + char ***image_matrix = malloc(sizeof(char **) * result_count); + int *inner_counts = malloc(sizeof(int) * result_count); + + if (!image_matrix || !inner_counts) { + if (image_matrix) + free(image_matrix); + if (inner_counts) + free(inner_counts); + free_image_results(results, result_count); free(display_query); free_context(&ctx); return -1; } - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - - if (!xpathCtx) { - xmlFreeDoc(doc); - http_response_free(&resp); - free(encoded_query); - free(display_query); - free_context(&ctx); - return -1; + for (int i = 0; i < result_count; i++) { + image_matrix[i] = malloc(sizeof(char *) * IMAGE_RESULT_FIELDS); + image_matrix[i][0] = strdup(results[i].thumbnail_url); + image_matrix[i][1] = strdup(results[i].title); + image_matrix[i][2] = strdup(results[i].page_url); + image_matrix[i][3] = strdup(results[i].full_url); + inner_counts[i] = IMAGE_RESULT_FIELDS; } - xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx); - - int image_count = 0; - char ***image_matrix = NULL; - int *inner_counts = NULL; - - if (xpathObj && xpathObj->nodesetval) { - int nodes = xpathObj->nodesetval->nodeNr; - - int max_images = (nodes < 32) ? nodes : 32; - image_matrix = malloc(sizeof(char **) * max_images); - inner_counts = malloc(sizeof(int) * max_images); - if (!image_matrix || !inner_counts) { - if (image_matrix) free(image_matrix); - if (inner_counts) free(inner_counts); - image_matrix = NULL; - inner_counts = NULL; - } - - for (int i = 0; i < nodes; i++) { - if (image_count >= 32) - break; - - xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; - xmlNodePtr img_node = NULL; - xmlNodePtr tit_node = NULL; - xmlNodePtr des_node = NULL; - xmlNodePtr thumb_link = NULL; - - for (xmlNodePtr child = node->children; child; child = child->next) { - if (child->type != XML_ELEMENT_NODE) - continue; - - if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) { - xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); - if (class) { - if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) { - thumb_link = child; - for (xmlNodePtr thumb_child = child->children; thumb_child; - thumb_child = thumb_child->next) { - if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) { - xmlChar *div_class = - xmlGetProp(thumb_child, (const xmlChar *)"class"); - if (div_class && - xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) { - for (xmlNodePtr cico_child = thumb_child->children; - cico_child; cico_child = cico_child->next) { - if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == - 0) { - img_node = cico_child; - break; - } - } - } - if (div_class) - xmlFree(div_class); - } - } - } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) { - tit_node = child; - } - xmlFree(class); - } - } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) { - xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); - if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) { - for (xmlNodePtr meta_child = child->children; meta_child; - meta_child = meta_child->next) { - if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) { - xmlChar *div_class = - xmlGetProp(meta_child, (const xmlChar *)"class"); - if (div_class) { - if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) { - des_node = meta_child; - } - xmlFree(div_class); - } - } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == - 0) { - xmlChar *a_class = - xmlGetProp(meta_child, (const xmlChar *)"class"); - if (a_class && - xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) { - tit_node = meta_child; - } - if (a_class) - xmlFree(a_class); - } - } - } - if (class) - xmlFree(class); - } - } - - xmlChar *iurl = - img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL; - xmlChar *full_url = - thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL; - xmlChar *title = des_node - ? xmlNodeGetContent(des_node) - : (tit_node ? xmlNodeGetContent(tit_node) : NULL); - xmlChar *rurl = - tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL; - - if (iurl && strlen((char *)iurl) > 0) { - char *proxy_url = NULL; - CURL *esc_curl = curl_easy_init(); - if (esc_curl) { - char *encoded = curl_easy_escape(esc_curl, (char *)iurl, 0); - if (encoded) { - size_t proxy_len = strlen("/proxy?url=") + strlen(encoded) + 1; - proxy_url = malloc(proxy_len); - if (proxy_url) { - snprintf(proxy_url, proxy_len, "/proxy?url=%s", encoded); - } - curl_free(encoded); - } - curl_easy_cleanup(esc_curl); - } - - image_matrix[image_count] = malloc(sizeof(char *) * 4); - image_matrix[image_count][0] = - proxy_url ? strdup(proxy_url) : strdup((char *)iurl); - free(proxy_url); - image_matrix[image_count][1] = strdup(title ? (char *)title : "Image"); - image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#"); - image_matrix[image_count][3] = - strdup(full_url ? (char *)full_url : "#"); - inner_counts[image_count] = 4; - image_count++; - } - - if (iurl) - xmlFree(iurl); - if (title) - xmlFree(title); - if (rurl) - xmlFree(rurl); - if (full_url) - xmlFree(full_url); - } - } - - context_set_array_of_arrays(&ctx, "images", image_matrix, image_count, + context_set_array_of_arrays(&ctx, "images", image_matrix, result_count, inner_counts); char *rendered = render_template("images.html", &ctx); @@ -261,27 +86,15 @@ int images_handler(UrlParams *params) { send_response("

Error rendering image results

"); } - if (image_matrix) { - for (int i = 0; i < image_count; i++) { - for (int j = 0; j < 4; j++) { - free(image_matrix[i][j]); - } - free(image_matrix[i]); - } - free(image_matrix); - } - if (inner_counts) { - free(inner_counts); + for (int i = 0; i < result_count; i++) { + for (int j = 0; j < IMAGE_RESULT_FIELDS; j++) + free(image_matrix[i][j]); + free(image_matrix[i]); } + free(image_matrix); + free(inner_counts); - if (xpathObj) - xmlXPathFreeObject(xpathObj); - if (xpathCtx) - xmlXPathFreeContext(xpathCtx); - if (doc) - xmlFreeDoc(doc); - http_response_free(&resp); - curl_free(encoded_query); + free_image_results(results, result_count); free(display_query); free_context(&ctx); diff --git a/src/Scraping/ImageScraping.c b/src/Scraping/ImageScraping.c new file mode 100644 index 0000000..33f710a --- /dev/null +++ b/src/Scraping/ImageScraping.c @@ -0,0 +1,239 @@ +#include "ImageScraping.h" +#include "../Utility/HttpClient.h" +#include "Config.h" +#include +#include +#include +#include +#include + +static char *build_proxy_url(const char *image_url) { + if (!image_url) + return NULL; + + char *proxy_url = NULL; + CURL *curl = curl_easy_init(); + if (curl) { + char *encoded = curl_easy_escape(curl, (char *)image_url, 0); + if (encoded) { + size_t len = strlen("/proxy?url=") + strlen(encoded) + 1; + proxy_url = malloc(len); + if (proxy_url) + snprintf(proxy_url, len, "/proxy?url=%s", encoded); + curl_free(encoded); + } + curl_easy_cleanup(curl); + } + + return proxy_url; +} + +static int parse_image_node(xmlNodePtr node, ImageResult *result) { + xmlNodePtr img_node = NULL; + xmlNodePtr tit_node = NULL; + xmlNodePtr des_node = NULL; + xmlNodePtr thumb_link = NULL; + + for (xmlNodePtr child = node->children; child; child = child->next) { + if (child->type != XML_ELEMENT_NODE) + continue; + + if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class) { + if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) { + thumb_link = child; + for (xmlNodePtr thumb_child = child->children; thumb_child; + thumb_child = thumb_child->next) { + if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = + xmlGetProp(thumb_child, (const xmlChar *)"class"); + if (div_class && + xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) { + for (xmlNodePtr cico_child = thumb_child->children; cico_child; + cico_child = cico_child->next) { + if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == + 0) { + img_node = cico_child; + break; + } + } + } + if (div_class) + xmlFree(div_class); + } + } + } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) { + tit_node = child; + } + xmlFree(class); + } + } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) { + for (xmlNodePtr meta_child = child->children; meta_child; + meta_child = meta_child->next) { + if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = + xmlGetProp(meta_child, (const xmlChar *)"class"); + if (div_class) { + if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) { + des_node = meta_child; + } + xmlFree(div_class); + } + } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) { + xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class"); + if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) { + tit_node = meta_child; + } + if (a_class) + xmlFree(a_class); + } + } + } + if (class) + xmlFree(class); + } + } + + xmlChar *iurl = + img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL; + xmlChar *full_url = + thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL; + xmlChar *title = des_node ? xmlNodeGetContent(des_node) + : (tit_node ? xmlNodeGetContent(tit_node) : NULL); + xmlChar *rurl = + tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL; + + if (!iurl || strlen((char *)iurl) == 0) { + if (iurl) + xmlFree(iurl); + if (title) + xmlFree(title); + if (rurl) + xmlFree(rurl); + if (full_url) + xmlFree(full_url); + return 0; + } + + char *proxy_url = build_proxy_url((char *)iurl); + result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl); + free(proxy_url); + result->title = strdup(title ? (char *)title : "Image"); + result->page_url = strdup(rurl ? (char *)rurl : "#"); + result->full_url = strdup(full_url ? (char *)full_url : "#"); + + if (iurl) + xmlFree(iurl); + if (title) + xmlFree(title); + if (rurl) + xmlFree(rurl); + if (full_url) + xmlFree(full_url); + + return 1; +} + +int scrape_images(const char *query, int page, ImageResult **out_results, + int *out_count) { + *out_results = NULL; + *out_count = 0; + + if (!query || strlen(query) == 0) + return -1; + + CURL *tmp = curl_easy_init(); + if (!tmp) + return -1; + + char *encoded_query = curl_easy_escape(tmp, query, 0); + curl_easy_cleanup(tmp); + + if (!encoded_query) + return -1; + + char url[BUFFER_SIZE_LARGE]; + int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1; + snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query, + first); + free(encoded_query); + + HttpResponse resp = http_get( + url, + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); + if (!resp.memory) { + return -1; + } + + htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); + if (!doc) { + http_response_free(&resp); + return -1; + } + + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + xmlFreeDoc(doc); + http_response_free(&resp); + return -1; + } + + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval) { + if (xpathObj) + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + xmlFreeDoc(doc); + http_response_free(&resp); + return 0; + } + + int nodes = xpathObj->nodesetval->nodeNr; + int max_images = + (nodes < IMAGE_RESULTS_PER_PAGE) ? nodes : IMAGE_RESULTS_PER_PAGE; + + ImageResult *results = malloc(sizeof(ImageResult) * max_images); + if (!results) { + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + xmlFreeDoc(doc); + http_response_free(&resp); + return -1; + } + + int count = 0; + for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) { + xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; + if (parse_image_node(node, &results[count])) { + count++; + } + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + xmlFreeDoc(doc); + http_response_free(&resp); + + *out_results = results; + *out_count = count; + return 0; +} + +void free_image_results(ImageResult *results, int count) { + if (!results) + return; + + for (int i = 0; i < count; i++) { + free(results[i].thumbnail_url); + free(results[i].title); + free(results[i].page_url); + free(results[i].full_url); + } + free(results); +} diff --git a/src/Scraping/ImageScraping.h b/src/Scraping/ImageScraping.h new file mode 100644 index 0000000..d244a63 --- /dev/null +++ b/src/Scraping/ImageScraping.h @@ -0,0 +1,18 @@ +#ifndef IMAGESCRAPING_H +#define IMAGESCRAPING_H + +#include +#include + +typedef struct { + char *thumbnail_url; + char *title; + char *page_url; + char *full_url; +} ImageResult; + +int scrape_images(const char *query, int page, ImageResult **out_results, + int *out_count); +void free_image_results(ImageResult *results, int count); + +#endif diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 4c87890..baf536c 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,395 +1,20 @@ #include "Scraping.h" #include "../Cache/Cache.h" #include "../Proxy/Proxy.h" -#include "../Utility/Unescape.h" -#include "../Utility/XmlHelper.h" #include "Config.h" #include #include -#include #include #include -#include #include -#include -static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, - void *userp) { - size_t realsize = size * nmemb; - MemoryBuffer *mem = (MemoryBuffer *)userp; - - if (mem->size + realsize + 1 > mem->capacity) { - size_t new_cap = - mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2; - while (new_cap < mem->size + realsize + 1) - new_cap *= 2; - - char *ptr = (char *)realloc(mem->memory, new_cap); - if (!ptr) { - return 0; - } - mem->memory = ptr; - mem->capacity = new_cap; - } - - memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - - return realsize; -} - -static const char *get_random_user_agent(void) { - static const char *agents[] = { - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " - "Gecko) " - "Chrome/120.0.0.0` Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " - "Firefox/121.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; - return agents[rand() % 5]; -} - -static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { +int check_cache_for_job(ScrapeJob *job) { + if (get_cache_ttl_search() <= 0) return 0; - } - - xmlXPathObjectPtr xpathObj = xml_xpath_eval( - xpathCtx, "//tr[not(contains(@class, " - "'result-sponsored'))]//a[@class='result-link']"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - int num_links = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_links, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - for (int i = 0; i < num_links && found_count < max_results; i++) { - xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; - char *title = xml_node_content(linkNode); - char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); - char *snippet_text = NULL; - - xmlNodePtr current = linkNode->parent; - while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) - current = current->parent; - - if (current && current->next) { - xmlNodePtr snippetRow = current->next; - while (snippetRow && - xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) - snippetRow = snippetRow->next; - if (snippetRow) { - xpathCtx->node = snippetRow; - xmlXPathObjectPtr sObj = - xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']"); - if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]); - } - if (sObj) - xmlXPathFreeObject(sObj); - xpathCtx->node = NULL; - } - } - - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title ? title : "No Title"); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - } - - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -static int parse_startpage(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { - return 0; - } - - xmlXPathObjectPtr xpathObj = - xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - int num_results = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_results, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = - xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]"); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = - xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]"); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? xml_node_content(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = - xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]"); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = strdup(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - if (linkObj) - xmlXPathFreeObject(linkObj); - if (titleObj) - xmlXPathFreeObject(titleObj); - if (snippetObj) - xmlXPathFreeObject(snippetObj); - } - - xpathCtx->node = NULL; - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -static int parse_yahoo(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { - return 0; - } - - xmlXPathObjectPtr xpathObj = - xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - int num_results = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_results, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = xml_xpath_eval( - xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']"); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = - xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]"); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? xml_node_content(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = - xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p"); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - if (linkObj) - xmlXPathFreeObject(linkObj); - if (titleObj) - xmlXPathFreeObject(titleObj); - if (snippetObj) - xmlXPathFreeObject(snippetObj); - } - - xpathCtx->node = NULL; - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -const SearchEngine ENGINE_REGISTRY[] = { - {.name = "DuckDuckGo Lite", - .base_url = "https://lite.duckduckgo.com/lite/?q=", - .host_header = "lite.duckduckgo.com", - .referer = "https://lite.duckduckgo.com/", - .page_param = "s", - .page_multiplier = 30, - .page_base = 0, - .parser = parse_ddg_lite}, - {.name = "Startpage", - .base_url = "https://www.startpage.com/sp/search?q=", - .host_header = "www.startpage.com", - .referer = "https://www.startpage.com/", - .page_param = "page", - .page_multiplier = 1, - .page_base = 1, - .parser = parse_startpage}, - {.name = "Yahoo", - .base_url = "https://search.yahoo.com/search?p=", - .host_header = "search.yahoo.com", - .referer = "https://search.yahoo.com/", - .page_param = "b", - .page_multiplier = 10, - .page_base = 1, - .parser = parse_yahoo}}; - -const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); - -#define CURL_TIMEOUT 15L -#define CURL_DNS_TIMEOUT 300L - -static void configure_curl_handle(CURL *curl, const char *full_url, - MemoryBuffer *chunk, - struct curl_slist *headers) { - curl_easy_setopt(curl, CURLOPT_URL, full_url); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); - curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); - - curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); - curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); - curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); - - apply_proxy_settings(curl); -} - -static char *build_search_url(const char *base_url, const char *page_param, - int page_multiplier, int page_base, - const char *encoded_query, int page) { - int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base; - char *url = malloc(BUFFER_SIZE_LARGE); - if (!url) { - return NULL; - } - snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query, - page_param, page_value); - return url; -} - -static struct curl_slist *build_request_headers(const char *host_header, - const char *referer) { - struct curl_slist *headers = NULL; - char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM]; - - snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer); - - headers = curl_slist_append(headers, host_buf); - headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append( - headers, - "Accept: " - "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); - headers = curl_slist_append(headers, "DNT: 1"); - - return headers; -} - -static int check_cache_for_job(ScrapeJob *job) { - if (get_cache_ttl_search() <= 0) { - return 0; - } char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (!key) { + if (!key) return 0; - } char *cached_data = NULL; size_t cached_size = 0; @@ -414,27 +39,31 @@ static int check_cache_for_job(ScrapeJob *job) { return 0; } -static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { - if (msg->data.result == CURLE_OK && job->response.size > 0) { - char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (key && get_cache_ttl_search() > 0) { - cache_set(key, job->response.memory, job->response.size); - free(key); - } +void parse_and_cache_response(ScrapeJob *job) { + if (job->response.size == 0) { + job->results_count = 0; + return; + } - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) + cache_set(key, job->response.memory, job->response.size); + free(key); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); } else { job->results_count = 0; } +} +void cleanup_job_handle(ScrapeJob *job, CURL *handle) { struct curl_slist *headers = NULL; curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); if (headers) @@ -444,67 +73,112 @@ static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { job->response.memory = NULL; } +void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { + if (msg->data.result == CURLE_OK) + parse_and_cache_response(job); + else + job->results_count = 0; + + cleanup_job_handle(job, handle); +} + +int setup_job(ScrapeJob *job, CURLM *multi_handle) { + if (job->handle) + curl_easy_cleanup(job->handle); + if (job->response.memory) + free(job->response.memory); + + if (check_cache_for_job(job)) { + job->results_count = job->results_count > 0 ? job->results_count : 0; + return 0; + } + + char *encoded_query = curl_easy_escape(NULL, job->query, 0); + if (!encoded_query) + return -1; + + char *full_url = + build_search_url(job->engine->base_url, job->engine->page_param, + job->engine->page_multiplier, job->engine->page_base, + encoded_query, job->page); + free(encoded_query); + + if (!full_url) + return -1; + + job->handle = curl_easy_init(); + if (!job->handle) { + free(full_url); + return -1; + } + + job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE); + job->response.size = 0; + job->response.capacity = INITIAL_BUFFER_SIZE; + + struct curl_slist *headers = + build_request_headers(job->engine->host_header, job->engine->referer); + + configure_curl_handle(job->handle, full_url, &job->response, headers); + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); + + free(full_url); + curl_multi_add_handle(multi_handle, job->handle); + return 0; +} + +int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) { + CURLMsg *msg; + int msgs_left; + + while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { + if (msg->msg != CURLMSG_DONE) + continue; + + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle && jobs[i].handle == handle) { + process_response(&jobs[i], handle, msg); + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + jobs[i].handle = NULL; + break; + } + } + } + + return 0; +} + +int should_retry(ScrapeJob *jobs, int num_jobs) { + if (proxy_count <= 0) + return 0; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].results_count == 0 && jobs[i].response.size == 0) + return 1; + } + return 0; +} + int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { int retries = 0; retry: CURLM *multi_handle = curl_multi_init(); - if (!multi_handle) { + if (!multi_handle) return -1; - } for (int i = 0; i < num_jobs; i++) { - ScrapeJob *job = &jobs[i]; - - if (job->handle) { - curl_easy_cleanup(job->handle); - job->handle = NULL; + if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) { + curl_multi_remove_handle(multi_handle, jobs[i].handle); + curl_easy_cleanup(jobs[i].handle); + jobs[i].handle = NULL; } - if (job->response.memory) { - free(job->response.memory); - } - - if (check_cache_for_job(job)) { - job->results_count = job->results_count > 0 ? job->results_count : 0; - continue; - } - - char *encoded_query = curl_easy_escape(NULL, job->query, 0); - if (!encoded_query) { - continue; - } - - char *full_url = - build_search_url(job->engine->base_url, job->engine->page_param, - job->engine->page_multiplier, job->engine->page_base, - encoded_query, job->page); - free(encoded_query); - - if (!full_url) { - continue; - } - - job->handle = curl_easy_init(); - if (!job->handle) { - free(full_url); - continue; - } - - job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE); - job->response.size = 0; - job->response.capacity = INITIAL_BUFFER_SIZE; - - struct curl_slist *headers = - build_request_headers(job->engine->host_header, job->engine->referer); - - configure_curl_handle(job->handle, full_url, &job->response, headers); - curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); - - free(full_url); - curl_multi_add_handle(multi_handle, job->handle); } - usleep(100000 + (rand() % 100000)); + http_delay(); int still_running = 0; curl_multi_perform(multi_handle, &still_running); @@ -512,50 +186,17 @@ retry: do { int numfds = 0; CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); - - if (mc != CURLM_OK) { + if (mc != CURLM_OK) break; - } - curl_multi_perform(multi_handle, &still_running); } while (still_running); - CURLMsg *msg; - int msgs_left; - while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { - if (msg->msg == CURLMSG_DONE) { - CURL *handle = msg->easy_handle; - - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].handle && jobs[i].handle == handle) { - ScrapeJob *job = &jobs[i]; - - process_job_response(job, handle, msg); - - curl_multi_remove_handle(multi_handle, handle); - if (handle) - curl_easy_cleanup(handle); - job->handle = NULL; - break; - } - } - } - } - + handle_responses(multi_handle, jobs, num_jobs); curl_multi_cleanup(multi_handle); - if (retries < max_proxy_retries && proxy_count > 0) { - int any_failed = 0; - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { - any_failed = 1; - break; - } - } - if (any_failed) { - retries++; - goto retry; - } + if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) { + retries++; + goto retry; } return 0; diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index f1ad2c4..1439118 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -3,6 +3,7 @@ #include #include +#include typedef struct { char *url; @@ -45,6 +46,25 @@ typedef struct { extern const SearchEngine ENGINE_REGISTRY[]; extern const int ENGINE_COUNT; +size_t write_memory_callback(void *contents, size_t size, size_t nmemb, + void *userp); +const char *get_random_user_agent(void); +void configure_curl_handle(CURL *curl, const char *full_url, + MemoryBuffer *chunk, struct curl_slist *headers); +char *build_search_url(const char *base_url, const char *page_param, + int page_multiplier, int page_base, + const char *encoded_query, int page); +struct curl_slist *build_request_headers(const char *host_header, + const char *referer); +void http_delay(void); + +xmlXPathContextPtr create_xpath_context(xmlDocPtr doc); +void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj); +SearchResult *alloc_results_array(int capacity, int max_results); +void assign_result(SearchResult *result, char *url, char *title, char *snippet, + int unescape); +void free_xml_node_list(char *title, char *url, char *snippet); + int scrape_engine(const SearchEngine *engine, const char *query, SearchResult **out_results, int max_results); diff --git a/src/Scraping/ScrapingHttp.c b/src/Scraping/ScrapingHttp.c new file mode 100644 index 0000000..1a6a292 --- /dev/null +++ b/src/Scraping/ScrapingHttp.c @@ -0,0 +1,109 @@ +#include "../Proxy/Proxy.h" +#include "Config.h" +#include "Scraping.h" +#include +#include +#include +#include +#include + +#define HTTP_DELAY_MIN_US 100000 +#define HTTP_DELAY_RANGE_US 100000 + +static const char *USER_AGENTS[] = { + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " + "Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " + "Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; + +#define USER_AGENT_COUNT (sizeof(USER_AGENTS) / sizeof(USER_AGENTS[0])) + +size_t write_memory_callback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + MemoryBuffer *mem = (MemoryBuffer *)userp; + + if (mem->size + realsize + 1 > mem->capacity) { + size_t new_cap = + mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) + new_cap *= 2; + + char *ptr = (char *)realloc(mem->memory, new_cap); + if (!ptr) + return 0; + mem->memory = ptr; + mem->capacity = new_cap; + } + + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +const char *get_random_user_agent(void) { + return USER_AGENTS[rand() % USER_AGENT_COUNT]; +} + +void configure_curl_handle(CURL *curl, const char *full_url, + MemoryBuffer *chunk, struct curl_slist *headers) { + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); + + curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + + apply_proxy_settings(curl); +} + +char *build_search_url(const char *base_url, const char *page_param, + int page_multiplier, int page_base, + const char *encoded_query, int page) { + int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base; + char *url = malloc(BUFFER_SIZE_LARGE); + if (!url) + return NULL; + snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query, + page_param, page_value); + return url; +} + +struct curl_slist *build_request_headers(const char *host_header, + const char *referer) { + struct curl_slist *headers = NULL; + char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM]; + + snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer); + + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append( + headers, + "Accept: " + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); + + return headers; +} + +void http_delay(void) { + usleep(HTTP_DELAY_MIN_US + (rand() % HTTP_DELAY_RANGE_US)); +} diff --git a/src/Scraping/ScrapingParsers.c b/src/Scraping/ScrapingParsers.c new file mode 100644 index 0000000..818d333 --- /dev/null +++ b/src/Scraping/ScrapingParsers.c @@ -0,0 +1,269 @@ +#include "../Utility/Unescape.h" +#include "../Utility/XmlHelper.h" +#include "Config.h" +#include "Scraping.h" +#include +#include +#include +#include + +xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) { + return xmlXPathNewContext(doc); +} + +void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) { + if (obj) + xmlXPathFreeObject(obj); + if (ctx) + xmlXPathFreeContext(ctx); +} + +SearchResult *alloc_results_array(int capacity, int max_results) { + int count = capacity < max_results ? capacity : max_results; + return xml_result_alloc(capacity, count); +} + +void assign_result(SearchResult *result, char *url, char *title, char *snippet, + int unescape) { + result->url = unescape ? unescape_search_url(url) : strdup(url ? url : ""); + result->title = strdup(title ? title : "No Title"); + result->snippet = strdup(snippet ? snippet : ""); +} + +void free_xml_node_list(char *title, char *url, char *snippet) { + if (title) + xmlFree(title); + if (url) + xmlFree(url); + if (snippet) + xmlFree(snippet); +} + +static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//tr[not(contains(@class, " + "'result-sponsored'))]//a[@class='result-link']"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_links = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_links, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_links && found_count < max_results; i++) { + xmlNodePtr link_node = obj->nodesetval->nodeTab[i]; + char *title = xml_node_content(link_node); + char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = link_node->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippet_row = current->next; + while (snippet_row && + xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0) + snippet_row = snippet_row->next; + if (snippet_row) { + ctx->node = snippet_row; + xmlXPathObjectPtr s_obj = + xml_xpath_eval(ctx, ".//td[@class='result-snippet']"); + if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0) + snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]); + if (s_obj) + xmlXPathFreeObject(s_obj); + ctx->node = NULL; + } + } + + assign_result(&(*out_results)[found_count], url, title, snippet_text, 1); + free_xml_node_list(title, url, snippet_text); + found_count++; + } + + free_xpath_objects(ctx, obj); + return found_count; +} + +static int parse_startpage(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//div[contains(@class, 'result')]"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_results = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_results, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr result_node = obj->nodesetval->nodeTab[i]; + ctx->node = result_node; + + xmlXPathObjectPtr link_obj = + xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]"); + char *url = + (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr title_obj = + xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]"); + char *title = (title_obj && title_obj->nodesetval && + title_obj->nodesetval->nodeNr > 0) + ? xml_node_content(title_obj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippet_obj = + xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]"); + char *snippet_text = + (snippet_obj && snippet_obj->nodesetval && + snippet_obj->nodesetval->nodeNr > 0) + ? xml_node_content(snippet_obj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + assign_result(&(*out_results)[found_count], url, title, snippet_text, 0); + found_count++; + } + + free_xml_node_list(title, url, snippet_text); + if (link_obj) + xmlXPathFreeObject(link_obj); + if (title_obj) + xmlXPathFreeObject(title_obj); + if (snippet_obj) + xmlXPathFreeObject(snippet_obj); + } + + ctx->node = NULL; + free_xpath_objects(ctx, obj); + return found_count; +} + +static int parse_yahoo(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_results = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_results, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr result_node = obj->nodesetval->nodeTab[i]; + ctx->node = result_node; + + xmlXPathObjectPtr link_obj = xml_xpath_eval( + ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']"); + char *url = + (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr title_obj = + xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]"); + char *title = (title_obj && title_obj->nodesetval && + title_obj->nodesetval->nodeNr > 0) + ? xml_node_content(title_obj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippet_obj = + xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p"); + char *snippet_text = + (snippet_obj && snippet_obj->nodesetval && + snippet_obj->nodesetval->nodeNr > 0) + ? xml_node_content(snippet_obj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + assign_result(&(*out_results)[found_count], url, title, snippet_text, 1); + found_count++; + } + + free_xml_node_list(title, url, snippet_text); + if (link_obj) + xmlXPathFreeObject(link_obj); + if (title_obj) + xmlXPathFreeObject(title_obj); + if (snippet_obj) + xmlXPathFreeObject(snippet_obj); + } + + ctx->node = NULL; + free_xpath_objects(ctx, obj); + return found_count; +} + +const SearchEngine ENGINE_REGISTRY[] = { + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .page_param = "s", + .page_multiplier = 30, + .page_base = 0, + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .page_param = "page", + .page_multiplier = 1, + .page_base = 1, + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .page_param = "b", + .page_multiplier = 10, + .page_base = 1, + .parser = parse_yahoo}}; + +const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);