Files
omnisearch/src/Scraping/ScrapingParsers.c
2026-03-17 13:51:12 -04:00

270 lines
8.3 KiB
C

#include "../Utility/Unescape.h"
#include "../Utility/XmlHelper.h"
#include "Config.h"
#include "Scraping.h"
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <stdlib.h>
#include <string.h>
xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) {
return xmlXPathNewContext(doc);
}
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) {
if (obj)
xmlXPathFreeObject(obj);
if (ctx)
xmlXPathFreeContext(ctx);
}
SearchResult *alloc_results_array(int capacity, int max_results) {
int count = capacity < max_results ? capacity : max_results;
return xml_result_alloc(capacity, count);
}
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
int unescape) {
result->url = unescape ? unescape_search_url(url) : strdup(url ? url : "");
result->title = strdup(title ? title : "No Title");
result->snippet = strdup(snippet ? snippet : "");
}
void free_xml_node_list(char *title, char *url, char *snippet) {
if (title)
xmlFree(title);
if (url)
xmlFree(url);
if (snippet)
xmlFree(snippet);
}
static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
SearchResult **out_results, int max_results) {
(void)engine_name;
int found_count = 0;
xmlXPathContextPtr ctx = create_xpath_context(doc);
if (!ctx)
return 0;
xmlXPathObjectPtr obj =
xml_xpath_eval(ctx, "//tr[not(contains(@class, "
"'result-sponsored'))]//a[@class='result-link']");
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
free_xpath_objects(ctx, obj);
return 0;
}
int num_links = obj->nodesetval->nodeNr;
*out_results = alloc_results_array(num_links, max_results);
if (!*out_results) {
free_xpath_objects(ctx, obj);
return 0;
}
for (int i = 0; i < num_links && found_count < max_results; i++) {
xmlNodePtr link_node = obj->nodesetval->nodeTab[i];
char *title = xml_node_content(link_node);
char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href");
char *snippet_text = NULL;
xmlNodePtr current = link_node->parent;
while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
current = current->parent;
if (current && current->next) {
xmlNodePtr snippet_row = current->next;
while (snippet_row &&
xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0)
snippet_row = snippet_row->next;
if (snippet_row) {
ctx->node = snippet_row;
xmlXPathObjectPtr s_obj =
xml_xpath_eval(ctx, ".//td[@class='result-snippet']");
if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0)
snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]);
if (s_obj)
xmlXPathFreeObject(s_obj);
ctx->node = NULL;
}
}
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
free_xml_node_list(title, url, snippet_text);
found_count++;
}
free_xpath_objects(ctx, obj);
return found_count;
}
static int parse_startpage(const char *engine_name, xmlDocPtr doc,
SearchResult **out_results, int max_results) {
(void)engine_name;
int found_count = 0;
xmlXPathContextPtr ctx = create_xpath_context(doc);
if (!ctx)
return 0;
xmlXPathObjectPtr obj =
xml_xpath_eval(ctx, "//div[contains(@class, 'result')]");
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
free_xpath_objects(ctx, obj);
return 0;
}
int num_results = obj->nodesetval->nodeNr;
*out_results = alloc_results_array(num_results, max_results);
if (!*out_results) {
free_xpath_objects(ctx, obj);
return 0;
}
for (int i = 0; i < num_results && found_count < max_results; i++) {
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
ctx->node = result_node;
xmlXPathObjectPtr link_obj =
xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]");
char *url =
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
(xmlChar *)"href")
: NULL;
xmlXPathObjectPtr title_obj =
xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]");
char *title = (title_obj && title_obj->nodesetval &&
title_obj->nodesetval->nodeNr > 0)
? xml_node_content(title_obj->nodesetval->nodeTab[0])
: NULL;
xmlXPathObjectPtr snippet_obj =
xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]");
char *snippet_text =
(snippet_obj && snippet_obj->nodesetval &&
snippet_obj->nodesetval->nodeNr > 0)
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
: NULL;
if (url && title) {
assign_result(&(*out_results)[found_count], url, title, snippet_text, 0);
found_count++;
}
free_xml_node_list(title, url, snippet_text);
if (link_obj)
xmlXPathFreeObject(link_obj);
if (title_obj)
xmlXPathFreeObject(title_obj);
if (snippet_obj)
xmlXPathFreeObject(snippet_obj);
}
ctx->node = NULL;
free_xpath_objects(ctx, obj);
return found_count;
}
static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
SearchResult **out_results, int max_results) {
(void)engine_name;
int found_count = 0;
xmlXPathContextPtr ctx = create_xpath_context(doc);
if (!ctx)
return 0;
xmlXPathObjectPtr obj =
xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]");
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
free_xpath_objects(ctx, obj);
return 0;
}
int num_results = obj->nodesetval->nodeNr;
*out_results = alloc_results_array(num_results, max_results);
if (!*out_results) {
free_xpath_objects(ctx, obj);
return 0;
}
for (int i = 0; i < num_results && found_count < max_results; i++) {
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
ctx->node = result_node;
xmlXPathObjectPtr link_obj = xml_xpath_eval(
ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
char *url =
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
(xmlChar *)"href")
: NULL;
xmlXPathObjectPtr title_obj =
xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]");
char *title = (title_obj && title_obj->nodesetval &&
title_obj->nodesetval->nodeNr > 0)
? xml_node_content(title_obj->nodesetval->nodeTab[0])
: NULL;
xmlXPathObjectPtr snippet_obj =
xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p");
char *snippet_text =
(snippet_obj && snippet_obj->nodesetval &&
snippet_obj->nodesetval->nodeNr > 0)
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
: NULL;
if (url && title) {
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
found_count++;
}
free_xml_node_list(title, url, snippet_text);
if (link_obj)
xmlXPathFreeObject(link_obj);
if (title_obj)
xmlXPathFreeObject(title_obj);
if (snippet_obj)
xmlXPathFreeObject(snippet_obj);
}
ctx->node = NULL;
free_xpath_objects(ctx, obj);
return found_count;
}
const SearchEngine ENGINE_REGISTRY[] = {
{.name = "DuckDuckGo Lite",
.base_url = "https://lite.duckduckgo.com/lite/?q=",
.host_header = "lite.duckduckgo.com",
.referer = "https://lite.duckduckgo.com/",
.page_param = "s",
.page_multiplier = 30,
.page_base = 0,
.parser = parse_ddg_lite},
{.name = "Startpage",
.base_url = "https://www.startpage.com/sp/search?q=",
.host_header = "www.startpage.com",
.referer = "https://www.startpage.com/",
.page_param = "page",
.page_multiplier = 1,
.page_base = 1,
.parser = parse_startpage},
{.name = "Yahoo",
.base_url = "https://search.yahoo.com/search?p=",
.host_header = "search.yahoo.com",
.referer = "https://search.yahoo.com/",
.page_param = "b",
.page_multiplier = 10,
.page_base = 1,
.parser = parse_yahoo}};
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);