mirror of
https://git.bwaaa.monster/omnisearch
synced 2026-03-25 17:19:02 +02:00
270 lines
8.3 KiB
C
270 lines
8.3 KiB
C
#include "../Utility/Unescape.h"
|
|
#include "../Utility/XmlHelper.h"
|
|
#include "Config.h"
|
|
#include "Scraping.h"
|
|
#include <libxml/HTMLparser.h>
|
|
#include <libxml/xpath.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) {
|
|
return xmlXPathNewContext(doc);
|
|
}
|
|
|
|
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) {
|
|
if (obj)
|
|
xmlXPathFreeObject(obj);
|
|
if (ctx)
|
|
xmlXPathFreeContext(ctx);
|
|
}
|
|
|
|
SearchResult *alloc_results_array(int capacity, int max_results) {
|
|
int count = capacity < max_results ? capacity : max_results;
|
|
return xml_result_alloc(capacity, count);
|
|
}
|
|
|
|
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
|
|
int unescape) {
|
|
result->url = unescape ? unescape_search_url(url) : strdup(url ? url : "");
|
|
result->title = strdup(title ? title : "No Title");
|
|
result->snippet = strdup(snippet ? snippet : "");
|
|
}
|
|
|
|
void free_xml_node_list(char *title, char *url, char *snippet) {
|
|
if (title)
|
|
xmlFree(title);
|
|
if (url)
|
|
xmlFree(url);
|
|
if (snippet)
|
|
xmlFree(snippet);
|
|
}
|
|
|
|
static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
|
|
SearchResult **out_results, int max_results) {
|
|
(void)engine_name;
|
|
int found_count = 0;
|
|
|
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
|
if (!ctx)
|
|
return 0;
|
|
|
|
xmlXPathObjectPtr obj =
|
|
xml_xpath_eval(ctx, "//tr[not(contains(@class, "
|
|
"'result-sponsored'))]//a[@class='result-link']");
|
|
|
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
int num_links = obj->nodesetval->nodeNr;
|
|
*out_results = alloc_results_array(num_links, max_results);
|
|
if (!*out_results) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
for (int i = 0; i < num_links && found_count < max_results; i++) {
|
|
xmlNodePtr link_node = obj->nodesetval->nodeTab[i];
|
|
char *title = xml_node_content(link_node);
|
|
char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href");
|
|
char *snippet_text = NULL;
|
|
|
|
xmlNodePtr current = link_node->parent;
|
|
while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
|
|
current = current->parent;
|
|
|
|
if (current && current->next) {
|
|
xmlNodePtr snippet_row = current->next;
|
|
while (snippet_row &&
|
|
xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0)
|
|
snippet_row = snippet_row->next;
|
|
if (snippet_row) {
|
|
ctx->node = snippet_row;
|
|
xmlXPathObjectPtr s_obj =
|
|
xml_xpath_eval(ctx, ".//td[@class='result-snippet']");
|
|
if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0)
|
|
snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]);
|
|
if (s_obj)
|
|
xmlXPathFreeObject(s_obj);
|
|
ctx->node = NULL;
|
|
}
|
|
}
|
|
|
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
|
|
free_xml_node_list(title, url, snippet_text);
|
|
found_count++;
|
|
}
|
|
|
|
free_xpath_objects(ctx, obj);
|
|
return found_count;
|
|
}
|
|
|
|
static int parse_startpage(const char *engine_name, xmlDocPtr doc,
|
|
SearchResult **out_results, int max_results) {
|
|
(void)engine_name;
|
|
int found_count = 0;
|
|
|
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
|
if (!ctx)
|
|
return 0;
|
|
|
|
xmlXPathObjectPtr obj =
|
|
xml_xpath_eval(ctx, "//div[contains(@class, 'result')]");
|
|
|
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
int num_results = obj->nodesetval->nodeNr;
|
|
*out_results = alloc_results_array(num_results, max_results);
|
|
if (!*out_results) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
|
|
ctx->node = result_node;
|
|
|
|
xmlXPathObjectPtr link_obj =
|
|
xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]");
|
|
char *url =
|
|
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
|
|
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
|
|
(xmlChar *)"href")
|
|
: NULL;
|
|
|
|
xmlXPathObjectPtr title_obj =
|
|
xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]");
|
|
char *title = (title_obj && title_obj->nodesetval &&
|
|
title_obj->nodesetval->nodeNr > 0)
|
|
? xml_node_content(title_obj->nodesetval->nodeTab[0])
|
|
: NULL;
|
|
|
|
xmlXPathObjectPtr snippet_obj =
|
|
xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]");
|
|
char *snippet_text =
|
|
(snippet_obj && snippet_obj->nodesetval &&
|
|
snippet_obj->nodesetval->nodeNr > 0)
|
|
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
|
|
: NULL;
|
|
|
|
if (url && title) {
|
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 0);
|
|
found_count++;
|
|
}
|
|
|
|
free_xml_node_list(title, url, snippet_text);
|
|
if (link_obj)
|
|
xmlXPathFreeObject(link_obj);
|
|
if (title_obj)
|
|
xmlXPathFreeObject(title_obj);
|
|
if (snippet_obj)
|
|
xmlXPathFreeObject(snippet_obj);
|
|
}
|
|
|
|
ctx->node = NULL;
|
|
free_xpath_objects(ctx, obj);
|
|
return found_count;
|
|
}
|
|
|
|
static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
|
|
SearchResult **out_results, int max_results) {
|
|
(void)engine_name;
|
|
int found_count = 0;
|
|
|
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
|
if (!ctx)
|
|
return 0;
|
|
|
|
xmlXPathObjectPtr obj =
|
|
xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]");
|
|
|
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
int num_results = obj->nodesetval->nodeNr;
|
|
*out_results = alloc_results_array(num_results, max_results);
|
|
if (!*out_results) {
|
|
free_xpath_objects(ctx, obj);
|
|
return 0;
|
|
}
|
|
|
|
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
|
|
ctx->node = result_node;
|
|
|
|
xmlXPathObjectPtr link_obj = xml_xpath_eval(
|
|
ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
|
|
char *url =
|
|
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
|
|
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
|
|
(xmlChar *)"href")
|
|
: NULL;
|
|
|
|
xmlXPathObjectPtr title_obj =
|
|
xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]");
|
|
char *title = (title_obj && title_obj->nodesetval &&
|
|
title_obj->nodesetval->nodeNr > 0)
|
|
? xml_node_content(title_obj->nodesetval->nodeTab[0])
|
|
: NULL;
|
|
|
|
xmlXPathObjectPtr snippet_obj =
|
|
xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p");
|
|
char *snippet_text =
|
|
(snippet_obj && snippet_obj->nodesetval &&
|
|
snippet_obj->nodesetval->nodeNr > 0)
|
|
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
|
|
: NULL;
|
|
|
|
if (url && title) {
|
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
|
|
found_count++;
|
|
}
|
|
|
|
free_xml_node_list(title, url, snippet_text);
|
|
if (link_obj)
|
|
xmlXPathFreeObject(link_obj);
|
|
if (title_obj)
|
|
xmlXPathFreeObject(title_obj);
|
|
if (snippet_obj)
|
|
xmlXPathFreeObject(snippet_obj);
|
|
}
|
|
|
|
ctx->node = NULL;
|
|
free_xpath_objects(ctx, obj);
|
|
return found_count;
|
|
}
|
|
|
|
const SearchEngine ENGINE_REGISTRY[] = {
|
|
{.name = "DuckDuckGo Lite",
|
|
.base_url = "https://lite.duckduckgo.com/lite/?q=",
|
|
.host_header = "lite.duckduckgo.com",
|
|
.referer = "https://lite.duckduckgo.com/",
|
|
.page_param = "s",
|
|
.page_multiplier = 30,
|
|
.page_base = 0,
|
|
.parser = parse_ddg_lite},
|
|
{.name = "Startpage",
|
|
.base_url = "https://www.startpage.com/sp/search?q=",
|
|
.host_header = "www.startpage.com",
|
|
.referer = "https://www.startpage.com/",
|
|
.page_param = "page",
|
|
.page_multiplier = 1,
|
|
.page_base = 1,
|
|
.parser = parse_startpage},
|
|
{.name = "Yahoo",
|
|
.base_url = "https://search.yahoo.com/search?p=",
|
|
.host_header = "search.yahoo.com",
|
|
.referer = "https://search.yahoo.com/",
|
|
.page_param = "b",
|
|
.page_multiplier = 10,
|
|
.page_base = 1,
|
|
.parser = parse_yahoo}};
|
|
|
|
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
|