mirror of
https://git.bwaaa.monster/omnisearch
synced 2026-03-25 17:19:02 +02:00
237 lines
6.7 KiB
C
237 lines
6.7 KiB
C
#include "Wikipedia.h"
|
|
#include "../Cache/Cache.h"
|
|
#include "../Scraping/Scraping.h"
|
|
#include "../Utility/HttpClient.h"
|
|
#include "Config.h"
|
|
#include <curl/curl.h>
|
|
#include <libxml/parser.h>
|
|
#include <libxml/tree.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
static void shorten_summary(char **extract_ptr, int max_chars) {
|
|
if (!extract_ptr || !*extract_ptr)
|
|
return;
|
|
|
|
char *text = *extract_ptr;
|
|
int len = strlen(text);
|
|
|
|
if (len <= max_chars)
|
|
return;
|
|
|
|
int end_pos = max_chars;
|
|
for (int i = max_chars; i > (max_chars / 2); i--) {
|
|
if (text[i] == '.' || text[i] == '!' || text[i] == '?') {
|
|
end_pos = i + 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
char *new_text = (char *)malloc(end_pos + 4);
|
|
|
|
if (new_text) {
|
|
strncpy(new_text, text, end_pos);
|
|
new_text[end_pos] = '\0';
|
|
strcat(new_text, "...");
|
|
free(*extract_ptr);
|
|
*extract_ptr = new_text;
|
|
}
|
|
}
|
|
|
|
static void extract_wiki_info(xmlNode *node, InfoBox *info) {
|
|
xmlNode *cur_node = NULL;
|
|
|
|
for (cur_node = node; cur_node; cur_node = cur_node->next) {
|
|
if (cur_node->type == XML_ELEMENT_NODE) {
|
|
if (strcmp((const char *)cur_node->name, "page") == 0) {
|
|
xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title");
|
|
if (title) {
|
|
if (info->title) {
|
|
free(info->title);
|
|
}
|
|
info->title = strdup((const char *)title);
|
|
|
|
const char *base_article_url = "https://en.wikipedia.org/wiki/";
|
|
char *formatted_title = strdup((const char *)title);
|
|
for (int i = 0; formatted_title[i]; i++) {
|
|
if (formatted_title[i] == ' ')
|
|
formatted_title[i] = '_';
|
|
}
|
|
|
|
if (info->url) {
|
|
free(info->url);
|
|
}
|
|
info->url =
|
|
malloc(strlen(base_article_url) + strlen(formatted_title) + 1);
|
|
if (info->url) {
|
|
strcpy(info->url, base_article_url);
|
|
strcat(info->url, formatted_title);
|
|
}
|
|
free(formatted_title);
|
|
xmlFree(title);
|
|
}
|
|
}
|
|
|
|
if (strcmp((const char *)cur_node->name, "thumbnail") == 0) {
|
|
xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source");
|
|
if (source) {
|
|
if (info->thumbnail_url) {
|
|
free(info->thumbnail_url);
|
|
}
|
|
info->thumbnail_url = strdup((const char *)source);
|
|
xmlFree(source);
|
|
}
|
|
}
|
|
|
|
if (strcmp((const char *)cur_node->name, "extract") == 0) {
|
|
xmlChar *content = xmlNodeGetContent(cur_node);
|
|
if (content) {
|
|
if (info->extract) {
|
|
free(info->extract);
|
|
}
|
|
info->extract = strdup((const char *)content);
|
|
|
|
shorten_summary(&(info->extract), WIKI_SUMMARY_MAX_CHARS);
|
|
xmlFree(content);
|
|
}
|
|
}
|
|
}
|
|
extract_wiki_info(cur_node->children, info);
|
|
}
|
|
}
|
|
|
|
InfoBox fetch_wiki_data(char *api_url) {
|
|
InfoBox info = {NULL, NULL, NULL, NULL};
|
|
|
|
if (!api_url) {
|
|
return info;
|
|
}
|
|
|
|
char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
|
if (cache_key && get_cache_ttl_infobox() > 0) {
|
|
char *cached_data = NULL;
|
|
size_t cached_size = 0;
|
|
if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
|
|
&cached_size) == 0 &&
|
|
cached_data && cached_size > 0) {
|
|
xmlDocPtr doc =
|
|
xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
|
|
if (doc != NULL) {
|
|
xmlNode *root_element = xmlDocGetRootElement(doc);
|
|
extract_wiki_info(root_element, &info);
|
|
xmlFreeDoc(doc);
|
|
}
|
|
free(cached_data);
|
|
free(cache_key);
|
|
return info;
|
|
}
|
|
free(cached_data);
|
|
}
|
|
free(cache_key);
|
|
|
|
HttpResponse resp = http_get(api_url, "libcurl-agent/1.0");
|
|
if (resp.memory && resp.size > 0) {
|
|
cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
|
if (cache_key && get_cache_ttl_infobox() > 0) {
|
|
cache_set(cache_key, resp.memory, resp.size);
|
|
}
|
|
free(cache_key);
|
|
|
|
xmlDocPtr doc =
|
|
xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0);
|
|
if (doc != NULL) {
|
|
xmlNode *root_element = xmlDocGetRootElement(doc);
|
|
extract_wiki_info(root_element, &info);
|
|
xmlFreeDoc(doc);
|
|
}
|
|
}
|
|
|
|
http_response_free(&resp);
|
|
return info;
|
|
}
|
|
|
|
static xmlNode *find_node_recursive(xmlNode *node, const char *target_name) {
|
|
for (xmlNode *cur = node; cur; cur = cur->next) {
|
|
if (cur->type == XML_ELEMENT_NODE &&
|
|
strcmp((const char *)cur->name, target_name) == 0) {
|
|
return cur;
|
|
}
|
|
xmlNode *found = find_node_recursive(cur->children, target_name);
|
|
if (found)
|
|
return found;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static char *get_first_search_result(const char *search_term) {
|
|
char *escaped_term = curl_easy_escape(NULL, search_term, 0);
|
|
const char *search_base =
|
|
"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=";
|
|
const char *search_suffix = "&format=xml&origin=*&srlimit=1";
|
|
|
|
char *search_url = malloc(strlen(search_base) + strlen(escaped_term) +
|
|
strlen(search_suffix) + 1);
|
|
if (!search_url) {
|
|
curl_free(escaped_term);
|
|
return NULL;
|
|
}
|
|
|
|
strcpy(search_url, search_base);
|
|
strcat(search_url, escaped_term);
|
|
strcat(search_url, search_suffix);
|
|
|
|
curl_free(escaped_term);
|
|
|
|
HttpResponse resp = http_get(search_url, "libcurl-agent/1.0");
|
|
free(search_url);
|
|
|
|
char *first_title = NULL;
|
|
if (resp.memory && resp.size > 0) {
|
|
xmlDocPtr doc =
|
|
xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0);
|
|
if (doc) {
|
|
xmlNode *root = xmlDocGetRootElement(doc);
|
|
xmlNode *search_node = find_node_recursive(root, "search");
|
|
if (search_node) {
|
|
for (xmlNode *sr = search_node->children; sr; sr = sr->next) {
|
|
if (sr->type == XML_ELEMENT_NODE &&
|
|
strcmp((const char *)sr->name, "p") == 0) {
|
|
xmlChar *title = xmlGetProp(sr, (const xmlChar *)"title");
|
|
if (title) {
|
|
first_title = strdup((const char *)title);
|
|
xmlFree(title);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
xmlFreeDoc(doc);
|
|
}
|
|
}
|
|
|
|
http_response_free(&resp);
|
|
return first_title;
|
|
}
|
|
|
|
char *construct_wiki_url(const char *search_term) {
|
|
char *first_title = get_first_search_result(search_term);
|
|
if (!first_title)
|
|
return NULL;
|
|
|
|
char *escaped_title = curl_easy_escape(NULL, first_title, 0);
|
|
const char *base = "https://en.wikipedia.org/w/"
|
|
"api.php?action=query&prop=extracts|pageimages&exintro&"
|
|
"explaintext&pithumbsize=400&format=xml&origin=*&titles=";
|
|
|
|
char *full_url = malloc(strlen(base) + strlen(escaped_title) + 1);
|
|
if (full_url) {
|
|
strcpy(full_url, base);
|
|
strcat(full_url, escaped_title);
|
|
}
|
|
|
|
curl_free(escaped_title);
|
|
free(first_title);
|
|
return full_url;
|
|
}
|