diff --git a/.env.local.example b/.env.local.example index 064b5394..34088f40 100644 --- a/.env.local.example +++ b/.env.local.example @@ -23,6 +23,11 @@ SEARXNG_PORT=8080 # default port SEARXNG_BIND_ADDRESS=0.0.0.0 # default address SEARXNG_IMAGE_PROXY=true # enable image proxy SEARXNG_LIMITER=false # can be enabled to limit the number of requests per IP address +SEARXNG_DEFAULT_DEPTH=basic # Set to 'basic' or 'advanced', only affects SearXNG searches +SEARXNG_MAX_RESULTS=50 # Maximum number of results to return from SearXNG +SEARXNG_ENGINES=google,bing,duckduckgo,wikipedia # Search engines to use +SEARXNG_TIME_RANGE=None # Time range for search results: day, week, month, year, or None (for all time) +SEARXNG_SAFESEARCH=0 # Safe search setting: 0 (off), 1 (moderate), 2 (strict) # Optional # The settings below can be used optionally as needed. @@ -59,4 +64,6 @@ SEARXNG_LIMITER=false # can be enabled to limit the number of requests per IP ad # enable the video search tool # Serper API Key retrieved here: https://serper.dev/api-key -# SERPER_API_KEY=[YOUR_SERPER_API_KEY] \ No newline at end of file +# SERPER_API_KEY=[YOUR_SERPER_API_KEY] + +#NEXT_PUBLIC_BASE_URL=http://localhost:3000 \ No newline at end of file diff --git a/README.md b/README.md index be09c864..7080f5a0 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,9 @@ An AI-powered search engine with a generative UI. - Specify the model to generate answers - Groq API support [※](https://github.com/miurla/morphic/pull/58) - Local Redis support -- SearXNG Search API support +- SearXNG Search API support with customizable depth (basic or advanced) +- Configurable search depth (basic or advanced) +- SearXNG Search API support with customizable depth ## 🧱 Stack @@ -99,6 +101,18 @@ To use Upstash Redis: 1. Set `USE_LOCAL_REDIS=false` or leave it unset in your `.env.local` file. 2. Set `UPSTASH_REDIS_REST_URL` and `UPSTASH_REDIS_REST_TOKEN` with your Upstash credentials. + +# SearXNG Configuration +SEARXNG_API_URL=http://localhost:8080 # Replace with your local SearXNG API URL or docker http://searxng:8080 +SEARCH_API=tavily # use searxng, tavily or exa +SEARXNG_SECRET="" # generate a secret key e.g. openssl rand -base64 32 +SEARXNG_PORT=8080 # default port +SEARXNG_BIND_ADDRESS=0.0.0.0 # default address +SEARXNG_IMAGE_PROXY=true # enable image proxy +SEARXNG_LIMITER=false # can be enabled to limit the number of requests per IP address +SEARXNG_DEFAULT_DEPTH=basic # Set to 'basic' or 'advanced', only affects SearXNG searches +SEARXNG_MAX_RESULTS=50 # Maximum number of results to return from SearXNG + ``` ### 5. Run app locally @@ -160,38 +174,51 @@ This will allow you to use Morphic as your default search engine in the browser. ### Using SearXNG as an Alternative Search Backend -Morphic now supports SearXNG as an alternative search backend. To use SearXNG: +Morphic now supports SearXNG as an alternative search backend with advanced search capabilities. To use SearXNG: 1. Ensure you have Docker and Docker Compose installed on your system. 2. In your `.env.local` file, set the following variables: + - NEXT_PUBLIC_BASE_URL=http://localhost:3000 # Base URL for local development - SEARXNG_API_URL=http://localhost:8080 # Replace with your local SearXNG API URL or docker http://searxng:8080 - SEARXNG_SECRET=your_secret_key_here - SEARXNG_PORT=8080 - SEARXNG_IMAGE_PROXY=true - SEARCH_API=searxng - SEARXNG_LIMITER=false # can be enabled to limit the number of requests per IP + - SEARXNG_DEFAULT_DEPTH=basic # Set to 'basic' or 'advanced' + - SEARXNG_MAX_RESULTS=50 # Maximum number of results to return from SearXNG + - SEARXNG_ENGINES=google,bing,duckduckgo,wikipedia # can be overriden in searxng config + - SEARXNG_TIME_RANGE=None # Time range for search results + - SEARXNG_SAFESEARCH=0 # Safe search setting + - SEARXNG_CRAWL_MULTIPLIER=4 # Multiplier for the number of results to crawl in advanced search 3. Two configuration files are provided in the root directory: -- `searxng-settings.yml`: This file contains the main configuration for SearXNG, including engine settings and server options. -- `searxng-limiter.toml`: This file configures the rate limiting and bot detection features of SearXNG. + - `searxng-settings.yml`: This file contains the main configuration for SearXNG, including engine settings and server options. + - `searxng-limiter.toml`: This file configures the rate limiting and bot detection features of SearXNG. 4. Run `docker-compose up` to start the Morphic stack with SearXNG included. 5. SearXNG will be available at `http://localhost:8080` and Morphic will use it as the search backend. +#### Advanced Search Configuration + +- `NEXT_PUBLIC_BASE_URL`: Set this to your local development URL (http://localhost:3000) or your production URL when deploying. +- `SEARXNG_DEFAULT_DEPTH`: Set to 'basic' or 'advanced' to control the default search depth. +- `SEARXNG_MAX_RESULTS`: Maximum number of results to return from SearXNG. +- `SEARXNG_CRAWL_MULTIPLIER`: In advanced search mode, this multiplier determines how many results to crawl. For example, if `SEARXNG_MAX_RESULTS=10` and `SEARXNG_CRAWL_MULTIPLIER=4`, up to 40 results will be crawled before filtering and ranking. +- `SEARXNG_ENGINES`: Comma-separated list of search engines to use. +- `SEARXNG_TIME_RANGE`: Time range for search results (e.g., 'day', 'week', 'month', 'year', 'all'). +- `SEARXNG_SAFESEARCH`: Safe search setting (0 for off, 1 for moderate, 2 for strict). + +The advanced search feature includes content crawling, relevance scoring, and filtering to provide more accurate and comprehensive results. + #### Customizing SearXNG - You can modify `searxng-settings.yml` to enable/disable specific search engines, change UI settings, or adjust server options. - The `searxng-limiter.toml` file allows you to configure rate limiting and bot detection. This is useful if you're exposing SearXNG directly to the internet. - If you prefer not to use external configuration files, you can set these options using environment variables in the `docker-compose.yml` file or directly in the SearXNG container. -#### Advanced Configuration - -- To disable the limiter entirely, set `LIMITER=false` in the SearXNG service environment variables. -- For production use, consider adjusting the `SEARXNG_SECRET_KEY` to a secure, randomly generated value. -- The `SEARXNG_IMAGE_PROXY` option allows SearXNG to proxy image results, enhancing privacy. Set to `true` to enable this feature. - #### Troubleshooting - If you encounter issues with specific search engines (e.g., Wikidata), you can disable them in `searxng-settings.yml`: diff --git a/app/api/advanced-search/route.ts b/app/api/advanced-search/route.ts new file mode 100644 index 00000000..afa5b89e --- /dev/null +++ b/app/api/advanced-search/route.ts @@ -0,0 +1,649 @@ +import { NextResponse } from 'next/server' +import http from 'http' +import https from 'https' +import { JSDOM, VirtualConsole } from 'jsdom' +import { + SearXNGSearchResults, + SearXNGResponse, + SearXNGResult, + SearchResultItem +} from '@/lib/types' +import { Agent } from 'http' +import { Redis } from '@upstash/redis' +import { createClient } from 'redis' + +/** + * Maximum number of results to fetch from SearXNG. + * Increasing this value can improve result quality but may impact performance. + * In advanced search mode, this is multiplied by SEARXNG_CRAWL_MULTIPLIER for initial fetching. + */ +const SEARXNG_MAX_RESULTS = Math.max( + 10, + Math.min(100, parseInt(process.env.SEARXNG_MAX_RESULTS || '50', 10)) +) + +const CACHE_TTL = 3600 // Cache time-to-live in seconds (1 hour) +const CACHE_EXPIRATION_CHECK_INTERVAL = 3600000 // 1 hour in milliseconds + +let redisClient: Redis | ReturnType | null = null + +// Initialize Redis client based on environment variables +async function initializeRedisClient() { + if (redisClient) return redisClient + + const useLocalRedis = process.env.USE_LOCAL_REDIS === 'true' + + if (useLocalRedis) { + const localRedisUrl = + process.env.LOCAL_REDIS_URL || 'redis://localhost:6379' + redisClient = createClient({ url: localRedisUrl }) + await redisClient.connect() + } else { + const upstashRedisRestUrl = process.env.UPSTASH_REDIS_REST_URL + const upstashRedisRestToken = process.env.UPSTASH_REDIS_REST_TOKEN + + if (upstashRedisRestUrl && upstashRedisRestToken) { + redisClient = new Redis({ + url: upstashRedisRestUrl, + token: upstashRedisRestToken + }) + } + } + + return redisClient +} + +// Function to get cached results +async function getCachedResults( + cacheKey: string +): Promise { + try { + const client = await initializeRedisClient() + if (!client) return null + + let cachedData: string | null + if (client instanceof Redis) { + cachedData = await client.get(cacheKey) + } else { + cachedData = await client.get(cacheKey) + } + + if (cachedData) { + console.log(`Cache hit for key: ${cacheKey}`) + return JSON.parse(cachedData) + } else { + console.log(`Cache miss for key: ${cacheKey}`) + return null + } + } catch (error) { + console.error('Redis cache error:', error) + return null + } +} + +// Function to set cached results with error handling and logging +async function setCachedResults( + cacheKey: string, + results: SearXNGSearchResults +): Promise { + try { + const client = await initializeRedisClient() + if (!client) return + + const serializedResults = JSON.stringify(results) + if (client instanceof Redis) { + await client.set(cacheKey, serializedResults, { ex: CACHE_TTL }) + } else { + await client.set(cacheKey, serializedResults, { EX: CACHE_TTL }) + } + console.log(`Cached results for key: ${cacheKey}`) + } catch (error) { + console.error('Redis cache error:', error) + } +} + +// Function to periodically clean up expired cache entries +async function cleanupExpiredCache() { + try { + const client = await initializeRedisClient() + if (!client) return + + const keys = await client.keys('search:*') + for (const key of keys) { + const ttl = await client.ttl(key) + if (ttl <= 0) { + await client.del(key) + console.log(`Removed expired cache entry: ${key}`) + } + } + } catch (error) { + console.error('Cache cleanup error:', error) + } +} + +// Set up periodic cache cleanup +setInterval(cleanupExpiredCache, CACHE_EXPIRATION_CHECK_INTERVAL) + +export async function POST(request: Request) { + const { query, maxResults, searchDepth, includeDomains, excludeDomains } = + await request.json() + + const SEARXNG_DEFAULT_DEPTH = process.env.SEARXNG_DEFAULT_DEPTH || 'basic' + + try { + const cacheKey = `search:${query}:${maxResults}:${searchDepth}:${ + Array.isArray(includeDomains) ? includeDomains.join(',') : '' + }:${Array.isArray(excludeDomains) ? excludeDomains.join(',') : ''}` + + // Try to get cached results + const cachedResults = await getCachedResults(cacheKey) + if (cachedResults) { + return NextResponse.json(cachedResults) + } + + // If not cached, perform the search + const results = await advancedSearchXNGSearch( + query, + Math.min(maxResults, SEARXNG_MAX_RESULTS), + searchDepth || SEARXNG_DEFAULT_DEPTH, + Array.isArray(includeDomains) ? includeDomains : [], + Array.isArray(excludeDomains) ? excludeDomains : [] + ) + + // Cache the results + await setCachedResults(cacheKey, results) + + return NextResponse.json(results) + } catch (error) { + console.error('Advanced search error:', error) + return NextResponse.json( + { + message: 'Internal Server Error', + error: error instanceof Error ? error.message : String(error), + query: query, + results: [], + images: [], + number_of_results: 0 + }, + { status: 500 } + ) + } +} + +async function advancedSearchXNGSearch( + query: string, + maxResults: number = 10, + searchDepth: 'basic' | 'advanced' = 'advanced', + includeDomains: string[] = [], + excludeDomains: string[] = [] +): Promise { + const apiUrl = process.env.SEARXNG_API_URL + if (!apiUrl) { + throw new Error('SEARXNG_API_URL is not set in the environment variables') + } + + const SEARXNG_ENGINES = + process.env.SEARXNG_ENGINES || 'google,bing,duckduckgo,wikipedia' + const SEARXNG_TIME_RANGE = process.env.SEARXNG_TIME_RANGE || 'None' + const SEARXNG_SAFESEARCH = process.env.SEARXNG_SAFESEARCH || '0' + const SEARXNG_CRAWL_MULTIPLIER = parseInt( + process.env.SEARXNG_CRAWL_MULTIPLIER || '4', + 10 + ) + + try { + const url = new URL(`${apiUrl}/search`) + url.searchParams.append('q', query) + url.searchParams.append('format', 'json') + url.searchParams.append('categories', 'general,images') + + // Add time_range if it's not 'None' + if (SEARXNG_TIME_RANGE !== 'None') { + url.searchParams.append('time_range', SEARXNG_TIME_RANGE) + } + + url.searchParams.append('safesearch', SEARXNG_SAFESEARCH) + url.searchParams.append('engines', SEARXNG_ENGINES) + + const resultsPerPage = 10 + const pageno = Math.ceil(maxResults / resultsPerPage) + url.searchParams.append('pageno', String(pageno)) + + //console.log('SearXNG API URL:', url.toString()) // Log the full URL for debugging + + const data: + | SearXNGResponse + | { error: string; status: number; data: string } = + await fetchJsonWithRetry(url.toString(), 3) + + if ('error' in data) { + console.error('Invalid response from SearXNG:', data) + throw new Error( + `Invalid response from SearXNG: ${data.error}. Status: ${data.status}. Data: ${data.data}` + ) + } + + if (!data || !Array.isArray(data.results)) { + console.error('Invalid response structure from SearXNG:', data) + throw new Error('Invalid response structure from SearXNG') + } + + let generalResults = data.results.filter( + (result: SearXNGResult) => result && !result.img_src + ) + + // Apply domain filtering manually + if (includeDomains.length > 0 || excludeDomains.length > 0) { + generalResults = generalResults.filter(result => { + const domain = new URL(result.url).hostname + return ( + (includeDomains.length === 0 || + includeDomains.some(d => domain.includes(d))) && + (excludeDomains.length === 0 || + !excludeDomains.some(d => domain.includes(d))) + ) + }) + } + + if (searchDepth === 'advanced') { + const crawledResults = await Promise.all( + generalResults + .slice(0, maxResults * SEARXNG_CRAWL_MULTIPLIER) + .map(result => crawlPage(result, query)) + ) + generalResults = crawledResults + .filter(result => result !== null && isQualityContent(result.content)) + .map(result => result as SearXNGResult) + + const MIN_RELEVANCE_SCORE = 10 + generalResults = generalResults + .map(result => ({ + ...result, + score: calculateRelevanceScore(result, query) + })) + .filter(result => result.score >= MIN_RELEVANCE_SCORE) + .sort((a, b) => b.score - a.score) + .slice(0, maxResults) + } + + generalResults = generalResults.slice(0, maxResults) + + const imageResults = (data.results || []) + .filter((result: SearXNGResult) => result && result.img_src) + .slice(0, maxResults) + + return { + results: generalResults.map( + (result: SearXNGResult): SearchResultItem => ({ + title: result.title || '', + url: result.url || '', + content: result.content || '' + }) + ), + query: data.query || query, + images: imageResults + .map((result: SearXNGResult) => { + const imgSrc = result.img_src || '' + return imgSrc.startsWith('http') ? imgSrc : `${apiUrl}${imgSrc}` + }) + .filter(Boolean), + number_of_results: data.number_of_results || generalResults.length + } + } catch (error) { + console.error('SearchXNG API error:', error) + return { + results: [], + query: query, + images: [], + number_of_results: 0 + } + } +} + +async function crawlPage( + result: SearXNGResult, + query: string +): Promise { + try { + const html = await fetchHtmlWithTimeout(result.url, 20000) + + // virtual console to suppress JSDOM warnings + const virtualConsole = new VirtualConsole() + virtualConsole.on('error', () => {}) + virtualConsole.on('warn', () => {}) + + const dom = new JSDOM(html, { + runScripts: 'outside-only', + resources: 'usable', + virtualConsole + }) + const document = dom.window.document + + // Remove script, style, nav, header, and footer elements + document + .querySelectorAll('script, style, nav, header, footer') + .forEach((el: Element) => el.remove()) + + const mainContent = + document.querySelector('main') || + document.querySelector('article') || + document.querySelector('.content') || + document.querySelector('#content') || + document.body + + if (mainContent) { + // Prioritize specific content elements + const priorityElements = mainContent.querySelectorAll('h1, h2, h3, p') + let extractedText = Array.from(priorityElements) + .map(el => el.textContent?.trim()) + .filter(Boolean) + .join('\n\n') + + // If not enough content, fall back to other elements + if (extractedText.length < 500) { + const contentElements = mainContent.querySelectorAll( + 'h4, h5, h6, li, td, th, blockquote, pre, code' + ) + extractedText += + '\n\n' + + Array.from(contentElements) + .map(el => el.textContent?.trim()) + .filter(Boolean) + .join('\n\n') + } + + // Extract metadata + const metaDescription = + document + .querySelector('meta[name="description"]') + ?.getAttribute('content') || '' + const metaKeywords = + document + .querySelector('meta[name="keywords"]') + ?.getAttribute('content') || '' + const ogTitle = + document + .querySelector('meta[property="og:title"]') + ?.getAttribute('content') || '' + const ogDescription = + document + .querySelector('meta[property="og:description"]') + ?.getAttribute('content') || '' + + // Combine metadata with extracted text + extractedText = `${result.title}\n\n${ogTitle}\n\n${metaDescription}\n\n${ogDescription}\n\n${metaKeywords}\n\n${extractedText}` + + // Limit the extracted text to 10000 characters + extractedText = extractedText.substring(0, 10000) + + // Highlight query terms in the content + result.content = highlightQueryTerms(extractedText, query) + + // Extract publication date + const publishedDate = extractPublicationDate(document) + if (publishedDate) { + result.publishedDate = publishedDate.toISOString() + } + } + + return result + } catch (error) { + console.error(`Error crawling ${result.url}:`, error) + return { + ...result, + content: result.content || 'Content unavailable due to crawling error.' + } + } +} + +function highlightQueryTerms(content: string, query: string): string { + try { + const terms = query + .toLowerCase() + .split(/\s+/) + .filter(term => term.length > 2) + .map(term => term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) // Escape special characters + + let highlightedContent = content + + terms.forEach(term => { + const regex = new RegExp(`\\b${term}\\b`, 'gi') + highlightedContent = highlightedContent.replace( + regex, + match => `${match}` + ) + }) + + return highlightedContent + } catch (error) { + //console.error('Error in highlightQueryTerms:', error) + return content // Return original content if highlighting fails + } +} + +function calculateRelevanceScore(result: SearXNGResult, query: string): number { + try { + const lowercaseContent = result.content.toLowerCase() + const lowercaseQuery = query.toLowerCase() + const queryWords = lowercaseQuery + .split(/\s+/) + .filter(word => word.length > 2) + .map(word => word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) // Escape special characters + + let score = 0 + + // Check for exact phrase match + if (lowercaseContent.includes(lowercaseQuery)) { + score += 30 + } + + // Check for individual word matches + queryWords.forEach(word => { + const regex = new RegExp(`\\b${word}\\b`, 'g') + const wordCount = (lowercaseContent.match(regex) || []).length + score += wordCount * 3 + }) + + // Boost score for matches in the title + const lowercaseTitle = result.title.toLowerCase() + if (lowercaseTitle.includes(lowercaseQuery)) { + score += 20 + } + + queryWords.forEach(word => { + const regex = new RegExp(`\\b${word}\\b`, 'g') + if (lowercaseTitle.match(regex)) { + score += 10 + } + }) + + // Boost score for recent content (if available) + if (result.publishedDate) { + const publishDate = new Date(result.publishedDate) + const now = new Date() + const daysSincePublished = + (now.getTime() - publishDate.getTime()) / (1000 * 3600 * 24) + if (daysSincePublished < 30) { + score += 15 + } else if (daysSincePublished < 90) { + score += 10 + } else if (daysSincePublished < 365) { + score += 5 + } + } + + // Penalize very short content + if (result.content.length < 200) { + score -= 10 + } else if (result.content.length > 1000) { + score += 5 + } + + // Boost score for content with more highlighted terms + const highlightCount = (result.content.match(//g) || []).length + score += highlightCount * 2 + + return score + } catch (error) { + //console.error('Error in calculateRelevanceScore:', error) + return 0 // Return 0 if scoring fails + } +} + +function extractPublicationDate(document: Document): Date | null { + const dateSelectors = [ + 'meta[name="article:published_time"]', + 'meta[property="article:published_time"]', + 'meta[name="publication-date"]', + 'meta[name="date"]', + 'time[datetime]', + 'time[pubdate]' + ] + + for (const selector of dateSelectors) { + const element = document.querySelector(selector) + if (element) { + const dateStr = + element.getAttribute('content') || + element.getAttribute('datetime') || + element.getAttribute('pubdate') + if (dateStr) { + const date = new Date(dateStr) + if (!isNaN(date.getTime())) { + return date + } + } + } + } + + return null +} + +const httpAgent = new http.Agent({ keepAlive: true }) +const httpsAgent = new https.Agent({ + keepAlive: true, + rejectUnauthorized: true // change to false if you want to ignore SSL certificate errors + //but use this with caution. +}) + +async function fetchJsonWithRetry(url: string, retries: number): Promise { + for (let i = 0; i < retries; i++) { + try { + return await fetchJson(url) + } catch (error) { + if (i === retries - 1) throw error + await new Promise(resolve => setTimeout(resolve, 1000 * (i + 1))) + } + } +} + +function fetchJson(url: string): Promise { + return new Promise((resolve, reject) => { + const protocol = url.startsWith('https:') ? https : http + const agent = url.startsWith('https:') ? httpsAgent : httpAgent + const request = protocol.get(url, { agent }, res => { + let data = '' + res.on('data', chunk => { + data += chunk + }) + res.on('end', () => { + try { + // Check if the response is JSON + if (res.headers['content-type']?.includes('application/json')) { + resolve(JSON.parse(data)) + } else { + // If not JSON, return an object with the raw data and status + resolve({ + error: 'Invalid JSON response', + status: res.statusCode, + data: data.substring(0, 200) // Include first 200 characters of the response + }) + } + } catch (e) { + reject(e) + } + }) + }) + request.on('error', reject) + request.on('timeout', () => { + request.destroy() + reject(new Error('Request timed out')) + }) + request.setTimeout(15000) // 15 second timeout + }) +} + +async function fetchHtmlWithTimeout( + url: string, + timeoutMs: number +): Promise { + try { + return await Promise.race([ + fetchHtml(url), + timeout(timeoutMs, `Fetching ${url} timed out after ${timeoutMs}ms`) + ]) + } catch (error) { + console.error(`Error fetching ${url}:`, error) + const errorMessage = error instanceof Error ? error.message : String(error) + return `Error fetching content: ${errorMessage}` + } +} + +function fetchHtml(url: string): Promise { + return new Promise((resolve, reject) => { + const protocol = url.startsWith('https:') ? https : http + const agent = url.startsWith('https:') ? httpsAgent : httpAgent + const request = protocol.get(url, { agent }, res => { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + // Handle redirects + fetchHtml(new URL(res.headers.location, url).toString()) + .then(resolve) + .catch(reject) + return + } + let data = '' + res.on('data', chunk => { + data += chunk + }) + res.on('end', () => resolve(data)) + }) + request.on('error', error => { + //console.error(`Error fetching ${url}:`, error) + reject(error) + }) + request.on('timeout', () => { + request.destroy() + //reject(new Error(`Request timed out for ${url}`)) + resolve('') + }) + request.setTimeout(10000) // 10 second timeout + }) +} + +function timeout(ms: number, message: string): Promise { + return new Promise((_, reject) => { + setTimeout(() => { + reject(new Error(message)) + }, ms) + }) +} + +function isQualityContent(text: string): boolean { + const words = text.split(/\s+/).length + const sentences = text.split(/[.!?]+/).length + const avgWordsPerSentence = words / sentences + + return ( + words > 50 && + sentences > 3 && + avgWordsPerSentence > 5 && + avgWordsPerSentence < 30 && + !text.includes('Content unavailable due to crawling error') && + !text.includes('Error fetching content:') + ) +} diff --git a/bun.lockb b/bun.lockb index 66c54b66..4cc856e3 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/lib/agents/tools/search.tsx b/lib/agents/tools/search.tsx index 79129549..4c53538b 100644 --- a/lib/agents/tools/search.tsx +++ b/lib/agents/tools/search.tsx @@ -40,20 +40,52 @@ export const searchTool = ({ uiStream, fullResponse }: ToolProps) => let searchResult: SearchResults const searchAPI = (process.env.SEARCH_API as 'tavily' | 'exa' | 'searxng') || 'tavily' - console.log(`Using search API: ${searchAPI}`) + + + const effectiveSearchDepth = + searchAPI === 'searxng' && + process.env.SEARXNG_DEFAULT_DEPTH === 'advanced' + ? 'advanced' + : search_depth || 'basic' + + console.log( + `Using search API: ${searchAPI}, Search Depth: ${effectiveSearchDepth}` + ) try { - searchResult = await (searchAPI === 'tavily' - ? tavilySearch - : searchAPI === 'exa' - ? exaSearch - : searxngSearch)( - filledQuery, - max_results, - search_depth, - include_domains, - exclude_domains - ) + if (searchAPI === 'searxng' && effectiveSearchDepth === 'advanced') { + // API route for advanced SearXNG search + const baseUrl = process.env.NEXT_PUBLIC_BASE_URL || 'http://localhost:3000'; + const response = await fetch(`${baseUrl}/api/advanced-search`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + query: filledQuery, + maxResults: max_results, + searchDepth: effectiveSearchDepth, + includeDomains: include_domains, + excludeDomains: exclude_domains + }) + }) + if (!response.ok) { + throw new Error( + `Advanced search API error: ${response.status} ${response.statusText}` + ) + } + searchResult = await response.json() + } else { + searchResult = await (searchAPI === 'tavily' + ? tavilySearch + : searchAPI === 'exa' + ? exaSearch + : searxngSearch)( + filledQuery, + max_results, + effectiveSearchDepth, + include_domains, + exclude_domains + ) + } } catch (error) { console.error('Search API error:', error) hasError = true @@ -171,9 +203,9 @@ async function exaSearch( async function searxngSearch( query: string, maxResults: number = 10, - _searchDepth: string, - includeDomains: string[] = [], //keep for future use - excludeDomains: string[] = [] //keep for future use + searchDepth: string, + includeDomains: string[] = [], + excludeDomains: string[] = [] ): Promise { const apiUrl = process.env.SEARXNG_API_URL if (!apiUrl) { @@ -185,9 +217,19 @@ async function searxngSearch( const url = new URL(`${apiUrl}/search`) url.searchParams.append('q', query) url.searchParams.append('format', 'json') - // Enable both general and image results url.searchParams.append('categories', 'general,images') + // Apply search depth settings + if (searchDepth === 'advanced') { + url.searchParams.append('time_range', '') + url.searchParams.append('safesearch', '0') + url.searchParams.append('engines', 'google,bing,duckduckgo,wikipedia') + } else { + url.searchParams.append('time_range', 'year') + url.searchParams.append('safesearch', '1') + url.searchParams.append('engines', 'google,bing') + } + // Fetch results from SearXNG const response = await fetch(url.toString(), { method: 'GET', @@ -236,4 +278,4 @@ async function searxngSearch( console.error('SearXNG API error:', error) throw error } -} +} \ No newline at end of file diff --git a/lib/types/index.ts b/lib/types/index.ts index ff7c5da7..079963f8 100644 --- a/lib/types/index.ts +++ b/lib/types/index.ts @@ -87,6 +87,8 @@ export interface SearXNGResult { url: string content: string img_src?: string + publishedDate?: string + score?: number } export interface SearXNGResponse { @@ -94,3 +96,14 @@ export interface SearXNGResponse { number_of_results: number results: SearXNGResult[] } + + +export type SearXNGImageResult = string + + +export type SearXNGSearchResults = { + images: SearXNGImageResult[] + results: SearchResultItem[] + number_of_results?: number + query: string +} diff --git a/package.json b/package.json index ab4a87ed..aa02039e 100644 --- a/package.json +++ b/package.json @@ -32,10 +32,12 @@ "clsx": "^2.1.0", "embla-carousel-react": "^8.0.0", "exa-js": "^1.0.12", + "jsdom": "^22.1.0", "katex": "^0.16.10", "lucide-react": "^0.363.0", "next": "^14.2.3", "next-themes": "^0.3.0", + "node-html-parser": "^6.1.13", "ollama-ai-provider": "^0.7.0", "react-dom": "^18", "react-icons": "^5.0.1", @@ -53,6 +55,7 @@ "zod": "^3.23.8" }, "devDependencies": { + "@types/jsdom": "^21.1.7", "@types/node": "^20", "@types/react": "^18", "@types/react-dom": "^18", diff --git a/searxng-settings.yml b/searxng-settings.yml index 410aa40b..685db6f3 100755 --- a/searxng-settings.yml +++ b/searxng-settings.yml @@ -29,3 +29,7 @@ server: X-Download-Options: noopen X-Robots-Tag: noindex, nofollow Referrer-Policy: no-referrer + +search: + formats: + - json \ No newline at end of file