"""Tool for the Tavily Crawl API."""
from typing import Any, Dict, List, Literal, Optional, Type
from langchain_core.callbacks import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain_core.tools import BaseTool, ToolException
from pydantic import BaseModel, Field
from langchain_tavily._utilities import TavilyCrawlAPIWrapper
def _generate_suggestions(params: dict) -> list:
"""Generate helpful suggestions based on the failed crawl parameters."""
suggestions = []
instructions = params.get("instructions")
select_paths = params.get("select_paths")
select_domains = params.get("select_domains")
exclude_paths = params.get("exclude_paths")
exclude_domains = params.get("exclude_domains")
categories = params.get("categories")
if instructions:
suggestions.append("Try more consice instructions")
if select_paths:
suggestions.append("Remove select_paths argument")
if select_domains:
suggestions.append("Remove select_domains argument")
if exclude_paths:
suggestions.append("Remove exclude_paths argument")
if exclude_domains:
suggestions.append("Remove exclude_domains argument")
if categories:
suggestions.append("Remove categories argument")
return suggestions
[docs]
class TavilyCrawl(BaseTool): # type: ignore[override]
"""Tool that sends requests to the Tavily Crawl API with dynamically settable parameters.""" # noqa: E501
name: str = "tavily_crawl"
description: str = """A powerful web crawler that initiates a structured web crawl starting from a specified
base URL. The crawler uses a BFS Depth: refering to the number of link hops from the root URL.
A page directly linked from the root is at BFS depth 1, regardless of its URL structure.
You can control how deep and wide it goes, and guide it to focus on specific sections of the site.
""" # noqa: E501
args_schema: Type[BaseModel] = TavilyCrawlInput
handle_tool_error: bool = True
max_depth: Optional[int] = None
"""Max depth of the crawl. Defines how far from the base URL the crawler can explore.
max_depth must be greater than 0
default is 1
""" # noqa: E501
max_breadth: Optional[int] = None
"""The maximum number of links to follow per level of the tree (i.e., per page).
max_breadth must be greater than 0
default is 20
"""
limit: Optional[int] = None
"""Total number of links the crawler will process before stopping.
limit must be greater than 0
default is 50
"""
instructions: Optional[str] = None
"""Natural language instructions for the crawler.
ex. "Python SDK"
"""
select_paths: Optional[List[str]] = None
"""Regex patterns to select only URLs with specific path patterns.
ex. ["/api/v1.*"]
"""
select_domains: Optional[List[str]] = None
"""Regex patterns to select only URLs from specific domains or subdomains.
ex. ["^docs\.example\.com$"]
"""
exclude_paths: Optional[List[str]] = None
"""
Regex patterns to exclude URLs with specific path patterns
ex. [/private/.*, /admin/.*]
"""
exclude_domains: Optional[List[str]] = None
"""
Regex patterns to exclude specific domains or subdomains from crawling
ex. [^private\.example\.com$]
"""
allow_external: Optional[bool] = None
"""Whether to allow following links that go to external domains.
default is False
"""
include_images: Optional[bool] = None
"""Whether to include images in the crawl results.
default is False
"""
categories: Optional[
List[
Literal[
"Careers",
"Blogs",
"Documentation",
"About",
"Pricing",
"Community",
"Developers",
"Contact",
"Media",
]
]
] = None
"""Filter URLs using predefined categories like 'Documentation', 'Blogs', etc.
"""
extract_depth: Optional[Literal["basic", "advanced"]] = None
"""Advanced extraction retrieves more data, including tables and embedded content,
with higher success but may increase latency.
default is basic
"""
format: Optional[str] = None
"""
The format of the extracted web page content. markdown returns content in markdown
format. text returns plain text and may increase latency.
default is markdown
"""
include_favicon: Optional[bool] = None
"""Whether to include the favicon URL for each result.
Default is False.
"""
api_wrapper: TavilyCrawlAPIWrapper = Field(default_factory=TavilyCrawlAPIWrapper) # type: ignore[arg-type]
def __init__(self, **kwargs: Any) -> None:
# Create api_wrapper with tavily_api_key if provided
if "tavily_api_key" in kwargs:
kwargs["api_wrapper"] = TavilyCrawlAPIWrapper(
tavily_api_key=kwargs["tavily_api_key"]
)
super().__init__(**kwargs)
def _run(
self,
url: str,
max_depth: Optional[int] = None,
max_breadth: Optional[int] = None,
limit: Optional[int] = None,
instructions: Optional[str] = None,
select_paths: Optional[List[str]] = None,
select_domains: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
allow_external: Optional[bool] = None,
include_images: Optional[bool] = None,
categories: Optional[
List[
Literal[
"Careers",
"Blogs",
"Documentation",
"About",
"Pricing",
"Community",
"Developers",
"Contact",
"Media",
]
]
] = None,
extract_depth: Optional[Literal["basic", "advanced"]] = None,
include_favicon: Optional[bool] = None,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> Dict[str, Any]:
"""Execute a crawl using the Tavily Crawl API.
Returns:
- base_url (str): The base URL that was crawled
Example: "https://tavily.com/"
- results (List[Dict]): A list of extracted content from the crawled URLs
- url (str): The URL that was crawled
Example: "https://tavily.com/#features"
- raw_content (str): The full content extracted from the page
- images (List[str]): A list of image URLs extracted from the page
- response_time (float): Time in seconds it took to complete the request
"""
try:
# Execute search with parameters directly
raw_results = self.api_wrapper.raw_results(
url=url,
max_depth=self.max_depth if self.max_depth else max_depth,
max_breadth=self.max_breadth if self.max_breadth else max_breadth,
limit=self.limit if self.limit else limit,
instructions=self.instructions if self.instructions else instructions,
select_paths=self.select_paths if self.select_paths else select_paths,
select_domains=self.select_domains
if self.select_domains
else select_domains,
exclude_paths=self.exclude_paths
if self.exclude_paths
else exclude_paths,
exclude_domains=self.exclude_domains
if self.exclude_domains
else exclude_domains,
allow_external=self.allow_external
if self.allow_external
else allow_external,
include_images=self.include_images
if self.include_images
else include_images,
categories=self.categories if self.categories else categories,
extract_depth=self.extract_depth
if self.extract_depth
else extract_depth,
include_favicon=self.include_favicon
if self.include_favicon
else include_favicon,
format=self.format,
)
# Check if results are empty and raise a specific exception
if not raw_results.get("results", []):
search_params = {
"instructions": instructions,
"select_paths": select_paths,
"select_domains": select_domains,
"exclude_paths": exclude_paths,
"exclude_domains": exclude_domains,
"categories": categories,
"format": self.format,
}
suggestions = _generate_suggestions(search_params)
# Construct a detailed message for the agent
error_message = (
f"No crawl results found for '{url}'. "
f"Suggestions: {', '.join(suggestions)}. "
f"Try modifying your crawl parameters with one of these approaches." # noqa: E501
)
raise ToolException(error_message)
return raw_results
except ToolException:
# Re-raise tool exceptions
raise
except Exception as e:
return {"error": e}
async def _arun(
self,
url: str,
max_depth: Optional[int] = None,
max_breadth: Optional[int] = None,
limit: Optional[int] = None,
instructions: Optional[str] = None,
select_paths: Optional[List[str]] = None,
select_domains: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
allow_external: Optional[bool] = None,
include_images: Optional[bool] = None,
categories: Optional[
List[
Literal[
"Careers",
"Blogs",
"Documentation",
"About",
"Pricing",
"Community",
"Developers",
"Contact",
"Media",
]
]
] = None,
extract_depth: Optional[Literal["basic", "advanced"]] = None,
include_favicon: Optional[bool] = None,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> Dict[str, Any]:
"""Use the tool asynchronously."""
try:
raw_results = await self.api_wrapper.raw_results_async(
url=url,
max_depth=self.max_depth if self.max_depth else max_depth,
max_breadth=self.max_breadth if self.max_breadth else max_breadth,
limit=self.limit if self.limit else limit,
instructions=self.instructions if self.instructions else instructions,
select_paths=self.select_paths if self.select_paths else select_paths,
select_domains=self.select_domains
if self.select_domains
else select_domains,
exclude_paths=self.exclude_paths
if self.exclude_paths
else exclude_paths,
exclude_domains=self.exclude_domains
if self.exclude_domains
else exclude_domains,
allow_external=self.allow_external
if self.allow_external
else allow_external,
include_images=self.include_images
if self.include_images
else include_images,
categories=self.categories if self.categories else categories,
extract_depth=self.extract_depth
if self.extract_depth
else extract_depth,
include_favicon=self.include_favicon
if self.include_favicon
else include_favicon,
format=self.format,
)
# Check if results are empty and raise a specific exception
if not raw_results.get("results", []):
search_params = {
"instructions": instructions,
"select_paths": select_paths,
"select_domains": select_domains,
"categories": categories,
}
suggestions = _generate_suggestions(search_params)
# Construct a detailed message for the agent
error_message = (
f"No crawl results found for '{url}'. "
f"Suggestions: {', '.join(suggestions)}. "
f"Try modifying your crawl parameters with one of these approaches." # noqa: E501
)
raise ToolException(error_message)
return raw_results
except ToolException:
# Re-raise tool exceptions
raise
except Exception as e:
return {"error": e}