diff options
| author | mo khan <mo@mokhan.ca> | 2025-08-18 15:04:35 -0600 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2025-08-18 15:04:35 -0600 |
| commit | 1fc7c287b0c64d90261f3149705cb3fe8e3c56ed (patch) | |
| tree | f58a9f63cf57da35402ef73e85c32e4ad650139b | |
| parent | dee76608b0dc5267d4f794bf3a628a36e71214cf (diff) | |
refactor: simplify the fetch server
| -rw-r--r-- | pkg/fetch/server.go | 224 | ||||
| -rw-r--r-- | pkg/htmlprocessor/processor.go | 98 |
2 files changed, 36 insertions, 286 deletions
diff --git a/pkg/fetch/server.go b/pkg/fetch/server.go index 3384276..1ae6779 100644 --- a/pkg/fetch/server.go +++ b/pkg/fetch/server.go @@ -1,6 +1,7 @@ package fetch import ( + "encoding/base64" "encoding/json" "fmt" "io" @@ -9,35 +10,27 @@ import ( "strings" "time" - "github.com/xlgmokha/mcp/pkg/htmlprocessor" "github.com/xlgmokha/mcp/pkg/mcp" ) -// FetchResult represents the result of a fetch operation type FetchResult struct { URL string `json:"url"` Content string `json:"content"` ContentType string `json:"content_type"` - Length int `json:"length"` - Truncated bool `json:"truncated,omitempty"` - NextIndex int `json:"next_index,omitempty"` + IsBinary bool `json:"is_binary"` } -// FetchOperations provides HTTP client operations for fetching content type FetchOperations struct { - httpClient *http.Client - userAgent string - htmlProcessor *htmlprocessor.ContentExtractor + httpClient *http.Client + userAgent string } -// NewFetchOperations creates a new FetchOperations helper func NewFetchOperations() *FetchOperations { return &FetchOperations{ httpClient: &http.Client{ Timeout: 30 * time.Second, }, - userAgent: "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)", - htmlProcessor: htmlprocessor.NewContentExtractor(), + userAgent: "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)", } } @@ -46,8 +39,7 @@ func New() *mcp.Server { fetch := NewFetchOperations() builder := mcp.NewServerBuilder("mcp-fetch", "1.0.0") - // Add fetch tool - builder.AddTool(mcp.NewTool("fetch", "Fetches a URL from the internet and extracts its contents as markdown. Always returns successful response with content or error details.", map[string]interface{}{ + builder.AddTool(mcp.NewTool("fetch", "Fetches a URL and returns the content. Text content is returned as-is, binary content is base64 encoded.", map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "url": map[string]interface{}{ @@ -55,24 +47,6 @@ func New() *mcp.Server { "description": "URL to fetch", "format": "uri", }, - "max_length": map[string]interface{}{ - "type": "integer", - "description": "Maximum number of characters to return. Defaults to 5000", - "minimum": 1, - "maximum": 999999, - "default": 5000, - }, - "start_index": map[string]interface{}{ - "type": "integer", - "description": "Start reading content from this character index. Defaults to 0", - "minimum": 0, - "default": 0, - }, - "raw": map[string]interface{}{ - "type": "boolean", - "description": "Get raw HTML content without markdown conversion. Defaults to false", - "default": false, - }, }, "required": []string{"url"}, }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) { @@ -81,57 +55,16 @@ func New() *mcp.Server { return mcp.NewToolError("url is required"), nil } - // Parse and validate URL parsedURL, err := url.Parse(urlStr) if err != nil || parsedURL.Scheme == "" || parsedURL.Host == "" { return mcp.NewToolError("Invalid URL format"), nil } - // Get optional parameters - maxLength := 5000 - if ml, ok := req.Arguments["max_length"]; ok { - switch v := ml.(type) { - case float64: - maxLength = int(v) - case int: - maxLength = v - default: - return mcp.NewToolError("max_length must be a number"), nil - } - if maxLength < 1 || maxLength > 999999 { - return mcp.NewToolError("max_length must be between 1 and 999999"), nil - } - } - - startIndex := 0 - if si, ok := req.Arguments["start_index"]; ok { - switch v := si.(type) { - case float64: - startIndex = int(v) - case int: - startIndex = v - default: - return mcp.NewToolError("start_index must be a number"), nil - } - if startIndex < 0 { - return mcp.NewToolError("start_index must be >= 0"), nil - } - } - - raw := false - if r, ok := req.Arguments["raw"]; ok { - if rBool, ok := r.(bool); ok { - raw = rBool - } - } - - // Fetch the content - result, err := fetch.fetchContent(parsedURL.String(), maxLength, startIndex, raw) + result, err := fetch.fetchContent(parsedURL.String()) if err != nil { return mcp.NewToolError(err.Error()), nil } - // Format result as JSON jsonResult, err := json.MarshalIndent(result, "", " ") if err != nil { return mcp.NewToolError(fmt.Sprintf("Failed to marshal result: %v", err)), nil @@ -140,155 +73,70 @@ func New() *mcp.Server { return mcp.NewToolResult(mcp.NewTextContent(string(jsonResult))), nil })) - // Add fetch prompt - builder.AddPrompt(mcp.NewPrompt("fetch", "Prompt for manually entering a URL to fetch content from", []mcp.PromptArgument{ - { - Name: "url", - Description: "The URL to fetch content from", - Required: true, - }, - { - Name: "reason", - Description: "Why you want to fetch this URL (optional context)", - Required: false, - }, - }, func(req mcp.GetPromptRequest) (mcp.GetPromptResult, error) { - url, hasURL := req.Arguments["url"].(string) - reason, hasReason := req.Arguments["reason"].(string) - - if !hasURL || url == "" { - return mcp.GetPromptResult{}, fmt.Errorf("url argument is required") - } - - // Create the prompt messages - var messages []mcp.PromptMessage - - // User message with the URL and optional reason - userContent := fmt.Sprintf("Please fetch the content from this URL: %s", url) - if hasReason && reason != "" { - userContent += fmt.Sprintf("\n\nReason: %s", reason) - } - - messages = append(messages, mcp.PromptMessage{ - Role: "user", - Content: mcp.NewTextContent(userContent), - }) - - // Assistant message suggesting the fetch tool usage - assistantContent := fmt.Sprintf(`I'll fetch the content from %s for you. - -Let me use the fetch tool to retrieve and process the content:`, url) - - messages = append(messages, mcp.PromptMessage{ - Role: "assistant", - Content: mcp.NewTextContent(assistantContent), - }) - - description := "Manual URL fetch prompt" - if hasReason && reason != "" { - description = fmt.Sprintf("Manual URL fetch: %s", reason) - } - - return mcp.GetPromptResult{ - Description: description, - Messages: messages, - }, nil - })) - return builder.Build() } -// Helper methods for FetchOperations - -func (fetch *FetchOperations) fetchContent(urlStr string, maxLength, startIndex int, raw bool) (*FetchResult, error) { - // Create HTTP request +func (fetch *FetchOperations) fetchContent(urlStr string) (*FetchResult, error) { req, err := http.NewRequest("GET", urlStr, nil) if err != nil { return nil, fmt.Errorf("Failed to create request: %v", err) } req.Header.Set("User-Agent", fetch.userAgent) - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - // Perform HTTP request resp, err := fetch.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("Failed to fetch URL: %v", err) } defer resp.Body.Close() - // Check for HTTP errors if resp.StatusCode >= 400 { return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) } - // Read response body body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("Failed to read response body: %v", err) } - // Get content type contentType := resp.Header.Get("Content-Type") - - // Process content + isBinary := isBinaryContent(contentType) + var content string - if raw || !isHTMLContent(string(body), contentType) { - content = string(body) + if isBinary { + content = base64.StdEncoding.EncodeToString(body) } else { - // Convert HTML to markdown using improved processor - var err error - content, err = fetch.htmlProcessor.ToMarkdown(string(body)) - if err != nil { - // Fallback to raw content if markdown conversion fails - content = string(body) - } - } - - // Apply start index first - originalLength := len(content) - if startIndex > 0 { - if startIndex >= originalLength { - return nil, fmt.Errorf("start_index (%d) is beyond content length (%d)", startIndex, originalLength) - } - content = content[startIndex:] - } - - // Apply max length and check for truncation - truncated := false - nextIndex := 0 - if len(content) > maxLength { - content = content[:maxLength] - truncated = true - nextIndex = startIndex + maxLength + content = string(body) } - result := &FetchResult{ + return &FetchResult{ URL: urlStr, Content: content, ContentType: contentType, - Length: len(content), - } - - if truncated { - result.Truncated = true - result.NextIndex = nextIndex - } - - return result, nil + IsBinary: isBinary, + }, nil } -func isHTMLContent(content, contentType string) bool { - // Check content type header - if strings.Contains(strings.ToLower(contentType), "text/html") { - return true +func isBinaryContent(contentType string) bool { + if contentType == "" { + return false } - - // Check if content starts with HTML tags (first 100 chars) - prefix := content - if len(prefix) > 100 { - prefix = prefix[:100] + + contentType = strings.ToLower(strings.Split(contentType, ";")[0]) + + textTypes := []string{ + "text/", + "application/json", + "application/xml", + "application/javascript", + "application/x-javascript", } - - return strings.Contains(strings.ToLower(prefix), "<html") + + for _, textType := range textTypes { + if strings.HasPrefix(contentType, textType) { + return false + } + } + + return true } diff --git a/pkg/htmlprocessor/processor.go b/pkg/htmlprocessor/processor.go deleted file mode 100644 index b0b47cd..0000000 --- a/pkg/htmlprocessor/processor.go +++ /dev/null @@ -1,98 +0,0 @@ -package htmlprocessor - -import ( - "strings" - - "github.com/JohannesKaufmann/html-to-markdown" - "github.com/PuerkitoBio/goquery" -) - -// ContentExtractor handles HTML content extraction and conversion -type ContentExtractor struct { - converter *md.Converter -} - -// NewContentExtractor creates a new ContentExtractor with default settings -func NewContentExtractor() *ContentExtractor { - converter := md.NewConverter("", true, nil) - - // Add custom rules to remove unwanted elements - converter.AddRules( - md.Rule{ - Filter: []string{"script", "style", "nav", "header", "footer", "aside"}, - Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string { - // Remove these elements entirely - empty := "" - return &empty - }, - }, - ) - - return &ContentExtractor{ - converter: converter, - } -} - -// ExtractReadableContent extracts the main readable content from HTML -// It removes navigation, ads, scripts, styles, and other non-content elements -func (e *ContentExtractor) ExtractReadableContent(html string) (string, error) { - doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) - if err != nil { - return "", err - } - - // Remove unwanted elements - doc.Find("script, style, nav, header, footer, aside, .sidebar, .ads, .advertisement").Remove() - - // Try to find main content areas in order of preference - var contentSelection *goquery.Selection - - // Look for semantic HTML5 elements first - if main := doc.Find("main"); main.Length() > 0 { - contentSelection = main.First() - } else if article := doc.Find("article"); article.Length() > 0 { - contentSelection = article.First() - } else if content := doc.Find(".content, .main-content, #content, #main"); content.Length() > 0 { - contentSelection = content.First() - } else { - // Fallback to body - contentSelection = doc.Find("body") - } - - // Extract text content - var textParts []string - contentSelection.Find("h1, h2, h3, h4, h5, h6, p, li").Each(func(i int, s *goquery.Selection) { - text := strings.TrimSpace(s.Text()) - if text != "" { - textParts = append(textParts, text) - } - }) - - return strings.Join(textParts, "\n"), nil -} - -// ToMarkdown converts HTML to markdown format -func (e *ContentExtractor) ToMarkdown(html string) (string, error) { - markdown, err := e.converter.ConvertString(html) - if err != nil { - return "", err - } - - // Clean up extra whitespace - lines := strings.Split(markdown, "\n") - var cleanLines []string - - for _, line := range lines { - trimmed := strings.TrimSpace(line) - if trimmed != "" || (len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] != "") { - cleanLines = append(cleanLines, trimmed) - } - } - - // Remove trailing empty lines - for len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] == "" { - cleanLines = cleanLines[:len(cleanLines)-1] - } - - return strings.Join(cleanLines, "\n"), nil -} |
