如何用Java写一个简单的搜索引擎爬虫?

wen java案例 62

本文目录导读:

如何用Java写一个简单的搜索引擎爬虫?

  1. 基础爬虫框架
  2. 增强版本 - 带索引功能
  3. 测试代码
  4. 多线程版本
  5. 使用建议

我来介绍一下如何用Java写一个简单的搜索引擎爬虫,我们会从基础版本开始,逐步完善功能。

基础爬虫框架

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SimpleCrawler {
    private Set<String> visitedUrls;
    private Queue<String> urlsToVisit;
    private int maxPages;
    private int currentPageCount;
    public SimpleCrawler(int maxPages) {
        this.visitedUrls = new HashSet<>();
        this.urlsToVisit = new LinkedList<>();
        this.maxPages = maxPages;
        this.currentPageCount = 0;
    }
    public void start(String seedUrl) {
        urlsToVisit.add(seedUrl);
        while (!urlsToVisit.isEmpty() && currentPageCount < maxPages) {
            String url = urlsToVisit.poll();
            if (!visitedUrls.contains(url)) {
                visitedUrls.add(url);
                crawl(url);
            }
        }
        System.out.println("抓取完成,共抓取了 " + currentPageCount + " 个页面");
    }
    private void crawl(String url) {
        try {
            System.out.println("正在抓取: " + url);
            String content = fetchPage(url);
            if (content != null) {
                // 页面处理
                processPage(url, content);
                // 提取链接
                List<String> links = extractLinks(content);
                // 将新链接加入队列
                for (String link : links) {
                    if (!visitedUrls.contains(link) && !urlsToVisit.contains(link)) {
                        urlsToVisit.add(link);
                    }
                }
                currentPageCount++;
                System.out.println("当前进度: " + currentPageCount + "/" + maxPages);
            }
        } catch (Exception e) {
            System.err.println("抓取 " + url + " 时出错: " + e.getMessage());
        }
    }
    private String fetchPage(String urlStr) {
        try {
            URL url = new URL(urlStr);
            URLConnection connection = url.openConnection();
            connection.setRequestProperty("User-Agent", "Mozilla/5.0");
            connection.setConnectTimeout(5000);
            connection.setReadTimeout(5000);
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(connection.getInputStream(), "UTF-8")
            );
            StringBuilder content = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                content.append(line).append("\n");
            }
            reader.close();
            return content.toString();
        } catch (Exception e) {
            System.err.println("获取页面内容失败: " + e.getMessage());
            return null;
        }
    }
    private void processPage(String url, String content) {
        // 这里可以添加页面处理逻辑
        System.out.println("页面标题: " + extractTitle(content));
        System.out.println("页面大小: " + content.length() + " 字符");
    }
    private List<String> extractLinks(String content) {
        List<String> links = new ArrayList<>();
        // 使用正则表达式提取链接
        Pattern pattern = Pattern.compile("href=[\"']([^\"']+)[\"']");
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            String link = matcher.group(1);
            // 过滤无效链接
            if (link.startsWith("http") || link.startsWith("https")) {
                links.add(link);
            } else if (link.startsWith("/")) {
                // 相对路径处理(这里简化处理)
                links.add("http://example.com" + link);
            }
        }
        return links;
    }
    private String extractTitle(String content) {
        Pattern pattern = Pattern.compile("<title>(.*?)</title>", Pattern.DOTALL);
        Matcher matcher = pattern.matcher(content);
        if (matcher.find()) {
            return matcher.group(1).trim();
        }
        return "未知标题";
    }
}

增强版本 - 带索引功能

import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
public class SearchEngineCrawler extends SimpleCrawler {
    private Map<String, String> pageIndex;  // URL -> 页面内容摘要
    private Map<String, List<String>> wordIndex;  // 单词 -> URL列表
    public SearchEngineCrawler(int maxPages) {
        super(maxPages);
        this.pageIndex = new ConcurrentHashMap<>();
        this.wordIndex = new ConcurrentHashMap<>();
    }
    @Override
    protected void processPage(String url, String content) {
        super.processPage(url, content);
        // 保存页面内容摘要
        String summary = content.substring(0, Math.min(200, content.length()));
        pageIndex.put(url, summary);
        // 建立单词索引
        indexContent(url, content);
    }
    private void indexContent(String url, String content) {
        // 移除HTML标签
        String text = content.replaceAll("<[^>]+>", " ");
        // 分割单词
        String[] words = text.toLowerCase().split("\\W+");
        for (String word : words) {
            if (word.length() > 2) {  // 忽略短词
                wordIndex.computeIfAbsent(word, k -> new ArrayList<>())
                        .add(url);
            }
        }
    }
    public List<String> search(String query) {
        // 简单搜索实现
        String[] keywords = query.toLowerCase().split("\\s+");
        Set<String> result = new HashSet<>();
        for (String keyword : keywords) {
            List<String> urls = wordIndex.get(keyword);
            if (urls != null) {
                result.addAll(urls);
            }
        }
        return new ArrayList<>(result);
    }
    public void saveIndex(String filePath) throws IOException {
        try (ObjectOutputStream oos = new ObjectOutputStream(
                new FileOutputStream(filePath))) {
            oos.writeObject(pageIndex);
            oos.writeObject(wordIndex);
        }
    }
    public void loadIndex(String filePath) throws IOException, ClassNotFoundException {
        try (ObjectInputStream ois = new ObjectInputStream(
                new FileInputStream(filePath))) {
            pageIndex = (Map<String, String>) ois.readObject();
            wordIndex = (Map<String, List<String>>) ois.readObject();
        }
    }
}

测试代码

public class CrawlerDemo {
    public static void main(String[] args) {
        // 基础版本测试
        System.out.println("=== 基础爬虫测试 ===");
        SimpleCrawler simpleCrawler = new SimpleCrawler(10);
        simpleCrawler.start("https://example.com");
        // 搜索引擎版本测试
        System.out.println("\n=== 搜索引擎爬虫测试 ===");
        SearchEngineCrawler engineCrawler = new SearchEngineCrawler(10);
        engineCrawler.start("https://example.com");
        // 搜索测试
        System.out.println("\n=== 搜索测试 ===");
        List<String> results = engineCrawler.search("example");
        System.out.println("搜索 'example' 结果:");
        for (String url : results) {
            System.out.println(" - " + url);
        }
    }
}

多线程版本

import java.util.concurrent.*;
import java.util.concurrent.locks.ReentrantLock;
public class ConcurrentCrawler {
    private final Set<String> visitedUrlSet = ConcurrentHashMap.newKeySet();
    private final BlockingQueue<String> urlQueue = new LinkedBlockingQueue<>();
    private final ExecutorService executorService;
    private final int threadCount;
    private volatile boolean isRunning = true;
    public ConcurrentCrawler(int threadCount) {
        this.threadCount = threadCount;
        this.executorService = Executors.newFixedThreadPool(threadCount);
    }
    public void start(String seedUrl, int maxPages) {
        urlQueue.add(seedUrl);
        for (int i = 0; i < threadCount; i++) {
            executorService.submit(new CrawlerTask(maxPages));
        }
    }
    public void stop() {
        isRunning = false;
        executorService.shutdown();
    }
    private class CrawlerTask implements Runnable {
        private final int maxPages;
        private int crawledCount = 0;
        public CrawlerTask(int maxPages) {
            this.maxPages = maxPages;
        }
        @Override
        public void run() {
            while (isRunning && crawledCount < maxPages) {
                try {
                    String url = urlQueue.poll(1, TimeUnit.SECONDS);
                    if (url != null && visitedUrlSet.add(url)) {
                        // 爬取逻辑...
                        System.out.println(Thread.currentThread().getName() + 
                                         " 正在爬取: " + url);
                        crawledCount++;
                    }
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    break;
                }
            }
        }
    }
}

使用建议

  1. 遵守robots.txt:检查网站的爬虫协议
  2. 设置请求间隔:避免对服务器造成负担
  3. 处理异常:完善的错误处理机制
  4. 去重机制:避免重复爬取
  5. 存储优化:考虑使用数据库存储索引

这个简单的爬虫可以作为学习参考,实际生产环境需要考虑更多因素,如分布式爬取、动态页面渲染等。

抱歉,评论功能暂时关闭!