Web Crawler & Indexer in Python
About the project: Here we are building a simple Web Crawler and Indexer. It is a fantastic way to learn about networking, parsing, and data structures in Python.
We have broken the project down into three essential files:
- crawler.py: Handles fetching web pages and extracting links and text content.
- indexer.py: Takes the extracted content, cleans it, tokenizes it, and builds a simple inverted index.
- main.py: Orchestrates the entire process, starting the crawl and displaying the final index.
Prerequisites:
You will need the following libraries installed:
pip install requests beautifulsoup4
The Crawler Logic
This file contains the WebCrawler class responsible for fetching URLs and extracting necessary information.
The Indexer Logic
This file defines the Indexer class, which processes the raw text content to build a frequency-based inverted index.
The Main Execution Script
This file brings the crawler and indexer together and executes the workflow.
Now, We will create three python files.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import deque
import time
class WebCrawler:
"""
A simple web crawler that fetches pages, extracts text, and finds internal links.
"""
def __init__(self, start_url, max_pages=10):
self.start_url = start_url
self.max_pages = max_pages
self.domain = urlparse(start_url).netloc
self.visited = set()
self.queue = deque([start_url])
self.crawled_data = [] # Stores list of {'url': ..., 'content': ...}
def _fetch_page(self, url):
"""Fetches the HTML content for a given URL, handles basic errors."""
try:
# Basic user agent to identify the crawler
headers = {'User-Agent': 'SimplePythonCrawler/1.0'}
response = requests.get(url, headers=headers, timeout=5)
# Raise HTTPError for bad responses (4xx or 5xx)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f" [Error] Could not fetch {url}: {e}")
return None
def _get_page_content(self, html):
"""Extracts and cleans visible text content from the HTML."""
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style tags before extracting text
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
# Get text, strip whitespace, and return
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def _extract_links(self, html, base_url):
"""Finds and returns a set of valid, internal URLs."""
soup = BeautifulSoup(html, 'html.parser')
links = set()
for link in soup.find_all('a', href=True):
href = link.get('href')
absolute_url = urljoin(base_url, href)
# 1. Normalize the URL (remove fragment/anchor part)
parsed_url = urlparse(absolute_url)._replace(fragment='')
clean_url = parsed_url.geturl()
# 2. Check if the URL belongs to the same domain
if parsed_url.netloc == self.domain and clean_url not in self.visited:
links.add(clean_url)
return links
def crawl(self):
"""The main crawling loop."""
print(f"Starting crawl on {self.start_url} (Max pages: {self.max_pages})...")
while self.queue and len(self.visited) < self.max_pages:
current_url = self.queue.popleft()
if current_url in self.visited:
continue
self.visited.add(current_url)
print(f" [Crawl] Visiting page {len(self.visited)}/{self.max_pages}: {current_url}")
html = self._fetch_page(current_url)
if not html:
continue
# Extract content for indexing
content = self._get_page_content(html)
self.crawled_data.append({'url': current_url, 'content': content})
# Extract links and add new ones to the queue
new_links = self._extract_links(html, current_url)
for link in new_links:
if len(self.visited) + len(self.queue) < self.max_pages:
self.queue.append(link)
# Be polite to the server (optional but good practice)
time.sleep(0.5)
print(f"Crawl finished. {len(self.crawled_data)} pages successfully crawled.")
return self.crawled_data
import re
from collections import defaultdict, Counter
class Indexer:
"""
Processes content from the crawler and creates an inverted index.
The inverted index maps:
{ 'word': { 'url1': count_in_url1, 'url2': count_in_url2, ... } }
"""
def __init__(self):
# The core data structure for the index
self.inverted_index = defaultdict(lambda: defaultdict(int))
# A small list of common English stop words
self.stop_words = set([
'the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were',
'in', 'on', 'at', 'to', 'of', 'for', 'with', 'this', 'that', 'it'
])
def _tokenize(self, text):
"""
Converts text to lowercase, removes punctuation, and splits into tokens.
Filters out stop words and short tokens.
"""
# Convert to lowercase
text = text.lower()
# Replace non-alphanumeric characters with spaces and split
tokens = re.findall(r'\b\w+\b', text)
# Filter tokens
filtered_tokens = [
token for token in tokens
if token not in self.stop_words and len(token) > 2
]
return filtered_tokens
def build_index(self, crawled_data):
"""
Iterates over all crawled documents and populates the inverted index.
:param crawled_data: List of dicts, e.g., [{'url': url, 'content': text}]
"""
print("\nBuilding inverted index...")
for doc in crawled_data:
url = doc['url']
content = doc['content']
# 1. Tokenize the content
tokens = self._tokenize(content)
# 2. Count word frequency in the current document
word_counts = Counter(tokens)
# 3. Add to the global inverted index
for word, count in word_counts.items():
self.inverted_index[word][url] = count
print(f"Index built successfully with {len(self.inverted_index)} unique terms.")
return self.inverted_index
def search(self, query):
"""
Performs a simple Boolean AND search for terms in the query.
Returns a list of URLs and their total score (sum of term counts).
"""
query_tokens = self._tokenize(query)
if not query_tokens:
return []
# Start with all documents containing the first term
results = self.inverted_index.get(query_tokens[0], {})
# Intersect documents for subsequent terms (Boolean AND)
for term in query_tokens[1:]:
term_docs = self.inverted_index.get(term, {})
# Find common URLs
common_urls = set(results.keys()) & set(term_docs.keys())
# Update scores for common URLs
new_results = {}
for url in common_urls:
# Score is the sum of counts for all query terms
new_results[url] = results.get(url, 0) + term_docs.get(url, 0)
results = new_results
if not results:
break # No intersection, stop searching
# Convert to a sorted list of (url, score) tuples
sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
return sorted_results
from crawler import WebCrawler
from indexer import Indexer
import json
# IMPORTANT NOTE:
# Replace the START_URL with a local development server or a smaller website
# to avoid hitting rate limits or consuming excessive resources on public sites.
# For demonstration purposes, we will use a hypothetical local domain.
START_URL = "http://example.com"
MAX_PAGES = 5 # Keep this low for a simple, quick test
def run_spider():
"""
Main function to run the crawler, indexer, and perform a search.
"""
print("--- Web Crawler and Indexer Project ---")
# --- 1. Crawl the Web ---
crawler = WebCrawler(START_URL, max_pages=MAX_PAGES)
crawled_data = crawler.crawl()
if not crawled_data:
print("\n[INFO] No data crawled. Exiting.")
return
# --- 2. Build the Index ---
indexer = Indexer()
inverted_index = indexer.build_index(crawled_data)
# Optionally, save the index to a file for persistence
# with open('inverted_index.json', 'w') as f:
# json.dump(inverted_index, f, indent=4)
# --- 3. Perform a Sample Search ---
# Find the most frequent words to create a meaningful sample query
all_content = ' '.join(item['content'] for item in crawled_data)
sample_tokens = indexer._tokenize(all_content)
# Get the top 3 most common words (adjust for a real search)
from collections import Counter
top_words = Counter(sample_tokens).most_common(3)
# Use the top two words as the sample query
if len(top_words) >= 2:
sample_query = f"{top_words[0][0]} {top_words[1][0]}"
elif top_words:
sample_query = top_words[0][0]
else:
sample_query = "domain content" # Default fallback
print(f"\n--- Running Search Query: '{sample_query}' ---")
search_results = indexer.search(sample_query)
if search_results:
print(f"Found {len(search_results)} documents matching the query, sorted by relevance (score):")
for rank, (url, score) in enumerate(search_results, 1):
print(f" {rank}. Score: {score} | URL: {url}")
else:
print("No results found for the sample query.")
if __name__ == "__main__":
run_spider()
This structure provides a clean separation of concerns, making the crawler responsible for retrieval and the indexer responsible for data processing and searching.
Run this project:
To run this, simply execute: python main.py.
We now have a foundational Web Crawler and Indexer! To make this robust, you might consider adding features like:
Database Persistence: Storing the index and crawled data in a file (like the commented-out json.dump) or in a proper database (like SQLite or Redis).
Ranking: Implementing more sophisticated ranking algorithms (e.g., TF-IDF or PageRank) instead of simple term count summation.
Error Handling: Adding more granular logging and managing redirects more effectively.
← Back to Projects