Scrape Dokumentasi FilamentPHP 5.x — Menggunakan BeautifulSoup4 dan Markdownify

Di https://filamentphp.com/docs/5.x/, buka submenu yang tertutup agar tampil semua di layar.

Inspect element bagian navbar dan copy-paste elemennya, lalu disimpan dalam file bernama FilamentPHP5DocsSidebar.html. Contoh element saat snippet ini dibuat: FilamentPHP5DocsSidebar.html

Ini dimasukkan dan dijalankan dalam satu folder.

File filament_doc_builder.py:

import os
import time
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urljoin, urlparse

# Configuration
SIDEBAR_FILE = 'FilamentPHP5DocsSidebar.html'
BASE_URL = 'https://filamentphp.com'
OUTPUT_MD = 'FilamentPHP_5_DocBook.md'
ASSETS_DIR = 'filament_assets'

# Create assets directory if it doesn't exist
if not os.path.exists(ASSETS_DIR):
    os.makedirs(ASSETS_DIR)

def get_links_from_sidebar():
    """Extracts all documentation links from the provided sidebar HTML."""
    if not os.path.exists(SIDEBAR_FILE):
        print(f"Error: {SIDEBAR_FILE} not found.")
        return []

    with open(SIDEBAR_FILE, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if '/docs/5.x/' in href:
            title = a_tag.get_text(strip=True)
            full_url = urljoin(BASE_URL, href)
            if not any(link['url'] == full_url for link in links):
                links.append({'title': title, 'url': full_url})
    return links

def download_and_localize_image(img_url):
    """Downloads an image and returns the local relative path."""
    try:
        # Clean the URL
        parsed_url = urlparse(img_url)
        # Create a filename based on the path to avoid collisions
        filename = parsed_url.path.strip('/').replace('/', '_')
        if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.svg', '.gif')):
            filename += ".jpg"
            
        local_path = os.path.join(ASSETS_DIR, filename)
        
        # Only download if we haven't already
        if not os.path.exists(local_path):
            img_data = requests.get(img_url, timeout=10).content
            with open(local_path, 'wb') as handler:
                handler.write(img_data)
        
        return local_path
    except Exception as e:
        print(f"      ! Failed to download image {img_url}: {e}")
        return img_url

def build_doc_book():
    links = get_links_from_sidebar()
    print(f"Found {len(links)} pages. Starting compilation...")

    with open(OUTPUT_MD, 'w', encoding='utf-8') as book:
        book.write(f"# FilamentPHP 5.x Reference Book\n\n")
        book.write(f"> Compiled on {time.ctime()}\n\n")

        for i, item in enumerate(links):
            print(f"[{i+1}/{len(links)}] Processing: {item['title']}")
            
            try:
                res = requests.get(item['url'], headers={'User-Agent': 'Mozilla/5.0'})
                res.raise_for_status()
                soup = BeautifulSoup(res.text, 'html.parser')
                
                # Target the main content area
                content = soup.find('main') or soup.find(class_='prose')
                
                if not content:
                    print(f"   ! Content not found for {item['title']}")
                    continue

                # 1. CLEANUP: Remove duplicate Dark Mode images
                # Filament hides these in spans with data-rmiz-content="not-found"
                for dark_span in content.find_all('span', {'data-rmiz-content': 'not-found'}):
                    dark_span.decompose()

                # 2. IMAGES: Download and fix paths
                for img in content.find_all('img'):
                    src = img.get('src')
                    if src:
                        full_img_url = urljoin(BASE_URL, src)
                        # Download and get local path
                        local_img_path = download_and_localize_image(full_img_url)
                        img['src'] = local_img_path

                # 3. LINKS: Ensure all internal links remain absolute so they work in MD
                for a in content.find_all('a', href=True):
                    if a['href'].startswith('/'):
                        a['href'] = urljoin(BASE_URL, a['href'])

                # 4. CONVERT TO MARKDOWN
                # We use ATX headers (###) to keep the structure clean
                markdown_text = md(str(content), heading_style="ATX")

                # Write to book
                book.write(f"\n\n---\n")
                book.write(f"# {item['title']}\n")
                book.write(f"**URL:** {item['url']}\n\n")
                book.write(markdown_text)
                book.write(f"\n\n")

            except Exception as e:
                print(f"   ! Error processing {item['url']}: {e}")
            
            # Pause to be respectful to the server
            time.sleep(1)

    print(f"\nSuccess! Your book is ready: {OUTPUT_MD}")
    print(f"All images are saved in: {ASSETS_DIR}")

if __name__ == "__main__":
    build_doc_book()

$ python3 -m venv venv
$ source venv/bin/activate
(venv) $ pip install requests beautifulsoup4 markdownify
(venv) $ python3 filament_doc_builder.py

Screenshot: