Converting wordpress blog to markdown for Astro

Our original blog has been active since 2004 and during this time, we originally used blogger.com, then converted to wordpress and completely abondoned it in 2015. I found an old mysql backup of the blog and wanted to check if I can convert each post to a markdown file and then inject some yaml front matter to get it to work in our new astro design. Again getting back to the philosophy of data should be future proof. The code below is super simple, just use mysql python client and markdownify.

import mysql.connector
import markdownify
import os
import re

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = re.sub(r'[^\w\s-]', '', value).strip().lower()
    value = re.sub(r'[\s_-]+', '-', value)
    if value[0].isdigit():
        value = f'post-{value}'
    return value

# Connect to your WordPress database
cnx = mysql.connector.connect(
    user='your_username',
    password='your_password',
    host='your_host',
    database='your_database'
)

# Create a cursor to execute queries
cursor = cnx.cursor()

# Updated query to retrieve blog entries with author names
query = """
SELECT p.post_title, p.post_content, p.post_name, p.post_date, u.display_name 
FROM wp_posts p 
JOIN wp_users u ON p.post_author = u.ID
WHERE p.post_type = 'post' AND p.post_status = 'publish'
"""

# Execute the query
cursor.execute(query)

# Fetch all the rows
rows = cursor.fetchall()

# Close the cursor and connection
cursor.close()
cnx.close()

# Create a directory to store the Markdown files
markdown_dir = 'markdown_files'
if not os.path.exists(markdown_dir):
    os.makedirs(markdown_dir)

# Loop through each blog entry
for title, content, slug, date, author in rows:
    # Convert HTML content to Markdown using markdownify
    md_content = markdownify.markdownify(content, heading_style='ATX')

    # Create a YAML front matter block with dynamic values
    yaml_block = f'''---
title: '{title}'
description: '{title}'
pubDate: '{date.strftime('%b %d %Y')}'
heroImage: '/img/blog/og_logo.png'
author: '{author}'
---
'''

    # Write the Markdown content to a file
    file_slug = slugify(title)  # Generate a slug from the title for the filename
    with open(f'{markdown_dir}/{file_slug}.md', 'w') as f:
        f.write(yaml_block + f'\n\n# {title}\n\n' + md_content)  # Title right after YAML block

Another interesting problem I ran into was to update the heroImage if a post had it. Here is my python script for it

import re
import os
import requests

def replace_hero_image(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regex to find the first image URL, ensuring it only matches images
    images = re.findall(r'!\[.*?\]\((.*?)\s*(?:["\'].*?["\'])?\)', content)
    if images:
        first_image_url = images[0]
        print(f"Checking image URL: {first_image_url}")  # Debug output

        try:
            # Make a HEAD request to check if the image URL works
            response = requests.head(first_image_url, allow_redirects=True)
            if response.status_code == 200:
                print(f"Image URL is valid: {first_image_url}")

                # Replace the heroImage in the YAML front matter
                new_content = re.sub(r'(heroImage:\s*["\'])(.*?)(["\'])', r'\1' + first_image_url + r'\3', content, count=1)
                print(f"New heroImage line: {re.search(r'heroImage:.*', new_content).group()}")  # Debug output

                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(new_content)
                print(f"Updated heroImage in {file_path}")
            else:
                print(f"Image URL returned {response.status_code}, not updating heroImage.")
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve image URL {first_image_url}: {e}")

    else:
        print(f"No images found in {file_path}")

# Iterate through all markdown files in the directory
for filename in os.listdir('.'):
    if filename.endswith('.md'):
        replace_hero_image(filename)

I put this on github and contributions are welcome.