Converting wordpress blog to markdown for Astro
Our original blog has been active since 2004 and during this time, we originally used blogger.com, then converted to wordpress and completely abondoned it in 2015. I found an old mysql backup of the blog and wanted to check if I can convert each post to a markdown file and then inject some yaml front matter to get it to work in our new astro design. Again getting back to the philosophy of data should be future proof. The code below is super simple, just use mysql python client and markdownify.
import mysql.connector
import markdownify
import os
import re
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
value = re.sub(r'[^\w\s-]', '', value).strip().lower()
value = re.sub(r'[\s_-]+', '-', value)
if value[0].isdigit():
value = f'post-{value}'
return value
# Connect to your WordPress database
cnx = mysql.connector.connect(
user='your_username',
password='your_password',
host='your_host',
database='your_database'
)
# Create a cursor to execute queries
cursor = cnx.cursor()
# Updated query to retrieve blog entries with author names
query = """
SELECT p.post_title, p.post_content, p.post_name, p.post_date, u.display_name
FROM wp_posts p
JOIN wp_users u ON p.post_author = u.ID
WHERE p.post_type = 'post' AND p.post_status = 'publish'
"""
# Execute the query
cursor.execute(query)
# Fetch all the rows
rows = cursor.fetchall()
# Close the cursor and connection
cursor.close()
cnx.close()
# Create a directory to store the Markdown files
markdown_dir = 'markdown_files'
if not os.path.exists(markdown_dir):
os.makedirs(markdown_dir)
# Loop through each blog entry
for title, content, slug, date, author in rows:
# Convert HTML content to Markdown using markdownify
md_content = markdownify.markdownify(content, heading_style='ATX')
# Create a YAML front matter block with dynamic values
yaml_block = f'''---
title: '{title}'
description: '{title}'
pubDate: '{date.strftime('%b %d %Y')}'
heroImage: '/img/blog/og_logo.png'
author: '{author}'
---
'''
# Write the Markdown content to a file
file_slug = slugify(title) # Generate a slug from the title for the filename
with open(f'{markdown_dir}/{file_slug}.md', 'w') as f:
f.write(yaml_block + f'\n\n# {title}\n\n' + md_content) # Title right after YAML block
Another interesting problem I ran into was to update the heroImage if a post had it. Here is my python script for it
import re
import os
import requests
def replace_hero_image(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Regex to find the first image URL, ensuring it only matches images
images = re.findall(r'!\[.*?\]\((.*?)\s*(?:["\'].*?["\'])?\)', content)
if images:
first_image_url = images[0]
print(f"Checking image URL: {first_image_url}") # Debug output
try:
# Make a HEAD request to check if the image URL works
response = requests.head(first_image_url, allow_redirects=True)
if response.status_code == 200:
print(f"Image URL is valid: {first_image_url}")
# Replace the heroImage in the YAML front matter
new_content = re.sub(r'(heroImage:\s*["\'])(.*?)(["\'])', r'\1' + first_image_url + r'\3', content, count=1)
print(f"New heroImage line: {re.search(r'heroImage:.*', new_content).group()}") # Debug output
with open(file_path, 'w', encoding='utf-8') as file:
file.write(new_content)
print(f"Updated heroImage in {file_path}")
else:
print(f"Image URL returned {response.status_code}, not updating heroImage.")
except requests.exceptions.RequestException as e:
print(f"Failed to retrieve image URL {first_image_url}: {e}")
else:
print(f"No images found in {file_path}")
# Iterate through all markdown files in the directory
for filename in os.listdir('.'):
if filename.endswith('.md'):
replace_hero_image(filename)
I put this on github and contributions are welcome.
By: Gavi Narra on: