VehiclesPlus/Description/html_to_bbcode.py

358 lines
13 KiB
Python
Raw Normal View History

2025-02-12 19:31:52 +01:00
import os
import re
import sys
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
class HTMLToBBCode:
def __init__(self):
self.conversion_map = {
'b': ['[b]', '[/b]'],
'i': ['[i]', '[/i]'],
'u': ['[u]', '[/u]'],
'strong': ['[b]', '[/b]'],
'em': ['[i]', '[/i]'],
'strike': ['[s]', '[/s]'],
'h1': ['[size=7]', '[/size]'],
'h2': ['[size=6]', '[/size]'],
'h3': ['[size=5]', '[/size]'],
'h4': ['[size=4]', '[/size]'],
'h5': ['[size=3]', '[/size]'],
'h6': ['[size=2]', '[/size]'],
'center': ['[center]', '[/center]'],
'div': ['', ''], # Handle div attributes separately
}
# URL replacements (from -> to)
self.url_replacements = {
'https://polymart.org/resource/vehiclesplus-1-12-1-20-2.633?purchase=1':
'https://www.spigotmc.org/resources/vehiclesplus-1-12-1-20-2.70523/purchase',
'https://polymart.org/resource?spigot_id=1997':
'https://www.spigotmc.org/resources/protocollib.1997/',
'https://polymart.org/resource?spigot_id=34315':
'https://www.spigotmc.org/resources/vault.34315/',
'https://polymart.org/resource?spigot_id=9089':
'https://www.spigotmc.org/resources/essentialsx.9089/',
'https://sbdevelopment.tech/images/buttons/buy2.png':
'https://sbdevelopment.tech/images/buttons/buy.png',
'https://sbdevelopment.tech/images/buttons/wiki2.png':
'https://sbdevelopment.tech/images/buttons/wiki.png',
'https://sbdevelopment.tech/images/buttons/discord2.png':
'https://sbdevelopment.tech/images/buttons/discord.png',
'https://sbdevelopment.tech/images/buttons/website2.png':
'https://sbdevelopment.tech/images/buttons/website.png'
}
def _convert_font_size(self, size, unit):
"""Convert font size to BBCode size scale 1-7"""
if unit == 'em':
# Convert em to scale 1-7
# em values: 1=normal, 2=large, 3=larger
size = float(size)
if size <= 0.8:
return 1
elif size <= 1:
return 2
elif size <= 1.5:
return 3
elif size <= 2:
return 4
elif size <= 2.5:
return 5
elif size <= 3:
return 6
else:
return 7
elif unit == 'pt':
# Convert pt to scale 1-7
size = int(size)
if size <= 8:
return 1
elif size <= 10:
return 2
elif size <= 12:
return 3
elif size <= 14:
return 4
elif size <= 16:
return 5
elif size <= 20:
return 6
else:
return 7
return 3 # Default to normal size
@staticmethod
def _convert_youtube_url(url):
# Extract video ID from YouTube URL
if 'youtube.com' in url:
video_id = url.split('v=')[-1].split('&')[0]
return f'[MEDIA=youtube]{video_id}[/MEDIA]'
elif 'youtu.be' in url:
video_id = url.split('/')[-1].split('?')[0]
return f'[MEDIA=youtube]{video_id}[/MEDIA]'
return None
def convert(self, html_content):
# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script, style elements and comments
for element in soup(['script', 'style']):
element.decompose()
# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Convert the HTML to BBCode
bbcode = self._process_tag(soup)
# Fix [CENTER] tag formatting - ensure single newline before [CENTER] and after content
bbcode = re.sub(r'\n+\[CENTER\]', '\n[CENTER]', bbcode)
bbcode = re.sub(r'([^\n])\[CENTER\]', r'\1\n[CENTER]', bbcode)
# Ensure [/CENTER] is on its own line
bbcode = re.sub(r'\[/CENTER\]([^\n])', r'[/CENTER]\n\1', bbcode)
# Clean up multiple consecutive newlines, but preserve single newlines
bbcode = re.sub(r'\n{3,}', '\n\n', bbcode)
# Remove any trailing whitespace
return bbcode.strip()
def _process_tag(self, element):
if isinstance(element, NavigableString):
# Clean up text content but preserve trailing space
text = element.string if element.string else ''
# Keep trailing space if it exists
has_trailing_space = text.endswith(' ')
# Normalize internal spaces
text = ' '.join(text.split())
# Restore trailing space if it existed
if has_trailing_space:
text += ' '
return text
# Skip elements with hidden-bbcode class
if isinstance(element, Tag) and 'hidden-bbcode' in element.get('class', []):
return ''
# Handle br tags immediately
if element.name == 'br':
return '\n'
# Handle p tags
if element.name == 'p':
content = self._get_inner_content(element)
style = element.get('style', '')
if style:
tags = []
# Handle font size
size_match = re.search(r'font-size:\s*(\d+(?:\.\d+)?)(pt|em)', style)
if size_match:
size = size_match.group(1)
unit = size_match.group(2)
bbcode_size = self._convert_font_size(size, unit)
tags.append(f'[SIZE={bbcode_size}]')
# Handle color
color_match = re.search(r'color:\s*rgb\((\d+),\s*(\d+),\s*(\d+)\)', style)
if color_match:
rgb = f'#{int(color_match.group(1)):02x}{int(color_match.group(2)):02x}{int(color_match.group(3)):02x}'
tags.append(f'[COLOR={rgb}]')
# Apply all opening tags
for tag in tags:
content = tag + content
# Apply all closing tags in reverse order
for tag in reversed(tags):
content += f'[/{tag[1:tag.index("=")]}]' if '=' in tag else f'[/{tag[1:]}]'
# Only add newlines if there's actual content and we're not in a special container
if content.strip() and not element.find_parent(['blockquote', 'center']):
# Check if the content is just a single element (like an IMG or MEDIA tag)
if re.match(r'^\[(?:IMG|MEDIA)[^\]]*\][^\[]*\[/(?:IMG|MEDIA)\]$', content.strip()):
return f'{content}\n'
return f'{content}\n\n'
return content
# Handle styled elements (span)
if element.name == 'span':
style = element.get('style', '')
if style:
content = self._get_inner_content(element)
tags = []
# Handle font size
size_match = re.search(r'font-size:\s*(\d+(?:\.\d+)?)(pt|em)', style)
if size_match:
size = size_match.group(1)
unit = size_match.group(2)
bbcode_size = self._convert_font_size(size, unit)
tags.append(f'[SIZE={bbcode_size}]')
# Handle color
color_match = re.search(r'color:\s*rgb\((\d+),\s*(\d+),\s*(\d+)\)', style)
if color_match:
rgb = f'#{int(color_match.group(1)):02x}{int(color_match.group(2)):02x}{int(color_match.group(3)):02x}'
tags.append(f'[COLOR={rgb}]')
# Apply all opening tags
for tag in tags:
content = tag + content
# Apply all closing tags in reverse order
for tag in reversed(tags):
content += f'[/{tag[1:tag.index("=")]}]' if '=' in tag else f'[/{tag[1:]}]'
return content
else:
content = self._get_inner_content(element)
return content
# Handle warning paragraph
if element.name == 'p' and element.find('a', {'class': 'externalLink ProxyLink'}):
warning_text = self._get_inner_content(element)
return f'[SIZE=3][COLOR=rgb(255, 128, 0)]{warning_text}[/COLOR][/SIZE]\n\n'
# Handle text alignment in divs
if element.name == 'div':
style = element.get('style', '')
if 'text-align: center' in style:
inner_content = self._get_inner_content(element).strip()
if inner_content:
# Both opening and closing tags on their own lines
return f'[CENTER]\n{inner_content}\n[/CENTER]'
return ''
# Handle blockquotes - strip all tags inside
if element.name == 'blockquote':
# Get raw text without any formatting
content = ''
for text in element.stripped_strings:
content += text + '\n'
return f'[QUOTE]\n{content.strip()}\n[/QUOTE]'
# Handle links
if element.name == 'a':
href = element.get('href', '')
if href:
# Check if it's a YouTube URL first
youtube_bbcode = self._convert_youtube_url(href)
if youtube_bbcode:
return youtube_bbcode
# Otherwise handle as normal URL
href = self.url_replacements.get(href, href)
content = self._get_inner_content(element)
# Check if there's space after the link
next_sibling = element.next_sibling
has_space = next_sibling and isinstance(next_sibling, NavigableString) and next_sibling.startswith(' ')
# Use single quotes for URLs and preserve space
return f"[URL='{href}']{content}[/URL]" + (' ' if has_space else '')
return ''
# Handle images
if element.name == 'img':
src = element.get('src', '')
if src:
src = self.url_replacements.get(src, src)
return f'[IMG]{src}[/IMG]'
return ''
# Handle lists
if element.name in ['ul', 'ol']:
result = '[LIST]' if element.name == 'ul' else '[LIST=1]'
result += '\n'
for item in element.find_all('li', recursive=False):
result += '[*]' + self._get_inner_content(item).strip() + '\n'
result += '[/LIST]\n'
return result
# Handle pre/code blocks
if element.name == 'pre':
if element.find('code'):
# Get the raw content preserving original formatting
code_content = ''
for string in element.find('code').strings:
code_content += string
if code_content:
# Preserve original formatting for HTML content
if 'language-markup' in element.get('class', []):
return f'[html]\n{code_content}\n[/html]'
return f'[code]\n{code_content}\n[/code]'
return ''
# Handle basic formatting
if element.name in self.conversion_map:
content = self._get_inner_content(element)
if content.strip():
return f"{self.conversion_map[element.name][0]}{content}{self.conversion_map[element.name][1]}"
# Process all other tags
return self._get_inner_content(element)
def _get_inner_content(self, element):
return ''.join(self._process_tag(child) for child in element.children)
def convert_html_to_bbcode(html_content):
"""
Convert HTML content to BBCode.
Args:
html_content (str): The HTML content to convert
Returns:
str: The converted BBCode content
"""
converter = HTMLToBBCode()
return converter.convert(html_content)
def convert_file(input_file):
"""
Convert an HTML file to BBCode and save it with .bbcode extension
Args:
input_file (str): Path to the HTML file
"""
if not input_file.endswith('.html'):
print(f"Error: Input file '{input_file}' must have .html extension")
return False
output_file = input_file[:-5] + '.bbcode' # Replace .html with .bbcode
try:
with open(input_file, 'r', encoding='utf-8') as f:
html_content = f.read()
bbcode = convert_html_to_bbcode(html_content)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(bbcode)
print(f"Successfully converted '{input_file}' to '{output_file}'")
return True
except Exception as e:
print(f"Error converting file: {str(e)}")
return False
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python html_to_bbcode.py <input_file.html>")
sys.exit(1)
input_file = sys.argv[1]
if not os.path.exists(input_file):
print(f"Error: File '{input_file}' does not exist")
sys.exit(1)
success = convert_file(input_file)
sys.exit(0 if success else 1)