358 lines
13 KiB
Python
358 lines
13 KiB
Python
|
import os
|
||
|
import re
|
||
|
import sys
|
||
|
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||
|
|
||
|
|
||
|
class HTMLToBBCode:
|
||
|
def __init__(self):
|
||
|
self.conversion_map = {
|
||
|
'b': ['[b]', '[/b]'],
|
||
|
'i': ['[i]', '[/i]'],
|
||
|
'u': ['[u]', '[/u]'],
|
||
|
'strong': ['[b]', '[/b]'],
|
||
|
'em': ['[i]', '[/i]'],
|
||
|
'strike': ['[s]', '[/s]'],
|
||
|
'h1': ['[size=7]', '[/size]'],
|
||
|
'h2': ['[size=6]', '[/size]'],
|
||
|
'h3': ['[size=5]', '[/size]'],
|
||
|
'h4': ['[size=4]', '[/size]'],
|
||
|
'h5': ['[size=3]', '[/size]'],
|
||
|
'h6': ['[size=2]', '[/size]'],
|
||
|
'center': ['[center]', '[/center]'],
|
||
|
'div': ['', ''], # Handle div attributes separately
|
||
|
}
|
||
|
|
||
|
# URL replacements (from -> to)
|
||
|
self.url_replacements = {
|
||
|
'https://polymart.org/resource/vehiclesplus-1-12-1-20-2.633?purchase=1':
|
||
|
'https://www.spigotmc.org/resources/vehiclesplus-1-12-1-20-2.70523/purchase',
|
||
|
'https://polymart.org/resource?spigot_id=1997':
|
||
|
'https://www.spigotmc.org/resources/protocollib.1997/',
|
||
|
'https://polymart.org/resource?spigot_id=34315':
|
||
|
'https://www.spigotmc.org/resources/vault.34315/',
|
||
|
'https://polymart.org/resource?spigot_id=9089':
|
||
|
'https://www.spigotmc.org/resources/essentialsx.9089/',
|
||
|
'https://sbdevelopment.tech/images/buttons/buy2.png':
|
||
|
'https://sbdevelopment.tech/images/buttons/buy.png',
|
||
|
'https://sbdevelopment.tech/images/buttons/wiki2.png':
|
||
|
'https://sbdevelopment.tech/images/buttons/wiki.png',
|
||
|
'https://sbdevelopment.tech/images/buttons/discord2.png':
|
||
|
'https://sbdevelopment.tech/images/buttons/discord.png',
|
||
|
'https://sbdevelopment.tech/images/buttons/website2.png':
|
||
|
'https://sbdevelopment.tech/images/buttons/website.png'
|
||
|
}
|
||
|
|
||
|
def _convert_font_size(self, size, unit):
|
||
|
"""Convert font size to BBCode size scale 1-7"""
|
||
|
if unit == 'em':
|
||
|
# Convert em to scale 1-7
|
||
|
# em values: 1=normal, 2=large, 3=larger
|
||
|
size = float(size)
|
||
|
if size <= 0.8:
|
||
|
return 1
|
||
|
elif size <= 1:
|
||
|
return 2
|
||
|
elif size <= 1.5:
|
||
|
return 3
|
||
|
elif size <= 2:
|
||
|
return 4
|
||
|
elif size <= 2.5:
|
||
|
return 5
|
||
|
elif size <= 3:
|
||
|
return 6
|
||
|
else:
|
||
|
return 7
|
||
|
elif unit == 'pt':
|
||
|
# Convert pt to scale 1-7
|
||
|
size = int(size)
|
||
|
if size <= 8:
|
||
|
return 1
|
||
|
elif size <= 10:
|
||
|
return 2
|
||
|
elif size <= 12:
|
||
|
return 3
|
||
|
elif size <= 14:
|
||
|
return 4
|
||
|
elif size <= 16:
|
||
|
return 5
|
||
|
elif size <= 20:
|
||
|
return 6
|
||
|
else:
|
||
|
return 7
|
||
|
return 3 # Default to normal size
|
||
|
|
||
|
@staticmethod
|
||
|
def _convert_youtube_url(url):
|
||
|
# Extract video ID from YouTube URL
|
||
|
if 'youtube.com' in url:
|
||
|
video_id = url.split('v=')[-1].split('&')[0]
|
||
|
return f'[MEDIA=youtube]{video_id}[/MEDIA]'
|
||
|
elif 'youtu.be' in url:
|
||
|
video_id = url.split('/')[-1].split('?')[0]
|
||
|
return f'[MEDIA=youtube]{video_id}[/MEDIA]'
|
||
|
return None
|
||
|
|
||
|
def convert(self, html_content):
|
||
|
# Create BeautifulSoup object
|
||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
|
||
|
# Remove script, style elements and comments
|
||
|
for element in soup(['script', 'style']):
|
||
|
element.decompose()
|
||
|
|
||
|
# Remove HTML comments
|
||
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||
|
comment.extract()
|
||
|
|
||
|
# Convert the HTML to BBCode
|
||
|
bbcode = self._process_tag(soup)
|
||
|
|
||
|
# Fix [CENTER] tag formatting - ensure single newline before [CENTER] and after content
|
||
|
bbcode = re.sub(r'\n+\[CENTER\]', '\n[CENTER]', bbcode)
|
||
|
bbcode = re.sub(r'([^\n])\[CENTER\]', r'\1\n[CENTER]', bbcode)
|
||
|
|
||
|
# Ensure [/CENTER] is on its own line
|
||
|
bbcode = re.sub(r'\[/CENTER\]([^\n])', r'[/CENTER]\n\1', bbcode)
|
||
|
|
||
|
# Clean up multiple consecutive newlines, but preserve single newlines
|
||
|
bbcode = re.sub(r'\n{3,}', '\n\n', bbcode)
|
||
|
|
||
|
# Remove any trailing whitespace
|
||
|
return bbcode.strip()
|
||
|
|
||
|
def _process_tag(self, element):
|
||
|
if isinstance(element, NavigableString):
|
||
|
# Clean up text content but preserve trailing space
|
||
|
text = element.string if element.string else ''
|
||
|
# Keep trailing space if it exists
|
||
|
has_trailing_space = text.endswith(' ')
|
||
|
# Normalize internal spaces
|
||
|
text = ' '.join(text.split())
|
||
|
# Restore trailing space if it existed
|
||
|
if has_trailing_space:
|
||
|
text += ' '
|
||
|
return text
|
||
|
|
||
|
# Skip elements with hidden-bbcode class
|
||
|
if isinstance(element, Tag) and 'hidden-bbcode' in element.get('class', []):
|
||
|
return ''
|
||
|
|
||
|
# Handle br tags immediately
|
||
|
if element.name == 'br':
|
||
|
return '\n'
|
||
|
|
||
|
# Handle p tags
|
||
|
if element.name == 'p':
|
||
|
content = self._get_inner_content(element)
|
||
|
style = element.get('style', '')
|
||
|
|
||
|
if style:
|
||
|
tags = []
|
||
|
# Handle font size
|
||
|
size_match = re.search(r'font-size:\s*(\d+(?:\.\d+)?)(pt|em)', style)
|
||
|
if size_match:
|
||
|
size = size_match.group(1)
|
||
|
unit = size_match.group(2)
|
||
|
bbcode_size = self._convert_font_size(size, unit)
|
||
|
tags.append(f'[SIZE={bbcode_size}]')
|
||
|
|
||
|
# Handle color
|
||
|
color_match = re.search(r'color:\s*rgb\((\d+),\s*(\d+),\s*(\d+)\)', style)
|
||
|
if color_match:
|
||
|
rgb = f'#{int(color_match.group(1)):02x}{int(color_match.group(2)):02x}{int(color_match.group(3)):02x}'
|
||
|
tags.append(f'[COLOR={rgb}]')
|
||
|
|
||
|
# Apply all opening tags
|
||
|
for tag in tags:
|
||
|
content = tag + content
|
||
|
|
||
|
# Apply all closing tags in reverse order
|
||
|
for tag in reversed(tags):
|
||
|
content += f'[/{tag[1:tag.index("=")]}]' if '=' in tag else f'[/{tag[1:]}]'
|
||
|
|
||
|
# Only add newlines if there's actual content and we're not in a special container
|
||
|
if content.strip() and not element.find_parent(['blockquote', 'center']):
|
||
|
# Check if the content is just a single element (like an IMG or MEDIA tag)
|
||
|
if re.match(r'^\[(?:IMG|MEDIA)[^\]]*\][^\[]*\[/(?:IMG|MEDIA)\]$', content.strip()):
|
||
|
return f'{content}\n'
|
||
|
return f'{content}\n\n'
|
||
|
return content
|
||
|
|
||
|
# Handle styled elements (span)
|
||
|
if element.name == 'span':
|
||
|
style = element.get('style', '')
|
||
|
if style:
|
||
|
content = self._get_inner_content(element)
|
||
|
tags = []
|
||
|
|
||
|
# Handle font size
|
||
|
size_match = re.search(r'font-size:\s*(\d+(?:\.\d+)?)(pt|em)', style)
|
||
|
if size_match:
|
||
|
size = size_match.group(1)
|
||
|
unit = size_match.group(2)
|
||
|
bbcode_size = self._convert_font_size(size, unit)
|
||
|
tags.append(f'[SIZE={bbcode_size}]')
|
||
|
|
||
|
# Handle color
|
||
|
color_match = re.search(r'color:\s*rgb\((\d+),\s*(\d+),\s*(\d+)\)', style)
|
||
|
if color_match:
|
||
|
rgb = f'#{int(color_match.group(1)):02x}{int(color_match.group(2)):02x}{int(color_match.group(3)):02x}'
|
||
|
tags.append(f'[COLOR={rgb}]')
|
||
|
|
||
|
# Apply all opening tags
|
||
|
for tag in tags:
|
||
|
content = tag + content
|
||
|
|
||
|
# Apply all closing tags in reverse order
|
||
|
for tag in reversed(tags):
|
||
|
content += f'[/{tag[1:tag.index("=")]}]' if '=' in tag else f'[/{tag[1:]}]'
|
||
|
|
||
|
return content
|
||
|
else:
|
||
|
content = self._get_inner_content(element)
|
||
|
return content
|
||
|
|
||
|
# Handle warning paragraph
|
||
|
if element.name == 'p' and element.find('a', {'class': 'externalLink ProxyLink'}):
|
||
|
warning_text = self._get_inner_content(element)
|
||
|
return f'[SIZE=3][COLOR=rgb(255, 128, 0)]{warning_text}[/COLOR][/SIZE]\n\n'
|
||
|
|
||
|
# Handle text alignment in divs
|
||
|
if element.name == 'div':
|
||
|
style = element.get('style', '')
|
||
|
if 'text-align: center' in style:
|
||
|
inner_content = self._get_inner_content(element).strip()
|
||
|
if inner_content:
|
||
|
# Both opening and closing tags on their own lines
|
||
|
return f'[CENTER]\n{inner_content}\n[/CENTER]'
|
||
|
return ''
|
||
|
|
||
|
# Handle blockquotes - strip all tags inside
|
||
|
if element.name == 'blockquote':
|
||
|
# Get raw text without any formatting
|
||
|
content = ''
|
||
|
for text in element.stripped_strings:
|
||
|
content += text + '\n'
|
||
|
return f'[QUOTE]\n{content.strip()}\n[/QUOTE]'
|
||
|
|
||
|
# Handle links
|
||
|
if element.name == 'a':
|
||
|
href = element.get('href', '')
|
||
|
if href:
|
||
|
# Check if it's a YouTube URL first
|
||
|
youtube_bbcode = self._convert_youtube_url(href)
|
||
|
if youtube_bbcode:
|
||
|
return youtube_bbcode
|
||
|
|
||
|
# Otherwise handle as normal URL
|
||
|
href = self.url_replacements.get(href, href)
|
||
|
content = self._get_inner_content(element)
|
||
|
# Check if there's space after the link
|
||
|
next_sibling = element.next_sibling
|
||
|
has_space = next_sibling and isinstance(next_sibling, NavigableString) and next_sibling.startswith(' ')
|
||
|
# Use single quotes for URLs and preserve space
|
||
|
return f"[URL='{href}']{content}[/URL]" + (' ' if has_space else '')
|
||
|
return ''
|
||
|
|
||
|
# Handle images
|
||
|
if element.name == 'img':
|
||
|
src = element.get('src', '')
|
||
|
if src:
|
||
|
src = self.url_replacements.get(src, src)
|
||
|
return f'[IMG]{src}[/IMG]'
|
||
|
return ''
|
||
|
|
||
|
# Handle lists
|
||
|
if element.name in ['ul', 'ol']:
|
||
|
result = '[LIST]' if element.name == 'ul' else '[LIST=1]'
|
||
|
result += '\n'
|
||
|
for item in element.find_all('li', recursive=False):
|
||
|
result += '[*]' + self._get_inner_content(item).strip() + '\n'
|
||
|
result += '[/LIST]\n'
|
||
|
return result
|
||
|
|
||
|
# Handle pre/code blocks
|
||
|
if element.name == 'pre':
|
||
|
if element.find('code'):
|
||
|
# Get the raw content preserving original formatting
|
||
|
code_content = ''
|
||
|
for string in element.find('code').strings:
|
||
|
code_content += string
|
||
|
if code_content:
|
||
|
# Preserve original formatting for HTML content
|
||
|
if 'language-markup' in element.get('class', []):
|
||
|
return f'[html]\n{code_content}\n[/html]'
|
||
|
return f'[code]\n{code_content}\n[/code]'
|
||
|
return ''
|
||
|
|
||
|
# Handle basic formatting
|
||
|
if element.name in self.conversion_map:
|
||
|
content = self._get_inner_content(element)
|
||
|
if content.strip():
|
||
|
return f"{self.conversion_map[element.name][0]}{content}{self.conversion_map[element.name][1]}"
|
||
|
|
||
|
# Process all other tags
|
||
|
return self._get_inner_content(element)
|
||
|
|
||
|
def _get_inner_content(self, element):
|
||
|
return ''.join(self._process_tag(child) for child in element.children)
|
||
|
|
||
|
|
||
|
def convert_html_to_bbcode(html_content):
|
||
|
"""
|
||
|
Convert HTML content to BBCode.
|
||
|
|
||
|
Args:
|
||
|
html_content (str): The HTML content to convert
|
||
|
|
||
|
Returns:
|
||
|
str: The converted BBCode content
|
||
|
"""
|
||
|
converter = HTMLToBBCode()
|
||
|
return converter.convert(html_content)
|
||
|
|
||
|
|
||
|
def convert_file(input_file):
|
||
|
"""
|
||
|
Convert an HTML file to BBCode and save it with .bbcode extension
|
||
|
|
||
|
Args:
|
||
|
input_file (str): Path to the HTML file
|
||
|
"""
|
||
|
if not input_file.endswith('.html'):
|
||
|
print(f"Error: Input file '{input_file}' must have .html extension")
|
||
|
return False
|
||
|
|
||
|
output_file = input_file[:-5] + '.bbcode' # Replace .html with .bbcode
|
||
|
|
||
|
try:
|
||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||
|
html_content = f.read()
|
||
|
|
||
|
bbcode = convert_html_to_bbcode(html_content)
|
||
|
|
||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
f.write(bbcode)
|
||
|
|
||
|
print(f"Successfully converted '{input_file}' to '{output_file}'")
|
||
|
return True
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"Error converting file: {str(e)}")
|
||
|
return False
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
if len(sys.argv) != 2:
|
||
|
print("Usage: python html_to_bbcode.py <input_file.html>")
|
||
|
sys.exit(1)
|
||
|
|
||
|
input_file = sys.argv[1]
|
||
|
if not os.path.exists(input_file):
|
||
|
print(f"Error: File '{input_file}' does not exist")
|
||
|
sys.exit(1)
|
||
|
|
||
|
success = convert_file(input_file)
|
||
|
sys.exit(0 if success else 1)
|