```python
import os
import re
import json
import yaml # Ensure you have PyYAML installed: pip install pyyaml
def extract_frontmatter(file_content, file_path):
"""
Extract YAML frontmatter from a Markdown file.
Frontmatter should be between two '---' lines. If templating markers (e.g., {{...}})
cause parsing errors, they are removed and parsing is retried.
"""
fm_pattern = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL)
match = fm_pattern.match(file_content)
if match:
fm_content = match.group(1)
try:
frontmatter = yaml.safe_load(fm_content)
return frontmatter if isinstance(frontmatter, dict) else {}
except yaml.YAMLError:
sanitized = re.sub(r'{{.*?}}', '', fm_content)
try:
frontmatter = yaml.safe_load(sanitized)
return frontmatter if isinstance(frontmatter, dict) else {}
except yaml.YAMLError as e2:
print(f"Error parsing YAML in file '{file_path}' after sanitization: {e2}")
return {}
return {}
def scan_directory(directory, permalink_exclude_regex, exclude_paths, include_paths):
"""
Walk through the directory tree, process Markdown files, and extract permalinks.
- Exclude files if their full path contains any substring in exclude_paths.
- Only process files whose path contains at least one include substring (if provided).
- Skip files whose permalink matches the exclusion regex.
"""
included_permalinks = []
excluded_permalinks = []
try:
permalink_exclude_pattern = re.compile(permalink_exclude_regex)
except re.error as e:
print(f"Invalid regex pattern '{permalink_exclude_regex}': {e}")
return included_permalinks, excluded_permalinks
for root, dirs, files in os.walk(directory):
if include_paths:
dirs[:] = [d for d in dirs if any(inc in os.path.join(root, d) for inc in include_paths)]
dirs[:] = [d for d in dirs if not any(excl in os.path.join(root, d) for excl in exclude_paths)]
for file in files:
file_path = os.path.join(root, file)
if include_paths and not any(inc in file_path for inc in include_paths):
continue
if any(excl in file_path for excl in exclude_paths):
continue
if not file.endswith(".md"):
continue
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading file '{file_path}': {e}")
continue
frontmatter = extract_frontmatter(content, file_path)
permalink = frontmatter.get('permalink')
if not permalink:
continue
if permalink_exclude_pattern.search(permalink):
excluded_permalinks.append(permalink)
else:
included_permalinks.append(permalink)
return included_permalinks, excluded_permalinks
def main():
# Enter multiple directory paths separated by commas (e.g., "folder1, folder2")
directories_input = input("Enter the directory paths to scan (separated by commas): ").strip()
directories = [d.strip() for d in directories_input.split(',') if d.strip()]
default_regex = r'.*\.(namzu|sud|enigmas)