└── scrape_loras.py


/scrape_loras.py:
--------------------------------------------------------------------------------
  1 | from huggingface_hub import HfApi, list_models
  2 | from datasets import Dataset
  3 | import json
  4 | import logging
  5 | import re
  6 | 
  7 | # Set up logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def fetch_lora_models():
 12 |     """Fetch LoRA models from Hugging Face that match the criteria."""
 13 |     logger.info("Fetching LoRA models from Hugging Face Hub...")
 14 |     
 15 |     # Initialize the Hugging Face API
 16 |     api = HfApi()
 17 |     
 18 |     # Search for models with LoRA in the tags or description
 19 |     # Focus on models for SDXL Flux
 20 |     models = list_models(
 21 |         filter="lora",  # Look for LoRA models
 22 |         tags=["flux"],  # Filter for flux tagged models
 23 |         cardData=True   # Get additional card data
 24 |     )
 25 |     
 26 |     logger.info(f"Found {len(models)} potential models")
 27 |     
 28 |     lora_data = []
 29 |     for model in models:
 30 |         try:
 31 |             # Get model card data to extract more information
 32 |             model_info = {
 33 |                 "repo": model.id,
 34 |                 "title": model.id.split("/")[-1].replace("-", " ").title()
 35 |             }
 36 |             
 37 |             # Try to get image example from model card metadata
 38 |             image_url = None
 39 |             if hasattr(model, "cardData") and model.cardData:
 40 |                 card_data = model.cardData
 41 |                 
 42 |                 # Extract trigger words from model card if available
 43 |                 if "tags" in card_data and card_data["tags"]:
 44 |                     potential_triggers = [tag for tag in card_data["tags"] if "trigger" in tag or "prompt" in tag]
 45 |                     if potential_triggers:
 46 |                         model_info["trigger_word"] = potential_triggers[0].split(":")[-1].strip()
 47 |                 
 48 |                 # Try to find example image URL
 49 |                 if "widgets" in card_data:
 50 |                     for widget in card_data["widgets"]:
 51 |                         if widget.get("type") == "image" and "src" in widget:
 52 |                             image_url = widget["src"]
 53 |                             break
 54 |             
 55 |             # If no image found in metadata, try to get first image from repo
 56 |             if not image_url:
 57 |                 try:
 58 |                     files = api.list_repo_files(model.id)
 59 |                     image_files = [f for f in files if f.endswith(('.png', '.jpg', '.jpeg', '.webp'))]
 60 |                     if image_files:
 61 |                         image_url = f"https://huggingface.co/{model.id}/resolve/main/{image_files[0]}"
 62 |                 except Exception as e:
 63 |                     logger.warning(f"Error fetching files for {model.id}: {e}")
 64 |             
 65 |             if image_url:
 66 |                 model_info["image"] = image_url
 67 |             
 68 |             # Extract trigger word from README if not found in tags
 69 |             if "trigger_word" not in model_info:
 70 |                 try:
 71 |                     readme_content = api.get_repo_file_content(model.id, "README.md")
 72 |                     trigger_matches = re.findall(r"trigger word[s]?[\s]*:[\s]*[\"']?([^\"'\n]+)[\"']?", 
 73 |                                                readme_content, re.IGNORECASE)
 74 |                     if trigger_matches:
 75 |                         model_info["trigger_word"] = trigger_matches[0].strip()
 76 |                     else:
 77 |                         # Look for other common patterns
 78 |                         prompt_matches = re.findall(r"prompt[s]?[\s]*:[\s]*[\"']?([^\"'\n]+)[\"']?", 
 79 |                                                   readme_content, re.IGNORECASE)
 80 |                         if prompt_matches:
 81 |                             model_info["trigger_word"] = prompt_matches[0].strip()
 82 |                 except Exception as e:
 83 |                     logger.warning(f"Error reading README for {model.id}: {e}")
 84 |             
 85 |             # Default empty string if no trigger word found
 86 |             if "trigger_word" not in model_info:
 87 |                 model_info["trigger_word"] = ""
 88 |             
 89 |             # Default trigger position
 90 |             if "prepend" in str(model_info.get("trigger_word")).lower():
 91 |                 model_info["trigger_position"] = "prepend"
 92 |             
 93 |             lora_data.append(model_info)
 94 |             logger.info(f"Processed {model.id}")
 95 |             
 96 |         except Exception as e:
 97 |             logger.error(f"Error processing model {model.id}: {e}")
 98 |     
 99 |     logger.info(f"Successfully gathered information for {len(lora_data)} LoRA models")
100 |     return lora_data
101 | 
102 | def create_and_push_dataset(lora_data, dataset_name):
103 |     """Create a dataset from LoRA data and push to Hugging Face Hub."""
104 |     logger.info(f"Creating dataset with {len(lora_data)} entries")
105 |     
106 |     # Create dataset
107 |     dataset = Dataset.from_list(lora_data)
108 |     
109 |     # Print dataset info
110 |     logger.info(f"Dataset created with schema: {dataset.features}")
111 |     logger.info(f"Dataset contains {len(dataset)} examples")
112 |     
113 |     # Push to Hub
114 |     logger.info(f"Pushing dataset to {dataset_name}")
115 |     dataset.push_to_hub(dataset_name, private=False)
116 |     logger.info(f"Dataset successfully pushed to https://huggingface.co/datasets/{dataset_name}")
117 | 
118 | def main():
119 |     # Fetch LoRA models
120 |     lora_data = fetch_lora_models()
121 |     
122 |     # Save locally as JSON (backup)
123 |     with open("loras_fetched.json", "w") as f:
124 |         json.dump(lora_data, f, indent=2)
125 |     
126 |     # Create and push dataset
127 |     username = "your-username"  # Replace with your Hugging Face username
128 |     dataset_name = f"{username}/flux-lora-models"
129 |     create_and_push_dataset(lora_data, dataset_name)
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------