def is_exception_file(filename): return os.path.basename(filename).lower() in EXCEPTION_FILES def is_exception_extension(filename): _, ext = os.path.splitext(filename) return ext.lower() in EXCEPTION_EXTENSIONS def clear_output_file(): with open(OUTPUT_FILE_PATH, "w", encoding="utf-8") as f: pass def get_files_to_process(folder, file_filter): pattern = os.path.join(folder, file_filter) files = glob.glob(pattern) filtered = [] for f in files: if os.path.isfile(f): if is_exception_file(f): continue if is_exception_extension(f): continue filtered.append(f) return filtered def is_root_domain_link(href): href_no_protocol = re.sub(r'^https?://', '', href) if href_no_protocol == 'www.evolution-101.com': return True if href_no_protocol.startswith('www.evolution-101.com?'): return True return False def check_link_validity(url): if not url.startswith('http://') and not url.startswith('https://'): url = 'http://' + url try: response = requests.head(url, allow_redirects=True, timeout=5) if 200 <= response.status_code < 400: return True else: return False except Exception: return False def main(): # Read parameters if len(sys.argv) < 2: print("Usage: python lien.py [target_folder]") sys.exit(1) file_filter = sys.argv[1] target_folder = sys.argv[2] if len(sys.argv) > 2 else os.getcwd() clear_output_file() # Ask for maxlien try: maxlien_input = input("Nombre de liens: ") except Exception: maxlien_input = "" try: maxlien = int(maxlien_input) except (ValueError, TypeError): maxlien = 3000 files = get_files_to_process(target_folder, file_filter) total_files_processed = 0 total_invalid_links = 0 total_links_checked = 0 with open(OUTPUT_FILE_PATH, "a", encoding="utf-8") as output_file: for filepath in files: invalid_links_in_file = 0 with open(filepath, "r", encoding="utf-8") as f: lines = f.readlines() for idx, line in enumerate(lines): hrefs = re.findall(r'href=["\'][^"\']*www\.evolution-101\.com[^"\']*', line) for href in hrefs: url_match = re.search(r'href=["\']([^"\']*)', href) if url_match: url = url_match.group(1) if is_root_domain_link(url): continue if total_links_checked >= maxlien: print("Max number of links reached, exiting.") print(f"FIN DU TRAITEMENT sélectionné selon : {file_filter}") print(f"Fichiers traités avec succès : {total_files_processed}") print(f"Nombre total de broken links : {total_invalid_links}") return total_links_checked += 1 if not check_link_validity(url): invalid_links_in_file += 1 total_invalid_links += 1 output_file.write(f"{os.path.basename(filepath)}, LOC : {idx + 1}, {url}\n") print(f"Fichier traité : {os.path.basename(filepath)} {invalid_links_in_file}") total_files_processed += 1 print(f"FIN DU TRAITEMENT sélectionné selon : {file_filter}") print(f"Fichiers traités avec succès : {total_files_processed}") print(f"Nombre total de broken links : {total_invalid_links}") if __name__ == "__main__": main()