def is_exception_file(filename):
return os.path.basename(filename).lower() in EXCEPTION_FILES
def is_exception_extension(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in EXCEPTION_EXTENSIONS
def clear_output_file():
with open(OUTPUT_FILE_PATH, "w", encoding="utf-8") as f:
pass
def get_files_to_process(folder, file_filter):
pattern = os.path.join(folder, file_filter)
files = glob.glob(pattern)
filtered = []
for f in files:
if os.path.isfile(f):
if is_exception_file(f):
continue
if is_exception_extension(f):
continue
filtered.append(f)
return filtered
def is_root_domain_link(href):
href_no_protocol = re.sub(r'^https?://', '', href)
if href_no_protocol == 'www.evolution-101.com':
return True
if href_no_protocol.startswith('www.evolution-101.com?'):
return True
return False
def check_link_validity(url):
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://' + url
try:
response = requests.head(url, allow_redirects=True, timeout=5)
if 200 <= response.status_code < 400:
return True
else:
return False
except Exception:
return False
def main():
# Read parameters
if len(sys.argv) < 2:
print("Usage: python lien.py
[target_folder]")
sys.exit(1)
file_filter = sys.argv[1]
target_folder = sys.argv[2] if len(sys.argv) > 2 else os.getcwd()
clear_output_file()
# Ask for maxlien
try:
maxlien_input = input("Nombre de liens: ")
except Exception:
maxlien_input = ""
try:
maxlien = int(maxlien_input)
except (ValueError, TypeError):
maxlien = 3000
files = get_files_to_process(target_folder, file_filter)
total_files_processed = 0
total_invalid_links = 0
total_links_checked = 0
with open(OUTPUT_FILE_PATH, "a", encoding="utf-8") as output_file:
for filepath in files:
invalid_links_in_file = 0
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
for idx, line in enumerate(lines):
hrefs = re.findall(r'href=["\'][^"\']*www\.evolution-101\.com[^"\']*', line)
for href in hrefs:
url_match = re.search(r'href=["\']([^"\']*)', href)
if url_match:
url = url_match.group(1)
if is_root_domain_link(url):
continue
if total_links_checked >= maxlien:
print("Max number of links reached, exiting.")
print(f"FIN DU TRAITEMENT sélectionné selon : {file_filter}")
print(f"Fichiers traités avec succès : {total_files_processed}")
print(f"Nombre total de broken links : {total_invalid_links}")
return
total_links_checked += 1
if not check_link_validity(url):
invalid_links_in_file += 1
total_invalid_links += 1
output_file.write(f"{os.path.basename(filepath)}, LOC : {idx + 1}, {url}\n")
print(f"Fichier traité : {os.path.basename(filepath)} {invalid_links_in_file}")
total_files_processed += 1
print(f"FIN DU TRAITEMENT sélectionné selon : {file_filter}")
print(f"Fichiers traités avec succès : {total_files_processed}")
print(f"Nombre total de broken links : {total_invalid_links}")
if __name__ == "__main__":
main()