/
check-relative-doc-links.py
executable file
·130 lines (108 loc) · 3.71 KB
/
check-relative-doc-links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
from __future__ import print_function
import os
import sys
import re
SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
def main():
os.chdir(SOURCE_ROOT)
filepaths = []
totalDirs = 0
try:
for root, dirs, files in os.walk(DOCS_DIR):
totalDirs += len(dirs)
for f in files:
if f.endswith('.md'):
filepaths.append(os.path.join(root, f))
except KeyboardInterrupt:
print('Keyboard interruption. Please try again.')
return 0
totalBrokenLinks = 0
for path in filepaths:
totalBrokenLinks += getBrokenLinks(path)
print('Parsed through ' + str(len(filepaths)) +
' files within docs directory and its ' +
str(totalDirs) + ' subdirectories.')
print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
return totalBrokenLinks
def getBrokenLinks(filepath):
currentDir = os.path.dirname(filepath)
brokenLinks = []
try:
f = open(filepath, 'r', encoding="utf-8")
lines = f.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
f.close()
linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
referenceLinkRegex = re.compile(
'^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
)
links = []
for line in lines:
matchLinks = linkRegexLink.search(line)
matchReferenceLinks = referenceLinkRegex.search(line)
if matchLinks:
relativeLink = matchLinks.group('link')
if not str(relativeLink).startswith('http'):
links.append(relativeLink)
if matchReferenceLinks:
referenceLink = matchReferenceLinks.group('link').strip('<>')
if not str(referenceLink).startswith('http'):
links.append(referenceLink)
for link in links:
sections = link.split('#')
if len(sections) < 2:
if not os.path.isfile(os.path.join(currentDir, link)):
brokenLinks.append(link)
elif str(link).startswith('#'):
if not checkSections(sections, lines):
brokenLinks.append(link)
else:
tempFile = os.path.join(currentDir, sections[0])
if os.path.isfile(tempFile):
try:
newFile = open(tempFile, 'r', encoding="utf-8")
newLines = newFile.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
newFile.close()
if not checkSections(sections, newLines):
brokenLinks.append(link)
else:
brokenLinks.append(link)
print_errors(filepath, brokenLinks)
return len(brokenLinks)
def checkSections(sections, lines):
invalidCharsRegex = '[^A-Za-z0-9_ \-]'
sectionHeader = sections[1]
regexSectionTitle = re.compile('# (?P<header>.*)')
for line in lines:
matchHeader = regexSectionTitle.search(line)
if matchHeader:
# This does the following to slugify a header name:
# * Replace whitespace with dashes
# * Strip anything that's not alphanumeric or a dash
# * Anything quoted with backticks (`) is an exception and will
# not have underscores stripped
matchHeader = str(matchHeader.group('header')).replace(' ', '-')
matchHeader = ''.join(
map(
lambda match: re.sub(invalidCharsRegex, '', match[0])
+ re.sub(invalidCharsRegex + '|_', '', match[1]),
re.findall('(`[^`]+`)|([^`]+)', matchHeader),
)
)
if matchHeader.lower() == sectionHeader:
return True
return False
def print_errors(filepath, brokenLink):
if brokenLink:
print("File Location: " + filepath)
for link in brokenLink:
print("\tBroken links: " + link)
if __name__ == '__main__':
sys.exit(main())