-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsampleFAQScript.py
More file actions
89 lines (71 loc) · 3.13 KB
/
sampleFAQScript.py
File metadata and controls
89 lines (71 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script extracts all FAQ questions and their corresponding answers from
the REDCap.html file and saves them to a text file. It uses the
BeautifulSoup library to parse the HTML.
"""
from bs4 import BeautifulSoup
def extract_qa_pairs_from_html(file_path="REDCap.html"):
"""
Parses an HTML file and extracts pairs of questions and answers.
Args:
file_path (str): The path to the HTML file.
Returns:
list: A list of dictionaries, where each dictionary contains
a 'question' and an 'answer' key.
"""
qa_pairs = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Find all the main container divs for each Q&A pair.
# The class 'panel panel-default card' consistently wraps one question and one answer.
qa_containers = soup.find_all('div', class_='panel panel-default card')
if not qa_containers:
print("Warning: Could not find any Q&A containers. The HTML structure might have changed.")
return []
# Loop through each container to find the question and answer within it.
for container in qa_containers:
question_tag = container.find('a', class_='accordion-toggle')
answer_tag = container.find('div', class_='panel-body')
# Ensure both a question and an answer were found before proceeding.
if question_tag and answer_tag:
question_text = question_tag.get_text(strip=True)
answer_text = answer_tag.get_text(strip=True)
qa_pairs.append({
'question': question_text,
'answer': answer_text
})
return qa_pairs
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
return []
except Exception as e:
print(f"An unexpected error occurred: {e}")
return []
def save_qa_to_file(qa_pairs, output_file="sample_data.txt"):
"""
Saves a list of question-answer pairs to a text file.
Args:
qa_pairs (list): A list of dictionaries containing questions and answers.
output_file (str): The name of the file to save the data to.
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
for i, pair in enumerate(qa_pairs, 1):
f.write(f"Q{i}: {pair['question']}\n")
f.write(f"A{i}: {pair['answer']}\n")
f.write("-" * 20 + "\n\n") # Separator for readability
print(f"\n✅ Success! {len(qa_pairs)} Q&A pairs have been saved to '{output_file}'.")
except IOError as e:
print(f"\n❌ Error: Could not write to the file '{output_file}'. Reason: {e}")
if __name__ == "__main__":
# 1. Extract the Q&A pairs from the HTML file.
faq_data = extract_qa_pairs_from_html()
# 2. If data was found, save it to a text file.
if faq_data:
save_qa_to_file(faq_data)
else:
print("No Q&A pairs were extracted, so no file was created.")