From 417223d58354da0ce64c1b1306e918fa42b9eb50 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 7 Jun 2026 11:51:47 -0400 Subject: [PATCH 01/18] Refactor parseArtSci function --- app/WebParsing/ArtSciParser.hs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 74f5b36ab..115bde9ad 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -34,13 +34,17 @@ parseCalendar = do parseArtSci :: IO () parseArtSci = do programs <- programsUrl - bodyTags <- httpBodyTags programs - let deptInfo = getDeptList bodyTags + deptInfo <- parseDepartmentList programs runDb $ do liftIO $ putStrLn "Inserting departments" insertDepts $ map snd deptInfo mapM_ parseDepartment (nubBy (\(x, _) (y, _) -> x == y) deptInfo) +parseDepartmentList :: String -> IO [(T.Text, T.Text)] +parseDepartmentList url = do + bodyTags <- httpBodyTags url + return $ getDeptList bodyTags + -- | Converts the processed main page and extracts a list of department html pages -- and department names getDeptList :: [Tag T.Text] -> [(T.Text, T.Text)] From 622c635e143f7e7237af6e4c325e88e9ecc5234e Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 7 Jun 2026 11:52:45 -0400 Subject: [PATCH 02/18] Update parseDepartmentList to ignore department names --- app/WebParsing/ArtSciParser.hs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 115bde9ad..328eb784a 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -42,8 +42,15 @@ parseArtSci = do parseDepartmentList :: String -> IO [(T.Text, T.Text)] parseDepartmentList url = do + let ignoredDepts = ["ASIP (Arts & Science Internship Program)", + "Biology", "Combined Degree Programs", + "Data Science", + "Faculty of Arts & Science Programs (299/398/399)", + "Pathobiology (see Laboratory Medicine and Pathobiology)", + "Research Opportunity/Research Excursions (299/398/399)"] bodyTags <- httpBodyTags url - return $ getDeptList bodyTags + let deptList = getDeptList bodyTags + return $ filter (\(_, deptName) -> deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) deptList -- | Converts the processed main page and extracts a list of department html pages -- and department names From 6f84688fecfd595ed3e5192e7fe52e02f7529346 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 7 Jun 2026 12:00:38 -0400 Subject: [PATCH 03/18] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f3cdd88d..28ce413ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ - Refactor functions for performing matrix operations from `app/Svg/Parser.hs` to `app/Util/Matrix.hs` - Updated documentation in `app/Util/Blaze.hs` - Removed `SvgJSON` data type in favour of `([Text], [Shape], [Path])` +- Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` ## [0.7.2] - 2025-12-10 From d208de13c3dc019ef69b3b4c017af95865a2899b Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 7 Jun 2026 13:50:47 -0400 Subject: [PATCH 04/18] Add comment for parseDepartmentList --- app/WebParsing/ArtSciParser.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 328eb784a..f36e585b5 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -40,6 +40,8 @@ parseArtSci = do insertDepts $ map snd deptInfo mapM_ parseDepartment (nubBy (\(x, _) (y, _) -> x == y) deptInfo) +-- | Parse the list of all departments, given the URL of the program/subject areas page. +-- Exclude departments with no courses, duplicate courses, and program areas belonging to a college. parseDepartmentList :: String -> IO [(T.Text, T.Text)] parseDepartmentList url = do let ignoredDepts = ["ASIP (Arts & Science Internship Program)", From ec392614cfd9e705fea6ae844b99a8e7313f19d6 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Tue, 9 Jun 2026 09:09:47 -0400 Subject: [PATCH 05/18] Update indentation for ignoredDepts variable --- app/WebParsing/ArtSciParser.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index f36e585b5..e672232f5 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -45,7 +45,8 @@ parseArtSci = do parseDepartmentList :: String -> IO [(T.Text, T.Text)] parseDepartmentList url = do let ignoredDepts = ["ASIP (Arts & Science Internship Program)", - "Biology", "Combined Degree Programs", + "Biology", + "Combined Degree Programs", "Data Science", "Faculty of Arts & Science Programs (299/398/399)", "Pathobiology (see Laboratory Medicine and Pathobiology)", From 309e3132d404e45cb257d1191f94c978c0ceaa6c Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Fri, 12 Jun 2026 10:58:56 -0400 Subject: [PATCH 06/18] Add parseDepartmentList to module export list --- app/WebParsing/ArtSciParser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index e672232f5..eeb0213f5 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -1,5 +1,5 @@ module WebParsing.ArtSciParser - (parseCalendar, getDeptList) where + (parseCalendar, getDeptList, parseDepartmentList) where import Config (fasCalendarUrl, programsUrl, runDb) import Control.Monad.IO.Class (liftIO) From 67684a474479461b167ac90aef4628f5fdfa52e0 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:25:02 -0400 Subject: [PATCH 07/18] Update department name in ignoredDepts --- app/WebParsing/ArtSciParser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index eeb0213f5..2fdace914 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -48,7 +48,7 @@ parseDepartmentList url = do "Biology", "Combined Degree Programs", "Data Science", - "Faculty of Arts & Science Programs (299/398/399)", + "Faculty of Arts and Science Programs (299/398/399)", "Pathobiology (see Laboratory Medicine and Pathobiology)", "Research Opportunity/Research Excursions (299/398/399)"] bodyTags <- httpBodyTags url From 6752a68447e79e44ad878db8e6dd368bbbd0ec5e Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:28:50 -0400 Subject: [PATCH 08/18] Add test for WebParsing.ArtSciParser --- backend-test/WebParsing/ArtSciParserTests.hs | 128 +++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 backend-test/WebParsing/ArtSciParserTests.hs diff --git a/backend-test/WebParsing/ArtSciParserTests.hs b/backend-test/WebParsing/ArtSciParserTests.hs new file mode 100644 index 000000000..d5b734649 --- /dev/null +++ b/backend-test/WebParsing/ArtSciParserTests.hs @@ -0,0 +1,128 @@ +{-| +Description: ArtSciParser module tests. + +Module that contains the tests for the functions in the ArtSciParser module. + +-} + +module WebParsing.ArtSciParserTests +( test_artSciParser +) where + +import qualified Data.Text as T +import WebParsing.ArtSciParser (parseDepartmentList) +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.HUnit (assertEqual, testCase) + +parsedDepts :: [(T.Text, T.Text)] +parsedDepts = + [ ("/section/Academic-Bridging-Program", "Academic Bridging Program"), + ("/section/Actuarial-Science", "Actuarial Science"), + ("/section/African-Studies", "African Studies Centre"), + ("/section/American-Studies", "American Studies"), + ("/section/Anatomy", "Anatomy"), + ("/section/Anthropology", "Anthropology"), + ("/section/Archaeology", "Archaeology"), + ("/section/Architecture-and-Visual-Studies", "Architecture and Visual Studies"), + ("/section/Art-History", "Art History"), + ("/section/Astronomy-and-Astrophysics", "Astronomy and Astrophysics"), + ("/section/Biochemistry", "Biochemistry"), + ("/section/Business-Fundamentals", "Business Fundamentals"), + ("/section/Centre-for-Caribbean-Studies", "Caribbean Studies, Centre for"), + ("/section/Cell-and-Systems-Biology", "Cell and Systems Biology"), + ("/section/Chemistry", "Chemistry"), + ("/section/Cinema-Studies-Institute", "Cinema Studies (Cinema Studies Institute)"), + ("/section/Classics", "Classics"), + ("/section/Computer-Science", "Computer Science"), + ("/section/Contemporary-Asian-Studies", "Contemporary Asian Studies, Dr. David Chu Program in"), + ("/section/Criminology-and-Sociolegal-Studies", "Criminology and Sociolegal Studies, Centre for"), + ("/section/Diaspora-and-Transnational-Studies", "Diaspora and Transnational Studies"), + ("/section/Drama,-Theatre-and-Performance-Studies", "Drama, Theatre and Performance Studies, Centre for"), + ("/section/Earth-Sciences", "Earth Sciences"), + ("/section/East-Asian-Studies", "East Asian Studies"), + ("/section/Ecology-and-Evolutionary-Biology", "Ecology and Evolutionary Biology"), + ("/section/Economics", "Economics"), + ("/section/English", "English"), + ("/section/Centre-for-Entrepreneurship", "Entrepreneurship, Centre for"), + ("/section/School-of-the-Environment", "Environment (School of the Environment)"), + ("/section/Slavic-and-East-European-Languages-and-Cultures", "Estonian"), + ("/section/Centre-for-Ethics", "Ethics, Centre for"), + ("/section/European-Affairs", "European Affairs"), + ("/section/Slavic-and-East-European-Languages-and-Cultures", "Finnish"), + ("/section/First-Year-Foundations", "First-Year Foundations"), + ("/section/Forest-Conservation-and-Forest-Biomaterials-Science", "Forest Conservation and Forest Biomaterials Science"), + ("/section/French", "French"), + ("/section/Geography-and-Planning", "Geography and Planning"), + ("/section/German", "German"), + ("/section/History", "History"), + ("/section/History-and-Philosophy-of-Science-and-Technology", "History and Philosophy of Science and Technology"), + ("/section/Human-Biology", "Human Biology"), + ("/section/Hungarian", "Hungarian"), + ("/section/Immunology", "Immunology"), + ("/section/Indigenous-Studies", "Indigenous Studies"), + ("/section/Industrial-Relations-and-Human-Resources", "Industrial Relations and Human Resources, Centre for"), + ("/section/Innis-College", "Innis College"), + ("/section/Italian", "Italian"), + ("/section/Centre-for-Jewish-Studies", "Jewish Studies, Centre for"), + ("/section/Laboratory-Medicine-and-Pathobiology", "Laboratory Medicine and Pathobiology"), + ("/section/Latin-American-Studies", "Latin American Studies"), + ("/section/Linguistics", "Linguistics"), + ("/section/Materials-Science", "Materials Science"), + ("/section/Mathematics", "Mathematics"), + ("/section/Centre-for-Medieval-Studies", "Medieval Studies, Centre for"), + ("/section/Molecular-Genetics-and-Microbiology", "Molecular Genetics and Microbiology"), + ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Munk School of Global Affairs and Public Policy"), + ("/section/Music", "Music"), + ("/section/Near-and-Middle-Eastern-Civilizations", "Near and Middle Eastern Civilizations"), + ("/section/New-College", "New College"), + ("/section/Nutritional-Sciences", "Nutritional Sciences"), + ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Peace, Conflict and Justice"), + ("/section/Pharmacology-and-Toxicology", "Pharmacology and Toxicology"), + ("/section/Philosophy", "Philosophy"), + ("/section/Physics", "Physics"), + ("/section/Physiology", "Physiology"), + ("/section/Planetary-Science", "Planetary Science"), + ("/section/Political-Science", "Political Science"), + ("/section/Portuguese", "Portuguese"), + ("/section/Psychology", "Psychology"), + ("/section/Munk-School-of-Global-Affairs-and-Public-Policy", "Public Policy"), + ("/section/Religion", "Religion"), + ("/section/Rotman-Commerce", "Rotman Commerce"), + ("/section/St.-Michael's-College", "St. Michael's College"), + ("/section/Sexual-Diversity-Studies", "Sexual Diversity Studies, Mark S. Bonham Centre for"), + ("/section/Slavic-and-East-European-Languages-and-Cultures", "Slavic and East European Languages and Cultures"), + ("/section/Sociology", "Sociology"), + ("/section/South-Asian-Studies", "South Asian Studies"), + ("/section/Spanish", "Spanish"), + ("/section/Statistical-Sciences", "Statistical Sciences"), + ("/section/Canadian-Institute-for-Theoretical-Astrophysics", "Theoretical Astrophysics (Canadian Institute for Theoretical Astrophysics)"), + ("/section/Trinity-College", "Trinity College"), + ("/section/University-College", "University College"), + ("/section/Geography-and-Planning", "Urban Studies"), + ("/section/Victoria-College", "Victoria College"), + ("/section/Women-and-Gender-Studies", "Women and Gender Studies"), + ("/section/Woodsworth-College", "Woodsworth College"), + ("/writing-faculty-arts-science", "Writing in the Faculty of Arts & Science"), + ("/section/Yiddish-Studies", "Yiddish Studies") + ] + +-- | List of test cases as (label, input URL, expected output) +parseDeptListTestCases :: [(String, String, [(T.Text, T.Text)])] +parseDeptListTestCases = + [ ("Program/subject areas page", "https://artsci.calendar.utoronto.ca/listing-program-subject-areas", parsedDepts) ] + +-- | Run a test case (label, input URL, expected output) on the parseDepartmentList function. +runParseDeptListTest :: (String, String, [(T.Text, T.Text)]) -> TestTree +runParseDeptListTest (label, input, expected) = + testCase label $ do + actual <- parseDepartmentList input + assertEqual ("Unexpected parsing result for " ++ label) expected actual + +-- | Run all the parseDeptList test cases +runParseDeptListTests :: [TestTree] +runParseDeptListTests = map runParseDeptListTest parseDeptListTestCases + +-- | Test suite for ArtSciParser module +test_artSciParser :: TestTree +test_artSciParser = + testGroup "ArtSciParser tests" runParseDeptListTests From caa36b4bdb276eeb2534a900aa3b4fe9c6e0d031 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:29:47 -0400 Subject: [PATCH 09/18] Update parseDepartmentList in WebParsing.ArtSciParser --- app/WebParsing/ArtSciParser.hs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 2fdace914..0ccbd629a 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -6,6 +6,7 @@ import Control.Monad.IO.Class (liftIO) import Data.List (findIndex, nubBy) import Data.Maybe (fromMaybe, mapMaybe) import qualified Data.Text as T +import qualified Data.Bifunctor as BF import Data.Text.Lazy (toStrict) import Data.Text.Lazy.Encoding (decodeUtf8) import Database.Persist (insertUnique) @@ -44,16 +45,17 @@ parseArtSci = do -- Exclude departments with no courses, duplicate courses, and program areas belonging to a college. parseDepartmentList :: String -> IO [(T.Text, T.Text)] parseDepartmentList url = do - let ignoredDepts = ["ASIP (Arts & Science Internship Program)", - "Biology", - "Combined Degree Programs", - "Data Science", - "Faculty of Arts and Science Programs (299/398/399)", - "Pathobiology (see Laboratory Medicine and Pathobiology)", + let ignoredDepts = ["ASIP (Arts & Science Internship Program)", + "Biology", + "Combined Degree Programs", + "Data Science", + "Faculty of Arts and Science Programs (299/398/399)", + "Laboratory Medicine and Pathobiology)", -- | Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page "Research Opportunity/Research Excursions (299/398/399)"] bodyTags <- httpBodyTags url let deptList = getDeptList bodyTags - return $ filter (\(_, deptName) -> deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) deptList + let cleaned = map (BF.second (T.replace "\160" " ")) deptList + return $ filter (\(deptPage, deptName) -> "/" `T.isPrefixOf` deptPage && deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) cleaned -- | Converts the processed main page and extracts a list of department html pages -- and department names From 12d1582404031dfe965941862798f4e6fd4e06ab Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:32:59 -0400 Subject: [PATCH 10/18] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28ce413ad..708fdd800 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ - Updated documentation in `app/Util/Blaze.hs` - Removed `SvgJSON` data type in favour of `([Text], [Shape], [Path])` - Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` +- Added test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` ## [0.7.2] - 2025-12-10 From 7965eb65798da012c40879f008e722d8eb625c39 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:55:21 -0400 Subject: [PATCH 11/18] Update CHANGELOG again --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3d7b6061..51dec3c1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ ### 🔧 Internal changes +- Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` +- Added test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` + ## [0.8.0] - 2026-06-09 ### ✨ New features/enhancements @@ -47,8 +50,6 @@ - Updated documentation in `app/Util/Blaze.hs` - Moved the `Course` data type from `Database/Tables.hs` into `Models/Course.hs`, renamed it to `CourseData` - Removed `SvgJSON` data type in favour of `([Text], [Shape], [Path])` -- Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` -- Added test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` ## [0.7.2] - 2025-12-10 From 38f79d6a567304c5fbdfe24345a39278253ac1dc Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Sun, 14 Jun 2026 16:56:05 -0400 Subject: [PATCH 12/18] Update CHANGELOG bullet tense --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51dec3c1a..e5d8d7f0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ ### 🔧 Internal changes - Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` -- Added test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` +- Add test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` ## [0.8.0] - 2026-06-09 From d63629a8c8f9e90fa7de153ca6422145fefa6b8e Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Wed, 17 Jun 2026 11:17:37 -0400 Subject: [PATCH 13/18] Revert CHANGELOG --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b3e7ebb6..4820c2740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,7 @@ ### 🔧 Internal changes - Refactored the `Courses` table to `Course` with a database migration - - Refactor `parseArtSci` function in `app/WebParsing/ArtSciParser.hs` by introducing `parseDepartmentList` -- Add test cases for the `parseDepartmentList` function in `backend-test/WebParsing/ArtSciParserTests.hs` ## [0.8.0] - 2026-06-09 From 433c2697c59717ac781ffcace3c52e26958f3d1b Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Wed, 17 Jun 2026 11:19:48 -0400 Subject: [PATCH 14/18] Remove getDeptList export --- app/WebParsing/ArtSciParser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 234ca56e1..41c4d0be5 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -1,5 +1,5 @@ module WebParsing.ArtSciParser - (parseCalendar, getDeptList, parseDepartmentList) where + (parseCalendar, parseDepartmentList) where import Config (fasCalendarUrl, programsUrl, runDb) import Control.Monad.IO.Class (liftIO) From 10e8a0fbda81b9df620b0d3bca802a51bea48ee0 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Mon, 22 Jun 2026 20:05:35 -0400 Subject: [PATCH 15/18] Refactor parseDepartmentList by moving code to getDeptList and creating a local helper function --- app/WebParsing/ArtSciParser.hs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 41c4d0be5..3138b66ed 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -54,8 +54,13 @@ parseDepartmentList url = do "Research Opportunity/Research Excursions (299/398/399)"] bodyTags <- httpBodyTags url let deptList = getDeptList bodyTags - let cleaned = map (BF.second (T.replace "\160" " ")) deptList - return $ filter (\(deptPage, deptName) -> "/" `T.isPrefixOf` deptPage && deptName `notElem` ignoredDepts && not (" College)" `T.isSuffixOf` deptName)) cleaned + return $ filter (isValidDepartment ignoredDepts) deptList + where + isValidDepartment :: [T.Text] -> (T.Text, T.Text) -> Bool + isValidDepartment ignoredDepts (deptPage, deptName) = + "/" `T.isPrefixOf` deptPage && + deptName `notElem` ignoredDepts && + not (" College)" `T.isSuffixOf` deptName) -- | Converts the processed main page and extracts a list of department html pages -- and department names @@ -64,7 +69,7 @@ getDeptList tags = let tables = TS.partitions (TS.isTagOpenName "table") tags -- every partition is a table tables' = map (takeWhile (not . TS.isTagCloseName "table")) tables depts = concatMap extractDepartments tables' - in depts + in map (BF.second cleanText) depts where extractDepartments :: [Tag T.Text] -> [(T.Text, T.Text)] extractDepartments tableTags = @@ -77,6 +82,7 @@ getDeptList tags = getDept :: [Tag T.Text] -> Maybe (T.Text, T.Text) getDept [] = Nothing getDept (x:xs) = Just (TS.fromAttrib "href" x, T.strip $ TS.innerText (x:xs)) + -- | Insert department names to database insertDepts :: [T.Text] -> SqlPersistM () @@ -172,4 +178,4 @@ httpBodyTags url = do -- | Remove odd characters from text cleanText :: T.Text -> T.Text -cleanText = T.replace "\n" "" . T.replace "\8203" "" . T.replace "\160" "" . T.strip +cleanText = T.replace "\n" "" . T.replace "\8203" "" . T.replace "\160" " " . T.strip From e747852ee5e44930db792d5720d3c616bf0f7d71 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Mon, 22 Jun 2026 20:36:34 -0400 Subject: [PATCH 16/18] Remove extra blank line --- app/WebParsing/ArtSciParser.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 3138b66ed..10dcb19e7 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -82,7 +82,6 @@ getDeptList tags = getDept :: [Tag T.Text] -> Maybe (T.Text, T.Text) getDept [] = Nothing getDept (x:xs) = Just (TS.fromAttrib "href" x, T.strip $ TS.innerText (x:xs)) - -- | Insert department names to database insertDepts :: [T.Text] -> SqlPersistM () From 5b7145208329a515e2e449f646feda9ccd959e99 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Mon, 22 Jun 2026 20:40:12 -0400 Subject: [PATCH 17/18] Add comments for clarity --- app/WebParsing/ArtSciParser.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 10dcb19e7..54ce76fc2 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -50,7 +50,7 @@ parseDepartmentList url = do "Combined Degree Programs", "Data Science", "Faculty of Arts and Science Programs (299/398/399)", - "Laboratory Medicine and Pathobiology)", -- | Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page + "Laboratory Medicine and Pathobiology)", -- Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page "Research Opportunity/Research Excursions (299/398/399)"] bodyTags <- httpBodyTags url let deptList = getDeptList bodyTags @@ -58,9 +58,9 @@ parseDepartmentList url = do where isValidDepartment :: [T.Text] -> (T.Text, T.Text) -> Bool isValidDepartment ignoredDepts (deptPage, deptName) = - "/" `T.isPrefixOf` deptPage && - deptName `notElem` ignoredDepts && - not (" College)" `T.isSuffixOf` deptName) + "/" `T.isPrefixOf` deptPage && -- Ignore footer links + deptName `notElem` ignoredDepts && -- Ignore departments in ignoredDepts + not (" College)" `T.isSuffixOf` deptName) -- Ignore departments belonging to a college -- | Converts the processed main page and extracts a list of department html pages -- and department names From 43eb734ecfb6e74c15b045a157b5bb5157e661f0 Mon Sep 17 00:00:00 2001 From: Rui Weng Date: Wed, 24 Jun 2026 09:12:38 -0400 Subject: [PATCH 18/18] Add styling and other changes --- app/WebParsing/ArtSciParser.hs | 24 +++++++++++--------- backend-test/WebParsing/ArtSciParserTests.hs | 1 - 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/app/WebParsing/ArtSciParser.hs b/app/WebParsing/ArtSciParser.hs index 54ce76fc2..a4567f57e 100644 --- a/app/WebParsing/ArtSciParser.hs +++ b/app/WebParsing/ArtSciParser.hs @@ -3,10 +3,10 @@ module WebParsing.ArtSciParser import Config (fasCalendarUrl, programsUrl, runDb) import Control.Monad.IO.Class (liftIO) +import qualified Data.Bifunctor as BF import Data.List (findIndex, nubBy) import Data.Maybe (fromMaybe, mapMaybe) import qualified Data.Text as T -import qualified Data.Bifunctor as BF import Data.Text.Lazy (toStrict) import Data.Text.Lazy.Encoding (decodeUtf8) import Database.Persist (insertUnique) @@ -45,22 +45,24 @@ parseArtSci = do -- Exclude departments with no courses, duplicate courses, and program areas belonging to a college. parseDepartmentList :: String -> IO [(T.Text, T.Text)] parseDepartmentList url = do - let ignoredDepts = ["ASIP (Arts & Science Internship Program)", + bodyTags <- httpBodyTags url + let deptList = getDeptList bodyTags + return $ filter (isValidDepartment ignoredDepts) deptList + where + ignoredDepts = ["ASIP (Arts & Science Internship Program)", "Biology", "Combined Degree Programs", "Data Science", "Faculty of Arts and Science Programs (299/398/399)", "Laboratory Medicine and Pathobiology)", -- Displayed as "Pathobiology (see Laboratory Medicine and Pathobiology)" on program areas page - "Research Opportunity/Research Excursions (299/398/399)"] - bodyTags <- httpBodyTags url - let deptList = getDeptList bodyTags - return $ filter (isValidDepartment ignoredDepts) deptList - where + "Research Opportunity/Research Excursions (299/398/399)", + "Writing in the Faculty of Arts & Science"] + isValidDepartment :: [T.Text] -> (T.Text, T.Text) -> Bool - isValidDepartment ignoredDepts (deptPage, deptName) = - "/" `T.isPrefixOf` deptPage && -- Ignore footer links - deptName `notElem` ignoredDepts && -- Ignore departments in ignoredDepts - not (" College)" `T.isSuffixOf` deptName) -- Ignore departments belonging to a college + isValidDepartment ignoredDepartments (deptPage, deptName) = + "/" `T.isPrefixOf` deptPage && -- Ignore footer links + deptName `notElem` ignoredDepartments && -- Ignore departments in ignoredDepartments + not (" College)" `T.isSuffixOf` deptName) -- Ignore departments belonging to a college -- | Converts the processed main page and extracts a list of department html pages -- and department names diff --git a/backend-test/WebParsing/ArtSciParserTests.hs b/backend-test/WebParsing/ArtSciParserTests.hs index d5b734649..4b645c2a7 100644 --- a/backend-test/WebParsing/ArtSciParserTests.hs +++ b/backend-test/WebParsing/ArtSciParserTests.hs @@ -102,7 +102,6 @@ parsedDepts = ("/section/Victoria-College", "Victoria College"), ("/section/Women-and-Gender-Studies", "Women and Gender Studies"), ("/section/Woodsworth-College", "Woodsworth College"), - ("/writing-faculty-arts-science", "Writing in the Faculty of Arts & Science"), ("/section/Yiddish-Studies", "Yiddish Studies") ]