diff options
authorJustin Bedo <>2022-09-29 14:02:21 +1000
committerJustin Bedo <>2022-09-30 13:36:15 +1000
commit52cda6932504f13f3ea0f933c58de97a296f19bf (patch)
parent16012d5e9b1ecf9887db715eb87f414c4dd9a6fa (diff)
add filtering on estimated labelling proportions
1 files changed, 18 insertions, 9 deletions
diff --git a/src/scrape.hs b/src/scrape.hs
index c9ff8e1..56f4631 100644
--- a/src/scrape.hs
+++ b/src/scrape.hs
@@ -32,20 +32,29 @@ schema =
PRIMARY KEY (gene, hgvs_p));
-newtype Config = Config FilePath
+data Config = Config FilePath Double
main = configured =<< execParser opts
config =
<$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into")
+ <*> option
+ auto
+ ( long "min-pi"
+ <> short 'p'
+ <> metavar "π"
+ <> help "minimum estimated labelling proportion"
+ <> showDefault
+ <> value 0.1
+ )
opts =
(config <**> helper)
(fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper")
-configured (Config out) = do
+configured (Config out minpi) = do
urns <- queryURNs
scores <-
M.fromListWith (M.unionWith (++)) . catMaybes
@@ -53,7 +62,7 @@ configured (Config out) = do
( \(x, u) -> do
g <- geneName u
T.putStrLn g
- s <- normalise <$> getScores x
+ s <- normalise minpi <$> getScores x
pure $ (g,) <$> s
@@ -73,12 +82,12 @@ insertGene conn (gene, muts) = do
insertMut conn gene (mut, score) =
execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score)
-normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double])
-normalise xs =
+normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double])
+normalise minpi xs =
let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]
(hgvs, y) = unzip $ map snd ord
y' = pava y
- in M.fromList . zip hgvs . map pure <$> puCorrection y y'
+ in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y'
notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double
@@ -89,9 +98,9 @@ weightedMean w0 w1 xs =
n0 = fromIntegral (length xs) - n1
in w1 * n1 / (w1 * n1 + w0 * n0)
-puCorrection lab ys =
- M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 <&> \nonsc ->
+puCorrection minpi lab ys =
+ M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc ->
let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter
w0 = 2 / cf
w1 = 1 / mean lab
- in pava' (weightedMean w0 w1) lab
+ in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab