From 52cda6932504f13f3ea0f933c58de97a296f19bf Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Thu, 29 Sep 2022 14:02:21 +1000 Subject: add filtering on estimated labelling proportions --- src/scrape.hs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/scrape.hs b/src/scrape.hs index c9ff8e1..56f4631 100644 --- a/src/scrape.hs +++ b/src/scrape.hs @@ -32,20 +32,29 @@ schema = PRIMARY KEY (gene, hgvs_p)); |] -newtype Config = Config FilePath +data Config = Config FilePath Double main = configured =<< execParser opts where config = Config <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") + <*> option + auto + ( long "min-pi" + <> short 'p' + <> metavar "π" + <> help "minimum estimated labelling proportion" + <> showDefault + <> value 0.1 + ) opts = info (config <**> helper) (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") -configured (Config out) = do +configured (Config out minpi) = do urns <- queryURNs scores <- M.fromListWith (M.unionWith (++)) . catMaybes @@ -53,7 +62,7 @@ configured (Config out) = do ( \(x, u) -> do g <- geneName u T.putStrLn g - s <- normalise <$> getScores x + s <- normalise minpi <$> getScores x pure $ (g,) <$> s ) urns @@ -73,12 +82,12 @@ insertGene conn (gene, muts) = do insertMut conn gene (mut, score) = execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) -normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) -normalise xs = +normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double]) +normalise minpi xs = let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] (hgvs, y) = unzip $ map snd ord y' = pava y - in M.fromList . zip hgvs . map pure <$> puCorrection y y' + in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y' notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double @@ -89,9 +98,9 @@ weightedMean w0 w1 xs = n0 = fromIntegral (length xs) - n1 in w1 * n1 / (w1 * n1 + w0 * n0) -puCorrection lab ys = - M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 <&> \nonsc -> +puCorrection minpi lab ys = + M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc -> let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter w0 = 2 / cf w1 = 1 / mean lab - in pava' (weightedMean w0 w1) lab + in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab -- cgit v1.2.3