diff options
| -rw-r--r-- | src/scrape.hs | 27 | 
1 files changed, 18 insertions, 9 deletions
diff --git a/src/scrape.hs b/src/scrape.hs index c9ff8e1..56f4631 100644 --- a/src/scrape.hs +++ b/src/scrape.hs @@ -32,20 +32,29 @@ schema =        PRIMARY KEY (gene, hgvs_p));    |] -newtype Config = Config FilePath +data Config = Config FilePath Double  main = configured =<< execParser opts    where      config =        Config          <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") +        <*> option +          auto +          ( long "min-pi" +              <> short 'p' +              <> metavar "π" +              <> help "minimum estimated labelling proportion" +              <> showDefault +              <> value 0.1 +          )      opts =        info          (config <**> helper)          (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") -configured (Config out) = do +configured (Config out minpi) = do    urns <- queryURNs    scores <-      M.fromListWith (M.unionWith (++)) . catMaybes @@ -53,7 +62,7 @@ configured (Config out) = do          ( \(x, u) -> do              g <- geneName u              T.putStrLn g -            s <- normalise <$> getScores x +            s <- normalise minpi <$> getScores x              pure $ (g,) <$> s          )          urns @@ -73,12 +82,12 @@ insertGene conn (gene, muts) = do  insertMut conn gene (mut, score) =    execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) -normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) -normalise xs = +normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double]) +normalise minpi xs =    let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]        (hgvs, y) = unzip $ map snd ord        y' = pava y -   in M.fromList . zip hgvs . map pure <$> puCorrection y y' +   in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y'  notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double @@ -89,9 +98,9 @@ weightedMean w0 w1 xs =        n0 = fromIntegral (length xs) - n1     in w1 * n1 / (w1 * n1 + w0 * n0) -puCorrection lab ys = -  M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 <&> \nonsc -> +puCorrection minpi lab ys = +  M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc ->      let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter          w0 = 2 / cf          w1 = 1 / mean lab -     in pava' (weightedMean w0 w1) lab +     in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab  | 
