diff options
Diffstat (limited to 'src/scrape.hs')
-rw-r--r-- | src/scrape.hs | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/src/scrape.hs b/src/scrape.hs new file mode 100644 index 0000000..7a5a280 --- /dev/null +++ b/src/scrape.hs @@ -0,0 +1,98 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} + +module Main where + +import Control.Lens +import Data.List +import qualified Data.Map as M +import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Database.SQLite.Simple +import Database.SQLite3 as S3 hiding (Statement) +import MaveDB +import Options.Applicative +import PAVA +import PyF +import Uniprot + +schema = + [fmt| + PRAGMA journal_mode = OFF; + PRAGMA synchronous = OFF; + PRAGMA temp_store = memory; + PRAGMA mmap_size = 30000000000; + PRAGMA page_size = 32768; + + CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL, + PRIMARY KEY (gene, hgvs_p)); + |] + +newtype Config = Config FilePath + +main = configured =<< execParser opts + where + config = + Config + <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") + + opts = + info + (config <**> helper) + (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") + +configured (Config out) = do + urns <- queryURNs + scores <- + M.fromListWith (M.unionWith (++)) . catMaybes + <$> mapM + ( \(x, u) -> do + g <- geneName u + T.putStrLn g + s <- normalise <$> getScores x + pure $ (g,) <$> s + ) + urns + + let cScores = M.map (M.toList . M.map geomean) scores + + withConnection out $ \conn@Connection {..} -> do + S3.exec connectionHandle (fromQuery schema) + withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores + +geomean :: [Double] -> Double +geomean = exp . mean . map log + +insertGene conn (gene, muts) = do + mapM_ (insertMut conn gene) muts + +insertMut conn gene (mut, score) = + execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) + +normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) +normalise xs = + let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] + (hgvs, y) = unzip $ map snd ord + y' = pava y + in M.fromList . zip hgvs . map pure <$> puCorrection hgvs y' + +notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double + +-- Assumes ∈ {0,1} +weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a +weightedMean w0 w1 xs = + let n1 = sum xs + n0 = fromIntegral (length xs) - n1 + in w1 * n1 / (w1 * n1 + w0 * n0) + +puCorrection hgvs ys = + let lab = map notTer hgvs + in M.fromListWith (++) (zip lab $ map pure ys) ^. at 1 <&> \nons -> + let cf = 1 - mean nons -- NB: reversed due to dms score being lower when Ter + w0 = 2 / cf + w1 = 1 / mean lab + in pava' (weightedMean w0 w1) lab |