summaryrefslogtreecommitdiff
path: root/src/scrape.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/scrape.hs')
-rw-r--r--src/scrape.hs98
1 files changed, 98 insertions, 0 deletions
diff --git a/src/scrape.hs b/src/scrape.hs
new file mode 100644
index 0000000..7a5a280
--- /dev/null
+++ b/src/scrape.hs
@@ -0,0 +1,98 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE TupleSections #-}
+
+module Main where
+
+import Control.Lens
+import Data.List
+import qualified Data.Map as M
+import Data.Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Database.SQLite.Simple
+import Database.SQLite3 as S3 hiding (Statement)
+import MaveDB
+import Options.Applicative
+import PAVA
+import PyF
+import Uniprot
+
+schema =
+ [fmt|
+ PRAGMA journal_mode = OFF;
+ PRAGMA synchronous = OFF;
+ PRAGMA temp_store = memory;
+ PRAGMA mmap_size = 30000000000;
+ PRAGMA page_size = 32768;
+
+ CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL,
+ PRIMARY KEY (gene, hgvs_p));
+ |]
+
+newtype Config = Config FilePath
+
+main = configured =<< execParser opts
+ where
+ config =
+ Config
+ <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into")
+
+ opts =
+ info
+ (config <**> helper)
+ (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper")
+
+configured (Config out) = do
+ urns <- queryURNs
+ scores <-
+ M.fromListWith (M.unionWith (++)) . catMaybes
+ <$> mapM
+ ( \(x, u) -> do
+ g <- geneName u
+ T.putStrLn g
+ s <- normalise <$> getScores x
+ pure $ (g,) <$> s
+ )
+ urns
+
+ let cScores = M.map (M.toList . M.map geomean) scores
+
+ withConnection out $ \conn@Connection {..} -> do
+ S3.exec connectionHandle (fromQuery schema)
+ withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores
+
+geomean :: [Double] -> Double
+geomean = exp . mean . map log
+
+insertGene conn (gene, muts) = do
+ mapM_ (insertMut conn gene) muts
+
+insertMut conn gene (mut, score) =
+ execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score)
+
+normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double])
+normalise xs =
+ let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]
+ (hgvs, y) = unzip $ map snd ord
+ y' = pava y
+ in M.fromList . zip hgvs . map pure <$> puCorrection hgvs y'
+
+notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double
+
+-- Assumes ∈ {0,1}
+weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a
+weightedMean w0 w1 xs =
+ let n1 = sum xs
+ n0 = fromIntegral (length xs) - n1
+ in w1 * n1 / (w1 * n1 + w0 * n0)
+
+puCorrection hgvs ys =
+ let lab = map notTer hgvs
+ in M.fromListWith (++) (zip lab $ map pure ys) ^. at 1 <&> \nons ->
+ let cf = 1 - mean nons -- NB: reversed due to dms score being lower when Ter
+ w0 = 2 / cf
+ w1 = 1 / mean lab
+ in pava' (weightedMean w0 w1) lab