diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/MaveDB.hs | 58 | ||||
-rw-r--r-- | src/PAVA.hs | 34 | ||||
-rw-r--r-- | src/Uniprot.hs | 26 | ||||
-rw-r--r-- | src/scrape.hs | 98 |
4 files changed, 216 insertions, 0 deletions
diff --git a/src/MaveDB.hs b/src/MaveDB.hs new file mode 100644 index 0000000..b662231 --- /dev/null +++ b/src/MaveDB.hs @@ -0,0 +1,58 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} + +module MaveDB where + +import Control.Lens +import Data.Aeson.Lens +import qualified Data.ByteString.Lazy.Char8 as BS +import qualified Data.Map.Strict as M +import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T +import Network.HTTP.Client (managerResponseTimeout, responseTimeoutNone) +import Network.HTTP.Client.TLS (tlsManagerSettings) +import Network.Wreq +import Uniprot (UniprotID) + +type URN = Text + +type Gene = Text + +type HGVSP = Text + +type Score = (HGVSP, Double) + +-- Standard options for requests +opts = defaults & manager .~ Left (tlsManagerSettings {managerResponseTimeout = responseTimeoutNone}) + +--opts = defaults & manager .~ Left tlsManagerSettings + +-- Retrieve a list of URNs which are protein coding human genes +queryURNs :: IO [(URN, UniprotID)] +queryURNs = do + rep <- getWith opts "https://mavedb.org/api/scoresets/" + pure $ + rep ^. responseBody + & (^.. traverse . runFold ((,) <$> Fold (key "urn" . _String) <*> Fold (key "target" . key "uniprot" . key "identifier" . _String))) + . (^.. folded . filtered (allOf (key "target" . key "reference_maps" . _Array . traverse . key "genome" . key "short_name" . _String) (== "hg38"))) + . (^.. folded . filtered (allOf (key "target" . key "type" . _String) (== "Protein coding"))) + . (^.. _Array . folded . filtered (allOf (key "target" . key "score_columns" . _Array) (\((^.. traverse . _String) -> scs) -> "hgvs_pro" `elem` scs && "score" `elem` scs))) + +getScores :: URN -> IO [Score] +getScores urn = do + let urn' = T.unpack urn + rep <- getWith opts $ "https://mavedb.org/scoreset/" <> urn' <> "/scores/" + let hdr : rows = dropComments . BS.lines . BS.filter (/= '\r') $ rep ^. responseBody + rows' = map (M.fromList . zip (BS.split ',' hdr) . BS.split ',') rows + pure $ + rows' + ^.. traverse + . runFold + ( (,) + <$> Fold (at "hgvs_pro" . to (fromMaybe $ error ("no hgvs_pro column in " <> urn')) . to (T.pack . BS.unpack)) + <*> Fold (at "score" . traverse . _Double) + ) + where + dropComments (a : as) = if BS.head a == '#' then dropComments as else a : as + dropComments x = x diff --git a/src/PAVA.hs b/src/PAVA.hs new file mode 100644 index 0000000..35ea4ec --- /dev/null +++ b/src/PAVA.hs @@ -0,0 +1,34 @@ +module PAVA where + +data Block a = Block ([a] -> a) [a] + +instance (Fractional a, Eq a) => Eq (Block a) where + x == y = mu x == mu y + +instance (Fractional a, Ord a) => Ord (Block a) where + x <= y = mu x <= mu y + +mu :: Fractional a => Block a -> a +mu (Block f xs) = f xs + +mean :: Fractional a => [a] -> a +mean = (\(a, b) -> b / a) . foldl (\(a, b) c -> (a + 1, b + c)) (0, 0) + +pava' :: (Show a, Eq a, Ord a, Fractional a) => ([a] -> a) -> [a] -> [a] +pava' f = concatMap unblock . untilFixed fixViol . map (Block f . pure) + where + untilFixed f x = + let y = f x + in if y == x then y else untilFixed f y + + fixViol (a : b : xs) + | a > b = merge a b : xs + | otherwise = a : fixViol (b : xs) + fixViol x = x + + merge (Block _ a) (Block _ b) = Block f (a <> b) + +pava = pava' mean + +unblock :: Fractional a => Block a -> [a] +unblock (Block f xs) = replicate (length xs) (f xs) diff --git a/src/Uniprot.hs b/src/Uniprot.hs new file mode 100644 index 0000000..9458cfb --- /dev/null +++ b/src/Uniprot.hs @@ -0,0 +1,26 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Uniprot where + +import Codec.Compression.GZip +import Control.Lens +import Data.Aeson.Lens +import qualified Data.ByteString.Char8 as BSS +import qualified Data.ByteString.Lazy.Char8 as BS +import Data.Text (Text) +import qualified Data.Text as T +import Network.Wreq + +type UniprotID = Text + +type GeneName = Text + +geneName :: UniprotID -> IO GeneName +geneName id = do + rep <- getWith (defaults & header "Accept" .~ ["application/json"]) $ "https://rest.uniprot.org/uniprotkb/" <> T.unpack id + let encoding = rep ^. responseHeader "content-encoding" + case encoding of + "gzip" -> + let json = decompress $ rep ^. responseBody + in pure $ json ^. key "genes" . _Array . traverse . key "geneName" . key "value" . _String + _ -> error $ "unkown content encoding " <> BSS.unpack encoding diff --git a/src/scrape.hs b/src/scrape.hs new file mode 100644 index 0000000..7a5a280 --- /dev/null +++ b/src/scrape.hs @@ -0,0 +1,98 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} + +module Main where + +import Control.Lens +import Data.List +import qualified Data.Map as M +import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Database.SQLite.Simple +import Database.SQLite3 as S3 hiding (Statement) +import MaveDB +import Options.Applicative +import PAVA +import PyF +import Uniprot + +schema = + [fmt| + PRAGMA journal_mode = OFF; + PRAGMA synchronous = OFF; + PRAGMA temp_store = memory; + PRAGMA mmap_size = 30000000000; + PRAGMA page_size = 32768; + + CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL, + PRIMARY KEY (gene, hgvs_p)); + |] + +newtype Config = Config FilePath + +main = configured =<< execParser opts + where + config = + Config + <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") + + opts = + info + (config <**> helper) + (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") + +configured (Config out) = do + urns <- queryURNs + scores <- + M.fromListWith (M.unionWith (++)) . catMaybes + <$> mapM + ( \(x, u) -> do + g <- geneName u + T.putStrLn g + s <- normalise <$> getScores x + pure $ (g,) <$> s + ) + urns + + let cScores = M.map (M.toList . M.map geomean) scores + + withConnection out $ \conn@Connection {..} -> do + S3.exec connectionHandle (fromQuery schema) + withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores + +geomean :: [Double] -> Double +geomean = exp . mean . map log + +insertGene conn (gene, muts) = do + mapM_ (insertMut conn gene) muts + +insertMut conn gene (mut, score) = + execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) + +normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) +normalise xs = + let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] + (hgvs, y) = unzip $ map snd ord + y' = pava y + in M.fromList . zip hgvs . map pure <$> puCorrection hgvs y' + +notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double + +-- Assumes ∈ {0,1} +weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a +weightedMean w0 w1 xs = + let n1 = sum xs + n0 = fromIntegral (length xs) - n1 + in w1 * n1 / (w1 * n1 + w0 * n0) + +puCorrection hgvs ys = + let lab = map notTer hgvs + in M.fromListWith (++) (zip lab $ map pure ys) ^. at 1 <&> \nons -> + let cf = 1 - mean nons -- NB: reversed due to dms score being lower when Ter + w0 = 2 / cf + w1 = 1 / mean lab + in pava' (weightedMean w0 w1) lab |