summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2022-09-29 08:53:40 +1000
committerJustin Bedo <cu@cua0.org>2022-09-29 08:55:35 +1000
commit8ecdab68587c07cf88ebc0b8a8f2bd0e1a703e37 (patch)
tree258b2b64362b5891e268f2761e6a798f2072759c
init
-rw-r--r--.gitignore1
-rw-r--r--default.nix23
-rw-r--r--flake.lock42
-rw-r--r--flake.nix8
-rw-r--r--src/MaveDB.hs58
-rw-r--r--src/PAVA.hs34
-rw-r--r--src/Uniprot.hs26
-rw-r--r--src/scrape.hs98
8 files changed, 290 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fcfc4a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+result*
diff --git a/default.nix b/default.nix
new file mode 100644
index 0000000..0a9f8b6
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,23 @@
+{pkgs ? import <nixpkgs> {}}:
+with pkgs; let
+ ghc = haskellPackages.ghcWithPackages (ps:
+ with ps; [
+ lens-aeson
+ wreq
+ sqlite-simple
+ PyF
+ optparse-applicative
+ ]);
+in
+ stdenv.mkDerivation {
+ name = "mavedb-scraper";
+ src = ./src;
+
+ buildInputs = [ghc];
+
+ buildPhase = ''
+ ghc -o scrape *.hs -O -rtsopts -threaded
+ '';
+
+ installPhase = "install -Dm755 ./scrape $out/bin/scrape";
+ }
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..f22c045
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,42 @@
+{
+ "nodes": {
+ "flake-utils": {
+ "locked": {
+ "lastModified": 1659877975,
+ "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1664405354,
+ "narHash": "sha256-kGTmqe/6GTfP2dbvYSH2MnoGSTwY4C7SL7ofcYf3/N0=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "37280688cb4221215696003baf8797858c8c546a",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "flake-utils": "flake-utils",
+ "nixpkgs": "nixpkgs"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..37741f3
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,8 @@
+{
+inputs.nixpkgs.url = "github:nixos/nixpkgs";
+inputs.flake-utils.url = "github:numtide/flake-utils";
+outputs = {self, nixpkgs, flake-utils}:
+ flake-utils.lib.eachDefaultSystem (system: let pkgs = import nixpkgs {inherit system;}; in {
+ defaultPackage = import ./default.nix { inherit pkgs; };
+ });
+ }
diff --git a/src/MaveDB.hs b/src/MaveDB.hs
new file mode 100644
index 0000000..b662231
--- /dev/null
+++ b/src/MaveDB.hs
@@ -0,0 +1,58 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+
+module MaveDB where
+
+import Control.Lens
+import Data.Aeson.Lens
+import qualified Data.ByteString.Lazy.Char8 as BS
+import qualified Data.Map.Strict as M
+import Data.Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
+import Network.HTTP.Client (managerResponseTimeout, responseTimeoutNone)
+import Network.HTTP.Client.TLS (tlsManagerSettings)
+import Network.Wreq
+import Uniprot (UniprotID)
+
+type URN = Text
+
+type Gene = Text
+
+type HGVSP = Text
+
+type Score = (HGVSP, Double)
+
+-- Standard options for requests
+opts = defaults & manager .~ Left (tlsManagerSettings {managerResponseTimeout = responseTimeoutNone})
+
+--opts = defaults & manager .~ Left tlsManagerSettings
+
+-- Retrieve a list of URNs which are protein coding human genes
+queryURNs :: IO [(URN, UniprotID)]
+queryURNs = do
+ rep <- getWith opts "https://mavedb.org/api/scoresets/"
+ pure $
+ rep ^. responseBody
+ & (^.. traverse . runFold ((,) <$> Fold (key "urn" . _String) <*> Fold (key "target" . key "uniprot" . key "identifier" . _String)))
+ . (^.. folded . filtered (allOf (key "target" . key "reference_maps" . _Array . traverse . key "genome" . key "short_name" . _String) (== "hg38")))
+ . (^.. folded . filtered (allOf (key "target" . key "type" . _String) (== "Protein coding")))
+ . (^.. _Array . folded . filtered (allOf (key "target" . key "score_columns" . _Array) (\((^.. traverse . _String) -> scs) -> "hgvs_pro" `elem` scs && "score" `elem` scs)))
+
+getScores :: URN -> IO [Score]
+getScores urn = do
+ let urn' = T.unpack urn
+ rep <- getWith opts $ "https://mavedb.org/scoreset/" <> urn' <> "/scores/"
+ let hdr : rows = dropComments . BS.lines . BS.filter (/= '\r') $ rep ^. responseBody
+ rows' = map (M.fromList . zip (BS.split ',' hdr) . BS.split ',') rows
+ pure $
+ rows'
+ ^.. traverse
+ . runFold
+ ( (,)
+ <$> Fold (at "hgvs_pro" . to (fromMaybe $ error ("no hgvs_pro column in " <> urn')) . to (T.pack . BS.unpack))
+ <*> Fold (at "score" . traverse . _Double)
+ )
+ where
+ dropComments (a : as) = if BS.head a == '#' then dropComments as else a : as
+ dropComments x = x
diff --git a/src/PAVA.hs b/src/PAVA.hs
new file mode 100644
index 0000000..35ea4ec
--- /dev/null
+++ b/src/PAVA.hs
@@ -0,0 +1,34 @@
+module PAVA where
+
+data Block a = Block ([a] -> a) [a]
+
+instance (Fractional a, Eq a) => Eq (Block a) where
+ x == y = mu x == mu y
+
+instance (Fractional a, Ord a) => Ord (Block a) where
+ x <= y = mu x <= mu y
+
+mu :: Fractional a => Block a -> a
+mu (Block f xs) = f xs
+
+mean :: Fractional a => [a] -> a
+mean = (\(a, b) -> b / a) . foldl (\(a, b) c -> (a + 1, b + c)) (0, 0)
+
+pava' :: (Show a, Eq a, Ord a, Fractional a) => ([a] -> a) -> [a] -> [a]
+pava' f = concatMap unblock . untilFixed fixViol . map (Block f . pure)
+ where
+ untilFixed f x =
+ let y = f x
+ in if y == x then y else untilFixed f y
+
+ fixViol (a : b : xs)
+ | a > b = merge a b : xs
+ | otherwise = a : fixViol (b : xs)
+ fixViol x = x
+
+ merge (Block _ a) (Block _ b) = Block f (a <> b)
+
+pava = pava' mean
+
+unblock :: Fractional a => Block a -> [a]
+unblock (Block f xs) = replicate (length xs) (f xs)
diff --git a/src/Uniprot.hs b/src/Uniprot.hs
new file mode 100644
index 0000000..9458cfb
--- /dev/null
+++ b/src/Uniprot.hs
@@ -0,0 +1,26 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Uniprot where
+
+import Codec.Compression.GZip
+import Control.Lens
+import Data.Aeson.Lens
+import qualified Data.ByteString.Char8 as BSS
+import qualified Data.ByteString.Lazy.Char8 as BS
+import Data.Text (Text)
+import qualified Data.Text as T
+import Network.Wreq
+
+type UniprotID = Text
+
+type GeneName = Text
+
+geneName :: UniprotID -> IO GeneName
+geneName id = do
+ rep <- getWith (defaults & header "Accept" .~ ["application/json"]) $ "https://rest.uniprot.org/uniprotkb/" <> T.unpack id
+ let encoding = rep ^. responseHeader "content-encoding"
+ case encoding of
+ "gzip" ->
+ let json = decompress $ rep ^. responseBody
+ in pure $ json ^. key "genes" . _Array . traverse . key "geneName" . key "value" . _String
+ _ -> error $ "unkown content encoding " <> BSS.unpack encoding
diff --git a/src/scrape.hs b/src/scrape.hs
new file mode 100644
index 0000000..7a5a280
--- /dev/null
+++ b/src/scrape.hs
@@ -0,0 +1,98 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE TupleSections #-}
+
+module Main where
+
+import Control.Lens
+import Data.List
+import qualified Data.Map as M
+import Data.Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Database.SQLite.Simple
+import Database.SQLite3 as S3 hiding (Statement)
+import MaveDB
+import Options.Applicative
+import PAVA
+import PyF
+import Uniprot
+
+schema =
+ [fmt|
+ PRAGMA journal_mode = OFF;
+ PRAGMA synchronous = OFF;
+ PRAGMA temp_store = memory;
+ PRAGMA mmap_size = 30000000000;
+ PRAGMA page_size = 32768;
+
+ CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL,
+ PRIMARY KEY (gene, hgvs_p));
+ |]
+
+newtype Config = Config FilePath
+
+main = configured =<< execParser opts
+ where
+ config =
+ Config
+ <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into")
+
+ opts =
+ info
+ (config <**> helper)
+ (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper")
+
+configured (Config out) = do
+ urns <- queryURNs
+ scores <-
+ M.fromListWith (M.unionWith (++)) . catMaybes
+ <$> mapM
+ ( \(x, u) -> do
+ g <- geneName u
+ T.putStrLn g
+ s <- normalise <$> getScores x
+ pure $ (g,) <$> s
+ )
+ urns
+
+ let cScores = M.map (M.toList . M.map geomean) scores
+
+ withConnection out $ \conn@Connection {..} -> do
+ S3.exec connectionHandle (fromQuery schema)
+ withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores
+
+geomean :: [Double] -> Double
+geomean = exp . mean . map log
+
+insertGene conn (gene, muts) = do
+ mapM_ (insertMut conn gene) muts
+
+insertMut conn gene (mut, score) =
+ execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score)
+
+normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double])
+normalise xs =
+ let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]
+ (hgvs, y) = unzip $ map snd ord
+ y' = pava y
+ in M.fromList . zip hgvs . map pure <$> puCorrection hgvs y'
+
+notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double
+
+-- Assumes ∈ {0,1}
+weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a
+weightedMean w0 w1 xs =
+ let n1 = sum xs
+ n0 = fromIntegral (length xs) - n1
+ in w1 * n1 / (w1 * n1 + w0 * n0)
+
+puCorrection hgvs ys =
+ let lab = map notTer hgvs
+ in M.fromListWith (++) (zip lab $ map pure ys) ^. at 1 <&> \nons ->
+ let cf = 1 - mean nons -- NB: reversed due to dms score being lower when Ter
+ w0 = 2 / cf
+ w1 = 1 / mean lab
+ in pava' (weightedMean w0 w1) lab