upd

Argusmocny · Argusmocny · commit f42507a13e88 · 2021-04-25T23:16:28.000+02:00
diff --git a/__init__.py b/__init__.py
@@ -1,2 +1,4 @@
 from .creation import cm_to_eids
-from .tensor_ops import delete_indices
+from .tensor_ops import delete_indices
+from .parsing import parse_graph_data
+from .parsing import parse_graph_data_torch
diff --git a/examples.ipynb b/examples.ipynb
@@ -0,0 +1,288 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 193,
+   "id": "measured-andrew",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "\n",
+    "import numba\n",
+    "import numpy as np\n",
+    "import atomium\n",
+    "import Bio\n",
+    "import torch as th\n",
+    "\n",
+    "from scipy.spatial import distance_matrix\n",
+    "protein_letters_3to1 = Bio.SeqUtils.IUPACData.protein_letters_3to1_extended\n",
+    "protein_letters_3to1 = {k.upper() : v for k,v in protein_letters_3to1.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 239,
+   "id": "common-kingdom",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.njit(parallel=True)\n",
+    "def numba_jit_scalar_distance_parallel(xyz):\n",
+    "    rows = xyz.shape[0]\n",
+    "    output = np.empty((rows, rows), dtype=np.float32)\n",
+    "    for i in numba.prange(rows):\n",
+    "        cols = rows - i\n",
+    "        for j in numba.prange(cols):\n",
+    "            tmp = 0.0\n",
+    "            tmp += (xyz[i, 0] - xyz[j, 0])**2\n",
+    "            tmp += (xyz[i, 1] - xyz[j, 1])**2\n",
+    "            tmp += (xyz[i, 2] - xyz[j, 2])**2\n",
+    "            tmp = math.sqrt(tmp) \n",
+    "            output[i,j] = tmp\n",
+    "            output[j,i] = tmp\n",
+    "    return output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "norwegian-sport",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_atom_xyz(atoms, atom_name):\n",
+    "    for a in atoms:\n",
+    "        if a.name == atom_name:\n",
+    "            return a.location\n",
+    "    return (np.nan, np.nan, np.nan)\n",
+    "\n",
+    "def get_ss_label(residue):\n",
+    "    '''\n",
+    "    E, H or C label from atomium\n",
+    "    '''\n",
+    "    if residue.helix:\n",
+    "        return 'H'\n",
+    "    elif residue.strand:\n",
+    "        return 'E'\n",
+    "    else:\n",
+    "        return 'C'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 237,
+   "id": "muslim-smoke",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_graph_data_numba(path_pdb, chain):\n",
+    "    \n",
+    "    if not os.path.isfile(path_pdb):\n",
+    "        FileNotFoundError('no such file', path_pdb)\n",
+    "    file = atomium.open(path_pdb)\n",
+    "    chain = file.model.chain(chain)\n",
+    "    preparation_dict = dict()\n",
+    "    for i, r in enumerate(chain.residues()):\n",
+    "        r_atoms = r.atoms()\n",
+    "        preparation_dict[i] = {'aa' : protein_letters_3to1[r.name],\n",
+    "                                    'charge' : r.charge,\n",
+    "                                    'CA' : get_atom_xyz(r_atoms, 'CA'),\n",
+    "                                    'CB' : get_atom_xyz(r_atoms, 'CB'),\n",
+    "                                    'ss_label' : get_ss_label(r)\n",
+    "                                   }\n",
+    "\n",
+    "        ca_xyz = np.asarray(list(map(lambda v : v['CA'], preparation_dict.values())), dtype=np.float32)\n",
+    "        sequence = list(map(lambda v : v['aa'], preparation_dict.values()))\n",
+    "        ca_ca_matrix = numba_jit_scalar_distance_parallel(ca_xyz)\n",
+    "    return ca_ca_matrix, sequence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 176,
+   "id": "hidden-thanksgiving",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_graph_data_torch(path_pdb, chain):\n",
+    "    \n",
+    "    if not os.path.isfile(path_pdb):\n",
+    "        FileNotFoundError('no such file', path_pdb)\n",
+    "    file = atomium.open(path_pdb)\n",
+    "    chain = file.model.chain(chain)\n",
+    "    preparation_dict = dict()\n",
+    "    for i, r in enumerate(chain.residues()):\n",
+    "        r_atoms = r.atoms()\n",
+    "        preparation_dict[i] = {'aa' : protein_letters_3to1[r.name],\n",
+    "                                    'charge' : r.charge,\n",
+    "                                    'CA' : get_atom_xyz(r_atoms, 'CA'),\n",
+    "                                    'CB' : get_atom_xyz(r_atoms, 'CB'),\n",
+    "                                    'ss_label' : get_ss_label(r)\n",
+    "                                   }\n",
+    "\n",
+    "        ca_xyz = th.FloatTensor(list(map(lambda v : v['CA'], preparation_dict.values())))\n",
+    "        sequence = list(map(lambda v : v['aa'], preparation_dict.values()))\n",
+    "\n",
+    "        ca_ca_matrix = th.cdist(ca_xyz, ca_xyz)\n",
+    "    return ca_ca_matrix, sequence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "id": "demographic-marijuana",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_graph_data(path_pdb, chain):\n",
+    "    \n",
+    "    if not os.path.isfile(path_pdb):\n",
+    "        FileNotFoundError('no such file', path_pdb)\n",
+    "    file = atomium.open(path_pdb)\n",
+    "    chain = file.model.chain(chain)\n",
+    "    preparation_dict = dict()\n",
+    "    for i, r in enumerate(chain.residues()):\n",
+    "        r_atoms = r.atoms()\n",
+    "        preparation_dict[i] = {'aa' : protein_letters_3to1[r.name],\n",
+    "                                    'charge' : r.charge,\n",
+    "                                    'CA' : get_atom_xyz(r_atoms, 'CA'),\n",
+    "                                    'CB' : get_atom_xyz(r_atoms, 'CB'),\n",
+    "                                    'ss_label' : get_ss_label(r)\n",
+    "                                   }\n",
+    "\n",
+    "        ca_xyz = np.asarray(list(map(lambda v : v['CA'], preparation_dict.values())), dtype=np.float32)\n",
+    "        sequence = list(map(lambda v : v['aa'], preparation_dict.values()))\n",
+    "\n",
+    "        ca_ca_matrix = distance_matrix(ca_xyz, ca_xyz)\n",
+    "    return ca_ca_matrix, sequence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 169,
+   "id": "laden-chart",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = '/home/db/localpdb/mirror/ea/pdb6eac.ent.gz'\n",
+    "chain = 'A'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 185,
+   "id": "monetary-wrapping",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4.11 s ± 3.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit parse_graph_data(path, chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 186,
+   "id": "armed-syndication",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.99 s ± 7.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit parse_graph_data_torch(path, chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 240,
+   "id": "rapid-accountability",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.05 s ± 9.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit parse_graph_data_numba(path, chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 241,
+   "id": "foster-output",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a,b = parse_graph_data_torch(path, chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 244,
+   "id": "broke-heart",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "885.0625"
+      ]
+     },
+     "execution_count": 244,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.element_size()*a.nelement() / 1024"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "union-injection",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/files_io.py b/files_io.py
@@ -0,0 +1,62 @@
+import json
+import os
+import pickle
+import gzip
+
+'''
+strorage for quick function
+'''
+
+
+def get_json(file):
+    '''
+    read json to object
+    '''
+    assert isinstance(file, str)
+    
+    with open(file, 'r') as f:
+        data = json.load(f)
+    return data
+
+def save_json(file, data):
+    '''
+    save json to file
+    '''
+    assert isinstance(file, str)
+    assert isinstance(data, dict)
+    
+    with open(file, 'w') as f:
+        json.dump(data, f)
+        
+        
+def load_gpickle(file):
+    '''
+    returns content of gziped (optional requires .gz extension) pickle file
+    '''
+    
+    assert isinstance(file, str)
+    assert os.path.isfile(file), f'no such file {file}'
+    
+    if file.endswith('.gz'):
+        with gzip.open(file, 'rb') as f:
+            data = pickle.load(f)
+    else:
+        with open(file, 'rb') as f:
+            data = pickle.load(f)
+    return data
+
+def save_gpickle(obj, file):
+    '''
+    pickles `obj` if file endswith .gz then zip pickle
+    '''
+    
+    assert isinstance(file, str)
+    assert os.path.isdir(os.path.dirname(file)), f'no such directory: {file}'
+    
+    if file.endswith('.gz'):
+        with gzip.open(file, 'wb') as f:
+            pickle.dump(obj, f)
+    else:
+        with open(file, 'rb') as f:
+            pickle.dump(obj, f)
+            
diff --git a/parsing.py b/parsing.py