gptel-gemini.el (13221B)
1 ;;; gptel-gemini.el --- Gemini suppport for gptel -*- lexical-binding: t; -*- 2 3 ;; Copyright (C) 2023 Karthik Chikmagalur 4 5 ;; Author: Karthik Chikmagalur <karthikchikmagalur@gmail.com> 6 7 ;; This program is free software; you can redistribute it and/or modify 8 ;; it under the terms of the GNU General Public License as published by 9 ;; the Free Software Foundation, either version 3 of the License, or 10 ;; (at your option) any later version. 11 12 ;; This program is distributed in the hope that it will be useful, 13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 ;; GNU General Public License for more details. 16 17 ;; You should have received a copy of the GNU General Public License 18 ;; along with this program. If not, see <https://www.gnu.org/licenses/>. 19 20 ;;; Commentary: 21 22 ;; This file adds support for the Gemini API to gptel 23 24 ;;; Code: 25 (require 'gptel) 26 (require 'cl-generic) 27 (require 'map) 28 (eval-when-compile (require 'cl-lib)) 29 30 (declare-function prop-match-value "text-property-search") 31 (declare-function text-property-search-backward "text-property-search") 32 (declare-function json-read "json") 33 (declare-function gptel-context--wrap "gptel-context") 34 (declare-function gptel-context--collect-media "gptel-context") 35 (defvar json-object-type) 36 37 ;;; Gemini 38 (cl-defstruct 39 (gptel-gemini (:constructor gptel--make-gemini) 40 (:copier nil) 41 (:include gptel-backend))) 42 43 (cl-defmethod gptel-curl--parse-stream ((_backend gptel-gemini) _info) 44 (let* ((content-strs)) 45 (condition-case nil 46 ;; while-let is Emacs 29.1+ only 47 (while (prog1 (search-forward "{" nil t) 48 (backward-char 1)) 49 (save-match-data 50 (when-let* 51 ((response (gptel--json-read)) 52 (text (map-nested-elt 53 response '(:candidates 0 :content :parts 0 :text)))) 54 (push text content-strs)))) 55 (error 56 (goto-char (match-beginning 0)))) 57 (apply #'concat (nreverse content-strs)))) 58 59 (cl-defmethod gptel--parse-response ((_backend gptel-gemini) response _info) 60 (map-nested-elt response '(:candidates 0 :content :parts 0 :text))) 61 62 (cl-defmethod gptel--request-data ((_backend gptel-gemini) prompts) 63 "JSON encode PROMPTS for sending to Gemini." 64 (let ((prompts-plist 65 `(:contents [,@prompts] 66 :safetySettings [(:category "HARM_CATEGORY_HARASSMENT" 67 :threshold "BLOCK_NONE") 68 (:category "HARM_CATEGORY_SEXUALLY_EXPLICIT" 69 :threshold "BLOCK_NONE") 70 (:category "HARM_CATEGORY_DANGEROUS_CONTENT" 71 :threshold "BLOCK_NONE") 72 (:category "HARM_CATEGORY_HATE_SPEECH" 73 :threshold "BLOCK_NONE")])) 74 params) 75 ;; HACK only gemini-pro doesn't support system messages. Need a less hacky 76 ;; way to do this. 77 (if (and gptel--system-message 78 (not (gptel--model-capable-p 'nosystem)) 79 (not (equal gptel-model 'gemini-pro))) 80 (plist-put prompts-plist :system_instruction 81 `(:parts (:text ,gptel--system-message)))) 82 (when gptel-temperature 83 (setq params 84 (plist-put params 85 :temperature (max gptel-temperature 1.0)))) 86 (when gptel-max-tokens 87 (setq params 88 (plist-put params 89 :maxOutputTokens gptel-max-tokens))) 90 (when params 91 (plist-put prompts-plist 92 :generationConfig params)) 93 ;; Merge request params with model and backend params. 94 (gptel--merge-plists 95 prompts-plist 96 (gptel-backend-request-params gptel-backend) 97 (gptel--model-request-params gptel-model)))) 98 99 (cl-defmethod gptel--parse-buffer ((_backend gptel-gemini) &optional max-entries) 100 (let ((prompts) (prop) 101 (include-media (and gptel-track-media (or (gptel--model-capable-p 'media) 102 (gptel--model-capable-p 'url))))) 103 (if (or gptel-mode gptel-track-response) 104 (while (and 105 (or (not max-entries) (>= max-entries 0)) 106 (setq prop (text-property-search-backward 107 'gptel 'response 108 (when (get-char-property (max (point-min) (1- (point))) 109 'gptel) 110 t)))) 111 (if (prop-match-value prop) ;assistant role 112 (push (list :role "model" 113 :parts 114 (list :text (buffer-substring-no-properties (prop-match-beginning prop) 115 (prop-match-end prop)))) 116 prompts) 117 (if include-media 118 (push (list :role "user" 119 :parts (gptel--gemini-parse-multipart 120 (gptel--parse-media-links 121 major-mode (prop-match-beginning prop) (prop-match-end prop)))) 122 prompts) 123 (push (list :role "user" 124 :parts 125 `[(:text ,(gptel--trim-prefixes 126 (buffer-substring-no-properties (prop-match-beginning prop) 127 (prop-match-end prop))))]) 128 prompts))) 129 (and max-entries (cl-decf max-entries))) 130 (push (list :role "user" 131 :parts 132 `[(:text ,(string-trim (buffer-substring-no-properties (point-min) (point-max))))]) 133 prompts)) 134 ;; HACK Prepend the system message to the first user prompt, but only for 135 ;; this model. 136 (when (and (equal gptel-model 'gemini-pro) 137 gptel--system-message) 138 (cl-callf 139 (lambda (msg) 140 (vconcat `((:text ,(concat gptel--system-message "\n\n"))) msg)) 141 (thread-first (car prompts) 142 (plist-get :parts)))) 143 prompts)) 144 145 (defun gptel--gemini-parse-multipart (parts) 146 "Convert a multipart prompt PARTS to the Gemini API format. 147 148 The input is an alist of the form 149 ((:text \"some text\") 150 (:media \"/path/to/media.png\" :mime \"image/png\") 151 (:text \"More text\")). 152 153 The output is a vector of entries in a backend-appropriate 154 format." 155 (cl-loop 156 for part in parts 157 for n upfrom 1 158 with last = (length parts) 159 for text = (plist-get part :text) 160 for media = (plist-get part :media) 161 if text do 162 (and (or (= n 1) (= n last)) (setq text (gptel--trim-prefixes text))) and 163 unless (string-empty-p text) 164 collect (list :text text) into parts-array end 165 else if media 166 collect 167 `(:inline_data 168 (:mime_type ,(plist-get part :mime) 169 :data ,(gptel--base64-encode media))) 170 into parts-array 171 finally return (vconcat parts-array))) 172 173 (cl-defmethod gptel--wrap-user-prompt ((_backend gptel-gemini) prompts 174 &optional inject-media) 175 "Wrap the last user prompt in PROMPTS with the context string. 176 177 If INJECT-MEDIA is non-nil wrap it with base64-encoded media 178 files in the context." 179 (if inject-media 180 ;; Wrap the first user prompt with included media files/contexts 181 (when-let ((media-list (gptel-context--collect-media))) 182 (cl-callf (lambda (current) 183 (vconcat (gptel--gemini-parse-multipart media-list) 184 current)) 185 (plist-get (car prompts) :parts))) 186 ;; Wrap the last user prompt with included text contexts 187 (cl-callf (lambda (current) 188 (if-let ((wrapped (gptel-context--wrap nil))) 189 (vconcat `((:text ,wrapped)) current) 190 current)) 191 (plist-get (car (last prompts)) :parts)))) 192 193 (defconst gptel--gemini-models 194 '((gemini-pro 195 :description "The previous generation of Google's multimodal AI model" 196 :capabilities (tool json media) 197 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 198 "application/pdf" "text/plain" "text/csv" "text/html") 199 :context-window 32 200 :input-cost 0.50 201 :output-cost 1.50 202 :cutoff-date "2023-02") 203 (gemini-1.5-flash 204 :description "A faster, more efficient version of Gemini 1.5 optimized for speed" 205 :capabilities (tool json media) 206 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 207 "application/pdf" "text/plain" "text/csv" "text/html") 208 :context-window 1000 209 ;; input & output price is halved for prompts of 128k tokens or less 210 :input-cost 0.15 211 :output-cost 0.60 212 :cutoff-date "2024-05") 213 (gemini-1.5-pro-latest 214 :description "Google's latest model with enhanced capabilities across various tasks" 215 :capabilities (tool json media) 216 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 217 "application/pdf" "text/plain" "text/csv" "text/html") 218 :context-window 2000 219 ;; input & output price is halved for prompts of 128k tokens or less 220 :input-cost 2.50 221 :output-cost 10 222 :cutoff-date "2024-05")) 223 "List of available Gemini models and associated properties. 224 Keys: 225 226 - `:description': a brief description of the model. 227 228 - `:capabilities': a list of capabilities supported by the model. 229 230 - `:mime-types': a list of supported MIME types for media files. 231 232 - `:context-window': the context window size, in thousands of tokens. 233 234 - `:input-cost': the input cost, in US dollars per million tokens. 235 236 - `:output-cost': the output cost, in US dollars per million tokens. 237 238 - `:cutoff-date': the knowledge cutoff date. 239 240 - `:request-params': a plist of additional request parameters to 241 include when using this model. 242 243 Information about the Gemini models was obtained from the following 244 source: 245 246 - <https://ai.google.dev/pricing> 247 - <https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models>") 248 249 ;;;###autoload 250 (cl-defun gptel-make-gemini 251 (name &key curl-args header key request-params 252 (stream nil) 253 (host "generativelanguage.googleapis.com") 254 (protocol "https") 255 (models gptel--gemini-models) 256 (endpoint "/v1beta/models")) 257 258 "Register a Gemini backend for gptel with NAME. 259 260 Keyword arguments: 261 262 CURL-ARGS (optional) is a list of additional Curl arguments. 263 264 HOST (optional) is the API host, defaults to 265 \"generativelanguage.googleapis.com\". 266 267 MODELS is a list of available model names, as symbols. 268 Additionally, you can specify supported LLM capabilities like 269 vision or tool-use by appending a plist to the model with more 270 information, in the form 271 272 (model-name . plist) 273 274 For a list of currently recognized plist keys, see 275 `gptel--gemini-models'. An example of a model specification 276 including both kinds of specs: 277 278 :models 279 \\='(gemini-pro ;Simple specs 280 gemini-1.5-flash 281 (gemini-1.5-pro-latest ;Full spec 282 :description 283 \"Complex reasoning tasks, problem solving and data extraction\" 284 :capabilities (tool json) 285 :mime-types 286 (\"image/jpeg\" \"image/png\" \"image/webp\" \"image/heic\"))) 287 288 289 STREAM is a boolean to enable streaming responses, defaults to 290 false. 291 292 PROTOCOL (optional) specifies the protocol, \"https\" by default. 293 294 ENDPOINT (optional) is the API endpoint for completions, defaults to 295 \"/v1beta/models\". 296 297 HEADER (optional) is for additional headers to send with each 298 request. It should be an alist or a function that retuns an 299 alist, like: 300 ((\"Content-Type\" . \"application/json\")) 301 302 KEY (optional) is a variable whose value is the API key, or 303 function that returns the key. 304 305 REQUEST-PARAMS (optional) is a plist of additional HTTP request 306 parameters (as plist keys) and values supported by the API. Use 307 these to set parameters that gptel does not provide user options 308 for." 309 (declare (indent 1)) 310 (let ((backend (gptel--make-gemini 311 :curl-args curl-args 312 :name name 313 :host host 314 :header header 315 :models (gptel--process-models models) 316 :protocol protocol 317 :endpoint endpoint 318 :stream stream 319 :request-params request-params 320 :key key 321 :url (lambda () 322 (let ((method (if (and stream 323 gptel-stream) 324 "streamGenerateContent" 325 "generateContent"))) 326 (format "%s://%s%s/%s:%s?key=%s" 327 protocol 328 host 329 endpoint 330 gptel-model 331 method 332 (gptel--get-api-key))))))) 333 (prog1 backend 334 (setf (alist-get name gptel--known-backends 335 nil nil #'equal) 336 backend)))) 337 338 (provide 'gptel-gemini) 339 ;;; gptel-gemini.el ends here