gptel-gemini.el (13592B)
1 ;;; gptel-gemini.el --- Gemini suppport for gptel -*- lexical-binding: t; -*- 2 3 ;; Copyright (C) 2023 Karthik Chikmagalur 4 5 ;; Author: Karthik Chikmagalur <karthikchikmagalur@gmail.com> 6 7 ;; This program is free software; you can redistribute it and/or modify 8 ;; it under the terms of the GNU General Public License as published by 9 ;; the Free Software Foundation, either version 3 of the License, or 10 ;; (at your option) any later version. 11 12 ;; This program is distributed in the hope that it will be useful, 13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 ;; GNU General Public License for more details. 16 17 ;; You should have received a copy of the GNU General Public License 18 ;; along with this program. If not, see <https://www.gnu.org/licenses/>. 19 20 ;;; Commentary: 21 22 ;; This file adds support for the Gemini API to gptel 23 24 ;;; Code: 25 (require 'gptel) 26 (require 'cl-generic) 27 (require 'map) 28 (eval-when-compile (require 'cl-lib)) 29 30 (declare-function prop-match-value "text-property-search") 31 (declare-function text-property-search-backward "text-property-search") 32 (declare-function json-read "json") 33 (declare-function gptel-context--wrap "gptel-context") 34 (declare-function gptel-context--collect-media "gptel-context") 35 (defvar json-object-type) 36 37 ;;; Gemini 38 (cl-defstruct 39 (gptel-gemini (:constructor gptel--make-gemini) 40 (:copier nil) 41 (:include gptel-backend))) 42 43 (cl-defmethod gptel-curl--parse-stream ((_backend gptel-gemini) _info) 44 (let* ((content-strs)) 45 (condition-case nil 46 ;; while-let is Emacs 29.1+ only 47 (while (prog1 (search-forward "{" nil t) 48 (backward-char 1)) 49 (save-match-data 50 (when-let* 51 ((response (gptel--json-read)) 52 (text (map-nested-elt 53 response '(:candidates 0 :content :parts 0 :text)))) 54 (push text content-strs)))) 55 (error 56 (goto-char (match-beginning 0)))) 57 (apply #'concat (nreverse content-strs)))) 58 59 (cl-defmethod gptel--parse-response ((_backend gptel-gemini) response _info) 60 (map-nested-elt response '(:candidates 0 :content :parts 0 :text))) 61 62 (cl-defmethod gptel--request-data ((_backend gptel-gemini) prompts) 63 "JSON encode PROMPTS for sending to Gemini." 64 ;; HACK (backwards compatibility) Prepend the system message to the first user 65 ;; prompt, but only for gemini-pro. 66 (when (and (equal gptel-model 'gemini-pro) gptel--system-message) 67 (cl-callf 68 (lambda (msg) 69 (vconcat `((:text ,(concat gptel--system-message "\n\n"))) msg)) 70 (thread-first (car prompts) 71 (plist-get :parts)))) 72 (let ((prompts-plist 73 `(:contents [,@prompts] 74 :safetySettings [(:category "HARM_CATEGORY_HARASSMENT" 75 :threshold "BLOCK_NONE") 76 (:category "HARM_CATEGORY_SEXUALLY_EXPLICIT" 77 :threshold "BLOCK_NONE") 78 (:category "HARM_CATEGORY_DANGEROUS_CONTENT" 79 :threshold "BLOCK_NONE") 80 (:category "HARM_CATEGORY_HATE_SPEECH" 81 :threshold "BLOCK_NONE")])) 82 params) 83 ;; HACK only gemini-pro doesn't support system messages. Need a less hacky 84 ;; way to do this. 85 (if (and gptel--system-message 86 (not (gptel--model-capable-p 'nosystem)) 87 (not (equal gptel-model 'gemini-pro))) 88 (plist-put prompts-plist :system_instruction 89 `(:parts (:text ,gptel--system-message)))) 90 (when gptel-temperature 91 (setq params 92 (plist-put params 93 :temperature (max gptel-temperature 1.0)))) 94 (when gptel-max-tokens 95 (setq params 96 (plist-put params 97 :maxOutputTokens gptel-max-tokens))) 98 (when params 99 (plist-put prompts-plist 100 :generationConfig params)) 101 ;; Merge request params with model and backend params. 102 (gptel--merge-plists 103 prompts-plist 104 (gptel-backend-request-params gptel-backend) 105 (gptel--model-request-params gptel-model)))) 106 107 (cl-defmethod gptel--parse-list ((_backend gptel-gemini) prompt-list) 108 (cl-loop for text in prompt-list 109 for role = t then (not role) 110 if text 111 if role 112 collect (list :role "user" :parts `[(:text ,text)]) into prompts 113 else collect (list :role "model" :parts `(:text ,text)) into prompts 114 finally return prompts)) 115 116 (cl-defmethod gptel--parse-buffer ((_backend gptel-gemini) &optional max-entries) 117 (let ((prompts) (prop) 118 (include-media (and gptel-track-media (or (gptel--model-capable-p 'media) 119 (gptel--model-capable-p 'url))))) 120 (if (or gptel-mode gptel-track-response) 121 (while (and 122 (or (not max-entries) (>= max-entries 0)) 123 (setq prop (text-property-search-backward 124 'gptel 'response 125 (when (get-char-property (max (point-min) (1- (point))) 126 'gptel) 127 t)))) 128 (if (prop-match-value prop) ;assistant role 129 (push (list :role "model" 130 :parts 131 (list :text (buffer-substring-no-properties (prop-match-beginning prop) 132 (prop-match-end prop)))) 133 prompts) 134 (if include-media 135 (push (list :role "user" 136 :parts (gptel--gemini-parse-multipart 137 (gptel--parse-media-links 138 major-mode (prop-match-beginning prop) (prop-match-end prop)))) 139 prompts) 140 (push (list :role "user" 141 :parts 142 `[(:text ,(gptel--trim-prefixes 143 (buffer-substring-no-properties (prop-match-beginning prop) 144 (prop-match-end prop))))]) 145 prompts))) 146 (and max-entries (cl-decf max-entries))) 147 (push (list :role "user" 148 :parts 149 `[(:text ,(string-trim (buffer-substring-no-properties (point-min) (point-max))))]) 150 prompts)) 151 prompts)) 152 153 (defun gptel--gemini-parse-multipart (parts) 154 "Convert a multipart prompt PARTS to the Gemini API format. 155 156 The input is an alist of the form 157 ((:text \"some text\") 158 (:media \"/path/to/media.png\" :mime \"image/png\") 159 (:text \"More text\")). 160 161 The output is a vector of entries in a backend-appropriate 162 format." 163 (cl-loop 164 for part in parts 165 for n upfrom 1 166 with last = (length parts) 167 for text = (plist-get part :text) 168 for media = (plist-get part :media) 169 if text do 170 (and (or (= n 1) (= n last)) (setq text (gptel--trim-prefixes text))) and 171 unless (string-empty-p text) 172 collect (list :text text) into parts-array end 173 else if media 174 collect 175 `(:inline_data 176 (:mime_type ,(plist-get part :mime) 177 :data ,(gptel--base64-encode media))) 178 into parts-array 179 finally return (vconcat parts-array))) 180 181 (cl-defmethod gptel--wrap-user-prompt ((_backend gptel-gemini) prompts 182 &optional inject-media) 183 "Wrap the last user prompt in PROMPTS with the context string. 184 185 If INJECT-MEDIA is non-nil wrap it with base64-encoded media 186 files in the context." 187 (if inject-media 188 ;; Wrap the first user prompt with included media files/contexts 189 (when-let ((media-list (gptel-context--collect-media))) 190 (cl-callf (lambda (current) 191 (vconcat (gptel--gemini-parse-multipart media-list) 192 current)) 193 (plist-get (car prompts) :parts))) 194 ;; Wrap the last user prompt with included text contexts 195 (cl-callf (lambda (current) 196 (if-let ((wrapped (gptel-context--wrap nil))) 197 (vconcat `((:text ,wrapped)) current) 198 current)) 199 (plist-get (car (last prompts)) :parts)))) 200 201 (defconst gptel--gemini-models 202 '((gemini-pro 203 :description "The previous generation of Google's multimodal AI model" 204 :capabilities (tool json media) 205 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 206 "application/pdf" "text/plain" "text/csv" "text/html") 207 :context-window 32 208 :input-cost 0.50 209 :output-cost 1.50 210 :cutoff-date "2023-02") 211 (gemini-1.5-flash 212 :description "A faster, more efficient version of Gemini 1.5 optimized for speed" 213 :capabilities (tool json media) 214 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 215 "application/pdf" "text/plain" "text/csv" "text/html") 216 :context-window 1000 217 ;; input & output price is halved for prompts of 128k tokens or less 218 :input-cost 0.15 219 :output-cost 0.60 220 :cutoff-date "2024-05") 221 (gemini-1.5-pro-latest 222 :description "Google's latest model with enhanced capabilities across various tasks" 223 :capabilities (tool json media) 224 :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif" 225 "application/pdf" "text/plain" "text/csv" "text/html") 226 :context-window 2000 227 ;; input & output price is halved for prompts of 128k tokens or less 228 :input-cost 2.50 229 :output-cost 10 230 :cutoff-date "2024-05")) 231 "List of available Gemini models and associated properties. 232 Keys: 233 234 - `:description': a brief description of the model. 235 236 - `:capabilities': a list of capabilities supported by the model. 237 238 - `:mime-types': a list of supported MIME types for media files. 239 240 - `:context-window': the context window size, in thousands of tokens. 241 242 - `:input-cost': the input cost, in US dollars per million tokens. 243 244 - `:output-cost': the output cost, in US dollars per million tokens. 245 246 - `:cutoff-date': the knowledge cutoff date. 247 248 - `:request-params': a plist of additional request parameters to 249 include when using this model. 250 251 Information about the Gemini models was obtained from the following 252 source: 253 254 - <https://ai.google.dev/pricing> 255 - <https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models>") 256 257 ;;;###autoload 258 (cl-defun gptel-make-gemini 259 (name &key curl-args header key request-params 260 (stream nil) 261 (host "generativelanguage.googleapis.com") 262 (protocol "https") 263 (models gptel--gemini-models) 264 (endpoint "/v1beta/models")) 265 266 "Register a Gemini backend for gptel with NAME. 267 268 Keyword arguments: 269 270 CURL-ARGS (optional) is a list of additional Curl arguments. 271 272 HOST (optional) is the API host, defaults to 273 \"generativelanguage.googleapis.com\". 274 275 MODELS is a list of available model names, as symbols. 276 Additionally, you can specify supported LLM capabilities like 277 vision or tool-use by appending a plist to the model with more 278 information, in the form 279 280 (model-name . plist) 281 282 For a list of currently recognized plist keys, see 283 `gptel--gemini-models'. An example of a model specification 284 including both kinds of specs: 285 286 :models 287 \\='(gemini-pro ;Simple specs 288 gemini-1.5-flash 289 (gemini-1.5-pro-latest ;Full spec 290 :description 291 \"Complex reasoning tasks, problem solving and data extraction\" 292 :capabilities (tool json) 293 :mime-types 294 (\"image/jpeg\" \"image/png\" \"image/webp\" \"image/heic\"))) 295 296 297 STREAM is a boolean to enable streaming responses, defaults to 298 false. 299 300 PROTOCOL (optional) specifies the protocol, \"https\" by default. 301 302 ENDPOINT (optional) is the API endpoint for completions, defaults to 303 \"/v1beta/models\". 304 305 HEADER (optional) is for additional headers to send with each 306 request. It should be an alist or a function that retuns an 307 alist, like: 308 ((\"Content-Type\" . \"application/json\")) 309 310 KEY (optional) is a variable whose value is the API key, or 311 function that returns the key. 312 313 REQUEST-PARAMS (optional) is a plist of additional HTTP request 314 parameters (as plist keys) and values supported by the API. Use 315 these to set parameters that gptel does not provide user options 316 for." 317 (declare (indent 1)) 318 (let ((backend (gptel--make-gemini 319 :curl-args curl-args 320 :name name 321 :host host 322 :header header 323 :models (gptel--process-models models) 324 :protocol protocol 325 :endpoint endpoint 326 :stream stream 327 :request-params request-params 328 :key key 329 :url (lambda () 330 (let ((method (if (and stream 331 gptel-stream) 332 "streamGenerateContent" 333 "generateContent"))) 334 (format "%s://%s%s/%s:%s?key=%s" 335 protocol 336 host 337 endpoint 338 gptel-model 339 method 340 (gptel--get-api-key))))))) 341 (prog1 backend 342 (setf (alist-get name gptel--known-backends 343 nil nil #'equal) 344 backend)))) 345 346 (provide 'gptel-gemini) 347 ;;; gptel-gemini.el ends here